最近從主產品被調到了火星,技術不到家的我感到壓力很大啊。提程式碼的時候也更加小心翼翼了,生怕搞壞了線上環境。
於是,我就打算給自己做個監控,好讓我能快速的發現問題,然後改掉bug。考慮到公司某些規定,就不詳細介紹了。下面簡答的描述下,有個思路就行。
思路就是:
1. 分析Nginx錯誤日誌,用正則匹配出對應內容
2. SVN blame出錯誤程式碼的作者。
3. 藉助釘釘的群聊機器人,及時傳送出去。複製程式碼
在實現的過程中,遇到了很多問題。大概有這麼幾個:
1、 伺服器埠限制的比較死,不能單獨給自己開對外訪問的埠。
2、 重複錯誤的觸發。
3、 SVN blame認證問題(沒解決。。。)
... ...
整體準備採用Client和Server的模式。
Client內容如下:
使用一個Python指令碼,藉助crontab監控錯誤日誌,並正則匹配對應的內容,傳送給Server端處理。
#!/usr/bin python
# coding: utf8
import sys
reload(sys)
sys.setdefaultencoding("utf8")
import re
import json
import urllib2
import time
# 2017/11/17 16:05:01 [error] 4004#0: *246620391 FastCGI sent in stderr: "PHP message: PHP Fatal error: Function name must be a string in /home/wwwroot/api.newtv.com/live.class.php on line 2242" while reading response header from upstream, client: 192.168.30.100, server: api.changbalive.com, request: "GET /api.php?ac=recordsingsong&curuserid=2635267&channelsrc=appstore&version=1.9.5&token=T777936552f7571e&bless=0&macaddress=A4D0E95D-AB54-48A1-BCC8-3EB0A530B2A7&ismember=0&openudid=d7be3882344bb889cd6c451880df1a834f1af960&systemversion=10.3.3&device=iPhone7,1&broken=0&songid=867712&secret=483f47528d HTTP/1.1", upstream: "fastcgi://127.0.0.1:9000", host: "api.changbalive.com"
# 返回值含義:tuple元組內依次為:報錯時間,錯誤類別,錯誤描述,出錯檔案全路徑,出錯位置行數
def parse_errorlog(line):
reg = re.compile(r"(.*?) \[error\] .*?PHP message:(.*?): (.*?) in (.*?) on line (\d+).*");
result = re.findall(reg, line)
# print len(result)
# print result
return result
def get_errorlog(type='api_newtv_error.log'):
path = "/var/log/nginx/{}".format(type)
result = []
with open(path, 'r') as file:
lines = file.readlines()
file.close()
for line in lines:
if line is not None:
result.append(line)
return result
//TODO
// 使用HTTP請求,將找到的錯誤資訊emit到Server端複製程式碼
Server內容
用到的檔案大致有這麼幾個。
findbugauthor.sh receive.php utils.php複製程式碼
findbugauthor.sh
#!/usr/bin bash
export $PATH
DIRPATH=$1
FILENAME=$2
LINE=$3
cd $DIRPATH
AUTHOR=`svn blame $FILENAME | head -$LINE | tail -1 | awk '{print $2}'`
echo $AUTHOR複製程式碼
utils.php
<?php
header("Content-Type:text/html;charset=UTF-8");
function getuniquecode($path, $line){
$errorcode = md5("{$path}{$line}");
return $errorcode;
}
function phpPost( $url, $post = '', $timeout = 5 ){
if( empty( $url ) ){
return ;
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
if( $post != '' && !empty( $post ) ){
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post);
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type: application/json', 'Content-Length: ' . strlen($post)));
}
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
$result = curl_exec($ch);
curl_close($ch);
return $result;
}
function getbashoutput($path, $line) {
$dirpath = substr($path, 0, strpos($path, basename($path)));
$filename = basename($path);
$result = "";
exec("bash ./findbugauthor.sh {$dirpath} {$filename} {$line} 2>&1", $result);
return $result[1];
}
//getbashoutput("/home/wwwroot/巴拉巴拉/你的檔案ListService.php", 142);
class RedisHelper{
private static $_instance = null;
const APINEWTVERROR_KEY = "zpinewtvcom:error:zset";
const NOTIFY_KEY = "apinewtvcom:notify:hash";
private function __construct(){
$this->redis = new Redis();
$this->redis->connect("127.0.0.1", 6379, 7);
}
public static function getInstance(){
if( self::$_instance == null ) {
self::$_instance = new RedisHelper();
}
return self::$_instance;
}
public function incrErrorNumber($errorcode, $number=1){
$this->redis->zIncrBy(self::APINEWTVERROR_KEY, 1, $errorcode);
}
public function getErrorNumber($errorcode) {
return intval($this->redis->zScore(self::APINEWTVERROR_KEY, $errorcode));
}
public function updateNotifyDate($errorcode){
$this->redis->hSet(self::NOTIFY_KEY, $errorcode, date("Ymd"));
}
public function getNotifyDate($errorcode) {
return $this->redis->hGet(self::NOTIFY_KEY, $errorcode);
}
public function getFrequentErrors($number=7) {
return $this->redis->zRevRange(self::APINEWTVERROR_KEY, 0, $number, true);
}
}
$template = <<<EOF
{
"msgtype": "text",
"text": {
"content": "我就是我, 是不一樣的煙火"
},
"at": {
"atMobiles": [
"156xxxx8827",
"189xxxx8325",
],
"isAtAll": false
}
}
EOF;
class Notifier{
private static $instance = null;
private function __construct() {
$this->url = "https://oapi.dingtalk.com/robot/send?access_token=b716e1f39b7fc7afbea04b2巴拉巴拉d4bb79db65a117d589f886d1757";
}
public static function getInstance(){
if(self::$instance==null) {
self::$instance = new Notifier();
}
return self::$instance;
}
public function notify($msg){
global $template;
$data = json_decode($template, true);
$data['text']['content'] = $msg;
$data['at']['atMobiles'] = array(15801479216, );
$data['at']['isAtAll'] = false;
$data['msgtype'] = "text";
$result = phpPost($this->url, json_encode($data));
return $result;
}
}複製程式碼
receive.php
<?php
header("Content-Type:text/html;charset=UTF-8");
require __DIR__."/utils.php";
$time = isset($_REQUEST['time'])?strval($_REQUEST['time']):"";
$level = isset($_REQUEST['level'])?strval($_REQUEST['level']):"";
$description = isset($_REQUEST['description'])?strval($_REQUEST['description']):"";
$fullpath = isset($_REQUEST['fullpath'])?strval($_REQUEST['fullpath']):"";
$linenumber = isset($_REQUEST['linenumber'])?intval($_REQUEST['linenumber']):0;
if(empty($time) || empty($level) || empty($description) || empty($fullpath) || empty($linenumber)) {
echo json_encode(array("errcode"=>-1, "errmsg"=>"請求引數不完整"));
}
$errorcode = getuniquecode($fullpath, $linenumber);
$helper = RedisHelper::getInstance();
$helper->incrErrorNumber($errorcode);
$bugauthor = getbashoutput($fullpath, $linenumber);
$notify = Notifier::getInstance();
var_dump($errorcode);
echo "\n";
$errors = $helper->getFrequentErrors(7);
var_dump($errors);
foreach($errors as $uniquecode=>$numbers) {
if(intval($errorcode) == intval($uniquecode)) {
$msg = "Bug 時間:{$time}\nBug級別:{$level}\n錯誤描述:{$description}\n檔案全路徑:{$fullpath}\n出錯行數:{$linenumber}\n程式碼負責人:{$bugauthor}\n";
$notify->notify($msg);
}
}複製程式碼
實現的效果
不足之處
報警的觸發機制還沒完善,其實這塊要做的內容會很多的,根據不同的場景選擇不同的策略很重要,但是要做到靈活的處理,每一個很好的設計是不行的,這裡有興趣的可以自己思考思考。
crontab的時間間隔也是個問題,太小了對伺服器壓力稍微有一點點的影響(雖然這基本上也沒什麼影響,但是時間片太小了,觸發機制就得跟著更改下);時間片太大了,報警的靈敏度就下降了,也就失去了報警的意義。
總的來說,思路很簡單,但是真的去實現起來並能很好的應用到開發中,還是有很長的路要走的。這裡就當是拋磚引玉吧。