告警系統對接原則:以指令碼執行的返回值為準,如返回值為0則表示正常,返回值為1則表示異常,根據預配置內容發出告警簡訊or郵件。
mysql撥測監控告警
#!/bin/bash
result=`/apps/svr/mysql_3306/bin/mysql -uuserAndPassword -puserAndPassword -h127.0.0.1 -N -e "select 1" 2>/dev/null`
if [[ $result -eq 1 ]]; then
echo "select 1 is OK"
exit 0
else
echo "ERROR,select 1 is not OK"
exit 1
fi
mysql連線數超過90%告警
#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
re_status=`$pwd/mysql -u$user -p$password -h$host --port=$port -N -e "show status like 'Threads_running'" 2>/dev/null |awk -F ' ' '{print $2}'`
re_variables=`$pwd/mysql -u$user -p$password -h$host --port=$port -N -e "show global variables like 'max_connections'" 2>/dev/null |awk -F ' ' '{print $2}'`
result=`awk 'BEGIN{printf "%.2f%\n",('$re_status'/'$re_variables')*100}'`
if [[ $result < 90% ]];then
echo "連線數正常"
exit 0
else
echo "當前連線數超過90%,告警!"
exit 1
fi
mysql主從狀態監控異常告警
#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
# 監控主從同步狀態,主從同步斷開既告警# 同時監控主從同步延時時間Seconds_Behind_Master,當時間大於60秒告警# 需要監控賬號 repl 具有"replication client" 許可權:grant replication client on *.* to repl@'%' ;
DATE=`date +"%Y-%m-%d %H:%M:%S"` #當前日期時間
MYSQLTEST=`$pwd/mysql -u$user -p$password -h$host --port=$port -S /apps/run/mysql_3306/mysql.sock -e "show slave status\G" 2>/dev/null |wc -l`
if [ $MYSQLTEST -ne 0 ];then #該機器為從庫
SLAVE_STATUS=`$pwd/mysql -u$user -p$password -h$host --port=$port -S /apps/run/mysql_3306/mysql.sock -e "show slave status\G" 2>/dev/null | egrep -i "running|Seconds_Behind_Master"`
IO_env=`echo $SLAVE_STATUS | grep IO | awk ' {print $2}'`
SQL_env=`echo $SLAVE_STATUS | grep SQL | awk '{print $4}'`
Seconds_Behind_env=`echo $SLAVE_STATUS |grep Seconds_Behind_Master |awk ' {print $6}'`
if [ "$IO_env" = "Yes" ]&&[ "$SQL_env" = "Yes" ];then #主從同步正常
echo "[$DATE] [INFO] Master-slave synchronization is running!"
else #主從同步斷開,告警
echo "[$DATE] [ERROR] Master-slave synchronization is not running!"
exit 1
fi
## 監控延時時間Seconds_Behind_Master
if [ "$Seconds_Behind_env" != "NULL" ]&&[ $Seconds_Behind_env -gt 60 ];then #主從同步延時大於60秒,告警
echo "[$DATE] [ERROR] Master-slave synchronization delay time is greater than 60 seconds!"
exit 1
fi
else
echo "Master" #該機器為主庫或者單機
fi
exit 0
mysql叢集未提交長事務監控異常告警
#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
DATE=`date +"%Y-%m-%d %H:%M:%S"`
MYSQLCOMMIT=` $pwd/mysql -u$user -p$password -h$host --port=$port -e "select a.id as conn_id, time_to_sec(timediff(now(),b.trx_started)) as trx_open_seconds from information_schema.processlist a right outer join information_schema.innodb_trx b on a.id = b.trx_mysql_thread_id where time_to_sec(timediff(now(),b.trx_started))>60;" 2>/dev/null |grep -v trx_open_seconds |wc -l`
if [ $MYSQLCOMMIT -gt 50 ];then
echo "[$DATE] [WARNING] 事務超過60秒未提交數量超過50個!"
exit 1
else
echo "[$DATE] [INFO] 事務超過60秒未提交數量: $MYSQLCOMMIT"
exit 0
fi
mysql快取命中率
#!/bin/bash
#採集間隔時間,單位s
asleep=10
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
probe_file=probe.log
Innodb_buffer_read_hit_ratio=0
getBufferRatio(){
a=$(/data01/svr/mysql_${port}/bin/mysql -u${user} -p${password} -h${host} -e "show global status like 'Innodb_buffer_pool%';" |grep -i "Innodb_buffer_pool_read_requests\|Innodb_buffer_pool_reads" | awk '{if(length($3)==0) print $0 ;else print $3}'| tr -t '\n'' ');
Innodb_buffer_pool_read_requests=$(echo $a | awk '{print $2}')
Innodb_buffer_pool_reads=$(echo $a | awk '{print $4}')
Innodb_buffer_read_hit_ratio=`awk 'BEGIN{printf "%.2f\n",(1-'$Innodb_buffer_pool_reads'/'$Innodb_buffer_pool_read_requests')*100}'`
}
getBufferRatio
echo $(date "+%Y-%m-%d_%H:%M:%S") $Innodb_buffer_read_hit_ratio
if [ ${Innodb_buffer_read_hit_ratio%.*} -lt 99 ];then
echo "[$DATE] [WARNING] buffer命中率低於99!"
exit 1
else
echo "[$DATE] [INFO] buffer命中率: $Innodb_buffer_read_hit_ratio"
exit 0
fi
MySQL鎖表監控告警
#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
Ratio=0
a=`$pwd/mysql -u$user -p$password -h$host --port=$port -e "SHOW STATUS LIKE 'Innodb_row_lock_current_waits'" 2>/dev/null |grep Innodb_row_lock_current_waits|awk '{print $2}'`
if [ $a -gt 0 ];then
echo "[$DATE] [WARNING] 出現鎖表!!"
exit 1
else
echo "[$DATE] [INFO] 鎖表檢查正常。"
exit 0
fi
QPS大於10000告警
#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
Ratio=0
Uptime=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port status 2>/dev/null | awk '{print $2}' `
QPS() {
Questions=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port status 2>/dev/null | awk '{print $6}' `
}
QPS
Ratio=`awk 'BEGIN{ printf "%.2f\n",'$Questions'/'$Uptime'}'`
if [ ${Ratio%.*} -gt 10000 ];then
echo "[$DATE] [WARNING] QPS大於10000!"
exit 1
else
echo "[$DATE] [INFO] 當前QPS為: $Ratio"
exit 0
fi
TPS大於4000告警
#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
Ratio=0
Uptime=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port status 2>/dev/null | awk '{print $2}' `
TPS() {
rollback=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port extended-status 2>/dev/null | awk '/\<Com_rollback\>/{print $4}'`
commit=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port extended-status 2>/dev/null | awk '/\<Com_commit\>/{print $4}'`
}
TPS
TpsRatio=`awk 'BEGIN{printf "%.2f\n",'$(($rollback+$commit))'/'$Uptime'}'`
if [ ${TpsRatio%.*} -gt 4000 ];then
echo "[$DATE] [WARNING] TPS大於4000!"
exit 1
else
echo "[$DATE] [INFO] 當前QPS為: $TpsRatio"
exit 0
fi
haproxy後端狀態監測
#!/bin/bash
result=`curl -s http://admin:admin@10.172.95.97:2000/status |grep -E 'order01|base|cust|idservice|irsc|sec|upc|ewe|ftpgw ' |grep DOWN |wc -l`
if [[ $result = 2 ]]; then
exit 0
else
exit 1
fi
JAVA記憶體溢位告警-OutOfMemory
#!/bin/bash
result=`tail -1000 /apps/logs/svc/svc-node01-order01-`date +%m%d`.log | grep 'java.lang.OutOfMemoryError' | wc -l`
if [[ $result = 0 ]]; then
exit 0
else
exit 1
fi
redis叢集狀態異常告警
#!/bin/bash
result=`/apps/svr/ids/redis-cli -c -p 11001 -h 127.0.0.1 cluster info |grep cluster_state | grep ok | wc -l`
if [[ $result = 1 ]]; then
exit 0
else
exit 1
fi
redis節點狀態監測告警
#!/bin/bash
result=`/apps/svr/ids/redis-cli -c -p 11001 -h 127.0.0.1 cluster info |grep cluster_known_nodes |awk -F ":" '{print $2}'`
if [[ $result = 6 ]]; then
exit 0
else
exit 1
fi
redis_slot異常告警
#!/bin/bash
result=`/apps/svr/ids/redis-cli -c -p 11001 -h 127.0.0.1 cluster info |grep cluster_slots_ok |awk -F ":" '{print $2}'`
if [[ $result = 16384 ]]; then
exit 0
else
exit 1
fi
keepalivedVIP連通性監控告警
#!/bin/sh
VIPS=`cat /etc/keepalived/keepalived.conf |grep -v real|grep -v smtp_server|grep -v 127.0.0.1|grep -v '#'|grep -E -o "\b([0-9]{1,3}\.){3}[0-9]{1,3}\b"|sort -nu`
num=0
for ips in $VIPS
do
result=`ping -w 2 -c 3 ${ips} | grep packet | awk -F" " '{print $6}'| awk -F"%" '{print $1}'| awk -F' ' '{print $1}'`
if [ $result -eq 0 ]; then
num=$num
else
let num=$num+1
fi
done
if [ $num -eq 0 ]; then
exit 0
else
exit 1
fi
keepalivedVIP丟失告警
#!/bin/bash
ip_count=`ip a |grep inet|grep -v 127.0.0.1|grep -v inet6 |wc -l`
if [ ! -f /tmp/check_vip.log ];then
ip a |grep inet|grep -v 127.0.0.1|grep -v inet6 |wc -l > /tmp/check_vip.log
else
vip_count=`sudo cat /tmp/check_vip.log`
if [ "$ip_count" == "$vip_count" ];then
echo "vip ok"
exit 0
else
echo $ip_count > /tmp/check_vip.log
exit 1
fi
fi
keepalived腦裂預警
這裡解釋一下,由於腦裂的驗證需要結合多個節點的情況,監控指令碼儘量不做的過於複雜,因此這裡在keepalived備節點做了監控,只要發現VIP切換即發出告警,人為接入檢查腦裂情況。
#!/bin/bash
result=`ip addr |grep 32 | wc -l`
if [[ $result = 0 ]]; then
exit 0
else
echo "keepalived從節點出現32位的vip,可能出現腦裂現象"
exit 1
fi
rocketmq叢集節點數量監控告警
#!/bin/bash
DATE=`date +"%Y-%m-%d %H:%M:%S"`
declare -x JAVA_HOME="/apps/tools/jdk"
NumCluster=`sudo -E /apps/svr/mqbroker/rocketmq/bin/mqadmin clusterList -n 127.0.0.1:9876 |grep -v Version|wc -l`
if [ $NumCluster -gt 4 ];then
echo "[$DATE] [WARNING] rocketMQ叢集節點小於4個!"
exit 1
else
echo "[$DATE] [INFO] rocketMQ叢集節點數量為: $NumCluster"
exit 0
fi
rocketmq訊息數量異常告警
#!/bin/bash
DATE=`date +"%Y-%m-%d %H:%M:%S"`
export JAVA_HOME=/apps/tools/jdk
export JAVA_BIN=/apps/tools/jdk/bin
NumTopic=`sudo -E /apps/svr/mqbroker/rocketmq/bin/mqadmin topicList -n 127.0.0.1:9876 2>/dev/null|grep Blue |grep CIDC |grep -v RETRY|wc -l`
if [ $NumTopic -lt 50 ];then
echo "[$DATE] [WARNING] rocketMQ訊息主題小於50個!"
exit 1
else
echo "[$DATE] [INFO] rocketMQ訊息topic數量當前為: $NumTopic"
exit 0
fi
logstash日誌報錯告警
#!/bin/bash
result=`tail -2000 /apps/logs/logstash/logstash-plain.log | grep 'error' | wc -l`
if [[ $result = 0 ]]; then
exit 0
else
exit 1
fi
elasticsearch叢集個數異常告警
#!/bin/bash
result=`curl http://10.172.95.1:9201/_cluster/health?pretty |grep number_of_nodes | awk -F ":" '{print $2}' |sed s'/.$//'`
if [[ $result = 4 ]]; then
exit 0
else
exit 1
fi
elasticsearch資料節點個數異常告警
#!/bin/bash
result=`curl http://10.172.95.1:9201/_cluster/health?pretty |grep number_of_data_nodes | awk -F ":" '{print $2}' |sed s'/.$//'`
if [[ $result = 4 ]]; then
exit 0
else
exit 1
fi
elasticsearch_java記憶體使用已超過48G
#!/bin/bash
#51539607552=48G
result=` curl http://10.172.95.1:9201/_cluster/stats?pretty |grep heap_used_in_bytes |awk -F ":" '{print $2}' |sed s'/.$//'`
if [[ $result > 51539607552 ]]; then
exit 0
else
exit 1
fi
zookeeper日誌告警
#!/bin/bash
result=`tail -2000 /apps/logs/logstash/logstash-plain.log | grep 'error' | wc -l`
if [[ $result = 0 ]]; then
exit 0
else
exit 1
fi
zookeeper叢集follower-mode變更告警
#!/bin/bash
result=`/apps/sh/zk/zkServer.sh status |grep follower |wc -l`
if [[ $result = 0 ]]; then
exit 0
else
exit 1
fi
zookeeper叢集leader-mode變更告警
#!/bin/bash
result=`/apps/sh/zk/zkServer.sh status |grep leader|wc -l `
if [[ $result = 0 ]]; then
exit 0
else
exit 1
fi
api介面層面監控告警
#!/bin/bash
result=`curl --location --request GET 'http://10.172.95.186:8000/emop?appId=600006&method=SYAN_UNHQ_queryOfferStatus&channelTypeId=0&flowdId=202006091314501278181&format=json&status=1%0A' --header 'Content-Type: text/plain' --data '{ "productType": "vm"}' -w "\n" |grep OK | wc -l`
if [[ $result = 1 ]]; then
exit 0
else
exit 1
fi