監控系統告警指令碼集合

AoGang發表於2021-05-08

告警系統對接原則:以指令碼執行的返回值為準,如返回值為0則表示正常,返回值為1則表示異常,根據預配置內容發出告警簡訊or郵件。

mysql撥測監控告警

#!/bin/bash
result=`/apps/svr/mysql_3306/bin/mysql -uuserAndPassword -puserAndPassword -h127.0.0.1 -N -e "select 1" 2>/dev/null`
if [[ $result -eq 1 ]]; then
 echo "select 1 is OK"
 exit 0
else
 echo "ERROR,select 1 is not OK"
 exit 1
fi

mysql連線數超過90%告警

#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin

re_status=`$pwd/mysql -u$user  -p$password -h$host --port=$port -N -e "show status like 'Threads_running'" 2>/dev/null |awk -F ' ' '{print $2}'`

re_variables=`$pwd/mysql -u$user  -p$password -h$host --port=$port  -N -e "show global variables like 'max_connections'" 2>/dev/null |awk -F ' ' '{print $2}'`

result=`awk 'BEGIN{printf "%.2f%\n",('$re_status'/'$re_variables')*100}'`

if [[ $result < 90% ]];then
 echo "連線數正常"
 exit 0
else
 echo "當前連線數超過90%,告警!"
 exit 1
fi

mysql主從狀態監控異常告警

#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin


# 監控主從同步狀態,主從同步斷開既告警# 同時監控主從同步延時時間Seconds_Behind_Master,當時間大於60秒告警# 需要監控賬號 repl 具有"replication client" 許可權:grant replication client on *.*  to repl@'%' ;

DATE=`date +"%Y-%m-%d %H:%M:%S"`    #當前日期時間
MYSQLTEST=`$pwd/mysql -u$user -p$password -h$host --port=$port -S /apps/run/mysql_3306/mysql.sock -e "show slave status\G" 2>/dev/null |wc -l`
if [ $MYSQLTEST -ne 0 ];then    #該機器為從庫
  SLAVE_STATUS=`$pwd/mysql -u$user -p$password -h$host --port=$port -S /apps/run/mysql_3306/mysql.sock -e "show slave status\G" 2>/dev/null | egrep -i "running|Seconds_Behind_Master"`
  IO_env=`echo $SLAVE_STATUS | grep IO | awk  ' {print $2}'`
  SQL_env=`echo $SLAVE_STATUS | grep SQL | awk  '{print $4}'`
  Seconds_Behind_env=`echo $SLAVE_STATUS |grep Seconds_Behind_Master |awk  ' {print $6}'`
  if [ "$IO_env" = "Yes" ]&&[ "$SQL_env" = "Yes" ];then    #主從同步正常
    echo "[$DATE] [INFO] Master-slave synchronization is running!"
  else    #主從同步斷開,告警
    echo "[$DATE] [ERROR] Master-slave synchronization is not running!"
    exit 1
  fi
  ## 監控延時時間Seconds_Behind_Master
  if [ "$Seconds_Behind_env" != "NULL" ]&&[ $Seconds_Behind_env -gt 60 ];then    #主從同步延時大於60秒,告警
    echo "[$DATE] [ERROR] Master-slave synchronization delay time is greater than 60 seconds!"
    exit 1
  fi
else
  echo "Master"    #該機器為主庫或者單機
fi
exit 0

mysql叢集未提交長事務監控異常告警

#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin



DATE=`date +"%Y-%m-%d %H:%M:%S"`
MYSQLCOMMIT=` $pwd/mysql -u$user -p$password -h$host --port=$port -e "select a.id as conn_id, time_to_sec(timediff(now(),b.trx_started)) as trx_open_seconds from information_schema.processlist a right outer join information_schema.innodb_trx b on a.id = b.trx_mysql_thread_id  where  time_to_sec(timediff(now(),b.trx_started))>60;" 2>/dev/null |grep -v trx_open_seconds |wc -l`
if [ $MYSQLCOMMIT -gt 50 ];then
    echo "[$DATE] [WARNING] 事務超過60秒未提交數量超過50個!"
    exit 1

  else
    echo "[$DATE] [INFO] 事務超過60秒未提交數量: $MYSQLCOMMIT"
    exit 0
fi

mysql快取命中率

#!/bin/bash
#採集間隔時間,單位s
asleep=10
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
probe_file=probe.log

Innodb_buffer_read_hit_ratio=0


getBufferRatio(){
    a=$(/data01/svr/mysql_${port}/bin/mysql -u${user} -p${password} -h${host} -e "show global status like 'Innodb_buffer_pool%';" |grep -i "Innodb_buffer_pool_read_requests\|Innodb_buffer_pool_reads" | awk '{if(length($3)==0) print $0 ;else print $3}'| tr -t '\n'' ');
    Innodb_buffer_pool_read_requests=$(echo $a | awk '{print $2}')
    Innodb_buffer_pool_reads=$(echo $a | awk '{print $4}')
    Innodb_buffer_read_hit_ratio=`awk 'BEGIN{printf "%.2f\n",(1-'$Innodb_buffer_pool_reads'/'$Innodb_buffer_pool_read_requests')*100}'`
}
getBufferRatio
echo $(date "+%Y-%m-%d_%H:%M:%S") $Innodb_buffer_read_hit_ratio

if [ ${Innodb_buffer_read_hit_ratio%.*} -lt 99 ];then
    echo "[$DATE] [WARNING] buffer命中率低於99!"
    exit 1

  else
    echo "[$DATE] [INFO] buffer命中率: $Innodb_buffer_read_hit_ratio"
    exit 0
fi

MySQL鎖表監控告警

#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
Ratio=0


a=`$pwd/mysql -u$user -p$password -h$host --port=$port -e "SHOW  STATUS LIKE 'Innodb_row_lock_current_waits'" 2>/dev/null |grep Innodb_row_lock_current_waits|awk '{print $2}'`


if [ $a -gt 0 ];then
    echo "[$DATE] [WARNING] 出現鎖表!!"
    exit 1

  else
    echo "[$DATE] [INFO] 鎖表檢查正常。"
    exit 0
fi

QPS大於10000告警

#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
Ratio=0



Uptime=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port status 2>/dev/null | awk '{print $2}' `
QPS() {
 Questions=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port status 2>/dev/null | awk '{print $6}' `
}
QPS
Ratio=`awk 'BEGIN{ printf "%.2f\n",'$Questions'/'$Uptime'}'`

if [ ${Ratio%.*} -gt 10000 ];then
    echo "[$DATE] [WARNING] QPS大於10000!"
    exit 1

  else
    echo "[$DATE] [INFO] 當前QPS為: $Ratio"
    exit 0
fi

TPS大於4000告警

#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
Ratio=0



Uptime=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port status 2>/dev/null | awk '{print $2}' `
TPS() {
 rollback=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port extended-status 2>/dev/null | awk '/\<Com_rollback\>/{print $4}'`
 commit=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port extended-status 2>/dev/null | awk '/\<Com_commit\>/{print $4}'`
}
TPS
TpsRatio=`awk 'BEGIN{printf "%.2f\n",'$(($rollback+$commit))'/'$Uptime'}'`


if [ ${TpsRatio%.*} -gt 4000 ];then
    echo "[$DATE] [WARNING] TPS大於4000!"
    exit 1

  else
    echo "[$DATE] [INFO] 當前QPS為: $TpsRatio"
    exit 0
fi

haproxy後端狀態監測

#!/bin/bash 
result=`curl -s  http://admin:admin@10.172.95.97:2000/status |grep -E 'order01|base|cust|idservice|irsc|sec|upc|ewe|ftpgw ' |grep DOWN |wc -l` 
if [[ $result = 2 ]]; then 
   exit 0 
else  
   exit 1 
fi

JAVA記憶體溢位告警-OutOfMemory

#!/bin/bash 
result=`tail -1000 /apps/logs/svc/svc-node01-order01-`date +%m%d`.log | grep 'java.lang.OutOfMemoryError' | wc -l` 
if [[ $result = 0 ]]; then 
   exit 0 
else  
   exit 1 
fi

redis叢集狀態異常告警

#!/bin/bash 
result=`/apps/svr/ids/redis-cli  -c -p 11001  -h 127.0.0.1 cluster info |grep cluster_state | grep ok | wc -l`
if [[ $result = 1 ]]; then 
   exit 0 
else  
   exit 1 
fi

redis節點狀態監測告警

#!/bin/bash 
result=`/apps/svr/ids/redis-cli  -c -p 11001  -h 127.0.0.1 cluster info |grep cluster_known_nodes |awk -F ":"  '{print $2}'`
if [[ $result = 6 ]]; then 
   exit 0 
else  
   exit 1 
fi

redis_slot異常告警

#!/bin/bash 
result=`/apps/svr/ids/redis-cli  -c -p 11001  -h 127.0.0.1 cluster info |grep cluster_slots_ok |awk -F ":"  '{print $2}'`
if [[ $result = 16384 ]]; then 
   exit 0 
else  
   exit 1 
fi

keepalivedVIP連通性監控告警

#!/bin/sh
VIPS=`cat /etc/keepalived/keepalived.conf |grep -v real|grep -v smtp_server|grep -v 127.0.0.1|grep -v '#'|grep -E -o "\b([0-9]{1,3}\.){3}[0-9]{1,3}\b"|sort -nu`
num=0
for ips in $VIPS
do
        result=`ping -w 2 -c 3 ${ips} | grep packet | awk -F" " '{print $6}'| awk -F"%" '{print $1}'| awk -F' ' '{print $1}'`
        if [ $result -eq 0 ]; then
              num=$num
        else
              let num=$num+1
        fi
done
    if [ $num -eq 0 ]; then
  exit 0
    else
  exit 1
 fi

keepalivedVIP丟失告警

#!/bin/bash
ip_count=`ip a |grep inet|grep -v 127.0.0.1|grep -v inet6 |wc -l`
if [ ! -f /tmp/check_vip.log ];then
ip a |grep inet|grep -v 127.0.0.1|grep -v inet6 |wc -l > /tmp/check_vip.log
else
vip_count=`sudo cat /tmp/check_vip.log`
if [ "$ip_count" == "$vip_count" ];then
echo "vip ok"
exit 0
else
echo $ip_count > /tmp/check_vip.log
exit 1
fi
fi

keepalived腦裂預警

這裡解釋一下,由於腦裂的驗證需要結合多個節點的情況,監控指令碼儘量不做的過於複雜,因此這裡在keepalived備節點做了監控,只要發現VIP切換即發出告警,人為接入檢查腦裂情況。

#!/bin/bash 
result=`ip addr  |grep 32  | wc -l` 
if [[ $result = 0 ]]; then 
   exit 0 
else    
   echo "keepalived從節點出現32位的vip,可能出現腦裂現象"
   exit 1 
fi

rocketmq叢集節點數量監控告警

#!/bin/bash
DATE=`date +"%Y-%m-%d %H:%M:%S"`
declare -x JAVA_HOME="/apps/tools/jdk"
NumCluster=`sudo -E /apps/svr/mqbroker/rocketmq/bin/mqadmin  clusterList -n 127.0.0.1:9876 |grep -v Version|wc -l`
if [ $NumCluster -gt 4 ];then
    echo "[$DATE] [WARNING] rocketMQ叢集節點小於4個!"
    exit 1

  else
    echo "[$DATE] [INFO] rocketMQ叢集節點數量為: $NumCluster"
    exit 0
fi

rocketmq訊息數量異常告警

#!/bin/bash
DATE=`date +"%Y-%m-%d %H:%M:%S"`
export JAVA_HOME=/apps/tools/jdk
export JAVA_BIN=/apps/tools/jdk/bin
NumTopic=`sudo -E /apps/svr/mqbroker/rocketmq/bin/mqadmin topicList -n 127.0.0.1:9876 2>/dev/null|grep Blue  |grep CIDC |grep -v RETRY|wc -l`
if [ $NumTopic -lt 50 ];then
    echo "[$DATE] [WARNING] rocketMQ訊息主題小於50個!"
    exit 1

  else
    echo "[$DATE] [INFO] rocketMQ訊息topic數量當前為: $NumTopic"
    exit 0
fi

logstash日誌報錯告警

#!/bin/bash
result=`tail -2000 /apps/logs/logstash/logstash-plain.log | grep 'error' | wc -l` 
if [[ $result = 0 ]]; then 
   exit 0 
else  
   exit 1 
fi

elasticsearch叢集個數異常告警

#!/bin/bash 
result=`curl http://10.172.95.1:9201/_cluster/health?pretty |grep number_of_nodes | awk  -F ":" '{print $2}' |sed s'/.$//'`
if [[ $result = 4 ]]; then 
   exit 0 
else  
   exit 1 
fi

elasticsearch資料節點個數異常告警

#!/bin/bash 
result=`curl http://10.172.95.1:9201/_cluster/health?pretty |grep number_of_data_nodes | awk  -F ":" '{print $2}' |sed s'/.$//'`
if [[ $result = 4 ]]; then 
   exit 0 
else  
   exit 1 
fi

elasticsearch_java記憶體使用已超過48G

#!/bin/bash 
#51539607552=48G
result=` curl http://10.172.95.1:9201/_cluster/stats?pretty  |grep heap_used_in_bytes |awk  -F ":" '{print $2}' |sed s'/.$//'`
if [[ $result > 51539607552 ]]; then 
   exit 0 
else  
   exit 1 
fi

zookeeper日誌告警

#!/bin/bash
result=`tail -2000 /apps/logs/logstash/logstash-plain.log | grep 'error' | wc -l` 
if [[ $result = 0 ]]; then 
   exit 0 
else  
   exit 1 
fi

zookeeper叢集follower-mode變更告警

#!/bin/bash
result=`/apps/sh/zk/zkServer.sh status  |grep follower |wc -l` 
if [[ $result = 0 ]]; then 
   exit 0 
else  
   exit 1 
fi

zookeeper叢集leader-mode變更告警

#!/bin/bash
result=`/apps/sh/zk/zkServer.sh status  |grep leader|wc -l ` 
if [[ $result = 0 ]]; then 
   exit 0 
else  
   exit 1 
fi

api介面層面監控告警

#!/bin/bash
result=`curl --location --request GET 'http://10.172.95.186:8000/emop?appId=600006&method=SYAN_UNHQ_queryOfferStatus&channelTypeId=0&flowdId=202006091314501278181&format=json&status=1%0A' --header 'Content-Type: text/plain' --data '{  "productType": "vm"}' -w "\n" |grep OK  | wc -l`
if [[ $result = 1 ]]; then 
   exit 0 
else  
   exit 1 
fi

相關文章