Mysql 監控系統

wenaini發表於2009-02-05
oracle的監控,市面上非常的多,最著名的應該是quest的產品spotlight。基本上能夠做到的全做到了。但是針對Mysql的監控產品還是非常少的,quest的spotlight on mysql對5以下的版本和MYISAM的引擎監控能力就非常有限,另外一個比較好的產品是oracle grid control上的一個mysql 控制元件,是由民間開發的,功能很強,也透過了oracle的認證,但麻煩的是,對grid control要求10.2.0.3,對mysql要求4.0.27以上,那麼對於一些比較老的版本,或者是部署grid agent有問題的環境就比較吃力了。大部分DBA想的辦法都是寫指令碼,這裡就公佈一個指令碼,基本上可以滿足絕大部分的mysql和os監控要求。[@more@]

disk_pct=80
errlog=/usr/local/mysql/data/`hostname`.err
data_dir=/usr/local/mysql/data
excludfile="(v3_buylog_080508.MYD|v3_buylog.MYD)"
nas_flag=0
myserver_flag=1
conn_flag=1
errlog_flag=1
rep_flag=0
bigfile_flag=1
exclud_flag=1
bk_flag=0
diskspace_flag=1
delay_flag=0
catchsql_flag=0
nasdir=/backup
process_limit=300
backup_dirlist=/root/bklist.lst
bkinput_file=/root/bk.tmp
log_dir=/usr/checklog

bkday=`date -d "1 days ago" +%y%m%d`
sqlday=`date -d "1 days ago" +%Y%m%d`
tgzday=`date -d "1 days ago" +%Y%m%d`
logday=`date -d "10 days ago" +%Y%m%d`
echo "#############################################################################################################">>$log_dir/`date +%Y%m%d`.log
echo "#############################################################################################################">>$log_dir/`date +%Y%m%d`.log
echo "#############################################################################################################">>$log_dir/`date +%Y%m%d`.log
echo "#############################################################################################################">>$log_dir/`date +%Y%m%d`.log
echo "#############################################################################################################">>$log_dir/`date +%Y%m%d`.log
echo "--------time `date +%k:%M:%S` monitor start-----------------------------------">>$log_dir/`date +%Y%m%d`.log
#purge old logs
if [ -f $log_dir/$logday.log ]
then
rm $log_dir/$logday.log
fi
#check nas
if [ $nas_flag -eq 1 ]
then
echo "`date +%k:%M:%S` nas monitor is on">>$log_dir/`date +%Y%m%d`.log
df -h $nasdir| sed -n '$p'|awk '{print $4}'|sed 's/%//'>/root/nas.txt
while read line
do
if [ $line -gt $disk_pct ]
then
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:nas out of space">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
else
echo "`date +%k:%M:%S` nas space is ok">>$log_dir/`date +%Y%m%d`.log
fi
doneelse
echo "`date +%k:%M:%S` nas monitor is off">>$log_dir/`date +%Y%m%d`.log
fi

#Check mysql process
if [ $myserver_flag -eq 1 ]
then
echo "`date +%k:%M:%S` mysqld process monitor is on">>$log_dir/`date +%Y%m%d`.log
live_flag=`mysqladmin ping|grep -c alive`
if [ $live_flag -eq 1 ]
then
echo "`date +%k:%M:%S` Mysql is alive">>$log_dir/`date +%Y%m%d`.log
else
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:Mysql is dead">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
fi
else
echo "`date +%k:%M:%S` mysqld process monitor is off">>$log_dir/`date +%Y%m%d`.log
fi
#Check mysql status
if [ $conn_flag -eq 1 ]
then
echo "`date +%k:%M:%S` mysql connection monitor is on">>$log_dir/`date +%Y%m%d`.log
prscnt=`mysql -e "show processlist"|wc -l`
if [ $prscnt -gt $process_limit ]
then
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:There is to many connections,the number is $prscnt,pls check mysql status">>$log_dir/`date +%Y%m%d`.log
mysql -e "show full processlist">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
else
echo "`date +%k:%M:%S` current connections of mysql:$prscnt">>$log_dir/`date +%Y%m%d`.log
fi
else
echo "`date +%k:%M:%S` mysql connection monitor is off">>$log_dir/`date +%Y%m%d`.log
fi
#Check mysql errorlog
if [ $errlog_flag -eq 1 ]
then
echo "`date +%k:%M:%S` mysql alert log monitor is on">>$log_dir/`date +%Y%m%d`.log
if [ $delay_flag -eq 0 ]
then
if [ -f $errlog ]
then
if [ `grep -ci err $errlog` -gt 0 ]
then
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:There is some error with mysql,pls check log">>$log_dir/`date +%Y%m%d`.log
grep -i error $errlog>>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
else
echo "`date +%k:%M:%S` mysql is running,no mistake in alertlog">>$log_dir/`date +%Y%m%d`.log
fi
else
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:no errorlog">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
fi
fi

if [ $delay_flag -eq 1 ]
then
if [ -f $errlog ]
then
if [ `grep -i err $errlog|awk '$0 !~/'"(Error reading relay log event)"'/ {print $0}'|grep -ci err` -gt 0 ]
then
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:There is some error with mysql,pls check log">>$log_dir/`date +%Y%m%d`.log
grep -i error $errlog>>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
else
echo "`date +%k:%M:%S` mysql is running,no mistake in alertlog">>$log_dir/`date +%Y%m%d`.log
fi
else
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:no errorlog">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
fi
fi
else
echo "`date +%k:%M:%S` mysql alert log monitor is off">>$log_dir/`date +%Y%m%d`.log
fi
#Replication Check
if [ $delay_flag -eq 0 ]
then
echo "`date +%k:%M:%S` not delay replication, so slave monitor is on">>$log_dir/`date +%Y%m%d`.log
if [ $rep_flag -eq 1 ]
then
echo "`date +%k:%M:%S` slave monitor is on">>$log_dir/`date +%Y%m%d`.log
slave_flag=`/usr/local/mysql/bin/mysqladmin extended-status|grep Slave_running|grep -c ON`
if [ $slave_flag -eq 0 ]
then
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:replication error">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
else
echo "`date +%k:%M:%S` replication running">>$log_dir/`date +%Y%m%d`.log
fi
else
echo "`date +%k:%M:%S` slave monitor is off">>$log_dir/`date +%Y%m%d`.log
fi
else
echo "`date +%k:%M:%S` delay replication, so slave monitor is off">>$log_dir/`date +%Y%m%d`.log
fi
#Check table which is more than 3Gb
if [ $bigfile_flag -eq 1 ]
then
echo "`date +%k:%M:%S` MYD 3G monitor is on">>$log_dir/`date +%Y%m%d`.log
if [ $exclud_flag -eq 0 ]
then
cd $data_dir
ls -lR | grep MYD |awk '{ if($5>=3221225472) print $9,$5}'>file.txt
if [ `wc -l file.txt| awk '{print $1}'` -gt 0 ]
then
while read line
do
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:datafile `echo $line|awk '{print $1}'` is bigger than 3Gb">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
doneelse
echo "`date +%k:%M:%S` no datafile bigger than 3Gb">>$log_dir/`date +%Y%m%d`.log
fi
else
cd $data_dir
ls -lR | grep MYD |awk '{ if($5>=3221225472) print $9,$5}'|awk '$1 !~/'$excludfile'/ {print $1,$2}'>file.txt
if [ `wc -l file.txt| awk '{print $1}'` -gt 0 ]
then
while read line
do
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:datafile `echo $line|awk '{print $1}'` is bigger than 3Gb">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
doneelse
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` file bigger than 3Gb but it is a big table">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
fi
fi
else
echo "`date +%k:%M:%S` MYD 3G monitor is off">>$log_dir/`date +%Y%m%d`.log
fi
#Check backup
grep -v "#" $backup_dirlist>$bkinput_file
if [ $bk_flag -eq 1 ]
then
echo "`date +%k:%M:%S` backup monitor is on">>$log_dir/`date +%Y%m%d`.log
while read line
do
dbname=`echo $line|awk '{print $1}'`
bk_dir=`echo $line|awk '{print $2}'`
bk_type=`echo $line|awk '{print $3}'`
if [ $bk_type -eq 1 ]
then
cd $bk_dir
if [ -d data$bkday ]
then
if [ `du data$bkday| sed -n '$p'|awk '{print $1}'` -lt 1000 ]
then
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:backup of $dbname is too small">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
else
echo "`date +%k:%M:%S` backup of $dbname is ok">>$log_dir/`date +%Y%m%d`.log
fi
else
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:no directory,backup of $dbname error">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
fi
elif [ $bk_type -eq 0 ]
then
cd $bk_dir
if [ -f $dbname'_'$sqlday.sql ]
then
if [ `du *$sqlday*|awk '{print $1}'` -lt 1000 ]
then
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:backup of $dbname is too small">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
else
echo "`date +%k:%M:%S` backup of $dbname is ok">>$log_dir/`date +%Y%m%d`.log
fi
else
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:no file,backup of $dbname error">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
fi
else
cd $bk_dir
if [ -f db.$tgzday.tgz ]
then
if [ `du db.$tgzday.tgz|awk '{print $1}'` -lt 1000 ]
then
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:backup of $dbname is too small">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
else
echo "`date +%k:%M:%S` backup of $dbname is ok">>$log_dir/`date +%Y%m%d`.log
fi
else
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:no file,backup of $dbname error">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
fi
fi
doneelse
echo "`date +%k:%M:%S` backup monitor is off">>$log_dir/`date +%Y%m%d`.log
fi
#Check datadir partition
if [ $diskspace_flag -eq 1 ]
then
echo "`date +%k:%M:%S` disk space monitor is on">>$log_dir/`date +%Y%m%d`.log
ct=`df -h |wc -l`
if [ $nas_flag -eq 1 ]
then
ct=`expr $ct - 2 `
fi
df -h | awk '{print $5}'|sed -n '2,'"$ct"'p'|sed 's/%//'>/root/disk.txt
while read dsk
do
if [ $dsk -gt $disk_pct ]
then
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error:some disk partition out of space">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
else
echo "`date +%k:%M:%S` disk space is ok">>$log_dir/`date +%Y%m%d`.log
fi
doneelse
echo "`date +%k:%M:%S` disk space monitor is off">>$log_dir/`date +%Y%m%d`.log
fi

#check catchsql daemon
if [ $catchsql_flag -eq 1 ]
then
echo "`date +%k:%M:%S` catchsql daemon monitor is on">>$log_dir/`date +%Y%m%d`.log
runflag=`ps -ef |grep catchsql.sh |wc -l`
if [ $runflag -eq 1 ]
then
cd /root/crontab_scripts
if [ -f catchsql.sh ]
then
/usr/bin/nohup ./catchsql.sh >/dev/null &
else
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
echo "`date +%k:%M:%S` Error: no catchsql script">>$log_dir/`date +%Y%m%d`.log
echo "-------------------------------------------------------------------------------">>$log_dir/`date +%Y%m%d`.log
fi
fi
else
echo "`date +%k:%M:%S` catchsql daemon monitor is off">>$log_dir/`date +%Y%m%d`.log
fi

echo "--------time `date +%k:%M:%S` monitor end-----------------------------------">>$log_dir/`date +%Y%m%d`.log

log=/usr/checklog/`date +%Y%m%d`.log
HOST_IP="`/sbin/ifconfig eth0 | grep 'inet addr' | awk '{print $2}' | sed -e 's/.*://'`"
HOST_IP2="`/sbin/ifconfig eth1 | grep 'inet addr' | awk '{print $2}' | sed -e 's/.*://'`"
errorct=`grep -c Error $log`
if [ $errorct -gt 0 ]
then
mail -s "Error Report~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~,machine `hostname` has some error with it,the ip is $HOST_IP and $HOST_IP2" $mailadd echo
else
echo "machine `hostname` is ok,IP is $HOST_IP and $HOST_IP2">>$log
mail -s "Server $HOST_IP $HOST_IP2 is OK" $mailadd fi
#########################################

#clear_log.sh
monitorlog=/usr/checklog/`date +%Y%m%d`.log
errlog=/usr/local/mysql/data/`hostname`.err
old_errlog=/usr/local/mysql/data/`hostname`.err.olg
> $monitorlog
cat $errlog>$old_errlog
>$errlog
/root/crontab_scripts/monitor.sh
errorct=`grep -c Error $monitorlog`
if [ $errorct -gt 0 ]
then
echo "there is still so mistake"
else
echo "system is ok"
fi

來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/79686/viewspace-1016767/,如需轉載,請註明出處,否則將追究法律責任。

相關文章