Nagios監控mongodb分片叢集服務實戰
Mongodb外掛下載地址為:git clone git://github.com/mzupan/nagios-plugin-mongodb.git,剛開始本人這裡沒有安裝gitpub環境,找網友草根幫忙下載的,之後上傳到了csdn資源頁面,新的下載地址為:
2,新增新的mongodb監控命令
因為mongodb服務是和mysql從庫公用一臺物理機,之前已經做了基礎nagios以及mysql服務監控,所以這裡只需要在原來的基礎上新增mongodb命令和服務即可。Nagios監控mysql請參考:http://blog.itpub.net/26230597/viewspace-760141/以及http://blog.itpub.net/26230597/viewspace-1217246/。所以這裡需要新增的mongodb監控命令如下所示:
-
[root@wgq objects]# cd /usr/local/nagios/etc/objects
-
[root@wgq objects]# vim commands.cfg
-
define command {
-
command_name check_mongodb
-
command_line $USER1$/nagios-plugin-mongodb/check_mongodb.py -H $HOSTADDRESS$ -A $ARG1$ -P $ARG2$ -W $ARG3$ -C $ARG4$
-
}
-
-
define command {
-
command_name check_mongodb_database
-
command_line $USER1$/nagios-plugin-mongodb/check_mongodb.py -H $HOSTADDRESS$ -A $ARG1$ -P $ARG2$ -W $ARG3$ -C $ARG4$ -d $ARG5$
-
}
-
-
define command {
-
command_name check_mongodb_collection
-
command_line $USER1$/nagios-plugin-mongodb/check_mongodb.py -H $HOSTADDRESS$ -A $ARG1$ -P $ARG2$ -W $ARG3$ -C $ARG4$ -d $ARG5$ -c $ARG6$
-
}
-
-
define command {
-
command_name check_mongodb_replicaset
-
command_line $USER1$/nagios-plugin-mongodb/check_mongodb.py -H $HOSTADDRESS$ -A $ARG1$ -P $ARG2$ -W $ARG3$ -C $ARG4$ -r $ARG5$
-
}
-
-
define command {
-
command_name check_mongodb_query
-
command_line $USER1$/nagios-plugin-mongodb/check_mongodb.py -H $HOSTADDRESS$ -A $ARG1$ -P $ARG2$ -W $ARG3$ -C $ARG4$ -q $ARG5$
- }
3,新增mongodb監控服務
-
#檢測mongodb服務的連線時間,超過2秒就普通報警,5秒就嚴重報警
-
define service{
-
host_name dbm1slave1
-
service_description Mongo Connect Check
-
check_command check_mongodb!connect!30000!2!5
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
#檢查mongodb的連線數,超過150普通報警,200嚴重報警
-
define service{
-
host_name dbm1slave1
-
service_description Mongo Free Connections
-
check_command check_mongodb!connections!27017!70!80
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
-
#檢查mongodb複製完成的百分比率,確保primary和standby的time是一致的。
-
define service{
-
host_name dbm1slave1
-
service_description Mongo Replication Lag
-
check_command check_mongodb!replication_lag!27017!15!30
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
#檢查mongodb記憶體使用率,閥值與mongodb所在機器的總記憶體數相關
-
define service{
-
host_name dbm1slave1
-
service_description Mongo Memory Usage
-
check_command check_mongodb!memory!27017!20!28
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
#檢查mongodb Mapped的記憶體使用率,閥值與mongodb所在機器的總記憶體數相關
-
define service{
-
host_name dbm1slave1
-
service_description Mongo Mapped Memory Usage
-
check_command check_mongodb!memory_mapped!27017!20!28
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
#檢查Lock Time的百分率,如果lock time佔據mongo執行時間的5%就普通報警,如果超過10%就嚴重報警
-
define service{
-
host_name dbm1slave1
-
service_description Mongo Lock Percentage
-
check_command check_mongodb!lock!27017!5!10
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
# Check Average Flush Time,檢查mongo伺服器的平均flush時間,
-
define service{
-
host_name dbm1slave1
-
service_description Mongo Flush Average
-
check_command check_mongodb!flushing!27017!100!200
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
# Check Last Flush Time,檢查最新的flush時間,如果超過200ms就普通報警,超過400ms就嚴重報警
-
define service{
-
host_name dbm1slave1
-
service_description Mongo Last Flush Time
-
check_command check_mongodb!last_flush_time!27017!200!400
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
# Check status of mongodb replicaset,檢查mongo複製的狀態
-
define service{
-
host_name dbm1slave1
-
service_description MongoDB state
-
check_command check_mongodb!replset_state!27017!0!0
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
# Check status of index miss ratio,檢查索引命中率,
-
define service{
-
host_name dbm1slave1
-
service_description MongoDB Index Miss Ratio
-
check_command check_mongodb!index_miss_ratio!27017!.005!.01
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
# Check number of databases and number of collections
-
define service{
-
host_name dbm1slave1
-
service_description MongoDB Number of databases
-
check_command check_mongodb!databases!27017!300!500
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
define service{
-
host_name dbm1slave1
-
service_description MongoDB Number of collections
-
check_command check_mongodb!collections!27017!300!500
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
# Check size of a database,檢查庫的大小
-
define service{
-
host_name dbm1slave1
-
service_description MongoDB Database size your-database
-
check_command check_mongodb_database!database_size!27017!300!500!your-database
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
# Check index size of a database,檢查庫索引的大小
-
define service{
-
host_name dbm1slave1
-
service_description MongoDB Database index size your-database
-
check_command check_mongodb_database!database_indexes!27017!50!100!your-database
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
# Check index size of a collection,檢查集合collection的索引大小
-
define service{
-
host_name dbm1slave1
-
service_description MongoDB Database index size your-database
-
check_command check_mongodb_collection!collection_indexes!27017!50!100!your-database!your-collection
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
# Check the primary server of replicaset,檢查複製的primary服務
-
define service{
-
host_name dbm1slave1
-
service_description MongoDB Replicaset Master Monitor: your-replicaset
-
check_command check_mongodb_replicaset!replica_primary!27017!0!1!your-replicaset
-
#示例:check_command check_mongodb_replicaset!replica_primary!27017!0!1!shard2
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
-
# Check the number of queries per second,檢查每一秒的查詢數量
-
define service{
-
host_name dbm1slave1
-
service_description MongoDB Updates per Second
-
check_command check_mongodb_query!queries_per_second!27017!200!150!update
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
# Check Primary Connection,檢查複製中與primary庫的連線時間,超過2秒就普通報警,超過4秒就嚴重報警
-
define service{
-
host_name dbm1slave1
-
service_description Mongo Connect Check
-
check_command check_mongodb!connect_primary!27017!2!4
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
-
}
-
-
# Check Collection State,檢查collection狀態,檢查mongo服務組列表的每一個主機,可以檢查重要collection的高可用性(鎖、超時、服務配置的可用性),如果發現一個查詢失敗就會報警。
-
define service{
-
host_name dbm1slave1
-
service_description Mongo Collection State
-
check_command check_mongodb!collection_state!27017!your-database!your-collection
-
max_check_attempts 5
-
normal_check_interval 3
-
retry_check_interval 2
-
check_period 24x7
-
notification_interval 10
-
notification_period 24x7
-
notification_options w,u,c,r
-
contact_groups ops
- }
4,檢視部分監控項效果
配置完nagios端服務,重啟下service nagios restart; 等上幾分鐘,nagios監控介面就會出現完整的mongo服務資訊,如下所示:
5,從ps中確定mongodb的架構
[root@db-m1-slave-1 ~]# ps -eaf|grep mongo
mongodb 2457 1 0 2013 ? 2-03:39:08 ./mongod --configsvr --dbpath /home/data/mongodb/config --port 20000 --logpath /home/data/mongodb/config.log --logappend --fork
mongodb 2804 1 0 2013 ? 1-10:02:33 mongos --configdb 192.168.12.62:20000,192.168.12.63:20000,192.168.12.72:20000 --port 30000 --chunkSize 64 --logpath /home/data/mongodb/mongos.log --logappend --fork
mongodb 3072 1 0 2013 ? 1-10:17:20 mongod --shardsvr --replSet shard1 --port 27017 --dbpath /home/data/mongodb/shard11 --oplogSize 2048 --logpath /home/data/mongodb/shard11.log --logappend --fork
root 11179 9391 0 11:14 pts/1 00:00:00 grep mongo
mongodb 30414 1 0 Feb14 ? 1-06:20:50 mongod --shardsvr --replSet shard2 --port 27018 --dbpath /home/data/mongodb/shard21 --oplogSize 2048 --logpath /home/data/mongodb/shard21.log --logappend --fork
[root@db-m1-slave-1 ~]#
看到有4個mongo程式,
a) 啟動引數有“--configdb”的就是叢集入口程式;
b) Shard Server,啟動引數帶“--shardsvr --replSet”的是叢集分片的一個片組啟動程式,使用者儲存實際的資料塊,也就是27017埠和27018埠的mongodb服務例項。至於如何判斷27017埠中哪個是primary哪個是secondary需要去登入27107埠執行rs.status();去檢視一下。
c) Config Server:啟動引數帶“--configsvr”的程式,儲存了整個Cluster Metadata,其中包括chunk資訊,也就是20000埠的mongodb服務例項。
d) Route Server:啟動引數帶“mongos
--configdb”的程式,前端路由,客戶端由此接入,且讓整個叢集看上去像單一資料庫,前端應用可以透明使用,也就是30000埠的mongodb例項。
6,除錯中出現過的錯誤
錯誤1:
[root@wgq nagios ~]# tail -f /usr/local/nagios/var/nagios.log
[1412819956] Warning: Return code of 13 for check of service 'Mongo Memory Usage' on host 'dbm1slave1' was out of bounds.
[1412819956] SERVICE ALERT: dbm1slave1;Mongo Memory Usage;CRITICAL;SOFT;1;(Return code of 13 is out of bounds)
[1412819975] Warning: Return code of 13 for check of service 'Mongodb Connect Check' on host 'dbm1slave1' was out of bounds.
[1412819975] SERVICE ALERT: dbm1slave1;Mongodb Connect Check;CRITICAL;SOFT;1;(Return code of 13 is out of bounds)
[1412820058] Warning: Return code of 13 for check of service 'Mongo Free Connections' on host 'dbm1slave1' was out of bounds.
需要賦值nagios使用者所有許可權以及r執行許可權
chmod 770 /usr/lib/nagios/plugins/check_mongodb.py
chown -R nagios.nagios /usr/lib/nagios/plugins/check_mongodb.py
錯誤2:
監控介面Status Information一欄出現 No module named pymongo報錯提示資訊:
出現這個提示是因為需要安裝pymongo模組,執行easy_install pymongo命令安裝即可,如下所示:
[root@wgq objects]# easy_install pymongo
Searching for pymongo
Reading
Best match: pymongo 2.7.2
......
zip_safe flag not set; analyzing archive contents...
Adding pymongo 2.7.2 to easy-install.pth file
Installed /usr/lib/python2.6/site-packages/pymongo-2.7.2-py2.6-linux-x86_64.egg
Processing dependencies for pymongo
Finished processing dependencies for pymongo
參考文章:
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/26230597/viewspace-1293589/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- mongodb 分片叢集建立分片集合MongoDB
- MongoDB分片叢集新增分片(自用)MongoDB
- MongoDB 分片叢集搭建MongoDB
- MongoDB叢集之分片MongoDB
- 搭建MongoDB分片叢集MongoDB
- mongodb 分片叢集設定MongoDB
- MongoDB分片叢集常用操作MongoDB
- mongodb副本叢集和分片叢集佈署MongoDB
- Nagios監控lvs服務iOS
- MongoDB分片叢集chunk的概念MongoDB
- 高可用mongodb叢集(分片+副本)MongoDB
- MongoDB分片儲存的叢集架構實現MongoDB架構
- MongoDB Sharding(二) -- 搭建分片叢集MongoDB
- Mongodb分散式叢集副本集+分片MongoDB分散式
- mongodb的分散式叢集(3、分片)MongoDB分散式
- 搭建高可用MongoDB叢集(四):分片MongoDB
- linux下Mongodb叢集搭建:分片+副本集LinuxMongoDB
- 【MongoDB】分片(sharding)+副本集(replSet)叢集搭建MongoDB
- Mongodb副本集+分片叢集環境部署記錄MongoDB
- MongoDB健壯叢集——用副本集做分片MongoDB
- 高可用的MongoDB叢集-實戰篇MongoDB
- Zookeeper叢集 + Kafka叢集 + KafkaOffsetMonitor 監控薦Kafka
- 搭建 MongoDB分片(sharding) / 分割槽 / 叢集環境MongoDB
- MongoDB叢集設定集合分片生效及檢視集合分片情況MongoDB
- 叢集監控工具ganglia
- .NET Core+MongoDB叢集搭建與實戰MongoDB
- MongoDB實戰系列之六:mongodb的高可用叢集設計實戰薦MongoDB
- 分片叢集元件元件
- 部署分片叢集
- 專案實戰:zabbix監控MySQL狀態、服務資訊MySql
- Mongodb 效能監控工具FreeMonitoring,mongostat,mongotop,Profiler,索引,分片,事務超時,MongoDB調優MongoDB索引
- MongoDB 分片叢集均衡器導致的效能下降MongoDB
- Mongodb主從複製/ 副本集/分片叢集介紹MongoDB
- mongoDB研究筆記:分片叢集的工作機制MongoDB筆記
- python監控MongoDB服務程序,故障釘釘告警PythonMongoDB
- redis sentinel 叢集監控 配置Redis
- MongoDB 4.2分片叢集搭建及與3.4分片叢集搭建時的一些異同MongoDB
- prometheus監控golang服務實踐PrometheusGolang