結合Ansible技術監控Storm叢集
1、我的hosts配置
# vim /etc/hosts
192.168.1.100 storm_zk1
192.168.1.101 storm_zk2
192.168.1.102 storm_zk3
192.168.1.103 storm_nimbus
192.168.1.104 storm_supervisor1
192.168.1.105 storm_supervisor2
192.168.1.106 storm_supervisor3
192.168.1.107 storm_supervisor4
192.168.1.108 storm_supervisor5
192.168.1.109 storm_supervisor6
2、我的storm配置
# vim /usr/local/storm/conf/storm.yaml
drpc.servers:
- "storm_supervisor1"
- "storm_supervisor2"
- "storm_supervisor3"
storm.zookeeper.servers:
- "storm_zk1"
- "storm_zk2"
- "storm_zk3"
storm.local.dir: "/data/storm/workdir"
nimbus.host: "storm_nimbus"
nimbus.thrift.port: 6627
nimbus.thrift.max_buffer_size: 1048576
nimbus.childopts: "-Xmx1024m"
nimbus.task.timeout.secs: 30
nimbus.supervisor.timeout.secs: 60
nimbus.monitor.freq.secs: 10
nimbus.cleanup.inbox.freq.secs: 600
nimbus.inbox.jar.expiration.secs: 3600
nimbus.task.launch.secs: 240
nimbus.reassign: true
nimbus.file.copy.expiration.secs: 600
nimbus.topology.validator: "backtype.storm.nimbus.DefaultTopologyValidator"
storm.zookeeper.port: 2181
storm.zookeeper.root: "/data/storm/zkinfo"
storm.cluster.mode: "distributed"
storm.local.mode.zmq: false
ui.port: 8080
ui.childopts: "-Xmx768m"
supervisor.slots.ports:
- 6700
- 6701
- 6702
- 6703
- 6704
- 6705
- 6706
- 6707
- 6708
- 6709
supervisor.childopts: "-Xmx2048m"
supervisor.worker.start.timeout.secs: 240
supervisor.worker.timeout.secs: 30
supervisor.monitor.frequency.secs: 3
supervisor.heartbeat.frequency.secs: 5
supervisor.enable: true
worker.childopts: "-Xmx4096m"
topology.max.spout.pending: 5000
storm.zookeeper.session.timeout: 5000
storm.zookeeper.connection.timeout: 3000
storm.zookeeper.retry.times: 6
storm.zookeeper.retry.interval: 2000
storm.zookeeper.retry.intervalceiling.millis: 30000
storm.thrift.transport: "backtype.storm.security.auth.SimpleTransportPlugin"
storm.messaging.transport: "backtype.storm.messaging.netty.Context"
storm.messaging.netty.server_worker_threads: 50
storm.messaging.netty.client_worker_threads: 50
storm.messaging.netty.buffer_size: 20971520
storm.messaging.netty.max_retries: 100
storm.messaging.netty.max_wait_ms: 1000
storm.messaging.netty.min_wait_ms: 100
3、nimbus節點部署
# vim /data/scripts/monitor_status_for_storm.sh
#!/bin/sh
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin
. /etc/profile
## 監控頁面地址引數
MON_SRV_IPADDR="192.168.1.103"
MON_SRV_PORT="8080"
## 是否已正確掃描
SCAN_FLAG=0
## 工作基路徑
BASE_PATH="/data/scripts"
## 異常 storm Supervisor 主機地址列表
FAIL_SUPERVISOR_LIST="${BASE_PATH}/fail_supervisor.txt"
#---------------------------------------------------------------------------------------------------
## 重啟storm的nimbus服務
function restart_storm_nimbus_server()
{
[[ -n `ps aux | grep java | grep storm` ]] && kill -9 `ps aux | grep java | grep storm | awk '{print $2}'`
nohup /usr/local/storm/bin/storm nimbus >/dev/null 2>&1 &
nohup /usr/local/storm/bin/storm ui >/dev/null 2>&1 &
sleep 30
}
#---------------------------------------------------------------------------------------------------
## 1、檢查監控頁面是否正常【8080埠不通的情況】
for ((i=0; i<3; i++)); do
RETVAL=`/usr/bin/nmap -n -sS -p ${MON_SRV_PORT} ${MON_SRV_IPADDR} | grep open`
[[ -n "${RETVAL}" ]] && SCAN_FLAG=1;break || sleep 10
done
[[ ${SCAN_FLAG} -ne 1 ]] && restart_storm_nimbus_server
#---------------------------------------------------------------------------------------------------
## 2、將監控頁面抓取內容與本地hosts內容進行差異比較,以確定是否存在異常的 storm supervisor 服務
curl -s {MON_SRV_IPADDR}:${MON_SRV_PORT}/ | sed 's//\n/g' | awk -F '<' '/^storm_/{print $1}' | awk '!/nimbus/{print}' | sort > ${BASE_PATH}/supervisor_list_from_page.txt
## 如果獲取的storm nimbus監控頁面資料為空,代表storm nimbus服務存在異常
[[ -z `sed '/^$/d' ${BASE_PATH}/supervisor_list_from_page.txt` ]] && restart_storm_nimbus_server
sort -nr ${BASE_PATH}/supervisor_list_from_page.txt ${BASE_PATH}/supervisor_list.txt | uniq -u > ${BASE_PATH}/supervisor_list_for_failed.txt
[[ -z `sed '/^$/d' ${BASE_PATH}/supervisor_list_for_failed.txt` ]] && rm -f ${BASE_PATH}/supervisor_list_for_failed.txt && exit 0
#---------------------------------------------------------------------------------------------------
## 3、獲得異常的 storm supervisor 服務的IP地址列表
echo "[fail_supervisor]" >> ${FAIL_SUPERVISOR_LIST}
for SUPERVISOR_NAMEADDR in `cat ${BASE_PATH}/supervisor_list_for_failed.txt`
do
TEMP_IPADDR=`grep -w ${SUPERVISOR_NAMEADDR} /etc/hosts | grep -v '#' | awk '{print $1}' | tail -1`
echo "${TEMP_IPADDR}" >> ${FAIL_SUPERVISOR_LIST}
IPLIST="${IPLIST} ${TEMP_IPADDR}"
done
#---------------------------------------------------------------------------------------------------
## 4、遠端重啟 storm supervisor 服務
/usr/local/bin/ansible -i ${FAIL_SUPERVISOR_LIST} fail_supervisor -m shell -a "/data/scripts/restart_storm_service.sh"
rm -f ${FAIL_SUPERVISOR_LIST}
# vim /data/scripts/supervisor_list.txt
storm_supervisor1
storm_supervisor2
storm_supervisor3
storm_supervisor4
storm_supervisor5
storm_supervisor6
# touch /var/run/check_storm.lock
# crontab -e
*/2 * * * * (flock --timeout=0 /var/run/check_storm.lock /data/scripts/monitor_status_for_storm.sh >/dev/null 2>&1)
4、supervisor節點部署
# vim /data/scripts/restart_storm_service.sh
#!/bin/sh
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin
. /etc/profile
[[ -n `ps aux | grep java | grep storm` ]] && kill -9 `ps aux | grep java | grep storm | awk '{print $2}'`
nohup /usr/local/storm/bin/storm supervisor >/dev/null 2>&1 &
# vim /etc/hosts
192.168.1.100 storm_zk1
192.168.1.101 storm_zk2
192.168.1.102 storm_zk3
192.168.1.103 storm_nimbus
192.168.1.104 storm_supervisor1
192.168.1.105 storm_supervisor2
192.168.1.106 storm_supervisor3
192.168.1.107 storm_supervisor4
192.168.1.108 storm_supervisor5
192.168.1.109 storm_supervisor6
2、我的storm配置
# vim /usr/local/storm/conf/storm.yaml
drpc.servers:
- "storm_supervisor1"
- "storm_supervisor2"
- "storm_supervisor3"
storm.zookeeper.servers:
- "storm_zk1"
- "storm_zk2"
- "storm_zk3"
storm.local.dir: "/data/storm/workdir"
nimbus.host: "storm_nimbus"
nimbus.thrift.port: 6627
nimbus.thrift.max_buffer_size: 1048576
nimbus.childopts: "-Xmx1024m"
nimbus.task.timeout.secs: 30
nimbus.supervisor.timeout.secs: 60
nimbus.monitor.freq.secs: 10
nimbus.cleanup.inbox.freq.secs: 600
nimbus.inbox.jar.expiration.secs: 3600
nimbus.task.launch.secs: 240
nimbus.reassign: true
nimbus.file.copy.expiration.secs: 600
nimbus.topology.validator: "backtype.storm.nimbus.DefaultTopologyValidator"
storm.zookeeper.port: 2181
storm.zookeeper.root: "/data/storm/zkinfo"
storm.cluster.mode: "distributed"
storm.local.mode.zmq: false
ui.port: 8080
ui.childopts: "-Xmx768m"
supervisor.slots.ports:
- 6700
- 6701
- 6702
- 6703
- 6704
- 6705
- 6706
- 6707
- 6708
- 6709
supervisor.childopts: "-Xmx2048m"
supervisor.worker.start.timeout.secs: 240
supervisor.worker.timeout.secs: 30
supervisor.monitor.frequency.secs: 3
supervisor.heartbeat.frequency.secs: 5
supervisor.enable: true
worker.childopts: "-Xmx4096m"
topology.max.spout.pending: 5000
storm.zookeeper.session.timeout: 5000
storm.zookeeper.connection.timeout: 3000
storm.zookeeper.retry.times: 6
storm.zookeeper.retry.interval: 2000
storm.zookeeper.retry.intervalceiling.millis: 30000
storm.thrift.transport: "backtype.storm.security.auth.SimpleTransportPlugin"
storm.messaging.transport: "backtype.storm.messaging.netty.Context"
storm.messaging.netty.server_worker_threads: 50
storm.messaging.netty.client_worker_threads: 50
storm.messaging.netty.buffer_size: 20971520
storm.messaging.netty.max_retries: 100
storm.messaging.netty.max_wait_ms: 1000
storm.messaging.netty.min_wait_ms: 100
3、nimbus節點部署
# vim /data/scripts/monitor_status_for_storm.sh
#!/bin/sh
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin
. /etc/profile
## 監控頁面地址引數
MON_SRV_IPADDR="192.168.1.103"
MON_SRV_PORT="8080"
## 是否已正確掃描
SCAN_FLAG=0
## 工作基路徑
BASE_PATH="/data/scripts"
## 異常 storm Supervisor 主機地址列表
FAIL_SUPERVISOR_LIST="${BASE_PATH}/fail_supervisor.txt"
#---------------------------------------------------------------------------------------------------
## 重啟storm的nimbus服務
function restart_storm_nimbus_server()
{
[[ -n `ps aux | grep java | grep storm` ]] && kill -9 `ps aux | grep java | grep storm | awk '{print $2}'`
nohup /usr/local/storm/bin/storm nimbus >/dev/null 2>&1 &
nohup /usr/local/storm/bin/storm ui >/dev/null 2>&1 &
sleep 30
}
#---------------------------------------------------------------------------------------------------
## 1、檢查監控頁面是否正常【8080埠不通的情況】
for ((i=0; i<3; i++)); do
RETVAL=`/usr/bin/nmap -n -sS -p ${MON_SRV_PORT} ${MON_SRV_IPADDR} | grep open`
[[ -n "${RETVAL}" ]] && SCAN_FLAG=1;break || sleep 10
done
[[ ${SCAN_FLAG} -ne 1 ]] && restart_storm_nimbus_server
#---------------------------------------------------------------------------------------------------
## 2、將監控頁面抓取內容與本地hosts內容進行差異比較,以確定是否存在異常的 storm supervisor 服務
curl -s {MON_SRV_IPADDR}:${MON_SRV_PORT}/ | sed 's//\n/g' | awk -F '<' '/^storm_/{print $1}' | awk '!/nimbus/{print}' | sort > ${BASE_PATH}/supervisor_list_from_page.txt
## 如果獲取的storm nimbus監控頁面資料為空,代表storm nimbus服務存在異常
[[ -z `sed '/^$/d' ${BASE_PATH}/supervisor_list_from_page.txt` ]] && restart_storm_nimbus_server
sort -nr ${BASE_PATH}/supervisor_list_from_page.txt ${BASE_PATH}/supervisor_list.txt | uniq -u > ${BASE_PATH}/supervisor_list_for_failed.txt
[[ -z `sed '/^$/d' ${BASE_PATH}/supervisor_list_for_failed.txt` ]] && rm -f ${BASE_PATH}/supervisor_list_for_failed.txt && exit 0
#---------------------------------------------------------------------------------------------------
## 3、獲得異常的 storm supervisor 服務的IP地址列表
echo "[fail_supervisor]" >> ${FAIL_SUPERVISOR_LIST}
for SUPERVISOR_NAMEADDR in `cat ${BASE_PATH}/supervisor_list_for_failed.txt`
do
TEMP_IPADDR=`grep -w ${SUPERVISOR_NAMEADDR} /etc/hosts | grep -v '#' | awk '{print $1}' | tail -1`
echo "${TEMP_IPADDR}" >> ${FAIL_SUPERVISOR_LIST}
IPLIST="${IPLIST} ${TEMP_IPADDR}"
done
#---------------------------------------------------------------------------------------------------
## 4、遠端重啟 storm supervisor 服務
/usr/local/bin/ansible -i ${FAIL_SUPERVISOR_LIST} fail_supervisor -m shell -a "/data/scripts/restart_storm_service.sh"
rm -f ${FAIL_SUPERVISOR_LIST}
# vim /data/scripts/supervisor_list.txt
storm_supervisor1
storm_supervisor2
storm_supervisor3
storm_supervisor4
storm_supervisor5
storm_supervisor6
# touch /var/run/check_storm.lock
# crontab -e
*/2 * * * * (flock --timeout=0 /var/run/check_storm.lock /data/scripts/monitor_status_for_storm.sh >/dev/null 2>&1)
4、supervisor節點部署
# vim /data/scripts/restart_storm_service.sh
#!/bin/sh
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin
. /etc/profile
[[ -n `ps aux | grep java | grep storm` ]] && kill -9 `ps aux | grep java | grep storm | awk '{print $2}'`
nohup /usr/local/storm/bin/storm supervisor >/dev/null 2>&1 &
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/9034054/viewspace-2052113/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- Zookeeper叢集 + Kafka叢集 + KafkaOffsetMonitor 監控薦Kafka
- 叢集監控工具ganglia
- Storm叢集搭建ORM
- redis sentinel 叢集監控 配置Redis
- 使用monit監控stormORM
- 用 Weave Scope 監控叢集 - 每天5分鐘玩轉 Docker 容器技術(175)Docker
- Redis安裝+叢集+效能監控Redis
- prometheus監控k8s叢集PrometheusK8S
- redis3.0叢集監控指令碼RedisS3指令碼
- 使用夜鶯+categraf監控redis和redis叢集Redis
- storm叢集啟動停止指令碼ORM指令碼
- CentOS6.5 安裝Storm叢集CentOSORM
- storm與kafka結合ORMKafka
- LVS叢集技術
- Linux 叢集技術(轉)Linux
- Oracle叢集技術 | 叢集的自啟動系列(一)Oracle
- Ganglia監控Hadoop叢集的安裝部署Hadoop
- Ansible部署K8s叢集K8S
- 基於 ZooKeeper 實現爬蟲叢集的監控爬蟲
- 關於Oracle 12c的叢集監控(CHM)Oracle
- Elasticsearch叢集監控工具bigdesk外掛安裝Elasticsearch
- Nagios監控mongodb分片叢集服務實戰iOSMongoDB
- 高效能Linux叢集管理監控之道(轉)Linux
- Redis叢集技術及Codis實踐Redis
- 資料庫叢集技術漫談資料庫
- vivo 容器叢集監控系統架構與實踐架構
- 叢集式數字監控應用模型研究(一) (轉)模型
- 【知識分享】伺服器叢集和伺服器叢集技術伺服器
- Linux叢集技術的熱點-虛擬技術(轉)Linux
- ss命令結合zabbix對socket做監控
- 技術分享| anyRTC 影片監控融合方案
- 大資料技術 - 監控預警大資料
- ai行為識別技術監控AI
- 技術與藝術的結合,HMS Core讓手機主題趣味叢生
- 網站安全監控的方法講解,網站安全監控技術網站
- 如何用Prometheus監控十萬container的Kubernetes叢集PrometheusAI
- 容器叢集監控系統架構如何對症下藥?架構
- Prometheus多叢集監控的3種方案,你選哪種?Prometheus