prometheus監控+alertmanager告警
配置告警規則
1、建立規則目錄
mkdir /usr/local/prometheus/rules
2、編寫告警規則檔案。
vim /usr/local/prometheus/rules/rule.yml
#新增以下配置
groups:
- name: instance-abnormal
rules:
- alert: POD新增告警!
expr: |
kube_pod_status_ready{condition="true"} == 0
and on(pod)
kube_pod_container_status_restarts_total == 0
for: 60s
labels:
name: instance
severity: Warning
instance: "{{ $labels.pod }}"
annotations:
summary: "k8s叢集告警!"
description: "{{ $labels.pod }} 為新增節點!"
- alert: POD重啟告警!
expr: |
kube_pod_status_ready{condition="true"} == 0
and on(pod)
kube_pod_container_status_restarts_total > 0
for: 60s
labels:
name: instance
severity: Critical
annotations:
summary: "k8s叢集POD重啟!"
description: "{{ $labels.pod }} 正在重啟!"
- name: instance-down
rules:
- alert: k8s叢集節點down!
expr: |
kube_node_status_condition{condition="Ready",status="true"} == 0
for: 60s
labels:
severity: Critical
annotations:
summary: "k8s叢集{{ $labels.node }}節點down!"
description: "{{ $labels.node }} 節點不可用,請儘快檢查!"
- name: resource-status
rules:
- alert: POD cpu使用率過高!
expr: |
sum by (pod, namespace)(rate(container_cpu_usage_seconds_total{name!=""}[60s])) /
sum by (pod,namespace) (kube_pod_container_resource_limits{resource="cpu"}) > 0.8
for: 1m
labels:
severity: Warning
annotations:
summary: "CPU使用率超過80%!"
description: "{{ $labels.pod }} CPU使用率超過80%,已超過1分鐘,請檢查!"
- alert: POD記憶體使用率過高!
expr: |
sum by (pod, namespace)(rate(container_memory_usage_bytes{name!=""}[60s])) /
sum by (pod,namespace) (kube_pod_container_resource_limits{resource="memory"}) > 0.8
for: 1m
labels:
severity: Warning
annotations:
summary: "記憶體使用率超過80%!"
description: "{{ $labels.pod }} 記憶體使用率超過80%,已超過1分鐘,請檢查!"
- alert: 主機cpu使用率過高!
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100)
for: 1m
labels:
severity: Warning
annotations:
summary: "{{ $labels.pod }} CPU使用率超過80%!"
description: "{{ $labels.pod }} CPU使用率超過80%,已超過1分鐘,請檢查!"
- alert: 主機記憶體使用率過高!
expr: |
sum by (pod, namespace)(rate(container_memory_usage_bytes{name!=""}[60s])) /
sum by (pod,namespace) (kube_pod_container_resource_limits{resource="memory"}) > 0.8
for: 1m
labels:
severity: Warning
annotations:
summary: "{{ $labels.node}} 記憶體使用率超過80%!"
description: "{{ $labels.node }} 記憶體使用率超過80%,已超過1分鐘,請檢查!"
- alert: 主機磁碟使用率過高!
expr: |
sum by (pod, namespace)(rate(container_memory_usage_bytes{name!=""}[60s])) /
sum by (pod,namespace) (kube_pod_container_resource_limits{resource="memory"}) > 0.8
for: 1m
labels:
severity: Warning
annotations:
summary: "{{ $labels.node }} 磁碟使用率超過80%!"
description: "{{ $labels.node }} 磁碟使用率超過80%,已超過1分鐘,請檢查!"
根據自己的需求配置。
3、新增prometheus配置。
vim /usr/local/prometheus/prometheus.yml
#新增以下配置 alerting: alertmanagers: - static_configs: - targets: - xxx.xxx.xxx.xxx:9093 rule_files: - "/usr/local/prometheus/rules/rule.yml"
4、重新載入prometheus
curl -X POST http://localhost:9090/-/reload
部署alertmanager
1、下載alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.26.0/alertmanager-0.26.0.linux-amd64.tar.gz
2、解壓、移動到安裝目錄、配置版本軟連線。
tar -zxf alertmanager-0.26.0.linux-amd64.tar.gz mv alertmanager-0.26.0.linux-amd64 /opt/alertmanager-0.26.0 ln -s /opt/alertmanager-0.26.0 /usr/local/alertmanager
3、配置systemd管理
vim /usr/lib/systemd/system/alertmanager.service
[Unit] Description=Alertmanager Service After=network.target [Service] ExecStart=/usr/local/alertmanager/alertmanager \ --storage.path=/usr/local/alertmanager/data \ --config.file=/usr/local/alertmanager/alertmanager.yml [Install] WantedBy=multi-user.target
4、啟動alertmanager,設定為開機啟動
systemctl start alertmanager
systemctl enable alertmanager
配置郵件告警
1、修改alertmanager.yml配置,配置郵箱告警。
vim /usr/local/alertmanager/alertmanager.yml
#修改檔案內容
global:
smtp_smarthost: 'smtp.139.com:25' # smtp地址,配置前需要檢查郵箱是否有開通SMTP,25埠是否通
smtp_from: 'xxxxxxxx@139.com' # 傳送郵件的郵箱地址
smtp_auth_username: 'xxxxxxxx' # 郵箱使用者
smtp_auth_password: 'xxxxxxxx' # 郵箱密碼,這裡需要配置的是客戶端授權碼,開通SMTP時會生成,有過期時間,過期了需要去郵箱系統裡重置。
smtp_require_tls: false # 是否開啟加密連線,預設為true
route:
group_by: ["alertname"] # 分組
group_wait: 30s # 告警等待,等待30秒內的其他告警資訊統一傳送,傳送之後,需要等待group_interval的時間後才再次傳送。
group_interval: 5m # 2次傳送警告資訊之間的間隔時間
repeat_interval: 1h # 同一條報警資訊,重複傳送的間隔時間
receiver: email # 接收器名稱,與receivers中的name對應。
receivers:
- name: 'email' # 接收器名稱
email_configs:
- to: 'xxxxxxxx@qq.com' # 接收郵件的郵箱地址
2、重啟alertmanager
systemctl restart alertmanager
配置釘釘告警
1、下載釘釘通知系統工具
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
2、解壓、移動至安裝目錄,建立軟連線
tar -zxf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz mv prometheus-webhook-dingtalk-2.1.0.linux-amd64 /opt/prometheus-webhook-dingtalk-2.1.0 ln -s /opt/prometheus-webhook-dingtalk-2.1.0 /usr/local/prometheus-webhook-dingtalk
3、建立釘釘告警模板
mkdir /usr/local/prometheus-webhook-dingtalk/templates vim /usr/local/prometheus-webhook-dingtalk/templates/service.tmpl
在service.tmpl檔案中加入以下配置
{{ template "service.title" . }} {{ define "service.title" }} {{ template "__subject" . }} {{ end }} {{ define "__subject" }} [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ end }} {{ template "service.content" . }} {{ define "service.content" }} {{ if gt (len .Alerts.Firing) 0 }} ========監控到{{ .Alerts.Firing | len }}個故障======== {{ template "__alert_list" .Alerts.Firing }} --- {{ end }} {{ if gt (len .Alerts.Resolved) 0 }} ========已恢復{{ .Alerts.Resolved | len }}個故障======== {{ template "__resolved_list" .Alerts.Resolved }} --- {{ end }} {{ end }} {{ define "__alert_list" }}{{ range . }} --- **告警型別**: {{ .Labels.alertname }} **告警級別**: {{ .Labels.severity }} **告警狀態**: {{ .Status }} **告警主題**: {{ .Annotations.summary }} **告警詳情**: {{ .Annotations.description }} **觸發時間**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} {{ end }}{{ end }} {{ define "__resolved_list" }}{{ range . }} --- **告警型別**: {{ .Labels.alertname }} **告警級別**: {{ .Labels.severity }} **告警狀態**: {{ .Status }} **告警主題**: {{ .Annotations.summary }} **告警詳情**: {{ .Annotations.description }} **觸發時間**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} **恢復時間**: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} {{ end }}{{ end }}
4、修改配置檔案
cp /usr/local/prometheus-webhook-dingtalk/config.example.yml /usr/local/prometheus-webhook-dingtalk/config.yml
vim /usr/local/prometheus-webhook-dingtalk/config.yml
修改config.yml檔案
templates: - /usr/local/prometheus-webhook-dingtalk/templates/*.tmpl targets: webhook1: url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxx message: title: '{{ template "service.title" . }}' text: '{{ template "service.content" . }}'
xxxxxxxx為釘釘機器人的token
5、配置systemd管理指令碼
[Unit] Description=prometheus webhook dingtalk After=network.target [Service] ExecStart=/usr/local/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk \ --config.file=/usr/local/prometheus-webhook-dingtalk/config.yml [Install] WantedBy=multi-user.target
6、啟動服務,設定為開機啟動
systemctl start dingtalk
systemctl enable dingtalk
7、修改alertmanager配置,告警資訊同時傳送到郵箱和釘釘
vim /usr/local/alertmanager/alertmanager.yml
修改為以下配置
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 5m
repeat_interval: 2h
receiver: 'default'
routes:
- receiver: 'email'
continue: true # 繼續匹配後續路由
- receiver: 'dingding'
continue: true # 如果有其他接收者也需要接收,可以繼續新增
receivers:
- name: 'default'
- name: 'email'
email_configs:
- to: 'xxxxxxx@qq.com' #接收告警資訊郵件郵箱
from: 'xxxxxxxx@139.com' #傳送郵件資訊郵箱
smarthost: 'smtp.139.com:25'
auth_username: 'xxxxxxxx'
auth_password: 'xxxxxxxx'
require_tls: false
send_resolved: true #傳送恢復資訊
- name: 'dingding'
webhook_configs:
- url: 'http://xxx.xxx.xxx.xxx:8060/dingtalk/webhook1/send' #prometheus-webhook-dingtalk服務地址
send_resolved: true # 當告警恢復時,也傳送通知
8、重啟alertmanager
systemctl restart alertmanager