Nagios資料提取和維護
Nagios資料提取和維護
=======================================================================
提取http服務的主機名,服務名,埠號,及url
create table test1
(
SELECT
display_name, address, host_object_id
FROM
nagios_hosts
WHERE
host_object_id IN (
SELECT
host_object_id
FROM
nagios_services
WHERE
check_command_object_id IN (
SELECT
object_id
FROM
nagios_commands
WHERE
command_line LIKE "%check_http -H%"
AND config_type = 1
)
AND config_type = 1
)
AND config_type = 1
)
------------
create table test2
(
SELECT
host_object_id,
display_name,
check_command_args
FROM
nagios_services
WHERE
check_command_object_id IN (
SELECT
object_id
FROM
nagios_commands
WHERE
command_line LIKE "%check_http -H%"
AND config_type = 1
)
AND config_type = 1
)
--------------
SELECT
test1.display_name,
test1.address,
test2.display_name,
test2.check_command_args
FROM
test1,
test2
WHERE
test1.host_object_id = test2.host_object_id;
------------
drop table test1;
drop table test2;
-------------
----------------------------------------------------------------------------------
主機描述 ip web描述 埠及web
**-**.172.28.**.162.**m1.app.rhl6.4 172.28.**.162 **web 450**!http://**
****-**.172.28.**.163.**m2.app.rhl6.4 172.28.**.163 **web 450**!http://**
----------------------------------------------------------------------------------
=======================================================================
=======================================================================
查詢服務組的服務分類列表:
SELECT
a.servicegroup_id,
b.alias,
a.address,
a.display_name,
a.t2name,
a.talias
FROM
(
SELECT
m.address,
m.display_name,
m.t2name,
m.alias talias,
p.servicegroup_id
FROM
nagios_servicegroup_members p,
(
SELECT
t2.service_object_id,
t1.alias,
t1.address,
t1.display_name,
t2.display_name t2name
FROM
nagios_hosts t1,
nagios_services t2
WHERE
t1.host_object_id = t2.host_object_id
AND t1.config_type = '1'
) m
WHERE
p.service_object_id = m.service_object_id
) a,
nagios_servicegroups b
WHERE
a.servicegroup_id = b.servicegroup_id
ORDER BY
1 DESC;
=======================================================================
=======================================================================
已納入Nagios監控平臺進行監控的伺服器數量:
SELECT
'Linux伺服器',
count(DISTINCT(host_name))
FROM
index_data
WHERE
host_name LIKE '%rhl%'
UNION
SELECT
'AIX伺服器',
count(DISTINCT(host_name))
FROM
index_data
WHERE
host_name LIKE '%AIX'
OR host_name LIKE '%aix'
UNION
SELECT
'Windows伺服器',
count(DISTINCT(host_name))
FROM
index_data
WHERE
host_name LIKE '%win'
UNION
SELECT
'虛擬機器平臺Esxi物理伺服器',
count(DISTINCT(host_name))
FROM
index_data
WHERE
host_name LIKE 'ESX%';
-----------------------------------------------------------------------------
Linux伺服器 22111
AIX伺服器 1011
Windows伺服器 5411
虛擬機器平臺Esxi物理伺服器 661
------------------------------------------------------------------------------
一個故障事件即為Nagios監控平臺監控到的一次嚴重級別的事件。
主機存活、網頁和資料庫連線的監控頻率為1分鐘一次,其它監控指標一般為5分鐘一次
================================================================================
================================================================================
本月具體服務事件前10排名:
SELECT
service_description,
count(
DISTINCT ctime,
service_description
) AS event_number
FROM
log
WHERE
from_unixtime(ctime, '%Y%m') = "201409"
AND service_description IS NOT NULL
AND (
STATUS = "CRITICAL"
OR STATUS = "DOWN"
)
GROUP BY
service_description
ORDER BY
count(*) DESC
LIMIT 10;
-----------------------------------------------------------------------------
-----------------------------------------------------------------------------
==============================================================================
==============================================================================
本月最頻繁發生故障的伺服器前10:
SELECT
host_name,
count(DISTINCT ctime, host_name) AS event_number
FROM
log
WHERE
from_unixtime(ctime, '%Y%m') = "201409"
AND service_description IS NOT NULL
AND (
STATUS = "CRITICAL"
OR STATUS = "DOWN"
)
GROUP BY
host_name
ORDER BY
count(DISTINCT ctime, host_name) DESC
LIMIT 10;
-------------------------------------------------------------------------------
host_name event_number
172.16.200.53 163
192.168.9.173 5
192.168.9.178 5
192.168.9.185 5
-------------------------------------------------------------------------------
===============================================================================
===============================================================================
本月最頻繁發生故障的伺服器前10排名-對應的主要故障事件:
drop procedure if exists top_server_events;
DELIMITER $$
CREATE PROCEDURE top_server_events(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare v_count int default 0;
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR
select host_name,count(distinct ctime,host_name) as event_number from log
where from_unixtime(ctime,'%Y%m') = v_ym
and service_description is not null and ( status="CRITICAL" or status="DOWN" )
group by host_name
order by count(distinct ctime,host_name) desc
limit 10;
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name,v_count;
WHILE stopFlag <> 1 DO
select v_host_name,service_description,count(distinct ctime,service_description) as event_number from log
where from_unixtime(ctime,'%Y%m')=v_ym and host_name = v_host_name
and service_description is not null and ( status="CRITICAL" or status="DOWN" )
group by service_description
having count(distinct ctime,service_description) > 300
order by count(distinct ctime,service_description) desc
limit 10;
FETCH v_cursor INTO v_host_name,v_count;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;
call top_server_events(201406);
生成伺服器CPU使用率報表資料:
call cpu_util_linux(201406);
call cpu_util_aix(201406);
call cpu_util_win(201406);
call cpu_util_esxi(201406);
本月"CPU使用率"峰值超過80%的伺服器:
select host_name,cpu_max,cpu_avg from cpu_util where ym=201406 and cpu_max > 80;
本月“CPU使用率”峰值超過80%的伺服器-對應的主要故障事件:
select host_name,service_description,count(distinct ctime,host_name,service_description) as event_number from log
where from_unixtime(ctime,'%Y%m')=201406 and host_name in (
select host_name from cpu_util where ym=201406 and cpu_max > 80 )
and service_description is not null and ( status="CRITICAL" or status="DOWN" )
group by host_name,service_description
order by count(distinct ctime,host_name,service_description) desc
limit 10 ;
本月"CPU使用率"平均值超過50%的伺服器:
select host_name,cpu_avg,cpu_max from cpu_util where ym=201406 and cpu_avg > 50;
本月“CPU使用率”平均值超過50%的伺服器-對應的故障事件:
select host_name,service_description,count(distinct ctime,host_name,service_description) as event_number from log
where from_unixtime(ctime,'%Y%m')=201406 and host_name in (
select host_name from cpu_util where ym=201406 and cpu_avg > 50 )
and service_description is not null and ( status="CRITICAL" or status="DOWN" )
group by host_name,service_description
order by count(distinct ctime,host_name,service_description) desc
limit 10;
附1:
create table cpu_util (ym int(6),host_name varchar(255),cpu_max float(5,2),cpu_avg float(5,2));
drop procedure if exists cpu_util_linux;
DELIMITER $$
CREATE PROCEDURE cpu_util_linux(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like '%rhl%' ;
/* DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ; */
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name;
WHILE stopFlag <> 1 DO
insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
select v_ym,v_host_name,max(value),avg(value)
from data_bin where id_metric = (
select metric_id from metrics where index_id = (
select id from index_data c where c.service_description="CPU使用率"
and c.host_name = v_host_name
)
and metric_name = "CpuUser"
)
and from_unixtime(ctime,'%Y%m') = v_ym ;
FETCH v_cursor INTO v_host_name;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;
drop procedure if exists cpu_util_aix;
DELIMITER $$
CREATE PROCEDURE cpu_util_aix(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like '%AIX' or host_name like '%aix' ;
/* DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ; */
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name;
WHILE stopFlag <> 1 DO
insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
select v_ym,v_host_name,max(value),avg(value)
from data_bin where id_metric = (
select metric_id from metrics where index_id = (
select id from index_data c where c.service_description="CPU使用率"
and c.host_name = v_host_name
)
and metric_name = "cpuusage"
)
and from_unixtime(ctime,'%Y%m') = v_ym ;
FETCH v_cursor INTO v_host_name;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;
drop procedure if exists cpu_util_win;
DELIMITER $$
CREATE PROCEDURE cpu_util_win(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like '%win' ;
/* DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ; */
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name;
WHILE stopFlag <> 1 DO
insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
select v_ym,v_host_name,max(value),avg(value)
from data_bin where id_metric = (
select metric_id from metrics where index_id = (
select id from index_data c where c.service_description="CPU使用率"
and c.host_name = v_host_name
)
and metric_name = "processor usage"
)
and from_unixtime(ctime,'%Y%m') = v_ym ;
FETCH v_cursor INTO v_host_name;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;
drop procedure if exists cpu_util_esxi;
DELIMITER $$
CREATE PROCEDURE cpu_util_esxi(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like 'ESX%' ;
/* DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ; */
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name;
WHILE stopFlag <> 1 DO
insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
select v_ym,v_host_name,max(value),avg(value)
from data_bin where id_metric = (
select metric_id from metrics where index_id = (
select id from index_data c where c.service_description="伺服器CPU使用率"
and c.host_name = v_host_name
)
and metric_name = "cpu_usage"
)
and from_unixtime(ctime,'%Y%m') = v_ym ;
FETCH v_cursor INTO v_host_name;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;
==================================================================================
==================================================================================
本月應用系統網頁故障情況
SELECT
service_description,
output,
count(
DISTINCT ctime,
service_description,
output
) AS event_number
FROM
log
WHERE
from_unixtime(ctime, '%Y%m') = 201406
AND (
service_description LIKE '%WEB%'
OR service_description LIKE '%web%'
)
AND service_description IS NOT NULL
AND (
STATUS = "CRITICAL"
OR STATUS = "DOWN"
)
GROUP BY
service_description,
output
ORDER BY
count(
DISTINCT ctime,
service_description,
output
) DESC
LIMIT 10;
--------------------------------------------------------------------------------------
=======================================================================================
=======================================================================================
SELECT
host_name,
count(DISTINCT ctime, host_name)
FROM
log
WHERE
from_unixtime(ctime, '%Y%m') = 201406
AND STATUS = "DOWN"
GROUP BY
host_name
ORDER BY
count(*) DESC;
========================================================================================
本月資料庫訪問故障
SELECT
service_description,
count(
DISTINCT ctime,
service_description
) AS event_number
FROM
log
WHERE
from_unixtime(ctime, '%Y%m') = 201406
AND (
service_description LIKE '%資料庫監聽%'
OR service_description LIKE '%資料庫連線時間%'
)
AND output IS NOT NULL
AND (
STATUS = "CRITICAL"
OR STATUS = "DOWN"
)
GROUP BY
service_description
ORDER BY
count(
DISTINCT ctime,
service_description
) DESC;
===========================================================================================
===========================================================================================
可用率計算表:centreon->report->
SELECT
from_unixtime(
date_start,
'%Y-%m-%d %H:%i:%S'
) date_start,
from_unixtime(
date_end,
'%Y-%m-%d %H:%i:%S'
) date_end,
OKTimeScheduled,
OKnbEvent,
OKTimeAverageAck,
OKTimeAverageRecovery,
WARNINGTimeScheduled,
WARNINGnbEvent,
WARNINGTimeAverageAck,
WARNINGTimeAverageRecovery,
UNKNOWNTimeScheduled,
UNKNOWNnbEvent,
UNKNOWNTimeAverageAck,
UNKNOWNTimeAverageRecovery,
CRITICALTimeScheduled,
CRITICALnbEvent,
CRITICALTimeAverageAck,
CRITICALTimeAverageRecovery,
UNDETERMINEDTimeScheduled,
MaintenanceTime
FROM
log_archive_service
WHERE
service_id = (
SELECT
service_id
FROM
index_data
WHERE
service_description = "可用率_新核心資料庫"
);
----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------
可用率演算法:
OKTimeScheduled: 26101
CRITICALTimeScheduled: 60
UNDETERMINEDTimeScheduled: 60239
total: 86400
ok% = 26101/86400*100=30.20949074 =OKTimeScheduled/total*100
未確定% = 60239/86400=69.72106481 =UNDETERMINEDTimeScheduled/total*100
critical% =60/86400=0.069444444 =CRITICALTimeScheduled/total*100
ok(%) =(1-60/26101)*100=99.77012375 =(1-CRITICALTimeScheduled/OKTimeScheduled)*100
可用率維護操作:
1、由於監控平臺本身的原因導致缺少某一天的監控資料,則需要根據實際情況增加1條記錄,否則對可用率有很大影響。
select
from_unixtime(date_start,'%Y-%m-%d %H:%i:%S') date_start2,
from_unixtime(date_end,'%Y-%m-%d %H:%i:%S') date_end2,
log_id,
host_id,
service_id,
OKTimeScheduled,
OKnbEvent,
OKTimeAverageAck,
OKTimeAverageRecovery,
WARNINGTimeScheduled,
WARNINGnbEvent,
WARNINGTimeAverageAck,
WARNINGTimeAverageRecovery,
UNKNOWNTimeScheduled,
UNKNOWNnbEvent,
UNKNOWNTimeAverageAck,
UNKNOWNTimeAverageRecovery,
CRITICALTimeScheduled,
CRITICALnbEvent,
CRITICALTimeAverageAck,
CRITICALTimeAverageRecovery,
UNDETERMINEDTimeScheduled,
MaintenanceTime,
date_start,
date_end
from log_archive_service
where service_id=(select service_id
from index_data
where service_description="可用率_新核心資料庫"
);
缺少一天的資料,則會產生undetermined 1天:
| 2014-04-09 00:00:00 | 2014-04-10 00:00:00 | 26939 | 59 | 768 | 86400
| 2014-04-11 00:00:00 | 2014-04-12 00:00:00 | 28766 | 59 | 768 | 86400
補2014-04-10一條記錄:
insert into log_archive_service(
host_id,
service_id,
OKTimeScheduled,
OKnbEvent,
OKTimeAverageAck,
OKTimeAverageRecovery,
WARNINGTimeScheduled,
WARNINGnbEvent,
WARNINGTimeAverageAck,
WARNINGTimeAverageRecovery,
UNKNOWNTimeScheduled,
UNKNOWNnbEvent,
UNKNOWNTimeAverageAck,
UNKNOWNTimeAverageRecovery,
CRITICALTimeScheduled,
CRITICALnbEvent,
CRITICALTimeAverageAck,
CRITICALTimeAverageRecovery,
UNDETERMINEDTimeScheduled,
MaintenanceTime,
date_start,
date_end
)
values
(
59,
768,
86400,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
unix_timestamp('2014-04-10 00:00:00'),
unix_timestamp('2014-04-11 00:00:00')
);
2、由於誤報,需要對可用率的資料進行調整
UPDATE log_archive_service
SET OKTimeScheduled = 86400,
CRITICALTimeScheduled = 0,
CRITICALnbEvent = 0,
OKnbEvent = 0
WHERE
from_unixtime(date_start, '%Y-%m-%d') >= "2014-03-01"
AND from_unixtime(date_start, '%Y-%m-%d') <= "2014-03-31"
AND service_id = (
SELECT
service_id
FROM
index_data
WHERE
service_description = "可用率_核心資料庫"
);
===============================================================================
UPDATE log_archive_service
SET OKTimeScheduled = 86400,
CRITICALTimeScheduled = 0,
CRITICALnbEvent = 0,
OKnbEvent = 0,
MaintenanceTime = 0
WHERE
from_unixtime(date_start, '%Y-%m-%d') >= "2014-06-25"
AND from_unixtime(date_start, '%Y-%m-%d') <= "2014-06-26"
AND service_id = (
SELECT
service_id
FROM
index_data
WHERE
service_description = "可用率_核心資料庫"
);
=================================================================================
=======================================================================
提取http服務的主機名,服務名,埠號,及url
create table test1
(
SELECT
display_name, address, host_object_id
FROM
nagios_hosts
WHERE
host_object_id IN (
SELECT
host_object_id
FROM
nagios_services
WHERE
check_command_object_id IN (
SELECT
object_id
FROM
nagios_commands
WHERE
command_line LIKE "%check_http -H%"
AND config_type = 1
)
AND config_type = 1
)
AND config_type = 1
)
------------
create table test2
(
SELECT
host_object_id,
display_name,
check_command_args
FROM
nagios_services
WHERE
check_command_object_id IN (
SELECT
object_id
FROM
nagios_commands
WHERE
command_line LIKE "%check_http -H%"
AND config_type = 1
)
AND config_type = 1
)
--------------
SELECT
test1.display_name,
test1.address,
test2.display_name,
test2.check_command_args
FROM
test1,
test2
WHERE
test1.host_object_id = test2.host_object_id;
------------
drop table test1;
drop table test2;
-------------
----------------------------------------------------------------------------------
主機描述 ip web描述 埠及web
**-**.172.28.**.162.**m1.app.rhl6.4 172.28.**.162 **web 450**!http://**
****-**.172.28.**.163.**m2.app.rhl6.4 172.28.**.163 **web 450**!http://**
----------------------------------------------------------------------------------
=======================================================================
=======================================================================
查詢服務組的服務分類列表:
SELECT
a.servicegroup_id,
b.alias,
a.address,
a.display_name,
a.t2name,
a.talias
FROM
(
SELECT
m.address,
m.display_name,
m.t2name,
m.alias talias,
p.servicegroup_id
FROM
nagios_servicegroup_members p,
(
SELECT
t2.service_object_id,
t1.alias,
t1.address,
t1.display_name,
t2.display_name t2name
FROM
nagios_hosts t1,
nagios_services t2
WHERE
t1.host_object_id = t2.host_object_id
AND t1.config_type = '1'
) m
WHERE
p.service_object_id = m.service_object_id
) a,
nagios_servicegroups b
WHERE
a.servicegroup_id = b.servicegroup_id
ORDER BY
1 DESC;
=======================================================================
=======================================================================
已納入Nagios監控平臺進行監控的伺服器數量:
SELECT
'Linux伺服器',
count(DISTINCT(host_name))
FROM
index_data
WHERE
host_name LIKE '%rhl%'
UNION
SELECT
'AIX伺服器',
count(DISTINCT(host_name))
FROM
index_data
WHERE
host_name LIKE '%AIX'
OR host_name LIKE '%aix'
UNION
SELECT
'Windows伺服器',
count(DISTINCT(host_name))
FROM
index_data
WHERE
host_name LIKE '%win'
UNION
SELECT
'虛擬機器平臺Esxi物理伺服器',
count(DISTINCT(host_name))
FROM
index_data
WHERE
host_name LIKE 'ESX%';
-----------------------------------------------------------------------------
Linux伺服器 22111
AIX伺服器 1011
Windows伺服器 5411
虛擬機器平臺Esxi物理伺服器 661
------------------------------------------------------------------------------
一個故障事件即為Nagios監控平臺監控到的一次嚴重級別的事件。
主機存活、網頁和資料庫連線的監控頻率為1分鐘一次,其它監控指標一般為5分鐘一次
================================================================================
================================================================================
本月具體服務事件前10排名:
SELECT
service_description,
count(
DISTINCT ctime,
service_description
) AS event_number
FROM
log
WHERE
from_unixtime(ctime, '%Y%m') = "201409"
AND service_description IS NOT NULL
AND (
STATUS = "CRITICAL"
OR STATUS = "DOWN"
)
GROUP BY
service_description
ORDER BY
count(*) DESC
LIMIT 10;
-----------------------------------------------------------------------------
-----------------------------------------------------------------------------
==============================================================================
==============================================================================
本月最頻繁發生故障的伺服器前10:
SELECT
host_name,
count(DISTINCT ctime, host_name) AS event_number
FROM
log
WHERE
from_unixtime(ctime, '%Y%m') = "201409"
AND service_description IS NOT NULL
AND (
STATUS = "CRITICAL"
OR STATUS = "DOWN"
)
GROUP BY
host_name
ORDER BY
count(DISTINCT ctime, host_name) DESC
LIMIT 10;
-------------------------------------------------------------------------------
host_name event_number
172.16.200.53 163
192.168.9.173 5
192.168.9.178 5
192.168.9.185 5
-------------------------------------------------------------------------------
===============================================================================
===============================================================================
本月最頻繁發生故障的伺服器前10排名-對應的主要故障事件:
drop procedure if exists top_server_events;
DELIMITER $$
CREATE PROCEDURE top_server_events(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare v_count int default 0;
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR
select host_name,count(distinct ctime,host_name) as event_number from log
where from_unixtime(ctime,'%Y%m') = v_ym
and service_description is not null and ( status="CRITICAL" or status="DOWN" )
group by host_name
order by count(distinct ctime,host_name) desc
limit 10;
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name,v_count;
WHILE stopFlag <> 1 DO
select v_host_name,service_description,count(distinct ctime,service_description) as event_number from log
where from_unixtime(ctime,'%Y%m')=v_ym and host_name = v_host_name
and service_description is not null and ( status="CRITICAL" or status="DOWN" )
group by service_description
having count(distinct ctime,service_description) > 300
order by count(distinct ctime,service_description) desc
limit 10;
FETCH v_cursor INTO v_host_name,v_count;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;
call top_server_events(201406);
生成伺服器CPU使用率報表資料:
call cpu_util_linux(201406);
call cpu_util_aix(201406);
call cpu_util_win(201406);
call cpu_util_esxi(201406);
本月"CPU使用率"峰值超過80%的伺服器:
select host_name,cpu_max,cpu_avg from cpu_util where ym=201406 and cpu_max > 80;
本月“CPU使用率”峰值超過80%的伺服器-對應的主要故障事件:
select host_name,service_description,count(distinct ctime,host_name,service_description) as event_number from log
where from_unixtime(ctime,'%Y%m')=201406 and host_name in (
select host_name from cpu_util where ym=201406 and cpu_max > 80 )
and service_description is not null and ( status="CRITICAL" or status="DOWN" )
group by host_name,service_description
order by count(distinct ctime,host_name,service_description) desc
limit 10 ;
本月"CPU使用率"平均值超過50%的伺服器:
select host_name,cpu_avg,cpu_max from cpu_util where ym=201406 and cpu_avg > 50;
本月“CPU使用率”平均值超過50%的伺服器-對應的故障事件:
select host_name,service_description,count(distinct ctime,host_name,service_description) as event_number from log
where from_unixtime(ctime,'%Y%m')=201406 and host_name in (
select host_name from cpu_util where ym=201406 and cpu_avg > 50 )
and service_description is not null and ( status="CRITICAL" or status="DOWN" )
group by host_name,service_description
order by count(distinct ctime,host_name,service_description) desc
limit 10;
附1:
create table cpu_util (ym int(6),host_name varchar(255),cpu_max float(5,2),cpu_avg float(5,2));
drop procedure if exists cpu_util_linux;
DELIMITER $$
CREATE PROCEDURE cpu_util_linux(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like '%rhl%' ;
/* DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ; */
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name;
WHILE stopFlag <> 1 DO
insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
select v_ym,v_host_name,max(value),avg(value)
from data_bin where id_metric = (
select metric_id from metrics where index_id = (
select id from index_data c where c.service_description="CPU使用率"
and c.host_name = v_host_name
)
and metric_name = "CpuUser"
)
and from_unixtime(ctime,'%Y%m') = v_ym ;
FETCH v_cursor INTO v_host_name;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;
drop procedure if exists cpu_util_aix;
DELIMITER $$
CREATE PROCEDURE cpu_util_aix(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like '%AIX' or host_name like '%aix' ;
/* DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ; */
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name;
WHILE stopFlag <> 1 DO
insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
select v_ym,v_host_name,max(value),avg(value)
from data_bin where id_metric = (
select metric_id from metrics where index_id = (
select id from index_data c where c.service_description="CPU使用率"
and c.host_name = v_host_name
)
and metric_name = "cpuusage"
)
and from_unixtime(ctime,'%Y%m') = v_ym ;
FETCH v_cursor INTO v_host_name;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;
drop procedure if exists cpu_util_win;
DELIMITER $$
CREATE PROCEDURE cpu_util_win(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like '%win' ;
/* DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ; */
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name;
WHILE stopFlag <> 1 DO
insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
select v_ym,v_host_name,max(value),avg(value)
from data_bin where id_metric = (
select metric_id from metrics where index_id = (
select id from index_data c where c.service_description="CPU使用率"
and c.host_name = v_host_name
)
and metric_name = "processor usage"
)
and from_unixtime(ctime,'%Y%m') = v_ym ;
FETCH v_cursor INTO v_host_name;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;
drop procedure if exists cpu_util_esxi;
DELIMITER $$
CREATE PROCEDURE cpu_util_esxi(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like 'ESX%' ;
/* DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ; */
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name;
WHILE stopFlag <> 1 DO
insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
select v_ym,v_host_name,max(value),avg(value)
from data_bin where id_metric = (
select metric_id from metrics where index_id = (
select id from index_data c where c.service_description="伺服器CPU使用率"
and c.host_name = v_host_name
)
and metric_name = "cpu_usage"
)
and from_unixtime(ctime,'%Y%m') = v_ym ;
FETCH v_cursor INTO v_host_name;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;
==================================================================================
==================================================================================
本月應用系統網頁故障情況
SELECT
service_description,
output,
count(
DISTINCT ctime,
service_description,
output
) AS event_number
FROM
log
WHERE
from_unixtime(ctime, '%Y%m') = 201406
AND (
service_description LIKE '%WEB%'
OR service_description LIKE '%web%'
)
AND service_description IS NOT NULL
AND (
STATUS = "CRITICAL"
OR STATUS = "DOWN"
)
GROUP BY
service_description,
output
ORDER BY
count(
DISTINCT ctime,
service_description,
output
) DESC
LIMIT 10;
--------------------------------------------------------------------------------------
=======================================================================================
=======================================================================================
SELECT
host_name,
count(DISTINCT ctime, host_name)
FROM
log
WHERE
from_unixtime(ctime, '%Y%m') = 201406
AND STATUS = "DOWN"
GROUP BY
host_name
ORDER BY
count(*) DESC;
========================================================================================
本月資料庫訪問故障
SELECT
service_description,
count(
DISTINCT ctime,
service_description
) AS event_number
FROM
log
WHERE
from_unixtime(ctime, '%Y%m') = 201406
AND (
service_description LIKE '%資料庫監聽%'
OR service_description LIKE '%資料庫連線時間%'
)
AND output IS NOT NULL
AND (
STATUS = "CRITICAL"
OR STATUS = "DOWN"
)
GROUP BY
service_description
ORDER BY
count(
DISTINCT ctime,
service_description
) DESC;
===========================================================================================
===========================================================================================
可用率計算表:centreon->report->
SELECT
from_unixtime(
date_start,
'%Y-%m-%d %H:%i:%S'
) date_start,
from_unixtime(
date_end,
'%Y-%m-%d %H:%i:%S'
) date_end,
OKTimeScheduled,
OKnbEvent,
OKTimeAverageAck,
OKTimeAverageRecovery,
WARNINGTimeScheduled,
WARNINGnbEvent,
WARNINGTimeAverageAck,
WARNINGTimeAverageRecovery,
UNKNOWNTimeScheduled,
UNKNOWNnbEvent,
UNKNOWNTimeAverageAck,
UNKNOWNTimeAverageRecovery,
CRITICALTimeScheduled,
CRITICALnbEvent,
CRITICALTimeAverageAck,
CRITICALTimeAverageRecovery,
UNDETERMINEDTimeScheduled,
MaintenanceTime
FROM
log_archive_service
WHERE
service_id = (
SELECT
service_id
FROM
index_data
WHERE
service_description = "可用率_新核心資料庫"
);
----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------
可用率演算法:
OKTimeScheduled: 26101
CRITICALTimeScheduled: 60
UNDETERMINEDTimeScheduled: 60239
total: 86400
ok% = 26101/86400*100=30.20949074 =OKTimeScheduled/total*100
未確定% = 60239/86400=69.72106481 =UNDETERMINEDTimeScheduled/total*100
critical% =60/86400=0.069444444 =CRITICALTimeScheduled/total*100
ok(%) =(1-60/26101)*100=99.77012375 =(1-CRITICALTimeScheduled/OKTimeScheduled)*100
可用率維護操作:
1、由於監控平臺本身的原因導致缺少某一天的監控資料,則需要根據實際情況增加1條記錄,否則對可用率有很大影響。
select
from_unixtime(date_start,'%Y-%m-%d %H:%i:%S') date_start2,
from_unixtime(date_end,'%Y-%m-%d %H:%i:%S') date_end2,
log_id,
host_id,
service_id,
OKTimeScheduled,
OKnbEvent,
OKTimeAverageAck,
OKTimeAverageRecovery,
WARNINGTimeScheduled,
WARNINGnbEvent,
WARNINGTimeAverageAck,
WARNINGTimeAverageRecovery,
UNKNOWNTimeScheduled,
UNKNOWNnbEvent,
UNKNOWNTimeAverageAck,
UNKNOWNTimeAverageRecovery,
CRITICALTimeScheduled,
CRITICALnbEvent,
CRITICALTimeAverageAck,
CRITICALTimeAverageRecovery,
UNDETERMINEDTimeScheduled,
MaintenanceTime,
date_start,
date_end
from log_archive_service
where service_id=(select service_id
from index_data
where service_description="可用率_新核心資料庫"
);
缺少一天的資料,則會產生undetermined 1天:
| 2014-04-09 00:00:00 | 2014-04-10 00:00:00 | 26939 | 59 | 768 | 86400
| 2014-04-11 00:00:00 | 2014-04-12 00:00:00 | 28766 | 59 | 768 | 86400
補2014-04-10一條記錄:
insert into log_archive_service(
host_id,
service_id,
OKTimeScheduled,
OKnbEvent,
OKTimeAverageAck,
OKTimeAverageRecovery,
WARNINGTimeScheduled,
WARNINGnbEvent,
WARNINGTimeAverageAck,
WARNINGTimeAverageRecovery,
UNKNOWNTimeScheduled,
UNKNOWNnbEvent,
UNKNOWNTimeAverageAck,
UNKNOWNTimeAverageRecovery,
CRITICALTimeScheduled,
CRITICALnbEvent,
CRITICALTimeAverageAck,
CRITICALTimeAverageRecovery,
UNDETERMINEDTimeScheduled,
MaintenanceTime,
date_start,
date_end
)
values
(
59,
768,
86400,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
unix_timestamp('2014-04-10 00:00:00'),
unix_timestamp('2014-04-11 00:00:00')
);
2、由於誤報,需要對可用率的資料進行調整
UPDATE log_archive_service
SET OKTimeScheduled = 86400,
CRITICALTimeScheduled = 0,
CRITICALnbEvent = 0,
OKnbEvent = 0
WHERE
from_unixtime(date_start, '%Y-%m-%d') >= "2014-03-01"
AND from_unixtime(date_start, '%Y-%m-%d') <= "2014-03-31"
AND service_id = (
SELECT
service_id
FROM
index_data
WHERE
service_description = "可用率_核心資料庫"
);
===============================================================================
UPDATE log_archive_service
SET OKTimeScheduled = 86400,
CRITICALTimeScheduled = 0,
CRITICALnbEvent = 0,
OKnbEvent = 0,
MaintenanceTime = 0
WHERE
from_unixtime(date_start, '%Y-%m-%d') >= "2014-06-25"
AND from_unixtime(date_start, '%Y-%m-%d') <= "2014-06-26"
AND service_id = (
SELECT
service_id
FROM
index_data
WHERE
service_description = "可用率_核心資料庫"
);
=================================================================================
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/29500582/viewspace-1315905/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- 資料維護和基礎架構維護-有感架構
- 維護樹狀資料
- 維護資料庫安全資料庫
- 資料庫常用維護命令資料庫
- Oracle資料庫日常維護Oracle資料庫
- 深入FDO----資料維護
- 瞭解和使用kfed維護ASM後設資料ASM
- Access資料庫日常維護和Access資料庫最佳化方法資料庫
- 資料探勘和資料提取能做什麼?
- 1.7. 建立和維護資料庫密碼檔案資料庫密碼
- [MySQL光速入門]029 資料庫管理和維護MySql資料庫
- Databricks 第6篇:Spark SQL 維護資料庫和表SparkSQL資料庫
- 六、資料庫管理與維護資料庫
- 網上 平臺玩提取平臺維護資料異常瀏覽失敗怎麼辦
- SAP RETAIL MM41維護商品主資料的時候可以維護分類資料AI
- 資訊系統資料維護的週期和頻率
- Oracle資料庫密碼檔案的使用和維護(轉)Oracle資料庫密碼
- nodebb搭建 維護 discuz 資料遷移
- 物料主資料的維護狀態
- ORACLE資料庫管理維護綱要Oracle資料庫
- 資料探勘和資料提取該怎麼區分?
- 資料庫和XML __ 從資料庫中提取xml資料庫XML
- PROC提取資料
- 《大型資料庫技術》MySQL管理維護資料庫MySql
- Oracle資料庫維護的重要性Oracle資料庫
- 【轉】Oracle資料庫日常維護手冊Oracle資料庫
- oralce動態維護資料庫的序列資料庫
- Informix_on_line資料庫維護技巧ORM資料庫
- Oracle資料庫中索引的維護 ztOracle資料庫索引
- PostgreSQL學習手冊(資料庫維護)SQL資料庫
- Oracle資料庫日常維護手冊 (zt)Oracle資料庫
- Oracle資料庫中索引的維護(zt)Oracle資料庫索引
- Oracle資料庫中索引的維護(轉)Oracle資料庫索引
- 29. 使用MySQL之資料庫維護MySql資料庫
- 索引優化和維護索引優化
- 達夢(DM)資料庫的表空間建立和遷移維護資料庫
- SAP CRM和C4C的產品主資料price維護
- 使用NLP和ML來提取和構造Web資料Web