Nagios資料提取和維護

=======================================================================
提取http服務的主機名，服務名，埠號，及url
create table test1
(
SELECT
   display_name, address, host_object_id
FROM
   nagios_hosts
WHERE
   host_object_id IN (
       SELECT
           host_object_id
       FROM
           nagios_services
       WHERE
           check_command_object_id IN (
               SELECT
                   object_id
               FROM
                   nagios_commands
               WHERE
                   command_line LIKE "%check_http -H%"
               AND config_type = 1
           )
       AND config_type = 1
   )
AND config_type = 1
)
------------
create table test2
(
SELECT
   host_object_id,
   display_name,
   check_command_args
FROM
   nagios_services
WHERE
   check_command_object_id IN (
       SELECT
           object_id
       FROM
           nagios_commands
       WHERE
           command_line LIKE "%check_http -H%"
       AND config_type = 1
   )
AND config_type = 1
)

--------------
SELECT
   test1.display_name,
   test1.address,
   test2.display_name,
   test2.check_command_args
FROM
   test1,
   test2
WHERE
   test1.host_object_id = test2.host_object_id;

------------
drop table test1;
drop table test2;
-------------
----------------------------------------------------------------------------------
主機描述                                 ip            web描述        埠及web
**-**.172.28.**.162.**m1.app.rhl6.4   172.28.**.162   **web   450**!http://**
****-**.172.28.**.163.**m2.app.rhl6.4   172.28.**.163   **web   450**!http://**
----------------------------------------------------------------------------------
=======================================================================

=======================================================================
查詢服務組的服務分類列表：
SELECT
   a.servicegroup_id,
   b.alias,
   a.address,
   a.display_name,
   a.t2name,
   a.talias
FROM
   (
       SELECT
           m.address,
           m.display_name,
           m.t2name,
           m.alias talias,
           p.servicegroup_id
       FROM
           nagios_servicegroup_members p,
           (
               SELECT
                   t2.service_object_id,
                   t1.alias,
                   t1.address,
                   t1.display_name,
                   t2.display_name t2name
               FROM
                   nagios_hosts t1,
                   nagios_services t2
               WHERE
                   t1.host_object_id = t2.host_object_id
               AND t1.config_type = '1'
           ) m
       WHERE
           p.service_object_id = m.service_object_id
   ) a,
   nagios_servicegroups b
WHERE
   a.servicegroup_id = b.servicegroup_id
ORDER BY
   1 DESC;

=======================================================================

=======================================================================
已納入Nagios監控平臺進行監控的伺服器數量：
SELECT
   'Linux伺服器',
   count(DISTINCT(host_name))
FROM
   index_data
WHERE
   host_name LIKE '%rhl%'
UNION
   SELECT
       'AIX伺服器',
       count(DISTINCT(host_name))
   FROM
       index_data
   WHERE
       host_name LIKE '%AIX'
   OR host_name LIKE '%aix'
   UNION
       SELECT
           'Windows伺服器',
           count(DISTINCT(host_name))
       FROM
           index_data
       WHERE
           host_name LIKE '%win'
       UNION
           SELECT
               '虛擬機器平臺Esxi物理伺服器',
               count(DISTINCT(host_name))
           FROM
               index_data
           WHERE
               host_name LIKE 'ESX%';
-----------------------------------------------------------------------------
Linux伺服器   22111
AIX伺服器   1011
Windows伺服器   5411
虛擬機器平臺Esxi物理伺服器   661
------------------------------------------------------------------------------

一個故障事件即為Nagios監控平臺監控到的一次嚴重級別的事件。
主機存活、網頁和資料庫連線的監控頻率為1分鐘一次，其它監控指標一般為5分鐘一次
================================================================================

================================================================================
本月具體服務事件前10排名：
SELECT
   service_description,
   count(
       DISTINCT ctime,
       service_description
   ) AS event_number
FROM
   log
WHERE
   from_unixtime(ctime, '%Y%m') = "201409"
AND service_description IS NOT NULL
AND (
   STATUS = "CRITICAL"
   OR STATUS = "DOWN"
)
GROUP BY
   service_description
ORDER BY
   count(*) DESC
LIMIT 10;
-----------------------------------------------------------------------------

-----------------------------------------------------------------------------
==============================================================================

==============================================================================
本月最頻繁發生故障的伺服器前10:
SELECT
   host_name,
   count(DISTINCT ctime, host_name) AS event_number
FROM
   log
WHERE
   from_unixtime(ctime, '%Y%m') = "201409"
AND service_description IS NOT NULL
AND (
   STATUS = "CRITICAL"
   OR STATUS = "DOWN"
)
GROUP BY
   host_name
ORDER BY
   count(DISTINCT ctime, host_name) DESC
LIMIT 10;

-------------------------------------------------------------------------------
host_name      event_number
172.16.200.53   163
192.168.9.173   5
192.168.9.178   5
192.168.9.185   5
-------------------------------------------------------------------------------
===============================================================================

===============================================================================

本月最頻繁發生故障的伺服器前10排名-對應的主要故障事件：
drop procedure if exists top_server_events;
DELIMITER $$
CREATE PROCEDURE top_server_events(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare v_count int default 0;
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR
    select host_name,count(distinct ctime,host_name) as event_number from log
      where from_unixtime(ctime,'%Y%m') = v_ym
        and service_description is not null and ( status="CRITICAL" or status="DOWN" )
      group by host_name
      order by count(distinct ctime,host_name) desc
      limit 10;
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name,v_count;
WHILE stopFlag <> 1 DO
    select v_host_name,service_description,count(distinct ctime,service_description) as event_number from log
      where from_unixtime(ctime,'%Y%m')=v_ym and host_name = v_host_name
        and service_description is not null and ( status="CRITICAL" or status="DOWN" )
      group by service_description
      having count(distinct ctime,service_description) > 300
      order by count(distinct ctime,service_description) desc
      limit 10;
      FETCH v_cursor INTO v_host_name,v_count;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;

call top_server_events(201406);

生成伺服器CPU使用率報表資料：
call cpu_util_linux(201406);
call cpu_util_aix(201406);
call cpu_util_win(201406);
call cpu_util_esxi(201406);

本月"CPU使用率"峰值超過80%的伺服器：
select host_name,cpu_max,cpu_avg from cpu_util where ym=201406 and cpu_max > 80;

本月“CPU使用率”峰值超過80%的伺服器-對應的主要故障事件：
select host_name,service_description,count(distinct ctime,host_name,service_description) as event_number from log
where from_unixtime(ctime,'%Y%m')=201406 and host_name in (
    select host_name from cpu_util where ym=201406 and cpu_max > 80 )
and service_description is not null and ( status="CRITICAL" or status="DOWN" )
group by host_name,service_description
order by count(distinct ctime,host_name,service_description) desc
limit 10 ;

本月"CPU使用率"平均值超過50%的伺服器：
select host_name,cpu_avg,cpu_max from cpu_util where ym=201406 and cpu_avg > 50;

本月“CPU使用率”平均值超過50%的伺服器-對應的故障事件：
select host_name,service_description,count(distinct ctime,host_name,service_description) as event_number from log
where from_unixtime(ctime,'%Y%m')=201406 and host_name in (
    select host_name from cpu_util where ym=201406 and cpu_avg > 50 )
and service_description is not null and ( status="CRITICAL" or status="DOWN" )
group by host_name,service_description
order by count(distinct ctime,host_name,service_description) desc
limit 10;

附1：
create table cpu_util (ym int(6),host_name varchar(255),cpu_max float(5,2),cpu_avg float(5,2));

drop procedure if exists cpu_util_linux;
DELIMITER $$
CREATE PROCEDURE cpu_util_linux(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like '%rhl%' ;
/* DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ; */
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name;
WHILE stopFlag <> 1 DO
      insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
          select v_ym,v_host_name,max(value),avg(value)
          from data_bin where id_metric = (
            select metric_id from metrics where index_id = (
              select id from index_data c where c.service_description="CPU使用率"
                and c.host_name = v_host_name
              )
              and metric_name = "CpuUser"
            )
          and from_unixtime(ctime,'%Y%m') = v_ym ;
      FETCH v_cursor INTO v_host_name;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;

drop procedure if exists cpu_util_aix;
DELIMITER $$
CREATE PROCEDURE cpu_util_aix(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like '%AIX' or host_name like '%aix' ;
/* DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ; */
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name;
WHILE stopFlag <> 1 DO
      insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
          select v_ym,v_host_name,max(value),avg(value)
          from data_bin where id_metric = (
            select metric_id from metrics where index_id = (
              select id from index_data c where c.service_description="CPU使用率"
                and c.host_name = v_host_name
              )
              and metric_name = "cpuusage"
            )
          and from_unixtime(ctime,'%Y%m') = v_ym ;
      FETCH v_cursor INTO v_host_name;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;

drop procedure if exists cpu_util_win;
DELIMITER $$
CREATE PROCEDURE cpu_util_win(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like '%win' ;
/* DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ; */
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name;
WHILE stopFlag <> 1 DO
      insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
          select v_ym,v_host_name,max(value),avg(value)
          from data_bin where id_metric = (
            select metric_id from metrics where index_id = (
              select id from index_data c where c.service_description="CPU使用率"
                and c.host_name = v_host_name
              )
              and metric_name = "processor usage"
            )
          and from_unixtime(ctime,'%Y%m') = v_ym ;
      FETCH v_cursor INTO v_host_name;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;

drop procedure if exists cpu_util_esxi;
DELIMITER $$
CREATE PROCEDURE cpu_util_esxi(in v_ym int(6))
BEGIN
declare v_host_name varchar(255);
declare stopFlag int default 0;
DECLARE v_cursor CURSOR FOR select distinct(host_name) from index_data where host_name like 'ESX%' ;
/* DECLARE CONTINUE HANDLER FOR NOT FOUND set stopFlag = 1 ; */
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET stopFlag = 1 ;
select "start time ... ",now();
OPEN v_cursor;
FETCH v_cursor INTO v_host_name;
WHILE stopFlag <> 1 DO
      insert into cpu_util(ym,host_name,cpu_max,cpu_avg)
          select v_ym,v_host_name,max(value),avg(value)
          from data_bin where id_metric = (
            select metric_id from metrics where index_id = (
              select id from index_data c where c.service_description="伺服器CPU使用率"
                and c.host_name = v_host_name
              )
              and metric_name = "cpu_usage"
            )
          and from_unixtime(ctime,'%Y%m') = v_ym ;
      FETCH v_cursor INTO v_host_name;
END WHILE;
CLOSE v_cursor;
select "end time ... ",now();
END $$
DELIMITER ;
==================================================================================

==================================================================================
本月應用系統網頁故障情況

SELECT
   service_description,
   output,
   count(
       DISTINCT ctime,
       service_description,
       output
   ) AS event_number
FROM
   log
WHERE
   from_unixtime(ctime, '%Y%m') = 201406
AND (
   service_description LIKE '%WEB%'
   OR service_description LIKE '%web%'
)
AND service_description IS NOT NULL
AND (
   STATUS = "CRITICAL"
   OR STATUS = "DOWN"
)
GROUP BY
   service_description,
   output
ORDER BY
   count(
       DISTINCT ctime,
       service_description,
       output
   ) DESC
LIMIT 10;
--------------------------------------------------------------------------------------

=======================================================================================

=======================================================================================
SELECT
   host_name,
   count(DISTINCT ctime, host_name)
FROM
   log
WHERE
   from_unixtime(ctime, '%Y%m') = 201406
AND STATUS = "DOWN"
GROUP BY
   host_name
ORDER BY
   count(*) DESC;

========================================================================================

本月資料庫訪問故障

SELECT
   service_description,
   count(
       DISTINCT ctime,
       service_description
   ) AS event_number
FROM
   log
WHERE
   from_unixtime(ctime, '%Y%m') = 201406
AND (
   service_description LIKE '%資料庫監聽%'
   OR service_description LIKE '%資料庫連線時間%'
)
AND output IS NOT NULL
AND (
   STATUS = "CRITICAL"
   OR STATUS = "DOWN"
)
GROUP BY
   service_description
ORDER BY
   count(
       DISTINCT ctime,
       service_description
   ) DESC;

===========================================================================================

===========================================================================================

可用率計算表：centreon->report->
SELECT
   from_unixtime(
       date_start,
       '%Y-%m-%d %H:%i:%S'
   ) date_start,
   from_unixtime(
       date_end,
       '%Y-%m-%d %H:%i:%S'
   ) date_end,
   OKTimeScheduled,
   OKnbEvent,
   OKTimeAverageAck,
   OKTimeAverageRecovery,
   WARNINGTimeScheduled,
   WARNINGnbEvent,
   WARNINGTimeAverageAck,
   WARNINGTimeAverageRecovery,
   UNKNOWNTimeScheduled,
   UNKNOWNnbEvent,
   UNKNOWNTimeAverageAck,
   UNKNOWNTimeAverageRecovery,
   CRITICALTimeScheduled,
   CRITICALnbEvent,
   CRITICALTimeAverageAck,
   CRITICALTimeAverageRecovery,
   UNDETERMINEDTimeScheduled,
   MaintenanceTime
FROM
   log_archive_service
WHERE
   service_id = (
       SELECT
           service_id
       FROM
           index_data
       WHERE
           service_description = "可用率_新核心資料庫"
   );

----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------
可用率演算法：
OKTimeScheduled:   26101
CRITICALTimeScheduled:   60
UNDETERMINEDTimeScheduled: 60239
total:   86400

ok%   = 26101/86400*100=30.20949074           =OKTimeScheduled/total*100
未確定%   = 60239/86400=69.72106481           =UNDETERMINEDTimeScheduled/total*100
critical%   =60/86400=0.069444444             =CRITICALTimeScheduled/total*100
ok(%)   =(1-60/26101)*100=99.77012375         =(1-CRITICALTimeScheduled/OKTimeScheduled)*100

可用率維護操作:

1、由於監控平臺本身的原因導致缺少某一天的監控資料，則需要根據實際情況增加1條記錄，否則對可用率有很大影響。

select
from_unixtime(date_start,'%Y-%m-%d %H:%i:%S') date_start2,
from_unixtime(date_end,'%Y-%m-%d %H:%i:%S') date_end2,
log_id,
host_id,
service_id,
OKTimeScheduled,
OKnbEvent,
OKTimeAverageAck,
OKTimeAverageRecovery,
WARNINGTimeScheduled,
WARNINGnbEvent,
WARNINGTimeAverageAck,
WARNINGTimeAverageRecovery,
UNKNOWNTimeScheduled,
UNKNOWNnbEvent,
UNKNOWNTimeAverageAck,
UNKNOWNTimeAverageRecovery,
CRITICALTimeScheduled,
CRITICALnbEvent,
CRITICALTimeAverageAck,
CRITICALTimeAverageRecovery,
UNDETERMINEDTimeScheduled,
MaintenanceTime,
date_start,
date_end
from log_archive_service
where service_id=(select service_id
                  from index_data
                  where service_description="可用率_新核心資料庫"
                 );

缺少一天的資料，則會產生undetermined 1天：
| 2014-04-09 00:00:00 | 2014-04-10 00:00:00 | 26939 |      59 |        768 |           86400
| 2014-04-11 00:00:00 | 2014-04-12 00:00:00 | 28766 |      59 |        768 |           86400

補2014-04-10一條記錄：
insert into log_archive_service(
host_id,
service_id,
OKTimeScheduled,
OKnbEvent,
OKTimeAverageAck,
OKTimeAverageRecovery,
WARNINGTimeScheduled,
WARNINGnbEvent,
WARNINGTimeAverageAck,
WARNINGTimeAverageRecovery,
UNKNOWNTimeScheduled,
UNKNOWNnbEvent,
UNKNOWNTimeAverageAck,
UNKNOWNTimeAverageRecovery,
CRITICALTimeScheduled,
CRITICALnbEvent,
CRITICALTimeAverageAck,
CRITICALTimeAverageRecovery,
UNDETERMINEDTimeScheduled,
MaintenanceTime,
date_start,
date_end
)
values
(
59,
768,
86400,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
unix_timestamp('2014-04-10 00:00:00'),
unix_timestamp('2014-04-11 00:00:00')
);

2、由於誤報，需要對可用率的資料進行調整

UPDATE log_archive_service
SET OKTimeScheduled = 86400,
CRITICALTimeScheduled = 0,
CRITICALnbEvent = 0,
OKnbEvent = 0
WHERE
   from_unixtime(date_start, '%Y-%m-%d') >= "2014-03-01"
AND from_unixtime(date_start, '%Y-%m-%d') <= "2014-03-31"
AND service_id = (
   SELECT
       service_id
   FROM
       index_data
   WHERE
       service_description = "可用率_核心資料庫"
);
===============================================================================

UPDATE log_archive_service
SET OKTimeScheduled = 86400,
CRITICALTimeScheduled = 0,
CRITICALnbEvent = 0,
OKnbEvent = 0,
MaintenanceTime = 0
WHERE
   from_unixtime(date_start, '%Y-%m-%d') >= "2014-06-25"
AND from_unixtime(date_start, '%Y-%m-%d') <= "2014-06-26"
AND service_id = (
   SELECT
       service_id
   FROM
       index_data
   WHERE
       service_description = "可用率_核心資料庫"
);

=================================================================================

Nagios資料提取和維護

相關文章