Dell PowerEdge 2950 Perc 5/i RAID monitoring Nagios

Проверяем какой RAID установлен на сервере

# lspci -v | grep RAID
02:0e.0 RAID bus controller: Dell PowerEdge Expandable RAID controller 5
        Subsystem: Dell PERC 5/i Integrated RAID Controller

Мониторить Perc 4e/Di будем с помощью megacli

# megacli -V
  MegaCLI SAS RAID Management Tool  Ver 8.05.06 Aug 28, 2012
  (c)Copyright 2011, LSI Corporation, All Rights Reserved.
  Exit Code: 0x00

Скачиваем и распаковываем архив 8.05.06_MegaCLI.zip

# wget http://www.lsi.com/downloads/Public/MegaRAID%20Common%20Files/8.05.06_MegaCLI.zip
# unzip 8.05.06_MegaCLI.zip
# cd MegaCli_Linux 
# rpm -ivh MegaCli-8.05.06-1.noarch.rpm

Пакет установлен

# rpm -qa | grep -i megacli
MegaCli-8.05.06-1.noarch
# yum info MegaCli
Installed Packages
Name        : MegaCli
Arch        : noarch
Version     : 8.05.06
Release     : 1
Size        : 5.4 M
Repo        : installed
Summary     : MegaCli SAS RAID Management Utility.
License     : LSI Logic Corporation
Description : MegaCli is used to manage SAS RAID controllers.

Cписок файлов принадлежащих данному пакету MegaCli-8.05.06-1.noarch

# rpm -ql MegaCli-8.05.06-1.noarch
/opt/MegaRAID/MegaCli/MegaCli
/opt/MegaRAID/MegaCli/MegaCli64
/opt/MegaRAID/MegaCli/libstorelibir-2.so.13.05-0

Создадим симлинк для удобной работы с утилитой MegaCli64

# ln -sf /opt/MegaRAID/MegaCli/MegaCli64 /usr/bin/MegaCli

Посмотреть полную информацию о контроллере

# MegaCli -AdpAllInfo -aALL
                 Capabilities
                ================
RAID Level Supported             : RAID0, RAID1, RAID5, RAID00, RAID10, RAID50, SRL 3 supported
Supported Drives                 : SAS, SATA
Allowed Mixing:
Mix in Enclosure Allowed

                 Device Present
                ================
Virtual Drives    : 1
  Degraded        : 0
  Offline         : 0
Physical Devices  : 7
  Disks           : 6
  Critical Disks  : 0
  Failed Disks    : 0

Частичная информация о конроллере

# MegaCli -LDInfo -LALL -aALL
Adapter 0 -- Virtual Drive Information:
Virtual Drive: 0 (Target Id: 0)
Name                :RAID5
RAID Level          : Primary-5, Secondary-0, RAID Level Qualifier-3
Size                : 9.093 TB
Parity Size         : 1.818 TB
State               : Optimal
Strip Size          : 64 KB
Number Of Drives    : 6
Span Depth          : 1
Default Cache Policy: WriteBack, ReadAhead, Direct, No Write Cache if Bad BBU
Current Cache Policy: WriteThrough, ReadAhead, Direct, No Write Cache if Bad BBU
Default Access Policy: Read/Write
Current Access Policy: Read/Write
Disk Cache Policy   : Disk's Default
Encryption Type     : None
Is VD Cached: No

Список физических дисков в системе и их статус

# MegaCli -PDList -aALL
# MegaCli -PDList -aALL | grep state
Firmware state: Online, Spun Up
Firmware state: Online, Spun Up
Firmware state: Online, Spun Up
Firmware state: Online, Spun Up
Firmware state: Online, Spun Up
Firmware state: Online, Spun Up

Небольшой скрипт для проверки состояния дисков

# cd /usr/lib64/nagios/plugins
# vi analysis.awk

# This is a little AWK program that interprets MegaCLI output
/Device Id/ { counter += 1; device[counter] = $3 }
/Firmware state/ { state_drive[counter] = $3 }
/Inquiry/ { name_drive[counter] = $3 ” ” $4 ” ” $5 ” ” $6 }
END {
for (i=1; i<=counter; i+=1) printf ("Device %02d (%s) status is: %s \n", device[i], name_drive[i], state_drive[i]);} Проверка

# MegaCli -PDList -aALL | awk -f analysis.awk
Device 00 (ATA WDC WD000000-0000001 WD-0000001) status is: Online,
Device 01 (ATA WDC WD000000-0000002 WD-0000002) status is: Online,
Device 02 (ATA WDC WD000000-0000003 WD-0000003) status is: Online,
Device 03 (ATA WDC WD000000-0000004 WD-0000004) status is: Online,
Device 04 (ATA WDC WD000000-0000005 WD-0000005) status is: Online,
Device 05 (ATA WDC WD000000-0000006 WD-0000006) status is: Online,

Создадим проверку RAID массива при помощи вот этого скрипта check_perc5i.pl

Сделаем файл исполняемым и перенесем в папку

/usr/lib/nagios/plugins для 32-битных систем
/usr/lib64/nagios/plugins"для 64-битных систем
# chmod +x check_perc5i.p
# mv check_perc5i.p /usr/lib64/nagios/plugins

Проверим скрипт

# ./check_perc5i.pl
OK - VirtualDrives=1, Degraded=0, Offline=0, PhysicalDevices=7, Disks=6,
CriticalDisks=0, FailedDisks=0, MemoryCorrectableErrors=0, MemoryUncorrectableErrors=0

Создадим симлинк для быстрой проверки состояния массива

# ln -sf /usr/lib64/nagios/plugins/check_perc5i.pl /usr/bin/checkraid
# checkraid
OK - VirtualDrives=1, Degraded=0, Offline=0, PhysicalDevices=7, Disks=6,
CriticalDisks=0, FailedDisks=0, MemoryCorrectableErrors=0, MemoryUncorrectableErrors=0

Добавим в Nagios проверку RAID при помощи nrpe и вот этого скрипта check_megacli

#! /bin/sh
. /usr/lib64/nagios/plugins/utils.sh
DEGR=`MegaCli -AdpAllInfo -aAll -NoLog |grep "Degraded *: [1-9]"`
FAIL=`MegaCli -AdpAllInfo -aAll -NoLog |grep "Failed Disks *: [1-9]"`
CRIT=`MegaCli -AdpAllInfo -aAll -NoLog |grep "Critical Disks *: [1-9]"`

if [ -n "$DEGR" ]; then
    echo "RAID CRITICAL - $DEGR"
    exitstatus=$STATE_CRITICAL

        elif [ -n "$FAIL" ]; then
             echo "DISK FAILURE - $FAIL"
             exitstatus=$STATE_WARNING

                  elif [ -n "$CRIT" ]; then
                       echo "DISK CRITICAL - $CRIT"
                       exitstatus=$STATE_WARNING

                           else
                           echo "RAID OK: Raid is optimal"
                           exitstatus=$STATE_OK
fi
exit $exitstatus

Сделаем файл исполняемым и перенесем в папку

# chmod +x check_megacli
# mv check_megacli /usr/lib64/nagios/plugins

Проверим скрипт

# /usr/lib64/nagios/plugins/check_megacli
RAID OK: Raid is optimal

Добавим на хост машине в /etc/nagios/nrpe.cfg

command[check_megacli]=/usr/lib64/nagios/plugins/check_megacli

Перестартуем nrpe

# service nrpe restart
Shutting down Nagios NRPE daemon (nrpe):                   [  OK  ]
Starting Nagios NRPE daemon (nrpe):                        [  OK  ]

На сервере Nagios в /etc/nagios/hosts/TEST.cfg пропишем проверку сервиса

define service{
               use                           generic-service
               host_name                     TEST
               service_description           RAID
               is_volatile                   0
               check_period                  24x7
               max_check_attempts            3
               normal_check_interval         1
               retry_check_interval          1
               contact_groups                admins
               notification_interval         120
               notification_period           24x7
               notification_options          c,r
               check_command                 check_nrpe!check_megacli
}

Перестарутем Nagios

# service nagios reload
nagios (pid 19539) is running...
Reloading nagios:                                          [  OK  ]

Результат