Nagios failover setup
Once you have nagios configured you can setup a failover setup where if the nagios master is offline then a standby slave will enable notifications and checking. Here are the setup notes and as custome script i wrote in pythong to achieve this.
the following command checks if nagios is running locally
[root@scrappy nagios]# /usr/local/nagios/libexec/check_nagios -F /usr/local/nagios/var/status.dat -e 1 -C ‘/usr/local/nagios/bin/nagios -d /usr/local/nagios/etc/nagios.cfg’
NAGIOS OK: 1 process, status log updated 5 seconds ago
now, set in nrpe config as following on remote slave/master to verify is running. must restart nrpe on machine installed on to reread nrpe.cfg
command[check_nagios_failover]=/usr/local/nagios/libexec/check_nagios -F /usr/local/nagios/var/status.dat -e 1 -C ‘/usr/local/nagios/bin/nagios -d /usr/local/nagios/etc/nagios.cfg’
test nrpe remote command.
[root@scrappy nagios]# /usr/local/nagios/libexec/check_nrpe -H james -c check_nagios_failover
NAGIOS OK: 15 processes, status log updated 0 seconds ago
modify nagios.cfg on nagios slave. restart nagios.
execute_service_checks=0
enable_notifications=0
check_external_commands=1
now set crontab on slave to check for master failure.
[root@scrappy etc]# crontab -l
* * * * * nagios /usr/local/nagios/set_slave_status.py > /dev/null
Create script and modify master ip.
#!/usr/bin/python
import os
import commands
import sys
master='james'
slave='scrappy'
tmp_file='/tmp/nagios-failover-state.txt'
commandfile='/usr/local/nagios/var/rw/nagios.cmd'
now=commands.getoutput('date +%s')
email=’email@domain.com‘
out=commands.getoutput(’/usr/local/nagios/libexec/check_nrpe -H ‘+master+’ -c check_nagios_failover’)
#if not 0, then master is down.
master_return_val=out.find(’OK’)
#store state information
def failover_save_state(x):
fh=open(tmp_file,’w')
fh.write(x)
fh.close()
def sync_nagios_files():
os.system(’rsync -av ‘+master+’:/usr/local/nagios/etc/objects/* /usr/local/nagios/etc/objects/’)
os.system(’/etc/init.d/nagios reload’)
print “Master conf files synced with Slave.”
#read current state information
fh=open(tmp_file,’r')
current_state=fh.readlines()
fh.close()
current_state=current_state[0]
#enable or disable checks for hosts
#print current_state
if master_return_val <= 0 and current_state is not ‘enabled’:
if current_state==’enabled’:
print “Nagios slave (”+slave+”) is active. Nagios master (”+master+”) is down. No state change”
else:
os.system(’/usr/bin/printf “[%lu] ENABLE_NOTIFICATIONS\n” ‘+now+’ > ‘+commandfile)
os.system(’/usr/bin/printf “[%lu] START_EXECUTING_HOST_CHECKS\n” ‘+now+’ > ‘+commandfile)
os.system(’/usr/bin/printf “[%lu] START_EXECUTING_SVC_CHECKS\n” ‘+now+’ > ‘+commandfile)
os.system(’echo elvis has left the building | mail -s “[Nagios] Master Down! Slave Enabled.” ‘+email)
print “Nagios Failover enabled”
failover_save_state(’enabled’)
sync_nagios_files()
elif current_state is not ‘disabled’:
if current_state==’disabled’:
print “Nagios master (”+master+”) is enabled. No state change.”
else:
os.system(’/usr/bin/printf “[%lu] DISABLE_NOTIFICATIONS\n” ‘+now+’ > ‘+commandfile)
os.system(’/usr/bin/printf “[%lu] STOP_EXECUTING_HOST_CHECKS\n” ‘+now+’ > ‘+commandfile)
os.system(’/usr/bin/printf “[%lu] STOP_EXECUTING_SVC_CHECKS\n” ‘+now+’ > ‘+commandfile)
os.system(’echo We are all out of donuts. | mail -s “[Nagios] Master Restored. Slave Disabled.” ‘+email)
print “Nagios master restored”
failover_save_state(’disabled’)
sync_nagios_files()
make executable
[root@scrappy nagios]# chown nagios.nagios /usr/local/nagios/etc/set_slave_status.py
[root@scrappy etc]# chmod o=rwx set_slave_status.py
