#!/usr/bin/env python # # This script will compare the master and slave databases, one table at a time, to determine # whether they are synchronized or not. Lines are printed with a leading "x" or "o", inspired # by the Japanese associate of "X" meaning "no" and "O" meaning "OK". Any line starting with # an "x" will follow with details about the table not being in sync. # import psycopg2 import time import sys import getopt import signal import string # Database Parameters db_host = 'localhost' db_name = 'marian' db_user = 'marian' db_password = 'marian' # Miscelaneous variables PROGNAME = "check_slony_lag" VERSION = "1.0.0" opt_timeout = 10 opt_warning = -1 opt_critical = -1 opt_debug = False # function: debug_print # ============================================================================ def debug_print(txt): """Lets us turn off print statements that are otherwise unimportant.""" if opt_debug: print txt # Dictionary of Nagios Plug-in return-values ERRORS={ 'OK':0, 'WARNING':1, 'CRITICAL':2, 'UNKNOWN':3, 'DEPENDENT':4 }; # Options that have to be handled by defau # --help, --version, --timeout, --warning, --critical def usage(): print "%s: [--help] [--version] [--timeout=n] --warning=n --critical=n" % PROGNAME print "" print " -h, --help This help message" print " -d, --debug Enable debugging output" print " -v, --version Version of this program" print " -t, --timeout= Max amount of time to execute" print " -w, --warning= (required) Warning number of lagged events" print " -c, --critical= (required) Critical number of lagged events" print "" print "Example:" print "" print " %s --timeout=20 --warning=80 --critical=90" % PROGNAME print "" try: opts, args = getopt.getopt(sys.argv[1:], "hvdt:w:c:", ["help", "version", "debug", "timeout=", "warning=", "critical="]) except getopt.GetoptError, err: print str(err) usage() sys.exit(ERRORS['UNKNOWN']) for opt, val in opts: if opt in ("-v", "--version"): print "%s v%s" % (PROGNAME, VERSION) elif opt in ("-t", "--timeout"): opt_timeout = int(val) elif opt in ("-w", "--warning"): opt_warning = int(val) elif opt in ("-c", "--critical"): opt_critical = int(val) elif opt in ("-d", "--debug"): opt_debug = True elif opt in ("-h", "--help"): usage() sys.exit(ERRORS['UNKNOWN']) else: assert False, "unknown option" if opt_critical == -1 or opt_warning == -1: print "Error: Make sure you specify a '--warning' AND '--critical'" sys.exit(ERRORS['UNKNOWN']) if opt_critical < opt_warning: print "Error: The 'warning' value should be less than 'critical' value" sys.exit(ERRORS['UNKNOWN']) def handler(signum, frame): print "%s UNKNOWN: Timed out after %d seconds" % (PROGNAME, opt_timeout) sys.exit(ERRORS['UNKNOWN']) # Set the signal handler and a 5-second alarm signal.signal(signal.SIGALRM, handler) signal.alarm(opt_timeout) # Connect to the database db = psycopg2.connect("user=" + db_user + " password=" + db_password + " dbname=" + db_name + " host=" + db_host) # Prepare our database pointer db_cursor = db.cursor() try: # Perform a query # This is hard-coded for the _slony_erp schema db_cursor.execute("select st_lag_num_events from _slony_erp.sl_status") except psycopg2.ProgrammingError: returnVal = ERRORS['CRITICAL'] returnStr = "%s CRITICAL: SQL Query returned zero records!" % (PROGNAME) else: # Get the results db_row = db_cursor.fetchone() debug_print("db_row: %s" % (db_row,)) if db_row == None: # Set counter to 0 st_lag_num_events = 0 else: # Save the value st_lag_num_events = db_row[0] debug_print("st_lag_num_events: %d" % (st_lag_num_events,)) if st_lag_num_events >= opt_critical: returnVal = ERRORS['CRITICAL'] returnStr = "%s CRITICAL: Replication is lagged by %d events!" % (PROGNAME, st_lag_num_events) elif st_lag_num_events >= opt_warning: returnVal = ERRORS['WARNING'] returnStr = "%s WARNING: %d events waiting to be replicated." % (PROGNAME, st_lag_num_events) else: returnVal = ERRORS['OK'] if st_lag_num_events == 0: returnStr = "%s OK: No replicated events in queue." % (PROGNAME,) elif st_lag_num_events == 1: returnStr = "%s OK: Replication is only lagged by 1 event." % (PROGNAME,) else: returnStr = "%s OK: Replication is only lagged by %d events." % (PROGNAME, st_lag_num_events) # Print the status of this check print returnStr # Exit with our return value sys.exit(returnVal)