  1. long post.. I'm posting this more as an informational; this is what we did to collect everything through snmp rather than open up any ports like ssh... If you have ideas on making it better I'm all ears! Nothing fancy.. I modified the code from the real world example that snmp pass_persist gives on their page... the main script loads stuff from a YAML config file then calls each python module, passing the snmp pass_persist object (pp) and config object over to them. The modules will run a quick test to see if they actually need to run... for example if the machine doesn't have DNS, then the dns_stats doesn't need to run. The File Monitor and DNS Stats are capable of monitoring multiple instances. Things that I want still: -- The file monitor oids will change for files if a file is added or removed.. so maybe we set LM to do autodiscovery based on a key or lookup instead ?! -- The above applies to software raids.. may need a state file to cache things in, or again.. key/lookups instead. -- dns_stats calls an external bash script to do the parsing because my python woo is very weak -- clean up / insert comments everywhere ! snmpd.conf (add this line to the bottom) pass_persist . /usr/share/snmp/extensions/lm/ config.yml: (/usr/share/snmp/extensions/lm/) --- dns: instances: - name: 'Main DNS' config: '/etc/named.conf' stats: '/var/named/data/named_stats.txt' - name: 'Special voodoo DNS' config: '/etc/named.voodoo.conf' stats: '/var/named.voodoo/data/named_stats.txt' file_monitor: directories: - '/var/opt/logs/servers' - '/var/opt/logs/network' Main script: (/usr/share/snmp/extensions/lm/ #!/usr/bin/python -u # -*- coding:Utf-8 -*- # Option -u is needed for communication with snmpd import sys sys.path.append('/usr/share/snmp/extensions/lm/libraries') import snmp_passpersist as snmp import os, re, socket, syslog, time, errno import LM_Config as cfg import LM_Software_Raid import LM_FileMonitor import LM_dns_stats # Global vars pp = None config = cfg.loadConfig() VER = "Logic Monitor SNMP Helper v0.1" POLLING_INTERVAL = 60 MAX_RETRY = 5 OID_BASE = "." def update_data(): pp.add_str('0.0',VER) ## This is needed because the variable isn't getting reset to 0 inside the modules LM_Software_Raid.doUpdate(pp) LM_FileMonitor.doUpdate(pp,config) LM_dns_stats.doUpdate(pp,config) def main(): syslog.openlog(sys.argv[0],syslog.LOG_PID) retry_timestamp=int(time.time()) retry_counter=MAX_RETRY while retry_counter>0: try: global pp syslog.syslog(syslog.LOG_INFO,"Starting Logic Monitor SNMP Helper...") # Load helpers pp=snmp.PassPersist(OID_BASE) pp.start(update_data,POLLING_INTERVAL) # Should'nt return (except if updater thread has died) except KeyboardInterrupt: print "Exiting on user request." sys.exit(0) except IOError, e: if e.errno == errno.EPIPE: syslog.syslog(syslog.LOG_INFO,"Snmpd had close the pipe, exiting...") sys.exit(0) else: syslog.syslog(syslog.LOG_WARNING,"Updater thread as died: IOError: %s" % (e)) except Exception, e: syslog.syslog(syslog.LOG_WARNING,"Main thread as died: %s: %s" % (e.__class__.__name__, e)) else: syslog.syslog(syslog.LOG_WARNING,"Updater thread as died: %s" % (pp.error)) syslog.syslog(syslog.LOG_WARNING,"Restarting monitoring in 15 sec...") time.sleep(15) # Errors frequency detection now=int(time.time()) if (now - 3600) > retry_timestamp: # If the previous error is older than 1H retry_counter=MAX_RETRY # Reset the counter else: retry_counter-=1 # Else countdown retry_timestamp=now syslog.syslog(syslog.LOG_ERR,"Too many retry, abording... Please check if xen is running !") sys.exit(1) if __name__ == "__main__": main() (/usr/share/snmp/extensions/lm/libraries) #!/usr/bin/python import sys sys.path.append('/usr/share/snmp/extensions/lm/libraries') import yaml file = '/usr/share/snmp/extensions/lm/config.yml' def loadConfig(): try: with open(file) as f: return yaml.load(f) except: print(file + " doesn't exist") (/usr/share/snmp/extensions/lm/libraries) #!/usr/bin/python import sys sys.path.append('/usr/share/snmp/extensions/lm/libraries') import re import mdstat import json import argparse import LM_Config as cfg md = mdstat.parse() def testModule(): config = cfg.loadConfig() if len(md['devices']) > 0: return True else: return False def to_bool(*args): try: if args[1] == "rev": if args[0] == True or args[0] == None: return 1 elif args[0] == False: return 0 except: if args[0] == True or args[0] == None: return 0 elif args[0] == False: return 1 else: return s def doUpdate(pp): if testModule(): # headers for each section in the for loop(s) pp.add_str('1.0',"md name") pp.add_str('1.1',"md status") # 1.2 is in the next loop pp.add_str('1.3',"md raid type") pp.add_str('1.4',"Disks in software raids") pp.add_str('1.5',"Disks Faulty Status") pp.add_str('1.6',"DiskX belongs to mdX") i = 0 for mdCounter,mdValue in enumerate(sorted(md['devices'])): # 1.0.x md names pp.add_str('1.0.' + str(mdCounter),mdValue) # 1.1.x md status pp.add_str('1.1.' + str(mdCounter),to_bool(md['devices'][mdValue]['active'])) # 1.2.x.0.0 md disk count pp.add_str('1.2.' + str(mdCounter) + '.0.0', mdValue + " disk count") # 1.2.x.1.0 md non faulted disk count pp.add_str('1.2.' + str(mdCounter) + '.1.0', mdValue + " non faulted disk count") # 1.3.x md type pp.add_str('1.3.' + str(mdCounter),md['devices'][mdValue]['personality']) fault = 0; count = 0 for dCounter,dValue in enumerate(md['devices'][mdValue]['disks']): pp.add_str('1.4.' + str(i),dValue) disk_fault=to_bool(md['devices'][mdValue]['disks'][dValue]['faulty'],"rev") fault = fault + to_bool(md['devices'][mdValue]['disks'][dValue]['faulty']) pp.add_str('1.5.' + str(i),disk_fault) pp.add_str('1.6.' + str(i), mdValue) i = i + 1 count = count + 1 # 1.2.x.0.x & 1.2.x.1.x pp.add_str('1.2.' + str(mdCounter) + '.0.1', count) pp.add_str('1.2.' + str(mdCounter) + '.1.1', fault) (/usr/share/snmp/extensions/lm/libraries) #!/usr/bin/python import sys sys.path.append('/usr/share/snmp/extensions/lm/libraries') import os import time import glob from stat import * dirs = [] # Nice little function to convert True / None to 0 and False to 1 def to_bool(s): if s == True or s == None: return 0 elif s == False: return 1 else: return s def testModule(config): for dir in config['file_monitor']['directories']: if os.path.isdir(dir): dirs.append(dir) if len(dirs) > 0: return True else: return False def doUpdate(pp,config): # Test if we should run this module and return anything if testModule(config): x = 0 pp.add_str('2.0.0',"Directories") pp.add_str('2.0.1',dirs) for dir in dirs: os.chdir(dir) pp.add_str('2.2.0', "Last Check Time (epoch)") pp.add_str('2.2.1', time.time()) pp.add_str('2.2.3', "Files") pp.add_str('2.2.4', "Modification Time at last check (epoch)") pp.add_str('2.2.5', "Full Path") pp.add_str('2.2.6', "File size") pp.add_str('2.2.7', "Created (epoch)") pp.add_str('2.2.8', "Access Time (epoch)") for counter,value in enumerate(glob.glob("*")): statinfo = os.stat(value) pp.add_str('2.2.3.' + str(counter), value) pp.add_str('2.2.4.' + str(counter), str(statinfo[8])) pp.add_str('2.2.5.' + str(counter), dir + '/' + value) pp.add_str('2.2.6.' + str(counter), str(statinfo[6])) pp.add_str('2.2.7.' + str(counter), str(statinfo[9])) pp.add_str('2.2.8.' + str(counter), str(statinfo[7])) x = x + 1 else: pp.add_str('2',"LM_FileMonitor module did not find any directories to monitor") (/usr/share/snmp/extensions/lm/libraries) #!/usr/bin/python import sys sys.path.append('/usr/share/snmp/extensions/lm/libraries') import os import re import subprocess dns_stats = [] dns_desc = [] def testModule(config): for instance in config['dns']['instances']: stats_file = instance['stats'] if os.path.exists(stats_file) and os.access(stats_file, os.R_OK): dns_stats.append(stats_file) dns_desc.append(instance['name']) if len(dns_stats) > 0: return True else: return False def doUpdate(pp,config): if testModule(config): i,dnsx = 0,0 for stats_file in dns_stats: if dnsx == len(config['dns']['instances']): dnsx = 0 pp.add_str('3.0.' + str(dnsx), str(dns_desc[dnsx])) pp.add_str('3.1.' + str(dnsx),"Stats from " + stats_file) script = "/usr/share/snmp/extensions/lm/helper_scripts/" incoming = subprocess.check_output([script, stats_file, 'incoming']) outgoing = subprocess.check_output([script, stats_file, 'outgoing']) resolver = subprocess.check_output([script, stats_file, 'resolver']) socket = subprocess.check_output([script, stats_file, 'socket']) ## Incoming i = 0 for s in re.split('\n',incoming): data = re.split(',', s) if len(data[0]) > 0 and data[1] > 0: pp.add_str('' + str(i), data[0]) pp.add_str('3.2.1.' + str(dnsx) + '.' + str(i), data[1]) i = i + 1 ## Outgoing i = 0 for s in re.split('\n',outgoing): data = re.split(',', s) if len(data[0]) > 0: pp.add_str('' + str(i), data[0]) pp.add_str('3.3.1.' + str(dnsx) + '.' + str(i), data[1]) i = i + 1 ## resolver i = 0 for s in re.split('\n',resolver): data = re.split(',', s) if len(data[0]) > 0: pp.add_str('' + str(i), data[0]) pp.add_str('3.4.1.' + str(dnsx) + '.' + str(i), data[1]) i = i + 1 ## socket i = 0 for s in re.split('\n',socket): data = re.split(',', s) if len(data[0]) > 0: pp.add_str('' + str(i), data[0]) pp.add_str('3.5.1.' + str(dnsx) + '.' + str(i), data[1]) i = i + 1 dnsx = dnsx + 1 else: pp.add_str('3' + "LM_dns_stats module did not find any bind intances from the config file") (/usr/share/snmp/extensions/lm/helper_scripts/) #!/bin/bash file=${1} [ -x ${file} ] && exit 0 [ -f ${file} ] || exit 0 dnsNames=(A A6 AAAA ANY CNAME DNSKEY DS MX NAPTR NS PTR SOA SPF SRV TXT) resNames=('mismatch responses received' 'IPv4 queries sent' 'IPv4 responses received' 'NXDOMAIN received' 'SERVFAIL received' 'FORMERR received' 'query retries' 'query timeouts' 'queries with RTT < 10ms' 'queries with RTT 10-100ms' 'queries with RTT 100-500ms' 'queries with RTT 500-800ms' 'queries with RTT 800-1600ms' 'queries with RTT > 1600ms') sockNames=('UDP/IPv4 sockets opened' 'UDP/IPv4 sockets closed' 'UDP/IPv4 socket bind failures' 'UDP/IPv4 connections established' 'UDP/IPv4 recv errors' 'TCP/IPv4 sockets opened' 'TCP/IPv4 sockets closed' 'TCP/IPv4 socket bind failures' 'TCP/IPv4 connections established' 'TCP/IPv4 recv errors') now_epoch=$(date +%s) mtime_epoch=$(stat ${file} -c %W) function updateStats { [ -f ${file} ] && [ $((now_epoch-mtime_epoch)) -gt 300 ] && rm -f $file && rndc stats } function getStats { start=$1 end=$2 regx1="sed -n '/${start}/,/${end}/p'" data=$(cat ${file} | \ eval ${regx1} | \ egrep '[0-9]' | \ awk '{ print $1" "$2 }') while read value name; do names+=(${name}) values+=("${value}") done <<< "${data}" for n in $(eval echo \${${3}[@]}); do regx2="${n}" if [[ ! "${names[@]}" =~ "$regx2" ]]; then names+=(${n}) values+=("0") fi done x=0 for n in ${names[@]}; do echo "${names[$x]},${values[$x]}" x=$((x+1)) done } function inStats { getStats "Incoming Q" "Outgoing Q" "dnsNames" } function outStats { getStats "Outgoing Q" "^+" "dnsNames" } function resStats { for n in "${resNames[@]}"; do regexp=" ${n}$" name=$(echo ${n}) value=$(egrep "${regexp}" ${file} | \ sed -n 's/.* \([0-9]*\) \([A-Za-z].*\)/\1/p') [ ${#value} -eq 0 ] && value=0 echo "${name},${value}" done } function resSocket { for n in "${sockNames[@]}"; do regexp=" ${n}$" value=$(egrep "${regexp}" ${file} | \ sed -n 's/.* \([0-9]*\) \([A-Za-z].*\)/\1/p') [ ${#value} -eq 0 ] && value=0 echo "${n},${value}" done } # all returns get the timestamps.. echo "stats_epoch,${mtime_epoch}" echo "now_epoch,${now_epoch}" case ${2} in incoming) inStats;; outgoing) outStats;; resolver) resStats;; socket) resSocket;; *) inStats;; esac # Call createStats last.. if we destroy first then query the file, values will be small to 0 as the values start at time of stats creation. If we call at the end, the next polling will have data during the time period from poll to poll. updateStats exit 0