Just finished my first Python script. It’s a Nagios plugin to check Dell hardware components via the omreport utility. It’s designed to be used client-side via NRPE (or check_by_ssh). Additional usage information can be found within the scripts docstrings as well as the --help option. Some gotchas:
srv-admin.sh start on Redhat-based systems or /etc/init.d/dataeng start on Debian-based. The order that the services start is crucial. The necessary device drivers must be loaded prior to the loading of the IPMI module.
#!/usr/bin/env python
import optparse
import subprocess
import sys
def check_storage():
""" Checks Dell storage components (pdisk, vdisk, cntrl battery).
Assigns results from omreport commands, gathered from parse_om(), to
local lists of dicts. Combines and passes to disp_results(). Performs
forementioned on each controller in system. omreport commands performed:
omreport storage controller
omreport storage pdisk controller=X
omreport storage vdisk controller=X
omreport storage battery
"""
# Create list of valid controllers (i.e. ['0','1'])
controllers = [x['ID'] for x in parse_om("storage controller", ["ID"])]
# check pdisks, vdisks, cntrl battery for each controller
for controller in controllers:
vfilter = ["Status","Name","State"]
pfilter = ["Status","Name","State","FailurePredicted"]
bfilter = ["Status","Name","State"]
vdisk = parse_om("storage vdisk controller=" + controller, vfilter)
pdisk = parse_om("storage pdisk controller=" + controller, pfilter)
battery = parse_om("storage battery controller=" + controller, bfilter)
components = vdisk + pdisk + battery
# parse and display results
disp_results(components)
def check_chassis(args):
""" Checks Dell chassis components.
Verifies user specified list of components to check are valid. Assigns
results from 'omreport chassis', gathered via parse_om(), to local dict
to bo passed to disp_results().
"""
components = ('fans',
'intrusion',
'memory',
'powersupplies',
'processors',
'temperatures',
'voltages',
'hardwarelog',
'batteries')
for arg in args:
if arg.lower() == "all":
args = ""
elif arg.lower() not in components:
print >> sys.stderr, '\nError: invalid chassis argument: %s' % arg
sys.exit(1)
# Returns dictionary in form: component:status.
chas = parse_om("chassis",args)[0]
disp_results(chas, chassis="True")
def parse_om(suffix, filters=""):
""" Returns results from omreport utility as a list of dicts.
Runs omreport with sub-command specified in param "suffix". Filters out
lines not matching optional param "filters". Attempts to provide useful
error output in instances where OMSA fails us.
"""
filters = [x.lower() for x in filters]
cmd = ["/usr/sbin/omreport"] + suffix.split()
try:
data = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0]
except OSError, e:
print >> sys.stderr, "Error running '%s', %s" % (" ".join(cmd), e)
sys.exit(1)
data = data.replace(' ','').splitlines()
result = [{}]
for item in data:
# Filter out useless items such as titles, blank lines.
if ":" not in item or "SEVERITY" in item:
continue
key, val = item.split(":", 1)
# Reverse chassis output to match others, ["Ok:Fans"] to ["Fans:Ok"].
if suffix == "chassis":
key, val = val, key
# Limit result to those specified in filters[].
if filters:
if key.lower() not in filters:
continue
if key in result[-1]:
result.append({})
result[-1][key] = val
# Sometimes omreport returns zero output if omsa services aren't started.
if not result[0]:
print >> sys.stderr, 'Error: "omreport %s" returned 0 output.' % suffix
print >> sys.stderr, 'Is OMSA running? "srvadmin-services.sh status".'
sys.exit(1)
return result
def disp_results(components, chassis=""):
""" Displays component status results, total component count, & exit status.
Iterates through components specified in param "components", appends
component name and state to one of three lists based on status. If any
components are critical or warning than only those components are printed,
otherwise all components are printed.
Controller batteries in state "charging" are ignored as they clutter the
Nagios status screen.
"""
succ = []
warn = []
crit = []
if chassis:
for key in components:
if components[key] == "Ok":
succ.append(key + ":" + components[key])
elif components[key] == "Critical":
crit.append(key + ":" + components[key])
else:
warn.append(key + ":" + components[key])
else:
# components is a list of dictionaries.
for value in components:
if value['Status'] == 'Ok':
succ.append("%s: Ok" % value['Name'])
elif value['Status'] == 'Critical':
msg = "%s in state:'%s'" % (value['Name'], value['State'])
if value.get('FailurePredicted') == 'Yes':
msg += ", Failure Predicted"
crit.append(msg)
# All remaining statuses: non-critical, non-recoverable, etc.
else:
# Skip when controller battery is in state charging.
if value['Name'].startswith("Battery") and \
value['State'] == "Charging":
continue
msg = "%s in state: '%s'" % (value['Name'], value['State'])
if value.get('FailurePredicted') == 'Yes':
msg += ", Failure Predicted"
warn.append(msg)
countSuffix = " - [%s:Success, %s:Warning, %s:Critical]" %\
(len(succ), len(warn), len(crit))
if crit:
print ", ".join(crit) + ", ".join(warn) + countSuffix
sys.exit(2)
elif warn:
print ", ".join(warn) + countSuffix
sys.exit(1)
else:
print ", ".join(succ) + countSuffix
def main():
""" Handles optparse usage and calls approriate check function."""
parser = optparse.OptionParser(description=\
"This Nagios plugin checks the health of Dell hardware. "
"Plugin requires omreport, part of Dell's OMSA package.")
# optparse wraps newlines and no option to override exists. As a hack
# I inject spaces to move each example to its own line. I could
# subclass optparse.epilog and override wrapping, but epilog only
# exists in python versions 2.4.? and above.
examples = optparse.OptionGroup(parser, "Examples",
"check_dell -s "
" "
"check_dell -c fans memory voltages"
" "
"check_dell -c all ")
arguments = optparse.OptionGroup(parser, "Valid chassis args",
"all fans intrusion memory "
"powersupplies processors "
"temperatures voltages "
"hardwarelog batteries")
parser.add_option_group(examples)
parser.add_option_group(arguments)
parser.add_option('-s',
'--storage',
action='store_true',
default=False,
help='check virtual and physical disks')
parser.add_option('-c',
'--chassis',
action='store_true',
default=False,
help='check specified chassis components')
(opts, args) = parser.parse_args()
if not opts.chassis and not opts.storage:
parser.print_help()
elif opts.chassis and len(args) < 1:
parser.error('--chassis takes one or more arguments')
# If the checks are run together the output will be two lines.
if opts.storage:
check_storage()
if opts.chassis:
check_chassis(args)
if __name__ == '__main__':
main()
[...] OMSA provides an interface to view and administer hardware on a Dell server. OMSA offers both a Web interface and CLI and uses the OpenIPMI standardand, thus it requires OpenIPMI be installed. Check out Dell’s site for mounds of mostly useless detail – Openmanage. The company I work for uses Dell servers almost exclusively. We have well over 10,000 PowerEdge servers deployed throughout several DCs. So we don’t have the luxury of manually auditing server health by peering at a server’s front panel, it must be automated. We use the Dell OMSA application on a growing percentage of our servers to monitor the state of Physical Disks, Virtual Disks, Memory, Controllers, Controller Batteries, etc. We monitor the status of these various components via Nagios by using the NRPE daemon and a custom check plugin that I wrote in python. [...]
Great post!