quick cleanup of perflogserver.py
This commit is contained in:
parent
b37eb2fe74
commit
5c7f9b7dc4
1 changed files with 359 additions and 341 deletions
|
@ -2,25 +2,36 @@
|
|||
#
|
||||
# (c)2011-2012 the Boeing Company
|
||||
#
|
||||
# perfmon.py - CORE server and node performace metrics logger and alarmer
|
||||
# server metrics: loadave1, 5, 15, mem, used cpu% of total, cpu1, cpu2, ..., cpun
|
||||
# node metrics: throughput, mem, cpu total, usr, sys, wait
|
||||
#
|
||||
import os, sys, time, re, optparse, signal, commands, pdb
|
||||
|
||||
"""
|
||||
perflogserver.py - CORE server and node performace metrics logger and alarmer
|
||||
server metrics: loadave1, 5, 15, mem, used cpu% of total, cpu1, cpu2, ..., cpun
|
||||
node metrics: throughput, mem, cpu total, usr, sys, wait
|
||||
"""
|
||||
|
||||
import commands
|
||||
import optparse
|
||||
import os
|
||||
import pdb
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
|
||||
|
||||
def readfile(fname):
|
||||
lines=[]
|
||||
lines = []
|
||||
try:
|
||||
f = open(fname, "r")
|
||||
except:
|
||||
except IOError:
|
||||
if options.timestamp == True:
|
||||
print str(time.time()),
|
||||
print "ERROR: failed to open file %s\n" % fname
|
||||
else :
|
||||
lines = f.readlines()
|
||||
f.close()
|
||||
print str(time.time()),
|
||||
print "ERROR: failed to open file %s\n" % fname
|
||||
else:
|
||||
lines = f.readlines()
|
||||
f.close()
|
||||
return lines
|
||||
|
||||
|
||||
def numcpus():
|
||||
lines = readfile("/proc/stat")
|
||||
n = 0
|
||||
|
@ -30,35 +41,36 @@ def numcpus():
|
|||
n += 1
|
||||
return n
|
||||
|
||||
|
||||
def handler(signum, frame):
|
||||
print "stop timestamp:", str(time.time()) + ", cyclecount=", cyclecount, ", caught signal", signum
|
||||
print "stop timestamp:", str(
|
||||
time.time()) + ", cyclecount=", cyclecount, ", caught signal", signum
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
class ServerMetrics(object):
|
||||
def __init__(self):
|
||||
self.smetrics = { "serverloadavg1" : 0.0,
|
||||
"serverloadavg5" : 0.0,
|
||||
"serverloadavg15" : 0.0,
|
||||
"serverusedmemory" : 0.0,
|
||||
"serverusedcputime" : 0.0,
|
||||
"processorusedcputime" : [] }
|
||||
self.smetrics = {"serverloadavg1": 0.0,
|
||||
"serverloadavg5": 0.0,
|
||||
"serverloadavg15": 0.0,
|
||||
"serverusedmemory": 0.0,
|
||||
"serverusedcputime": 0.0,
|
||||
"processorusedcputime": []}
|
||||
|
||||
# set values from val = (nump, ldavg1, ldavg5, adavg15, mem, cpu, p1cpu, p2cpu...)
|
||||
def setvalues(self, val):
|
||||
"""
|
||||
Set values from val = (nump, ldavg1, ldavg5, adavg15, mem, cpu, p1cpu, p2cpu...).
|
||||
"""
|
||||
self.smetrics["serverloadavg1"] = val[0]
|
||||
self.smetrics["serverloadavg5"] = val[1]
|
||||
self.smetrics["serverloadavg15"] = val[2]
|
||||
self.smetrics["serverusedmemory"] = val[4]
|
||||
self.smetrics["serverusedcputime"] = val[5]
|
||||
#print self.smetrics.keys(), self.smetrics.values()
|
||||
|
||||
pcpu = []
|
||||
for ind in range(5,len(val)):
|
||||
pcpu.append(val[ind])
|
||||
# print "[" + ",".join(map(lambda(x):str(round(x, 2)), pcpu)) +"]"
|
||||
|
||||
pcpu = []
|
||||
for ind in range(5, len(val)):
|
||||
pcpu.append(val[ind])
|
||||
self.smetrics["processorusedcputime"] = pcpu
|
||||
#print self.smetrics.keys(), self.smetrics.values()
|
||||
|
||||
def setvalue(self, key, val):
|
||||
self.smetrics[key] = val
|
||||
|
@ -70,457 +82,470 @@ class ServerMetrics(object):
|
|||
return self.smetrics.keys()
|
||||
|
||||
def tocsv(self):
|
||||
rv = "Server"
|
||||
rv = "Server"
|
||||
for k in self.smetrics:
|
||||
# print k, self.smetrics[k]
|
||||
if isinstance(self.smetrics[k], float):
|
||||
rv += ", %.2f" % self.smetrics[k]
|
||||
else:
|
||||
else:
|
||||
if isinstance(self.smetrics[k], list):
|
||||
rv += ", [" + \
|
||||
", ".join(map(lambda(x):str(round(x, 2)), self.smetrics[k])) \
|
||||
+ "]"
|
||||
else:
|
||||
rv += ", " + str(self.smetrics[k])
|
||||
return rv
|
||||
values = ", ".join(str(round(x, 2)) for x in self.smetrics[k])
|
||||
rv += ", [%s]" % values
|
||||
else:
|
||||
rv += ", " + str(self.smetrics[k])
|
||||
return rv
|
||||
|
||||
|
||||
def readserverthresholds(filename):
|
||||
if filename is None:
|
||||
return
|
||||
return
|
||||
|
||||
lines = readfile(filename)
|
||||
for l in lines:
|
||||
mval = l.strip().split('=')
|
||||
#print "read line %s" % mval
|
||||
if len(mval) > 1 :
|
||||
thekey = mval[0].strip()
|
||||
theval = mval[1].strip()
|
||||
if thekey in serverthresholds.getkeys():
|
||||
serverthresholds.setvalue(thekey, float(theval))
|
||||
# print thekey," = %.2f" % float(theval)
|
||||
|
||||
def checkserverthreshold(metricval):
|
||||
# print out an alarm if a ServerMetrics value crosses threshold
|
||||
mval = l.strip().split('=')
|
||||
if len(mval) > 1:
|
||||
thekey = mval[0].strip()
|
||||
theval = mval[1].strip()
|
||||
if thekey in serverthresholds.getkeys():
|
||||
serverthresholds.setvalue(thekey, float(theval))
|
||||
|
||||
|
||||
def checkserverthreshold(metricval):
|
||||
"""
|
||||
Print out an alarm if a ServerMetrics value crosses threshold.
|
||||
"""
|
||||
for key in serverthresholds.getkeys():
|
||||
# print "checking threshold of key = ", key
|
||||
if key == "processorusedcputime":
|
||||
pcpus = metricval.getvalue(key)
|
||||
# print key, pcpus, serverthresholds[key]
|
||||
for ind in range(0, len(pcpus)):
|
||||
# print ind, pcpus[ind]
|
||||
if pcpus[ind] > serverthresholds.getvalue(key):
|
||||
if key == "processorusedcputime":
|
||||
pcpus = metricval.getvalue(key)
|
||||
for ind in range(0, len(pcpus)):
|
||||
if pcpus[ind] > serverthresholds.getvalue(key):
|
||||
alarm = ["server", os.uname()[1], str(ind) + key,
|
||||
"%.2f" % pcpus[ind], ">", serverthresholds.getvalue(key)]
|
||||
"%.2f" % pcpus[ind], ">", serverthresholds.getvalue(key)]
|
||||
if options.timestamp:
|
||||
print str(time.time()) + ",",
|
||||
print ", ".join(map(lambda(x):str(x), alarm))
|
||||
else:
|
||||
if metricval.getvalue(key) > serverthresholds.getvalue(key):
|
||||
print str(time.time()) + ",",
|
||||
print ", ".join(str(x) for x in alarm)
|
||||
else:
|
||||
if metricval.getvalue(key) > serverthresholds.getvalue(key):
|
||||
alarm = ["server", os.uname()[1], key,
|
||||
"%.2f" % metricval.getvalue(key), ">", serverthresholds.getvalue(key)]
|
||||
"%.2f" % metricval.getvalue(key), ">", serverthresholds.getvalue(key)]
|
||||
if options.timestamp:
|
||||
print str(time.time()) + ",",
|
||||
print ", ".join(map(lambda(x):str(x), alarm))
|
||||
print str(time.time()) + ",",
|
||||
print ", ".join(str(x) for x in alarm)
|
||||
|
||||
|
||||
def collectservercputimes():
|
||||
# return cpu times in ticks of this server total and each processor 3*(1+#cpu) columns
|
||||
# (user+nice, sys, idle) from each /proc/stat cpu lines assume columns are:
|
||||
# cpu# user nice sys idle iowait irq softirq steal guest (man 5 proc)
|
||||
"""
|
||||
Return cpu times in ticks of this server total and each processor 3*(1+#cpu) columns
|
||||
(user+nice, sys, idle) from each /proc/stat cpu lines assume columns are:
|
||||
cpu# user nice sys idle iowait irq softirq steal guest (man 5 proc)
|
||||
"""
|
||||
rval = {}
|
||||
lines = readfile("/proc/stat")
|
||||
for i in range(ncpus + 1):
|
||||
items = lines[i].split()
|
||||
(user, nice, sys, idle) = map(lambda(x): int(x), items[1:5])
|
||||
rval[i] = [user+nice, sys, idle]
|
||||
return rval
|
||||
|
||||
def csvservercputimes(cputimes):
|
||||
# return a csv string of this server total and each processor's cpu times
|
||||
# (usr, sys, idle) in ticks
|
||||
rval = ''
|
||||
for i in range(len(cputimes)):
|
||||
rval += ', '.join(map(lambda(x):str(x), cputimes[i]))
|
||||
user, nice, sys, idle = [int(x) for x in items[1:5]]
|
||||
rval[i] = [user+nice, sys, idle]
|
||||
return rval
|
||||
|
||||
|
||||
def csvservercputimes(cputimes):
|
||||
"""
|
||||
Return a csv string of this server total and each processor's cpu times
|
||||
(usr, sys, idle) in ticks.
|
||||
"""
|
||||
rval = ''
|
||||
for i in range(len(cputimes)):
|
||||
rval += ", ".join(str(x) for x in cputimes[i])
|
||||
return rval
|
||||
|
||||
|
||||
def calcservercputimes(cputimea, cputimeb):
|
||||
# return cpu used/total % of this server total and each processor (1+#cpu columns)
|
||||
"""
|
||||
Return cpu used/total % of this server total and each processor (1+#cpu columns).
|
||||
"""
|
||||
p = {}
|
||||
for n in range(ncpus + 1):
|
||||
# print cputimeb[n]
|
||||
p[n] = []
|
||||
for i in range(len(cputimea[n])):
|
||||
p[n].append(cputimeb[n][i] - cputimea[n][i])
|
||||
# print p[n]
|
||||
total = sum(p[n]) # cpu times total delta
|
||||
# print total
|
||||
# cpu times total delta
|
||||
total = sum(p[n])
|
||||
if total == 0:
|
||||
p[n] = 0.0
|
||||
else:
|
||||
p[n] = 100 - ((100.0 * p[n][-1]) / total)
|
||||
return p
|
||||
|
||||
def collectservermems():
|
||||
# return memory (total, free) in KB from proc/meminfo
|
||||
|
||||
def collectservermems():
|
||||
"""
|
||||
Return memory (total, free) in KB from proc/meminfo.
|
||||
"""
|
||||
lines = readfile("/proc/meminfo")
|
||||
mem = map(lambda(x):x.split(), lines[0:2])
|
||||
return map(lambda(x):int(x), zip(*mem)[1])
|
||||
mem = [x.plit() for x in lines[0:2]]
|
||||
return [int(x) for x in zip(*mem)[1]]
|
||||
|
||||
def csvservermems(mems):
|
||||
# return a csv string of this server memory (total, free)
|
||||
return ", ".join(map(lambda x: str(x), mems))
|
||||
|
||||
def calcserverusedmem(mems):
|
||||
# return int(100*(MemTotal-MemFree)/MemTotal) from /proc/meminfo
|
||||
def csvservermems(mems):
|
||||
"""
|
||||
Return a csv string of this server memory (total, free).
|
||||
"""
|
||||
return ", ".join(str(x) for x in mems)
|
||||
|
||||
|
||||
def calcserverusedmem(mems):
|
||||
"""
|
||||
Return int(100*(MemTotal-MemFree)/MemTotal) from /proc/meminfo.
|
||||
"""
|
||||
return 100 * (mems[0] - mems[1]) / mems[0]
|
||||
|
||||
|
||||
def collectservermetrics(cputimes, mems, thresholdcheck):
|
||||
# return ServerMetrics object with a dictionary of
|
||||
# loadavg1,loadavg5,loadavg15, usedmem%, usedcpu% for total, cpu1, cpu2, ...
|
||||
"""
|
||||
Return ServerMetrics object with a dictionary of
|
||||
loadavg1,loadavg5,loadavg15, usedmem%, usedcpu% for total, cpu1, cpu2, ...
|
||||
"""
|
||||
metricval = []
|
||||
ldavgs=os.getloadavg()
|
||||
ldavgs = os.getloadavg()
|
||||
for v in ldavgs:
|
||||
metricval.append(v)
|
||||
metricval.append(calcserverusedmem(mems))
|
||||
|
||||
for i in range(ncpus + 1):
|
||||
metricval.append(cputimes[i])
|
||||
# print cputimes[i]
|
||||
metricval.append(cputimes[i])
|
||||
|
||||
srvmetrics = ServerMetrics()
|
||||
srvmetrics.setvalues(metricval)
|
||||
# print srvmetrics.tocsv()
|
||||
|
||||
if thresholdcheck:
|
||||
checkserverthreshold(srvmetrics)
|
||||
checkserverthreshold(srvmetrics)
|
||||
|
||||
return srvmetrics
|
||||
|
||||
|
||||
|
||||
def csvservermetrics(srvmetrics):
|
||||
# return a csv string of ServerMetrics.tocsv()
|
||||
# loadavg1,loadavg5,loadavg15, usedmem%, usedcpu% for total, cpu1, cpu2, ...
|
||||
"""
|
||||
Return a csv string of ServerMetrics.tocsv()
|
||||
loadavg1,loadavg5,loadavg15, usedmem%, usedcpu% for total, cpu1, cpu2, ...
|
||||
"""
|
||||
rv = ""
|
||||
if options.timestamp:
|
||||
rv = str(time.time()) + ", "
|
||||
rv = str(time.time()) + ", "
|
||||
rv += srvmetrics.tocsv()
|
||||
return rv
|
||||
|
||||
|
||||
def csvserverbaseline():
|
||||
# return a csv string of raw server metrics data: memfree, memtotal, cpuused, cpusystem, cpuidle
|
||||
"""
|
||||
Return a csv string of raw server metrics data: memfree, memtotal, cpuused, cpusystem, cpuidle.
|
||||
"""
|
||||
return "memory (total, free) = " + csvservermems(collectservermems()) + "\ncputime (used, sys, idl) = " + csvservercputimes(collectservercputimes())
|
||||
|
||||
|
||||
class NodeMetrics(object):
|
||||
def __init__(self):
|
||||
self.nmetrics = {"nodethroughput" : 0.0,
|
||||
"nodeusedmemory" : 0.0,
|
||||
"nodetotalcpu" : 0.0,
|
||||
"nodeusercpu" : 0.0,
|
||||
"nodesystemcpu" : 0.0,
|
||||
"nodewaitcpu" : 0.0}
|
||||
self.nmetrics = {"nodethroughput": 0.0,
|
||||
"nodeusedmemory": 0.0,
|
||||
"nodetotalcpu": 0.0,
|
||||
"nodeusercpu": 0.0,
|
||||
"nodesystemcpu": 0.0,
|
||||
"nodewaitcpu": 0.0}
|
||||
|
||||
# set values from val = (throughput, mem, tcpu, ucpu, scpu, wcpu):
|
||||
def setvalues(self, val):
|
||||
self.nmetrics["nodethroughput"] = val[0]
|
||||
self.nmetrics["nodeusedmemory"] = val[1]
|
||||
self.nmetrics["nodetotalcpu"] = val[2]
|
||||
self.nmetrics["nodeusercpu"] = val[3]
|
||||
self.nmetrics["nodesystemcpu"] = val[4]
|
||||
self.nmetrics["nodewaitcpu"] = val[5]
|
||||
"""
|
||||
Set values from val = (throughput, mem, tcpu, ucpu, scpu, wcpu).
|
||||
"""
|
||||
self.nmetrics["nodethroughput"] = val[0]
|
||||
self.nmetrics["nodeusedmemory"] = val[1]
|
||||
self.nmetrics["nodetotalcpu"] = val[2]
|
||||
self.nmetrics["nodeusercpu"] = val[3]
|
||||
self.nmetrics["nodesystemcpu"] = val[4]
|
||||
self.nmetrics["nodewaitcpu"] = val[5]
|
||||
|
||||
def setvalue(self, key, val):
|
||||
self.nmetrics[key] = val
|
||||
self.nmetrics[key] = val
|
||||
|
||||
def getvalue(self, key):
|
||||
return self.nmetrics[key]
|
||||
return self.nmetrics[key]
|
||||
|
||||
def getkeys(self):
|
||||
return self.nmetrics.keys()
|
||||
return self.nmetrics.keys()
|
||||
|
||||
def tocsv(self):
|
||||
return ", ".join(map(lambda(x):str(x), self.nmetrics.values()))
|
||||
return ", ".join(str(x) for x in self.nmetrics.values())
|
||||
|
||||
|
||||
class LogSession(object):
|
||||
def __init__(self):
|
||||
self.nodethresholds = NodeMetrics()
|
||||
# set node threshold default values:
|
||||
# nodethroughput=20.0, nodeusedmemory=15.0, nodetotalcpu=90.0,
|
||||
# nodeusercpu=30.0, nodewaitcpu=50.0, nodesystemcpu=20.0}
|
||||
self.nodethresholds.setvalues([20.0, 15.0, 90.0, 30.0, 50.0, 20.0])
|
||||
if options.configfile is not None:
|
||||
self.readnodethresholds(options.configfile)
|
||||
# set node threshold default values:
|
||||
# nodethroughput=20.0, nodeusedmemory=15.0, nodetotalcpu=90.0,
|
||||
# nodeusercpu=30.0, nodewaitcpu=50.0, nodesystemcpu=20.0}
|
||||
self.nodethresholds.setvalues([20.0, 15.0, 90.0, 30.0, 50.0, 20.0])
|
||||
if options.configfile is not None:
|
||||
self.readnodethresholds(options.configfile)
|
||||
self.pids = {}
|
||||
self.nodemetricsA = {}
|
||||
self.nodemetricsB = {}
|
||||
self.nodemetricsC = {}
|
||||
|
||||
def getpids(self):
|
||||
# return dict of all CORE session pids in a dict using node name as the keys
|
||||
# parent pid (vnoded) is the first value
|
||||
def getpids(self):
|
||||
"""
|
||||
Return dict of all CORE session pids in a dict using node name as the keys
|
||||
parent pid (vnoded) is the first value.
|
||||
"""
|
||||
self.pids = {}
|
||||
nodes = commands.getstatusoutput("ls /tmp/pycore.%s/*pid" % options.session)
|
||||
nodes = commands.getstatusoutput(
|
||||
"ls /tmp/pycore.%s/*pid" % options.session)
|
||||
if nodes[0] != 0:
|
||||
# if options.timestamp == True:
|
||||
# print str(time.time()),
|
||||
# print "CORE session %s has not created nodes" % options.session
|
||||
return
|
||||
|
||||
return
|
||||
|
||||
nodes = nodes[1].split('\n')
|
||||
for nod in nodes:
|
||||
nodename = nod.split('/')[-1].strip(".pid")
|
||||
self.pids[nodename] = commands.getoutput("cat %s" % nod)
|
||||
|
||||
nodename = nod.split('/')[-1].strip(".pid")
|
||||
self.pids[nodename] = commands.getoutput("cat %s" % nod)
|
||||
|
||||
# do not expect failure of this command
|
||||
procs = commands.getoutput('ps -eo ppid,pid,comm').split('\n')
|
||||
|
||||
|
||||
# build self.pids dict with key=nodename and val="ppid,pid,cmd"
|
||||
for nname in self.pids:
|
||||
# print nname, self.pids[nname]
|
||||
if self.pids[nname] == "":
|
||||
if options.timestamp == True:
|
||||
print str(time.time()),
|
||||
print "ERROR: null vnoded pid of node: %s" % nname
|
||||
else:
|
||||
childprocs = []
|
||||
ppid = self.pids[nname]
|
||||
if options.timestamp == True:
|
||||
print str(time.time()),
|
||||
print "ERROR: null vnoded pid of node: %s" % nname
|
||||
else:
|
||||
childprocs = []
|
||||
ppid = self.pids[nname]
|
||||
for proc in procs:
|
||||
val=proc.split()
|
||||
if ppid == val[1]:
|
||||
childprocs.append([val[1], val[2]] )
|
||||
if ppid == val[0]:
|
||||
childprocs.append([val[1], val[2]])
|
||||
val = proc.split()
|
||||
if ppid == val[1]:
|
||||
childprocs.append([val[1], val[2]])
|
||||
if ppid == val[0]:
|
||||
childprocs.append([val[1], val[2]])
|
||||
self.pids[nname] = childprocs
|
||||
# print nname, self.pids[nname]
|
||||
return self.pids
|
||||
|
||||
|
||||
def printsesspids(self):
|
||||
if self.pids == {}:
|
||||
return {}
|
||||
for pp in self.pids:
|
||||
if self.pids[pp] != []:
|
||||
if self.pids[pp] != []:
|
||||
for ap in range(len(self.pids[pp]) - 1):
|
||||
print ", " + self.pids[pp][ap][0], # ap pid
|
||||
print ", " + self.pids[pp][ap][1], # ap cmd
|
||||
procmetrics = map(lambda(x):str(x),self.pids[pp][ap][-1])
|
||||
print ", " + ", ".join(procmetrics),
|
||||
nodemetrics = map(lambda(x):str(x), self.pids[pp][-1])
|
||||
print ", " + ", ".join(nodemetrics)
|
||||
# ap pid
|
||||
print ", " + self.pids[pp][ap][0],
|
||||
# ap cmd
|
||||
print ", " + self.pids[pp][ap][1],
|
||||
procmetrics = [str(x) for x in self.pids[pp][ap][-1]]
|
||||
print ", " + ", ".join(procmetrics),
|
||||
nodemetrics = [str(x) for x in self.pids[pp][-1]]
|
||||
print ", " + ", ".join(nodemetrics)
|
||||
|
||||
def getprocessmetrics(self, pid):
|
||||
# return [cpu#, vsize(kb), ttime, utime, stime, wtime]
|
||||
# from a /proc/pid/stat (a single line file) assume columns are:
|
||||
# pid(0) comm(1) state ppid pgrp sess tty_nr tpgid flags
|
||||
# minflt cmiflt majflt cmajflt # utime(12) stime cutime cstime
|
||||
# priority nice num_threads itrealvalue starttime vsize(22) rss rsslim
|
||||
# startcode endcode startstack kstkesp signal blocked sigignore sigcatch
|
||||
# wchan nswap cnswap exit_signal processor(38) rt_priority
|
||||
# policy ioblock guest_time cguest_time (man 5 proc)
|
||||
#rval = ProcessMetrics()
|
||||
#rval.__init__()
|
||||
"""
|
||||
Return [cpu#, vsize(kb), ttime, utime, stime, wtime]
|
||||
from a /proc/pid/stat (a single line file) assume columns are:
|
||||
pid(0) comm(1) state ppid pgrp sess tty_nr tpgid flags
|
||||
minflt cmiflt majflt cmajflt # utime(12) stime cutime cstime
|
||||
priority nice num_threads itrealvalue starttime vsize(22) rss rsslim
|
||||
startcode endcode startstack kstkesp signal blocked sigignore sigcatch
|
||||
wchan nswap cnswap exit_signal processor(38) rt_priority
|
||||
policy ioblock guest_time cguest_time (man 5 proc)
|
||||
"""
|
||||
rval = {}
|
||||
lines = readfile("/proc/" + pid + "/stat")
|
||||
if lines == []:
|
||||
return rval
|
||||
return rval
|
||||
items = lines[0].split()
|
||||
(utime, stime, cutime, cstime) = map(lambda(x):int(x), items[13:17])
|
||||
# print ">???", utime, stime, cutime, cstime
|
||||
utime, stime, cutime, cstime = [int(x) for x in items[13:17]]
|
||||
rval = (items[38], # last run processor
|
||||
int(items[22])/1000, # process virtual mem in kb
|
||||
utime + stime + cutime + cstime,# totoal time
|
||||
int(items[22])/1000, # process virtual mem in kb
|
||||
utime + stime + cutime + cstime, # totoal time
|
||||
utime, # user time
|
||||
stime, # system time
|
||||
cutime + cstime) # wait time
|
||||
# print "pid --- processmetrics", rval
|
||||
return rval
|
||||
|
||||
|
||||
def getnodethroughput(self, pid):
|
||||
# return node throughput of total receive and transmit packets in kb
|
||||
"""
|
||||
Return node throughput of total receive and transmit packets in kb.
|
||||
"""
|
||||
lines = readfile("/proc/" + pid + "/net/dev")
|
||||
if lines == []:
|
||||
return -0.00
|
||||
ifs = map(lambda(x): x.split(), lines[2:])
|
||||
return -0.00
|
||||
ifs = [x.split() for x in lines[2:]]
|
||||
ifm = zip(*ifs)
|
||||
rv = sum(map(lambda(x):int(x), ifm[1])) # received bytes
|
||||
tr = sum(map(lambda(x):int(x), ifm[9])) # transmited bytes
|
||||
#print 'node thruput :', rv, tr, (rv + tr)/1000
|
||||
rv = sum(int(x) for x in ifm[1]) # received bytes
|
||||
tr = sum(int(x) for x in ifm[9]) # transmited bytes
|
||||
return (rv + tr)/1000
|
||||
|
||||
|
||||
def getnodemetrics(self, mindex):
|
||||
# return NodeMetrics with indexed by nodename, values are rows of
|
||||
# [ [ppid, vnoded, [cpu#, vmem(kb), ttime, utime, stime, wtime]],
|
||||
# [cpid, cmd, [cpu#, vmem(kb), ttime, utime, stime, wtime]], ... ,
|
||||
# [thrput, vmem(kb), ttime, utime, stime, wtime]]
|
||||
if mindex == 'a':
|
||||
metricref = self.nodemetricsA
|
||||
else:
|
||||
metricref = self.nodemetricsB
|
||||
"""
|
||||
Return NodeMetrics with indexed by nodename, values are rows of
|
||||
[ [ppid, vnoded, [cpu#, vmem(kb), ttime, utime, stime, wtime]],
|
||||
[cpid, cmd, [cpu#, vmem(kb), ttime, utime, stime, wtime]], ... ,
|
||||
[thrput, vmem(kb), ttime, utime, stime, wtime]]
|
||||
"""
|
||||
if mindex == 'a':
|
||||
metricref = self.nodemetricsA
|
||||
else:
|
||||
metricref = self.nodemetricsB
|
||||
|
||||
self.getpids()
|
||||
# print " inside getnodemetrics()", self.pids
|
||||
if self.pids == {}:
|
||||
return {}
|
||||
|
||||
return {}
|
||||
|
||||
for nod in self.pids:
|
||||
nmetric = NodeMetrics()
|
||||
nmetric.__init__()
|
||||
nodeapps = {}
|
||||
for ap in range(len(self.pids[nod])): # get each process metrics
|
||||
procm = self.getprocessmetrics(self.pids[nod][ap][0])
|
||||
if procm == []:
|
||||
if options.timestamp == True:
|
||||
print str(time.time()),
|
||||
print "WARNING: transient process", self.pids[nod][ap][1], \
|
||||
"/", self.pids[nod][ap][0], "on node %s" % nod
|
||||
else:
|
||||
nodeapps[ap] = procm
|
||||
self.pids[nod][ap].append(nodeapps[ap])
|
||||
processm = zip(*nodeapps.values()) # get overall node metrics
|
||||
# print processm
|
||||
if len(processm) > 0:
|
||||
# if nod == 'n6':
|
||||
# print nod, self.getnodethroughput(self.pids[nod][0][0])
|
||||
nmetric.setvalues(( self.getnodethroughput(self.pids[nod][0][0]),
|
||||
sum(map(lambda(x):int(x), processm[1])), # vsize(kb)
|
||||
sum(map(lambda(x):int(x), processm[2])), # ttime
|
||||
sum(map(lambda(x):int(x), processm[3])), # utime
|
||||
sum(map(lambda(x):int(x), processm[4])), # stime
|
||||
sum(map(lambda(x):int(x), processm[5])))) # wtime
|
||||
for ap in range(len(self.pids[nod])): # get each process metrics
|
||||
procm = self.getprocessmetrics(self.pids[nod][ap][0])
|
||||
if procm == []:
|
||||
if options.timestamp == True:
|
||||
print str(time.time()),
|
||||
print "WARNING: transient process", self.pids[nod][ap][1], \
|
||||
"/", self.pids[nod][ap][0], "on node %s" % nod
|
||||
else:
|
||||
nodeapps[ap] = procm
|
||||
self.pids[nod][ap].append(nodeapps[ap])
|
||||
processm = zip(*nodeapps.values()) # get overall node metrics
|
||||
if len(processm) > 0:
|
||||
nmetric.setvalues((self.getnodethroughput(self.pids[nod][0][0]),
|
||||
# vsize(kb)
|
||||
sum(int(x) for x in processm[1]),
|
||||
# ttime
|
||||
sum(int(x) for x in processm[2]),
|
||||
# utime
|
||||
sum(int(x) for x in processm[3]),
|
||||
# stime
|
||||
sum(int(x) for x in processm[4]),
|
||||
sum(int(x) for x in processm[5]))) # wtime
|
||||
metricref[nod] = nmetric
|
||||
# print nod, self.pids[nod][0][0], metricref[nod].tocsv()
|
||||
return metricref
|
||||
|
||||
def setnodemetricsC(self, key, val):
|
||||
self.nodemetricsC[key] = val
|
||||
def setnodemetricsC(self, key, val):
|
||||
self.nodemetricsC[key] = val
|
||||
|
||||
def printnodemetrics(self, mindex):
|
||||
def printnodemetrics(self, mindex):
|
||||
if mindex == 'c':
|
||||
mm = self.nodemetricsC
|
||||
else:
|
||||
if mindex == 'a':
|
||||
mm = self.nodemetricsA
|
||||
else:
|
||||
mm = self.nodemetricsB
|
||||
mm = self.nodemetricsC
|
||||
else:
|
||||
if mindex == 'a':
|
||||
mm = self.nodemetricsA
|
||||
else:
|
||||
mm = self.nodemetricsB
|
||||
|
||||
for k in self.nodemetricsC:
|
||||
for k in self.nodemetricsC:
|
||||
if options.timestamp:
|
||||
print str(time.time()) + ",",
|
||||
print k, ",", mm[k].tocsv()
|
||||
print k, ",", mm[k].tocsv()
|
||||
|
||||
def readnodethresholds(self, filename):
|
||||
if filename is None:
|
||||
return
|
||||
lines = readfile(filename)
|
||||
def readnodethresholds(self, filename):
|
||||
if filename is None:
|
||||
return
|
||||
lines = readfile(filename)
|
||||
for l in lines:
|
||||
mval = l.strip().split('=')
|
||||
# print "read line %s" % mval
|
||||
if len(mval) > 1 :
|
||||
thekey = mval[0].strip()
|
||||
theval = mval[1].strip()
|
||||
if thekey in self.nodethresholds.getkeys():
|
||||
self.nodethresholds.setvalue(thekey, float(theval))
|
||||
#print thekey," = %.2f" % float(theval)
|
||||
|
||||
mval = l.strip().split('=')
|
||||
if len(mval) > 1:
|
||||
thekey = mval[0].strip()
|
||||
theval = mval[1].strip()
|
||||
if thekey in self.nodethresholds.getkeys():
|
||||
self.nodethresholds.setvalue(thekey, float(theval))
|
||||
|
||||
def checknodethresholds(self, nname):
|
||||
# print "check node thresholds", nname
|
||||
calcm = self.nodemetricsC[nname]
|
||||
calcm = self.nodemetricsC[nname]
|
||||
for keyname in self.nodethresholds.getkeys():
|
||||
# print "HIII", keyname, calcm.getvalue(keyname), self.nodethresholds.getvalue(keyname)
|
||||
if float(calcm.getvalue(keyname)) > float(self.nodethresholds.getvalue(keyname)):
|
||||
# print calculatednodem.getvalue(m)
|
||||
alarm = ["node", nname + "/" + self.pids[nname][0][0], keyname,\
|
||||
calcm.getvalue(keyname), ">", self.nodethresholds.getvalue(keyname)]
|
||||
if float(calcm.getvalue(keyname)) > float(self.nodethresholds.getvalue(keyname)):
|
||||
alarm = ["node", nname + "/" + self.pids[nname][0][0], keyname,
|
||||
calcm.getvalue(keyname), ">", self.nodethresholds.getvalue(keyname)]
|
||||
if options.timestamp:
|
||||
print str(time.time()) + ",",
|
||||
print ", ".join(map(lambda(x):str(x), alarm))
|
||||
print ", ".join(str(x) for x in alarm)
|
||||
|
||||
def calcnodemetrics(self, cputimea, cputimeb, mems):
|
||||
# return a dict of nodemetrics indexed by node name
|
||||
# nodemetrics[nodename][-1] = node/host%
|
||||
"""
|
||||
Return a dict of nodemetrics indexed by node name
|
||||
nodemetrics[nodename][-1] = node/host%.
|
||||
"""
|
||||
p = []
|
||||
for i in range(len(cputimeb[0])):
|
||||
p.append(cputimeb[0][i] - cputimea[0][i])
|
||||
hostusedcpu = p[0] + p[1]
|
||||
hostusedmem = mems[0] - mems[1]
|
||||
if hostusedcpu == 0:
|
||||
print "WARNING: host used cpu = 0, ", p[0], p[1]
|
||||
hostusedcpu = 1
|
||||
print "WARNING: host used cpu = 0, ", p[0], p[1]
|
||||
hostusedcpu = 1
|
||||
if hostusedmem == 0:
|
||||
print "WARNING: host used mem = 0, ", mems[0], mems[1]
|
||||
hostusedmem = 1
|
||||
print "WARNING: host used mem = 0, ", mems[0], mems[1]
|
||||
hostusedmem = 1
|
||||
|
||||
nodesa = self.nodemetricsA
|
||||
nodesb = self.nodemetricsB
|
||||
nodesa = self.nodemetricsA
|
||||
nodesb = self.nodemetricsB
|
||||
for nod in nodesb:
|
||||
calcm = self.nodemetricsC
|
||||
calcm = self.nodemetricsC
|
||||
calcm = NodeMetrics()
|
||||
calcm.__init__()
|
||||
if (nod in nodesa):
|
||||
try:
|
||||
if (nodesb[nod] == []) | (nodesa[nod] == []) | \
|
||||
( False == isinstance(nodesb[nod], NodeMetrics)) | \
|
||||
( False == isinstance(nodesa[nod], NodeMetrics)):
|
||||
if options.timestamp == True:
|
||||
print str(time.time()),
|
||||
print "Warning: nodes %s is not fully instanciated" % nod
|
||||
else:
|
||||
# calc throughput kbps
|
||||
#print "node b : ", nodesb[nod].tocsv()
|
||||
#print "node a : ", nodesa[nod].tocsv()
|
||||
#if nod == 'n6':
|
||||
#print nodesb[nod].getvalue("nodethroughput"), nodesa[nod].getvalue("nodethroughput")
|
||||
calcm.setvalue("nodethroughput", "%.2f" % (8 * (nodesb[nod].getvalue("nodethroughput") \
|
||||
- nodesa[nod].getvalue("nodethroughput")) / options.interval))
|
||||
# calc mem node used / host used
|
||||
calcm.setvalue("nodeusedmemory", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodeusedmemory") / hostusedmem)))
|
||||
|
||||
# calc total cpu time node / host
|
||||
calcm.setvalue("nodetotalcpu", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodetotalcpu")\
|
||||
- nodesa[nod].getvalue("nodetotalcpu")) /hostusedcpu))
|
||||
# calc user cpu time node / host
|
||||
calcm.setvalue("nodeusercpu", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodeusercpu")\
|
||||
- nodesa[nod].getvalue("nodeusercpu")) /hostusedcpu))
|
||||
# calc system cpu time node / host
|
||||
calcm.setvalue("nodesystemcpu", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodesystemcpu")\
|
||||
- nodesa[nod].getvalue("nodesystemcpu")) /hostusedcpu))
|
||||
# calc waitcpu time node / host
|
||||
calcm.setvalue("nodewaitcpu", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodewaitcpu")\
|
||||
- nodesa[nod].getvalue("nodewaitcpu")) /hostusedcpu))
|
||||
calcm.__init__()
|
||||
if (nod in nodesa):
|
||||
try:
|
||||
if (nodesb[nod] == []) | (nodesa[nod] == []) | \
|
||||
(False == isinstance(nodesb[nod], NodeMetrics)) | \
|
||||
(False == isinstance(nodesa[nod], NodeMetrics)):
|
||||
if options.timestamp == True:
|
||||
print str(time.time()),
|
||||
print "Warning: nodes %s is not fully instanciated" % nod
|
||||
else:
|
||||
# calc throughput kbps
|
||||
calcm.setvalue("nodethroughput", "%.2f" % (8 * (nodesb[nod].getvalue("nodethroughput")
|
||||
- nodesa[nod].getvalue("nodethroughput")) / options.interval))
|
||||
# calc mem node used / host used
|
||||
calcm.setvalue("nodeusedmemory", "%.2f" % (
|
||||
100.0 * (nodesb[nod].getvalue("nodeusedmemory") / hostusedmem)))
|
||||
|
||||
#print nod, calcm.tocsv()
|
||||
#print '=========================='
|
||||
logsession.nodemetricsC[nod] = calcm
|
||||
# logsession.printnodemetrics('c')
|
||||
# calc total cpu time node / host
|
||||
calcm.setvalue("nodetotalcpu", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodetotalcpu")
|
||||
- nodesa[nod].getvalue("nodetotalcpu")) / hostusedcpu))
|
||||
# calc user cpu time node / host
|
||||
calcm.setvalue("nodeusercpu", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodeusercpu")
|
||||
- nodesa[nod].getvalue("nodeusercpu")) / hostusedcpu))
|
||||
# calc system cpu time node / host
|
||||
calcm.setvalue("nodesystemcpu", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodesystemcpu")
|
||||
- nodesa[nod].getvalue("nodesystemcpu")) / hostusedcpu))
|
||||
# calc waitcpu time node / host
|
||||
calcm.setvalue("nodewaitcpu", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodewaitcpu")
|
||||
- nodesa[nod].getvalue("nodewaitcpu")) / hostusedcpu))
|
||||
logsession.nodemetricsC[nod] = calcm
|
||||
|
||||
if options.alarm is not None:
|
||||
logsession.checknodethresholds(nod)
|
||||
except IndexError:
|
||||
pass
|
||||
else:
|
||||
print "Warning: transient node %s " % nod
|
||||
logsession.checknodethresholds(nod)
|
||||
except IndexError:
|
||||
pass
|
||||
else:
|
||||
print "Warning: transient node %s " % nod
|
||||
|
||||
return nodesb
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
usagestr = "%prog [-h] [options] [args]\n\nLog server and optional CORE session metrics to stdout."
|
||||
parser = optparse.OptionParser(usage = usagestr)
|
||||
parser.set_defaults(interval=2, timestamp=False,
|
||||
configfile = "/etc/core/perflogserver.conf",\
|
||||
alarm = True, session = None)
|
||||
parser.add_option("-i", "--interval", dest = "interval", type = int,
|
||||
help = "seconds to wait between samples; default=%s" %
|
||||
parser = optparse.OptionParser(usage=usagestr)
|
||||
parser.set_defaults(interval=2, timestamp=False,
|
||||
configfile="/etc/core/perflogserver.conf",
|
||||
alarm=True, session=None)
|
||||
parser.add_option("-i", "--interval", dest="interval", type=int,
|
||||
help="seconds to wait between samples; default=%s" %
|
||||
parser.defaults["interval"])
|
||||
parser.add_option("-t", "--timestamp", action = "store_true",
|
||||
dest = "timestamp",
|
||||
help = "include timestamp on each line")
|
||||
parser.add_option("-c", "--configfile", dest = "configfile",
|
||||
type = "string",
|
||||
help = "read threshold values from the specified file;\
|
||||
parser.add_option("-t", "--timestamp", action="store_true",
|
||||
dest="timestamp",
|
||||
help="include timestamp on each line")
|
||||
parser.add_option("-c", "--configfile", dest="configfile",
|
||||
type="string",
|
||||
help="read threshold values from the specified file;\
|
||||
default=%s" % parser.defaults["configfile"])
|
||||
parser.add_option("-a", "--alarm", action = "store_true",
|
||||
dest = "alarm",
|
||||
help = "generate alarms based threshold check on each cycle")
|
||||
parser.add_option("-s", "--session", dest = "session", type = int,
|
||||
help = "CORE session id; default=%s" %
|
||||
parser.add_option("-a", "--alarm", action="store_true",
|
||||
dest="alarm",
|
||||
help="generate alarms based threshold check on each cycle")
|
||||
parser.add_option("-s", "--session", dest="session", type=int,
|
||||
help="CORE session id; default=%s" %
|
||||
parser.defaults["session"])
|
||||
global options
|
||||
global ncpus
|
||||
|
@ -528,8 +553,7 @@ def main():
|
|||
global logsession
|
||||
global cyclecount
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
# print options
|
||||
options, _args = parser.parse_args()
|
||||
|
||||
signal.signal(signal.SIGINT, handler)
|
||||
signal.signal(signal.SIGTERM, handler)
|
||||
|
@ -538,51 +562,45 @@ def main():
|
|||
|
||||
# server threshold dictionary - a ServerMetrics instant with default values
|
||||
serverthresholds = ServerMetrics()
|
||||
# set to server threshold default values: serverloadavg1=3.5,
|
||||
# serverloadavg5=3.5, serverloadavg15=3.5, serverusedmemory=80.0,
|
||||
# set to server threshold default values: serverloadavg1=3.5,
|
||||
# serverloadavg5=3.5, serverloadavg15=3.5, serverusedmemory=80.0,
|
||||
# serverusedcputime=80.0, processorusedcputime=90.0
|
||||
serverthresholds.setvalues([3.5, 3.5, 3.5, 80.0, 80.0, 90.0])
|
||||
if options.alarm is True:
|
||||
# read server threshold values from configuration file
|
||||
readserverthresholds(options.configfile)
|
||||
|
||||
readserverthresholds(options.configfile)
|
||||
|
||||
if options.session is not None:
|
||||
logsession = LogSession()
|
||||
# print logsession
|
||||
|
||||
# mark host log baseline
|
||||
print "server: ", ", ".join(map(lambda(x):str(x), os.uname())), ",", ncpus, "CPU cores"
|
||||
print "server: ", ", ".join(str(x) for x in os.uname()), ",", ncpus, "CPU cores"
|
||||
print "start timestamp:", time.time(), ", baseline data: "
|
||||
print csvserverbaseline()
|
||||
print "server metrics: ", ", ".join(map(lambda(x):str(x), serverthresholds.getkeys()))
|
||||
print "server metrics: ", ", ".join(str(x) for x in serverthresholds.getkeys())
|
||||
if options.session is not None:
|
||||
print "node metrics: nodename, ", ", ".join(map(lambda(x):str(x), logsession.nodethresholds.getkeys()))
|
||||
print "node metrics: nodename, ", ", ".join(str(x) for x in logsession.nodethresholds.getkeys())
|
||||
|
||||
cyclecount = 0
|
||||
while True:
|
||||
cputimea = collectservercputimes()
|
||||
if options.session is not None:
|
||||
nodesa = logsession.getnodemetrics('a')
|
||||
# print "nodes a:", nodesa
|
||||
|
||||
time.sleep(options.interval)
|
||||
|
||||
cputimeb = collectservercputimes()
|
||||
mems = collectservermems()
|
||||
|
||||
mems = collectservermems()
|
||||
calccputime = calcservercputimes(cputimea, cputimeb)
|
||||
m = csvservermetrics(collectservermetrics(calccputime, mems, options.alarm))
|
||||
print m
|
||||
m = csvservermetrics(collectservermetrics(
|
||||
calccputime, mems, options.alarm))
|
||||
print m
|
||||
|
||||
if options.session is not None:
|
||||
nodesb = logsession.getnodemetrics('b')
|
||||
# print "nodes b:", nodesb
|
||||
if nodesb != {}:
|
||||
logsession.calcnodemetrics(cputimea, cputimeb, mems)
|
||||
logsession.printnodemetrics('c')
|
||||
nodesb = logsession.getnodemetrics('b')
|
||||
if nodesb != {}:
|
||||
logsession.calcnodemetrics(cputimea, cputimeb, mems)
|
||||
logsession.printnodemetrics('c')
|
||||
|
||||
sys.stdout.flush()
|
||||
sys.stdout.flush()
|
||||
cyclecount = cyclecount + 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
Loading…
Reference in a new issue