Removing deprecated perf scripts
This commit is contained in:
parent
527a44748d
commit
76846fefeb
12 changed files with 1 additions and 830 deletions
|
@ -236,7 +236,6 @@ AC_CONFIG_FILES([Makefile
|
||||||
gui/Makefile
|
gui/Makefile
|
||||||
gui/icons/Makefile
|
gui/icons/Makefile
|
||||||
scripts/Makefile
|
scripts/Makefile
|
||||||
scripts/perf/Makefile
|
|
||||||
man/Makefile
|
man/Makefile
|
||||||
docs/Makefile
|
docs/Makefile
|
||||||
daemon/Makefile
|
daemon/Makefile
|
||||||
|
|
|
@ -13,7 +13,7 @@ DISTCLEANFILES = Makefile.in core-daemon.service core-daemon
|
||||||
|
|
||||||
EXTRA_DIST = core-daemon.in core-daemon.service.in
|
EXTRA_DIST = core-daemon.in core-daemon.service.in
|
||||||
|
|
||||||
SUBDIRS = perf
|
SUBDIRS =
|
||||||
|
|
||||||
# install startup scripts based on --with-startup=option configure option
|
# install startup scripts based on --with-startup=option configure option
|
||||||
# init.d (default), systemd
|
# init.d (default), systemd
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
# CORE
|
|
||||||
# (c)2011-2012 the Boeing Company.
|
|
||||||
# See the LICENSE file included in this distribution.
|
|
||||||
#
|
|
||||||
# author: Jeff Ahrenholz <jeffrey.m.ahrenholz@boeing.com>
|
|
||||||
#
|
|
||||||
# Makefile for installing performance monitoring scripts.
|
|
||||||
#
|
|
||||||
|
|
||||||
DISTCLEANFILES = Makefile.in
|
|
||||||
|
|
||||||
|
|
||||||
# Hook scripts installed to /usr[/local]/share/core/examples/hooks
|
|
||||||
hooksdir=$(pkgdatadir)/examples/hooks
|
|
||||||
dist_hooks_SCRIPTS = perflogstart.sh perflogstop.sh sessiondatacollect.sh \
|
|
||||||
timesyncstop.sh timesyncstart.sh perflogserver.py \
|
|
||||||
configuration_hook.sh datacollect_hook.sh
|
|
||||||
|
|
||||||
# Files installed to /etc/core/
|
|
||||||
perfconfdir=$(CORE_CONF_DIR)
|
|
||||||
dist_perfconf_DATA = perflogserver.conf
|
|
||||||
|
|
||||||
uninstall-hook:
|
|
||||||
rmdir -p $(perfconfdir) || true
|
|
||||||
rmdir -p $(hooksdir) || true
|
|
|
@ -1,6 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
# experiment hook script; write commands here to execute on the host at the
|
|
||||||
# specified state
|
|
||||||
|
|
||||||
sh /usr/local/share/core/examples/hooks/timesyncstop.sh
|
|
||||||
sh /usr/local/share/core/examples/hooks/perflogstart.sh
|
|
|
@ -1,7 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
# experiment hook script; write commands here to execute on the host at the
|
|
||||||
# specified state
|
|
||||||
|
|
||||||
sh /usr/local/share/core/examples/hooks/perflogstop.sh
|
|
||||||
sh /usr/local/share/core/examples/hooks/sessiondatacollect.sh
|
|
||||||
sh /usr/local/share/core/examples/hooks/timesyncstart.sh
|
|
|
@ -1,37 +0,0 @@
|
||||||
# Configuration file for CORE performance logging to generate alarms
|
|
||||||
#
|
|
||||||
# The string lef of "=" is the key of the performance metric - do not change it
|
|
||||||
|
|
||||||
# raise alarm if 1-minute, 5-minute, 16-minute load average
|
|
||||||
# rises above those values
|
|
||||||
serverloadavg1 = 3.6
|
|
||||||
serverloadavg5 = 3.7
|
|
||||||
serverloadavg15 = 3.8
|
|
||||||
|
|
||||||
# raise alarm if the server used memory rises above this value
|
|
||||||
serverusedmemory = 81.0
|
|
||||||
|
|
||||||
# raise alarm if the server used cpu time rises above this value
|
|
||||||
serverusedcputime = 91.0
|
|
||||||
|
|
||||||
# raise alarm if any server processor used cpu time rises above this value
|
|
||||||
processorusedcputime = 92.0
|
|
||||||
|
|
||||||
# raise alarm if any node throughput rises above this value
|
|
||||||
nodethroughput = 21.0
|
|
||||||
|
|
||||||
# raise alarm if any node used memory rises above this value
|
|
||||||
nodeusedmemory = 11.0
|
|
||||||
|
|
||||||
# raise alarm if any node total cpu time rises above this value
|
|
||||||
nodetotalcpu = 91.0
|
|
||||||
|
|
||||||
# raise alarm if any node used cpu time rises above this value
|
|
||||||
nodeusercpu = 41.0
|
|
||||||
|
|
||||||
# raise alarm if any node system cpu time rises above this value
|
|
||||||
nodesystemcpu = 21.0
|
|
||||||
|
|
||||||
# raise alarm if any node wait cpu time rises above this value
|
|
||||||
nodewaitcpu = 51.0
|
|
||||||
|
|
|
@ -1,603 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
#
|
|
||||||
# (c)2011-2012 the Boeing Company
|
|
||||||
#
|
|
||||||
|
|
||||||
"""
|
|
||||||
perflogserver.py - CORE server and node performace metrics logger and alarmer
|
|
||||||
server metrics: loadave1, 5, 15, mem, used cpu% of total, cpu1, cpu2, ..., cpun
|
|
||||||
node metrics: throughput, mem, cpu total, usr, sys, wait
|
|
||||||
"""
|
|
||||||
|
|
||||||
import commands
|
|
||||||
import optparse
|
|
||||||
import os
|
|
||||||
import signal
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
def readfile(fname):
|
|
||||||
lines = []
|
|
||||||
try:
|
|
||||||
f = open(fname, "r")
|
|
||||||
except IOError:
|
|
||||||
if options.timestamp == True:
|
|
||||||
print(str(time.time()),)
|
|
||||||
print("ERROR: failed to open file %s\n" % fname)
|
|
||||||
else:
|
|
||||||
lines = f.readlines()
|
|
||||||
f.close()
|
|
||||||
return lines
|
|
||||||
|
|
||||||
|
|
||||||
def numcpus():
|
|
||||||
lines = readfile("/proc/stat")
|
|
||||||
n = 0
|
|
||||||
for l in lines[1:]:
|
|
||||||
if l[:3] != "cpu":
|
|
||||||
break
|
|
||||||
n += 1
|
|
||||||
return n
|
|
||||||
|
|
||||||
|
|
||||||
def handler(signum, frame):
|
|
||||||
print("stop timestamp:", str(time.time()) + ", cyclecount=", cyclecount, ", caught signal", signum)
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
|
|
||||||
class ServerMetrics(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.smetrics = {"serverloadavg1": 0.0,
|
|
||||||
"serverloadavg5": 0.0,
|
|
||||||
"serverloadavg15": 0.0,
|
|
||||||
"serverusedmemory": 0.0,
|
|
||||||
"serverusedcputime": 0.0,
|
|
||||||
"processorusedcputime": []}
|
|
||||||
|
|
||||||
def setvalues(self, val):
|
|
||||||
"""
|
|
||||||
Set values from val = (nump, ldavg1, ldavg5, adavg15, mem, cpu, p1cpu, p2cpu...).
|
|
||||||
"""
|
|
||||||
self.smetrics["serverloadavg1"] = val[0]
|
|
||||||
self.smetrics["serverloadavg5"] = val[1]
|
|
||||||
self.smetrics["serverloadavg15"] = val[2]
|
|
||||||
self.smetrics["serverusedmemory"] = val[4]
|
|
||||||
self.smetrics["serverusedcputime"] = val[5]
|
|
||||||
|
|
||||||
pcpu = []
|
|
||||||
for ind in range(5, len(val)):
|
|
||||||
pcpu.append(val[ind])
|
|
||||||
self.smetrics["processorusedcputime"] = pcpu
|
|
||||||
|
|
||||||
def setvalue(self, key, val):
|
|
||||||
self.smetrics[key] = val
|
|
||||||
|
|
||||||
def getvalue(self, key):
|
|
||||||
return self.smetrics[key]
|
|
||||||
|
|
||||||
def getkeys(self):
|
|
||||||
return self.smetrics.keys()
|
|
||||||
|
|
||||||
def tocsv(self):
|
|
||||||
rv = "Server"
|
|
||||||
for k in self.smetrics:
|
|
||||||
if isinstance(self.smetrics[k], float):
|
|
||||||
rv += ", %.2f" % self.smetrics[k]
|
|
||||||
else:
|
|
||||||
if isinstance(self.smetrics[k], list):
|
|
||||||
values = ", ".join(str(round(x, 2)) for x in self.smetrics[k])
|
|
||||||
rv += ", [%s]" % values
|
|
||||||
else:
|
|
||||||
rv += ", " + str(self.smetrics[k])
|
|
||||||
return rv
|
|
||||||
|
|
||||||
|
|
||||||
def readserverthresholds(filename):
|
|
||||||
if filename is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
lines = readfile(filename)
|
|
||||||
for l in lines:
|
|
||||||
mval = l.strip().split('=')
|
|
||||||
if len(mval) > 1:
|
|
||||||
thekey = mval[0].strip()
|
|
||||||
theval = mval[1].strip()
|
|
||||||
if thekey in serverthresholds.getkeys():
|
|
||||||
serverthresholds.setvalue(thekey, float(theval))
|
|
||||||
|
|
||||||
|
|
||||||
def checkserverthreshold(metricval):
|
|
||||||
"""
|
|
||||||
Print out an alarm if a ServerMetrics value crosses threshold.
|
|
||||||
"""
|
|
||||||
for key in serverthresholds.getkeys():
|
|
||||||
if key == "processorusedcputime":
|
|
||||||
pcpus = metricval.getvalue(key)
|
|
||||||
for ind, pcpu in enumerate(pcpus):
|
|
||||||
if pcpu > serverthresholds.getvalue(key):
|
|
||||||
alarm = ["server", os.uname()[1], str(ind) + key,
|
|
||||||
"%.2f" % pcpus[ind], ">", serverthresholds.getvalue(key)]
|
|
||||||
if options.timestamp:
|
|
||||||
print(str(time.time()) + ",",)
|
|
||||||
print(", ".join(str(x) for x in alarm))
|
|
||||||
else:
|
|
||||||
if metricval.getvalue(key) > serverthresholds.getvalue(key):
|
|
||||||
alarm = ["server", os.uname()[1], key,
|
|
||||||
"%.2f" % metricval.getvalue(key), ">", serverthresholds.getvalue(key)]
|
|
||||||
if options.timestamp:
|
|
||||||
print(str(time.time()) + ",",)
|
|
||||||
print(", ".join(str(x) for x in alarm))
|
|
||||||
|
|
||||||
|
|
||||||
def collectservercputimes():
|
|
||||||
"""
|
|
||||||
Return cpu times in ticks of this server total and each processor 3*(1+#cpu) columns
|
|
||||||
(user+nice, sys, idle) from each /proc/stat cpu lines assume columns are:
|
|
||||||
cpu# user nice sys idle iowait irq softirq steal guest (man 5 proc)
|
|
||||||
"""
|
|
||||||
rval = {}
|
|
||||||
lines = readfile("/proc/stat")
|
|
||||||
for i in range(ncpus + 1):
|
|
||||||
items = lines[i].split()
|
|
||||||
user, nice, sys, idle = [int(x) for x in items[1:5]]
|
|
||||||
rval[i] = [user+nice, sys, idle]
|
|
||||||
return rval
|
|
||||||
|
|
||||||
|
|
||||||
def csvservercputimes(cputimes):
|
|
||||||
"""
|
|
||||||
Return a csv string of this server total and each processor's cpu times
|
|
||||||
(usr, sys, idle) in ticks.
|
|
||||||
"""
|
|
||||||
rval = ''
|
|
||||||
for i in range(len(cputimes)):
|
|
||||||
rval += ", ".join(str(x) for x in cputimes[i])
|
|
||||||
return rval
|
|
||||||
|
|
||||||
|
|
||||||
def calcservercputimes(cputimea, cputimeb):
|
|
||||||
"""
|
|
||||||
Return cpu used/total % of this server total and each processor (1+#cpu columns).
|
|
||||||
"""
|
|
||||||
p = {}
|
|
||||||
for n in range(ncpus + 1):
|
|
||||||
p[n] = []
|
|
||||||
for i in range(len(cputimea[n])):
|
|
||||||
p[n].append(cputimeb[n][i] - cputimea[n][i])
|
|
||||||
# cpu times total delta
|
|
||||||
total = sum(p[n])
|
|
||||||
if total == 0:
|
|
||||||
p[n] = 0.0
|
|
||||||
else:
|
|
||||||
p[n] = 100 - ((100.0 * p[n][-1]) / total)
|
|
||||||
return p
|
|
||||||
|
|
||||||
|
|
||||||
def collectservermems():
|
|
||||||
"""
|
|
||||||
Return memory (total, free) in KB from proc/meminfo.
|
|
||||||
"""
|
|
||||||
lines = readfile("/proc/meminfo")
|
|
||||||
mem = [x.split() for x in lines[0:2]]
|
|
||||||
return [int(x) for x in zip(*mem)[1]]
|
|
||||||
|
|
||||||
|
|
||||||
def csvservermems(mems):
|
|
||||||
"""
|
|
||||||
Return a csv string of this server memory (total, free).
|
|
||||||
"""
|
|
||||||
return ", ".join(str(x) for x in mems)
|
|
||||||
|
|
||||||
|
|
||||||
def calcserverusedmem(mems):
|
|
||||||
"""
|
|
||||||
Return int(100*(MemTotal-MemFree)/MemTotal) from /proc/meminfo.
|
|
||||||
"""
|
|
||||||
return 100 * (mems[0] - mems[1]) / mems[0]
|
|
||||||
|
|
||||||
|
|
||||||
def collectservermetrics(cputimes, mems, thresholdcheck):
|
|
||||||
"""
|
|
||||||
Return ServerMetrics object with a dictionary of
|
|
||||||
loadavg1,loadavg5,loadavg15, usedmem%, usedcpu% for total, cpu1, cpu2, ...
|
|
||||||
"""
|
|
||||||
metricval = []
|
|
||||||
ldavgs = os.getloadavg()
|
|
||||||
for v in ldavgs:
|
|
||||||
metricval.append(v)
|
|
||||||
metricval.append(calcserverusedmem(mems))
|
|
||||||
|
|
||||||
for i in range(ncpus + 1):
|
|
||||||
metricval.append(cputimes[i])
|
|
||||||
|
|
||||||
srvmetrics = ServerMetrics()
|
|
||||||
srvmetrics.setvalues(metricval)
|
|
||||||
|
|
||||||
if thresholdcheck:
|
|
||||||
checkserverthreshold(srvmetrics)
|
|
||||||
|
|
||||||
return srvmetrics
|
|
||||||
|
|
||||||
|
|
||||||
def csvservermetrics(srvmetrics):
|
|
||||||
"""
|
|
||||||
Return a csv string of ServerMetrics.tocsv()
|
|
||||||
loadavg1,loadavg5,loadavg15, usedmem%, usedcpu% for total, cpu1, cpu2, ...
|
|
||||||
"""
|
|
||||||
rv = ""
|
|
||||||
if options.timestamp:
|
|
||||||
rv = str(time.time()) + ", "
|
|
||||||
rv += srvmetrics.tocsv()
|
|
||||||
return rv
|
|
||||||
|
|
||||||
|
|
||||||
def csvserverbaseline():
|
|
||||||
"""
|
|
||||||
Return a csv string of raw server metrics data: memfree, memtotal, cpuused, cpusystem, cpuidle.
|
|
||||||
"""
|
|
||||||
return "memory (total, free) = " + csvservermems(collectservermems()) + "\ncputime (used, sys, idl) = " + csvservercputimes(collectservercputimes())
|
|
||||||
|
|
||||||
|
|
||||||
class NodeMetrics(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.nmetrics = {"nodethroughput": 0.0,
|
|
||||||
"nodeusedmemory": 0.0,
|
|
||||||
"nodetotalcpu": 0.0,
|
|
||||||
"nodeusercpu": 0.0,
|
|
||||||
"nodesystemcpu": 0.0,
|
|
||||||
"nodewaitcpu": 0.0}
|
|
||||||
|
|
||||||
def setvalues(self, val):
|
|
||||||
"""
|
|
||||||
Set values from val = (throughput, mem, tcpu, ucpu, scpu, wcpu).
|
|
||||||
"""
|
|
||||||
self.nmetrics["nodethroughput"] = val[0]
|
|
||||||
self.nmetrics["nodeusedmemory"] = val[1]
|
|
||||||
self.nmetrics["nodetotalcpu"] = val[2]
|
|
||||||
self.nmetrics["nodeusercpu"] = val[3]
|
|
||||||
self.nmetrics["nodesystemcpu"] = val[4]
|
|
||||||
self.nmetrics["nodewaitcpu"] = val[5]
|
|
||||||
|
|
||||||
def setvalue(self, key, val):
|
|
||||||
self.nmetrics[key] = val
|
|
||||||
|
|
||||||
def getvalue(self, key):
|
|
||||||
return self.nmetrics[key]
|
|
||||||
|
|
||||||
def getkeys(self):
|
|
||||||
return self.nmetrics.keys()
|
|
||||||
|
|
||||||
def tocsv(self):
|
|
||||||
return ", ".join(str(x) for x in self.nmetrics.values())
|
|
||||||
|
|
||||||
|
|
||||||
class LogSession(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.nodethresholds = NodeMetrics()
|
|
||||||
# set node threshold default values:
|
|
||||||
# nodethroughput=20.0, nodeusedmemory=15.0, nodetotalcpu=90.0,
|
|
||||||
# nodeusercpu=30.0, nodewaitcpu=50.0, nodesystemcpu=20.0}
|
|
||||||
self.nodethresholds.setvalues([20.0, 15.0, 90.0, 30.0, 50.0, 20.0])
|
|
||||||
if options.configfile is not None:
|
|
||||||
self.readnodethresholds(options.configfile)
|
|
||||||
self.pids = {}
|
|
||||||
self.nodemetricsA = {}
|
|
||||||
self.nodemetricsB = {}
|
|
||||||
self.nodemetricsC = {}
|
|
||||||
|
|
||||||
def getpids(self):
|
|
||||||
"""
|
|
||||||
Return dict of all CORE session pids in a dict using node name as the keys
|
|
||||||
parent pid (vnoded) is the first value.
|
|
||||||
"""
|
|
||||||
self.pids = {}
|
|
||||||
nodes = commands.getstatusoutput(
|
|
||||||
"ls /tmp/pycore.%s/*pid" % options.session)
|
|
||||||
if nodes[0] != 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
nodes = nodes[1].split('\n')
|
|
||||||
for nod in nodes:
|
|
||||||
nodename = nod.split('/')[-1].strip(".pid")
|
|
||||||
self.pids[nodename] = commands.getoutput("cat %s" % nod)
|
|
||||||
|
|
||||||
# do not expect failure of this command
|
|
||||||
procs = commands.getoutput('ps -eo ppid,pid,comm').split('\n')
|
|
||||||
|
|
||||||
# build self.pids dict with key=nodename and val="ppid,pid,cmd"
|
|
||||||
for nname in self.pids:
|
|
||||||
if self.pids[nname] == "":
|
|
||||||
if options.timestamp == True:
|
|
||||||
print(str(time.time()),)
|
|
||||||
print("ERROR: null vnoded pid of node: %s" % nname)
|
|
||||||
else:
|
|
||||||
childprocs = []
|
|
||||||
ppid = self.pids[nname]
|
|
||||||
for proc in procs:
|
|
||||||
val = proc.split()
|
|
||||||
if ppid == val[1]:
|
|
||||||
childprocs.append([val[1], val[2]])
|
|
||||||
if ppid == val[0]:
|
|
||||||
childprocs.append([val[1], val[2]])
|
|
||||||
self.pids[nname] = childprocs
|
|
||||||
return self.pids
|
|
||||||
|
|
||||||
def printsesspids(self):
|
|
||||||
if self.pids == {}:
|
|
||||||
return {}
|
|
||||||
for pp in self.pids:
|
|
||||||
if self.pids[pp] != []:
|
|
||||||
for ap in range(len(self.pids[pp]) - 1):
|
|
||||||
# ap pid
|
|
||||||
print(", " + self.pids[pp][ap][0],)
|
|
||||||
# ap cmd
|
|
||||||
print(", " + self.pids[pp][ap][1],)
|
|
||||||
procmetrics = [str(x) for x in self.pids[pp][ap][-1]]
|
|
||||||
print(", " + ", ".join(procmetrics),)
|
|
||||||
nodemetrics = [str(x) for x in self.pids[pp][-1]]
|
|
||||||
print(", " + ", ".join(nodemetrics))
|
|
||||||
|
|
||||||
def getprocessmetrics(self, pid):
|
|
||||||
"""
|
|
||||||
Return [cpu#, vsize(kb), ttime, utime, stime, wtime]
|
|
||||||
from a /proc/pid/stat (a single line file) assume columns are:
|
|
||||||
pid(0) comm(1) state ppid pgrp sess tty_nr tpgid flags
|
|
||||||
minflt cmiflt majflt cmajflt # utime(12) stime cutime cstime
|
|
||||||
priority nice num_threads itrealvalue starttime vsize(22) rss rsslim
|
|
||||||
startcode endcode startstack kstkesp signal blocked sigignore sigcatch
|
|
||||||
wchan nswap cnswap exit_signal processor(38) rt_priority
|
|
||||||
policy ioblock guest_time cguest_time (man 5 proc)
|
|
||||||
"""
|
|
||||||
rval = {}
|
|
||||||
lines = readfile("/proc/" + pid + "/stat")
|
|
||||||
if lines == []:
|
|
||||||
return rval
|
|
||||||
items = lines[0].split()
|
|
||||||
utime, stime, cutime, cstime = [int(x) for x in items[13:17]]
|
|
||||||
rval = (items[38], # last run processor
|
|
||||||
int(items[22])/1000, # process virtual mem in kb
|
|
||||||
utime + stime + cutime + cstime, # totoal time
|
|
||||||
utime, # user time
|
|
||||||
stime, # system time
|
|
||||||
cutime + cstime) # wait time
|
|
||||||
return rval
|
|
||||||
|
|
||||||
def getnodethroughput(self, pid):
|
|
||||||
"""
|
|
||||||
Return node throughput of total receive and transmit packets in kb.
|
|
||||||
"""
|
|
||||||
lines = readfile("/proc/" + pid + "/net/dev")
|
|
||||||
if lines == []:
|
|
||||||
return -0.00
|
|
||||||
ifs = [x.split() for x in lines[2:]]
|
|
||||||
ifm = zip(*ifs)
|
|
||||||
rv = sum(int(x) for x in ifm[1]) # received bytes
|
|
||||||
tr = sum(int(x) for x in ifm[9]) # transmited bytes
|
|
||||||
return (rv + tr)/1000
|
|
||||||
|
|
||||||
def getnodemetrics(self, mindex):
|
|
||||||
"""
|
|
||||||
Return NodeMetrics with indexed by nodename, values are rows of
|
|
||||||
[ [ppid, vnoded, [cpu#, vmem(kb), ttime, utime, stime, wtime]],
|
|
||||||
[cpid, cmd, [cpu#, vmem(kb), ttime, utime, stime, wtime]], ... ,
|
|
||||||
[thrput, vmem(kb), ttime, utime, stime, wtime]]
|
|
||||||
"""
|
|
||||||
if mindex == 'a':
|
|
||||||
metricref = self.nodemetricsA
|
|
||||||
else:
|
|
||||||
metricref = self.nodemetricsB
|
|
||||||
|
|
||||||
self.getpids()
|
|
||||||
if self.pids == {}:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
for nod in self.pids:
|
|
||||||
nmetric = NodeMetrics()
|
|
||||||
nmetric.__init__()
|
|
||||||
nodeapps = {}
|
|
||||||
for ap in range(len(self.pids[nod])): # get each process metrics
|
|
||||||
procm = self.getprocessmetrics(self.pids[nod][ap][0])
|
|
||||||
if procm == []:
|
|
||||||
if options.timestamp == True:
|
|
||||||
print(str(time.time()),)
|
|
||||||
print("WARNING: transient process", self.pids[nod][ap][1],
|
|
||||||
"/", self.pids[nod][ap][0], "on node %s" % nod)
|
|
||||||
else:
|
|
||||||
nodeapps[ap] = procm
|
|
||||||
self.pids[nod][ap].append(nodeapps[ap])
|
|
||||||
processm = zip(*nodeapps.values()) # get overall node metrics
|
|
||||||
if len(processm) > 0:
|
|
||||||
nmetric.setvalues((self.getnodethroughput(self.pids[nod][0][0]),
|
|
||||||
# vsize(kb)
|
|
||||||
sum(int(x) for x in processm[1]),
|
|
||||||
# ttime
|
|
||||||
sum(int(x) for x in processm[2]),
|
|
||||||
# utime
|
|
||||||
sum(int(x) for x in processm[3]),
|
|
||||||
# stime
|
|
||||||
sum(int(x) for x in processm[4]),
|
|
||||||
sum(int(x) for x in processm[5]))) # wtime
|
|
||||||
metricref[nod] = nmetric
|
|
||||||
return metricref
|
|
||||||
|
|
||||||
def setnodemetricsC(self, key, val):
|
|
||||||
self.nodemetricsC[key] = val
|
|
||||||
|
|
||||||
def printnodemetrics(self, mindex):
|
|
||||||
if mindex == 'c':
|
|
||||||
mm = self.nodemetricsC
|
|
||||||
else:
|
|
||||||
if mindex == 'a':
|
|
||||||
mm = self.nodemetricsA
|
|
||||||
else:
|
|
||||||
mm = self.nodemetricsB
|
|
||||||
|
|
||||||
for k in self.nodemetricsC:
|
|
||||||
if options.timestamp:
|
|
||||||
print(str(time.time()) + ",",)
|
|
||||||
print(k, ",", mm[k].tocsv())
|
|
||||||
|
|
||||||
def readnodethresholds(self, filename):
|
|
||||||
if filename is None:
|
|
||||||
return
|
|
||||||
lines = readfile(filename)
|
|
||||||
for l in lines:
|
|
||||||
mval = l.strip().split('=')
|
|
||||||
if len(mval) > 1:
|
|
||||||
thekey = mval[0].strip()
|
|
||||||
theval = mval[1].strip()
|
|
||||||
if thekey in self.nodethresholds.getkeys():
|
|
||||||
self.nodethresholds.setvalue(thekey, float(theval))
|
|
||||||
|
|
||||||
def checknodethresholds(self, nname):
|
|
||||||
calcm = self.nodemetricsC[nname]
|
|
||||||
for keyname in self.nodethresholds.getkeys():
|
|
||||||
if float(calcm.getvalue(keyname)) > float(self.nodethresholds.getvalue(keyname)):
|
|
||||||
alarm = ["node", nname + "/" + self.pids[nname][0][0], keyname,
|
|
||||||
calcm.getvalue(keyname), ">", self.nodethresholds.getvalue(keyname)]
|
|
||||||
if options.timestamp:
|
|
||||||
print(str(time.time()) + ",",)
|
|
||||||
print(", ".join(str(x) for x in alarm))
|
|
||||||
|
|
||||||
def calcnodemetrics(self, cputimea, cputimeb, mems):
|
|
||||||
"""
|
|
||||||
Return a dict of nodemetrics indexed by node name
|
|
||||||
nodemetrics[nodename][-1] = node/host%.
|
|
||||||
"""
|
|
||||||
p = []
|
|
||||||
for i in range(len(cputimeb[0])):
|
|
||||||
p.append(cputimeb[0][i] - cputimea[0][i])
|
|
||||||
hostusedcpu = p[0] + p[1]
|
|
||||||
hostusedmem = mems[0] - mems[1]
|
|
||||||
if hostusedcpu == 0:
|
|
||||||
print("WARNING: host used cpu = 0, ", p[0], p[1])
|
|
||||||
hostusedcpu = 1
|
|
||||||
if hostusedmem == 0:
|
|
||||||
print("WARNING: host used mem = 0, ", mems[0], mems[1])
|
|
||||||
hostusedmem = 1
|
|
||||||
|
|
||||||
nodesa = self.nodemetricsA
|
|
||||||
nodesb = self.nodemetricsB
|
|
||||||
for nod in nodesb:
|
|
||||||
calcm = self.nodemetricsC
|
|
||||||
calcm = NodeMetrics()
|
|
||||||
calcm.__init__()
|
|
||||||
if (nod in nodesa):
|
|
||||||
try:
|
|
||||||
if (nodesb[nod] == []) | (nodesa[nod] == []) | \
|
|
||||||
(False == isinstance(nodesb[nod], NodeMetrics)) | \
|
|
||||||
(False == isinstance(nodesa[nod], NodeMetrics)):
|
|
||||||
if options.timestamp == True:
|
|
||||||
print(str(time.time()),)
|
|
||||||
print("Warning: nodes %s is not fully instanciated" % nod)
|
|
||||||
else:
|
|
||||||
# calc throughput kbps
|
|
||||||
calcm.setvalue("nodethroughput", "%.2f" % (8 * (nodesb[nod].getvalue("nodethroughput")
|
|
||||||
- nodesa[nod].getvalue("nodethroughput")) / options.interval))
|
|
||||||
# calc mem node used / host used
|
|
||||||
calcm.setvalue("nodeusedmemory", "%.2f" % (
|
|
||||||
100.0 * (nodesb[nod].getvalue("nodeusedmemory") / hostusedmem)))
|
|
||||||
|
|
||||||
# calc total cpu time node / host
|
|
||||||
calcm.setvalue("nodetotalcpu", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodetotalcpu")
|
|
||||||
- nodesa[nod].getvalue("nodetotalcpu")) / hostusedcpu))
|
|
||||||
# calc user cpu time node / host
|
|
||||||
calcm.setvalue("nodeusercpu", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodeusercpu")
|
|
||||||
- nodesa[nod].getvalue("nodeusercpu")) / hostusedcpu))
|
|
||||||
# calc system cpu time node / host
|
|
||||||
calcm.setvalue("nodesystemcpu", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodesystemcpu")
|
|
||||||
- nodesa[nod].getvalue("nodesystemcpu")) / hostusedcpu))
|
|
||||||
# calc waitcpu time node / host
|
|
||||||
calcm.setvalue("nodewaitcpu", "%.2f" % (100.0 * (nodesb[nod].getvalue("nodewaitcpu")
|
|
||||||
- nodesa[nod].getvalue("nodewaitcpu")) / hostusedcpu))
|
|
||||||
logsession.nodemetricsC[nod] = calcm
|
|
||||||
|
|
||||||
if options.alarm is not None:
|
|
||||||
logsession.checknodethresholds(nod)
|
|
||||||
except IndexError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
print("Warning: transient node %s " % nod)
|
|
||||||
|
|
||||||
return nodesb
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
usagestr = "%prog [-h] [options] [args]\n\nLog server and optional CORE session metrics to stdout."
|
|
||||||
parser = optparse.OptionParser(usage=usagestr)
|
|
||||||
parser.set_defaults(interval=2, timestamp=False,
|
|
||||||
configfile="/etc/core/perflogserver.conf",
|
|
||||||
alarm=True, session=None)
|
|
||||||
parser.add_option("-i", "--interval", dest="interval", type=int,
|
|
||||||
help="seconds to wait between samples; default=%s" % parser.defaults["interval"])
|
|
||||||
parser.add_option("-t", "--timestamp", action="store_true",
|
|
||||||
dest="timestamp",
|
|
||||||
help="include timestamp on each line")
|
|
||||||
parser.add_option("-c", "--configfile", dest="configfile",
|
|
||||||
type="string",
|
|
||||||
help="read threshold values from the specified file;default=%s" % parser.defaults["configfile"])
|
|
||||||
parser.add_option("-a", "--alarm", action="store_true",
|
|
||||||
dest="alarm",
|
|
||||||
help="generate alarms based threshold check on each cycle")
|
|
||||||
parser.add_option("-s", "--session", dest="session", type=int,
|
|
||||||
help="CORE session id; default=%s" % parser.defaults["session"])
|
|
||||||
global options
|
|
||||||
global ncpus
|
|
||||||
global serverthresholds
|
|
||||||
global logsession
|
|
||||||
global cyclecount
|
|
||||||
|
|
||||||
options, _args = parser.parse_args()
|
|
||||||
|
|
||||||
signal.signal(signal.SIGINT, handler)
|
|
||||||
signal.signal(signal.SIGTERM, handler)
|
|
||||||
|
|
||||||
ncpus = numcpus()
|
|
||||||
|
|
||||||
# server threshold dictionary - a ServerMetrics instant with default values
|
|
||||||
serverthresholds = ServerMetrics()
|
|
||||||
# set to server threshold default values: serverloadavg1=3.5,
|
|
||||||
# serverloadavg5=3.5, serverloadavg15=3.5, serverusedmemory=80.0,
|
|
||||||
# serverusedcputime=80.0, processorusedcputime=90.0
|
|
||||||
serverthresholds.setvalues([3.5, 3.5, 3.5, 80.0, 80.0, 90.0])
|
|
||||||
if options.alarm is True:
|
|
||||||
# read server threshold values from configuration file
|
|
||||||
readserverthresholds(options.configfile)
|
|
||||||
|
|
||||||
if options.session is not None:
|
|
||||||
logsession = LogSession()
|
|
||||||
|
|
||||||
# mark host log baseline
|
|
||||||
print("server: ", ", ".join(str(x) for x in os.uname()), ",", ncpus, "CPU cores")
|
|
||||||
print("start timestamp:", time.time(), ", baseline data: ")
|
|
||||||
print(csvserverbaseline())
|
|
||||||
print("server metrics: ", ", ".join(str(x) for x in serverthresholds.getkeys()))
|
|
||||||
if options.session is not None:
|
|
||||||
print("node metrics: nodename, ", ", ".join(str(x) for x in logsession.nodethresholds.getkeys()))
|
|
||||||
|
|
||||||
cyclecount = 0
|
|
||||||
while True:
|
|
||||||
cputimea = collectservercputimes()
|
|
||||||
if options.session is not None:
|
|
||||||
nodesa = logsession.getnodemetrics("a")
|
|
||||||
time.sleep(options.interval)
|
|
||||||
cputimeb = collectservercputimes()
|
|
||||||
mems = collectservermems()
|
|
||||||
calccputime = calcservercputimes(cputimea, cputimeb)
|
|
||||||
m = csvservermetrics(collectservermetrics(
|
|
||||||
calccputime, mems, options.alarm))
|
|
||||||
print(m)
|
|
||||||
|
|
||||||
if options.session is not None:
|
|
||||||
nodesb = logsession.getnodemetrics('b')
|
|
||||||
if nodesb != {}:
|
|
||||||
logsession.calcnodemetrics(cputimea, cputimeb, mems)
|
|
||||||
logsession.printnodemetrics('c')
|
|
||||||
|
|
||||||
sys.stdout.flush()
|
|
||||||
cyclecount = cyclecount + 1
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,10 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
# (c)2010-2012 the Boeing Company
|
|
||||||
# author: Yueli Yang <yueli.yang@boeing.com>
|
|
||||||
#
|
|
||||||
# start core performance logging and collect output to file
|
|
||||||
# /tmp/pycore.nnnn/perf<sessionid>.log
|
|
||||||
HOOKS_DIR=$(dirname $0)
|
|
||||||
|
|
||||||
sid=$(pwd | awk -F / {'print $3'} | awk -F . {'print $2'})
|
|
||||||
python $HOOKS_DIR/perflogserver.py -t -a -c /etc/core/perflogserver.conf -s $sid > perf$sid.log &
|
|
|
@ -1,11 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
# (c)2010-2012 the Boeing Company
|
|
||||||
# author: Yueli Yang <yueli.yang@boeing.com>
|
|
||||||
#
|
|
||||||
# terminate core perfromance logging process
|
|
||||||
|
|
||||||
perfproc=$(ps aux | grep perflogserver | grep python | awk {'print $2'})
|
|
||||||
if [ ! $perfproc = "" ]; then
|
|
||||||
echo "terminating core performance log process $perfproc"
|
|
||||||
kill -9 $perfproc
|
|
||||||
fi
|
|
|
@ -1,52 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
# (c)2010-2012 the Boeing Company
|
|
||||||
# author: Yueli Yang <yueli.yang@boeing.com>
|
|
||||||
#
|
|
||||||
# Create a tar ball of the CORE session's runtime folder /tmp/pycore.nnnn
|
|
||||||
# Collection such runtime tar balls from all distributed servers to folder
|
|
||||||
# /tmp/<sessionid>-<date>-<time> for example: /tmp/56779-11Oct14-09:33:13
|
|
||||||
|
|
||||||
currentdir=$(pwd)
|
|
||||||
sid=${currentdir##*.}
|
|
||||||
ts=$(date +%y%h%d-%T)
|
|
||||||
logfile=/tmp/corelog.tgz
|
|
||||||
echo Collect data from localhost:$currentdir to $logfile
|
|
||||||
cd ..
|
|
||||||
tar -czf $logfile ${currentdir##*/}
|
|
||||||
if [ ! $? = 0 ]; then
|
|
||||||
echo Failed to collect CORE data from localhost:$currentdir to $logfile
|
|
||||||
fi
|
|
||||||
cd $currentdir
|
|
||||||
|
|
||||||
m=$(grep master $currentdir/servers)
|
|
||||||
if [ ! $sid = ${m##*=} ]; then
|
|
||||||
# quite if this is not a master server
|
|
||||||
echo not a master server
|
|
||||||
exit
|
|
||||||
fi
|
|
||||||
|
|
||||||
# On a master server, create a folder to harvest overall core emulation data
|
|
||||||
logdir=/tmp/$sid-$ts
|
|
||||||
if [ ! -e $logdir ]; then
|
|
||||||
echo create folder $logdir
|
|
||||||
mkdir $logdir
|
|
||||||
fi
|
|
||||||
cp $logfile $logdir/localhost-${currentdir##*.}.tgz
|
|
||||||
|
|
||||||
# harvest CORE data from distributed servers
|
|
||||||
hs=$(grep -v master= $currentdir/servers | awk {'print $2'})
|
|
||||||
echo hosts are localhost $hs
|
|
||||||
for h in $hs; do
|
|
||||||
echo checking host $h ...
|
|
||||||
out=$(ping -c 1 -w 1 $h | grep " 0 received,")
|
|
||||||
if [ " $out " = " " ]; then
|
|
||||||
slavesid=$(ssh $h tar -tzf $logfile | awk -F / {'print $1'})
|
|
||||||
if [ $? = 0 ]; then
|
|
||||||
destlogfile=$logdir/$h-${slavesid##*.}.tgz
|
|
||||||
echo Collect data from $h:$logfile to $destlogfile
|
|
||||||
scp $h:$logfile $destlogfile
|
|
||||||
else
|
|
||||||
echo $logfile could not be found on host $h
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
done
|
|
|
@ -1,26 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
# (c)2010-2012 the Boeing Company
|
|
||||||
# author: Yueli Yang <yueli.yang@boeing.com>
|
|
||||||
#
|
|
||||||
# start time sync with ntpd if it is installed but not started
|
|
||||||
#
|
|
||||||
# Usage: sudo timesyncstart.sh [debug]
|
|
||||||
|
|
||||||
dbg=$1
|
|
||||||
if [ "$dbg" = "debug" ]; then
|
|
||||||
date +%y%h%d-%T-%N
|
|
||||||
fi
|
|
||||||
if [ -e /usr/sbin/ntpd ]; then
|
|
||||||
/etc/init.d/ntp status
|
|
||||||
if [ $? = 3 ]; then
|
|
||||||
/etc/init.d/ntp start
|
|
||||||
if [ "$dbg" = "debug" ]; then
|
|
||||||
echo "start ntpd process"
|
|
||||||
ntptime
|
|
||||||
ntpq -c peers
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
if [ "$dbg" = "debug" ]; then
|
|
||||||
date +%y%h%d-%T-%N
|
|
||||||
fi
|
|
|
@ -1,51 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
# (c)2010-2012 the Boeing Company
|
|
||||||
# author: Yueli Yang <yueli.yang@boeing.com>
|
|
||||||
#
|
|
||||||
# Stop timesync service. If it was not yet in sync with any ntp server,
|
|
||||||
# adjust its time to the first ntp server in # configured in /etc/ntp.conf
|
|
||||||
#
|
|
||||||
# Usage: sudo sh timesyncstop.sh [deubg]
|
|
||||||
|
|
||||||
dbg=$1
|
|
||||||
if [ "$dbg" = "debug" ]; then
|
|
||||||
date +%y%h%d-%T-%N
|
|
||||||
fi
|
|
||||||
svs=""
|
|
||||||
if [ -e /usr/sbin/ntpd ]; then
|
|
||||||
# command "sudo /usr/sbin/ntpd -q -g" will start the ntp service and
|
|
||||||
# keep it on until it performs a good synchronization, then it leaves.
|
|
||||||
# It is not quick enough for our need here
|
|
||||||
/etc/init.d/ntp status
|
|
||||||
if [ $? = 0 ]; then
|
|
||||||
svs=$(ntpq -c peers | awk {'print $1'} | grep ^*)
|
|
||||||
if [ "$dbg" = "debug" ]; then
|
|
||||||
ntpq -c peers
|
|
||||||
echo "get time servers for later need: $svs"
|
|
||||||
ntptime
|
|
||||||
echo "terminate ntpd process"
|
|
||||||
fi
|
|
||||||
/etc/init.d/ntp stop
|
|
||||||
fi
|
|
||||||
|
|
||||||
# if there are time servers configured in file /etc/ntp.conf,
|
|
||||||
# adjust time to the first server
|
|
||||||
if [ "$svs" = "" ]; then
|
|
||||||
svs=$(grep ^server /etc/ntp.conf | awk {'print $2'})
|
|
||||||
if [ "$dbg" = "debug" ]; then
|
|
||||||
echo "$svs"
|
|
||||||
fi
|
|
||||||
for sv in "$svs"; do
|
|
||||||
ntpdate $sv
|
|
||||||
# may use "sntp -s $sv" or "rdate" if ntpdate is deprecated
|
|
||||||
break
|
|
||||||
done
|
|
||||||
else
|
|
||||||
if [ "$dbg" = "debug" ]; then
|
|
||||||
echo No time adjustment because time sync was stable with $svs
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
if [ "$dbg" = "debug" ]; then
|
|
||||||
date +%y%h%d-%T-%N
|
|
||||||
fi
|
|
Loading…
Reference in a new issue