Mini Shell
# vim: set ts=4 sw=4 expandtab syntax=python:
"""
ngxstats.master
Realtime Nginx stats aggregation tool
Daemon lifecycle management
Copyright (c) 2019-2020 InMotion Hosting, Inc.
https://www.inmotionhosting.com/
@author J. Hipps <jacobh@inmotionhosting.com>
"""
# pylint: disable=invalid-name
import multiprocessing
import logging
import logging.handlers
from queue import Empty, Full
from time import time
from collections import Counter
import numpy
from setproctitle import setproctitle
from ngxstats import zmq_client, postgres, influx, stats
from ngxstats.util import gconf
logger = logging.getLogger('ngxstats')
workers = {'zmq_client': None, 'postgres': None, 'influx': None, 'stats': None}
def start():
"""
Spawn subservices
"""
logger.info(
"master: main service started. pid = %d",
multiprocessing.current_process().pid,
)
try:
setproctitle("ngxstats: master")
except Exception:
pass
# Setup IPC queues
logger.debug("master: creating IPC queues")
q_incoming = multiprocessing.Queue(gconf.queue_max_size)
q_outgoing = multiprocessing.Queue(gconf.queue_max_size)
q_outaggro = multiprocessing.Queue(gconf.queue_max_size)
# Spawn worker procs and attach their queues
logger.debug("master: spawning subservices")
if not gconf.disable_stats:
workers['zmq_client'] = multiprocessing.Process(
name="ngxstats: zmq_client",
target=zmq_client.start,
args=(q_incoming,),
)
workers['postgres'] = multiprocessing.Process(
name="ngxstats: postgres", target=postgres.start, args=(q_outgoing,)
)
workers['influx'] = multiprocessing.Process(
name="ngxstats: influx", target=influx.start, args=(q_outaggro,)
)
if gconf.stats_api_enable:
workers['stats'] = multiprocessing.Process(
name="ngxstats: stats", target=stats.start
)
for tworker, tproc in workers.items():
if tproc:
logger.debug("master: spawn %s...", tworker)
tproc.start()
log_queue = []
last_check = 0.0
# Run main upkeep loop
logger.debug(
"master: entering main control loop; flush_interval = %ds / queue_max_size = %d",
gconf.flush_interval,
gconf.queue_max_size,
)
while True:
if not gconf.disable_stats:
# grab new zmq log entries
# block for up to 0.1s while waiting for new messages
# using this method versus non-blocking calls is more reliable for whatever reason
try:
log_queue.append(q_incoming.get(True, 0.1))
except Empty:
pass
now = time()
if now - last_check >= gconf.flush_interval:
if not gconf.disable_stats:
logger.debug(
"master: flush_interval reached; now - last_check = %01.2fs",
now - last_check,
)
if now - last_check > (gconf.flush_interval * 1.5):
logger.warning(
"master: processing loop lag: %01.2fs",
now - last_check - gconf.flush_interval,
)
try:
qremain = q_incoming.qsize()
except Exception:
qremain = 0
logger.debug(
"master: log_queue size = %d / remain = %d",
len(log_queue),
qremain,
)
# collate and flush data to postgres writer
try:
q_outgoing.put(log_queue)
except Full:
logger.error(
"q_outgoing is full! Log metrics will be lost."
)
# perform aggregation on log_queue data
# & flush aggro data to influxdb
if log_queue:
try:
q_outaggro.put(do_aggro(log_queue))
except Full:
logger.error(
"q_outaggro is full! Aggregate metrics will be lost."
)
# reset log_queue
log_queue = []
last_check = now
def do_aggro(indata):
"""
Aggregate incoming log data for
the current time interval
To create a new aggregation for Influx, add it here.
"""
aggro = {
'requests': 0, # Total requests
'bytes': 0, # Total bytes sent
'rps': 0.0, # Requests per second average
'bps': 0.0, # Bytes per second average
'tdelta': 0.0, # Time delta
'rt_avg': 0.0, # Request Time - Mean
'rt_med': 0.0, # Request Time - Median
'rt_max': 0.0, # Request Time - Max
'stat_404': 0, # Status count: 404
'stat_406': 0, # Status count: 406
'stat_429': 0, # Status count: 429
'stat_499': 0, # Status count: 499
'stat_500': 0, # Status count: 500
'stat_502': 0, # Status count: 502
'stat_503': 0, # Status count: 503
'stat_503r': 0, # Status count: 503 (when eorigin=NULL)
'stat_504': 0, # Status count: 504
'stat_508': 0, # Status count: 508
'scheme_http': 0, # Total HTTP requests
'scheme_https': 0, # Total HTTPS requests
'proto_http10': 0, # Total HTTP/1.0 requests
'proto_http11': 0, # Total HTTP/1.1 requests
'proto_http2': 0, # Total HTTP/2 requests
'top_vhost': "", # Top vhost/server block
'top_addr': "", # Top client address
'req_wplogin': 0, # Request count: wp-login.php
'req_xmlrpc': 0, # Request count: xmlrpc.php
'req_wlwmanifest': 0, # Request count: wlwmanifest.xml
}
counters = {
'vhost': [],
'addr': [],
'rt': [],
'ts': [],
}
# Aggregate requests
for treq in indata:
aggro['requests'] += 1
aggro['bytes'] += int(treq.get('body_bytes_sent', 0))
counters['vhost'].append(treq.get('server_name'))
counters['addr'].append(treq.get('remote_addr'))
try:
counters['ts'].append(float(treq.get('ts')))
except Exception:
pass
try:
counters['rt'].append(float(treq.get('request_time')))
except Exception:
pass
try:
# count 'real' 503 errors here (where eorigin is unset)
if treq.get('status') == '503' and treq.get('eorigin') == '':
aggro['stat_503r'] += 1
else:
aggro['stat_' + treq.get('status')] += 1
except Exception:
pass
try:
aggro['scheme_' + treq.get('scheme')] += 1
except Exception:
pass
if treq.get('proto') == 'HTTP/2.0':
aggro['proto_http2'] += 1
elif treq.get('proto') == 'HTTP/1.1':
aggro['proto_http11'] += 1
elif treq.get('proto') == 'HTTP/1.0':
aggro['proto_http10'] += 1
# Aggregate WordPress specific 'bruteforce' Requests
if "wp-login.php" in treq.get('request'):
aggro['req_wplogin'] += 1
elif "xmlrpc.php" in treq.get('request'):
aggro['req_xmlrpc'] += 1
elif "wlwmanifest.xml" in treq.get('request'):
aggro['req_wlwmanifest'] += 1
# Determine period/delta-t
try:
period = numpy.max(counters['ts']) - numpy.min(counters['ts'])
except Exception:
period = gconf.flush_interval
# Calculate final metrics
aggro['tdelta'] = period
aggro['rps'] = float(aggro['requests']) / period
aggro['bps'] = float(aggro['bytes']) / period
aggro['rt_avg'] = numpy.mean(counters['rt'])
aggro['rt_med'] = numpy.median(counters['rt'])
aggro['rt_max'] = numpy.max(counters['rt'])
try:
aggro['top_vhost'] = Counter(counters['vhost']).most_common(1)[0][0]
aggro['top_addr'] = Counter(counters['addr']).most_common(1)[0][0]
except Exception as e:
logger.warning(
"master: do_aggro: failed to determine top vhost/addr: %s", str(e)
)
return aggro
Zerion Mini Shell 1.0