Mini Shell
# vim: set ts=4 sw=4 expandtab syntax=python:
"""
ngxutil.logparse
Nginx access.log parser
@author J. Hipps <jacobh@inmotionhosting.com>
"""
import logging
import re
import socket
import math
from time import time
from collections import OrderedDict
import arrow
from tailer import Tailer
from ngxutil.cache import find_cache_item_url
from ngxutil.util import *
logger = logging.getLogger('ngxutil')
LOGRGX = r'(?P<clientip>[0-9a-fA-F\.\:]{7,32}) (?P<ident>[^ ]+) (?P<user>[^ ]+) ' \
r'\[(?P<timestamp>[^\]]+)\] "(?P<method>[A-Za-z0-9]+) (?P<uri>.+) ' \
r'HTTP/(?P<httpversion>[0-9\.]{1,4})" (?P<status>[0-9]{3}) (?P<bytes>[0-9]+|\-) ' \
r'"(?P<referrer>.+)" "(?P<agent>[^\"]+)" ' \
r'(?:\[loc=(?P<loc>[^\]]+)\] )?' \
r'\[rt=(?P<request_time>[^\]]+)\] \[proto=(?P<protocol>[^\]]+)\] \[args=(?P<args>[^\]]*)\] ' \
r'\[ucs=(?P<upstream_cache_status>[^\]]*)\] \[xpc=(?P<proxy_cache>[^\]]+)\] ' \
r'\[uct=(?P<upstream_connect_time>[^\]]*)\] \[gz=(?P<gzip_ratio>[^\]]*)\]' \
r'(?: \[br=(?P<br_ratio>[^\]]*)\])?' \
r'(?: \[tls=(?P<tls_version>[^\]]*)\] \[cipher=(?P<tls_ciphers>[^\]]*)\] \[scheme=(?P<scheme>[^\]]*)\])?' \
r'(?: \[eorigin=(?P<eorigin>[^\]]*)\])? ' \
r'(?P<vhost>[a-zA-Z\._:\-0-9]+) (?P<server_host>[a-zA-Z\._:\-0-9]+) ' \
r'(?P<server_hostname>[a-zA-Z\._:\-0-9]+) (?P<connection_id>[0-9]+)'
class NgxTailer(Tailer):
"""
NGINX access log reader
Reads chunks of the logfile in reverse, parses each line,
then stops when conditions are met.
Extends tailer.Tailer from the PyTailer module
"""
rgx = re.compile(LOGRGX)
span_start = arrow.get(0)
lastlines = None
def up_next_lines(self, lines=100):
"""
Return the next @lines of the file in reverse
"""
__pstart = time()
end_pos = self.file.tell()
for i in range(lines):
if not self.seek_line():
break
f_pos = self.file.tell()
data = self.file.read(end_pos - f_pos - 1)
self.seek(f_pos)
if data:
sl = self.splitlines(data)
else:
sl = []
__pdelta = time() - __pstart
logger.debug("*** NgxTailer prof: up_next_lines() d-t = %f", __pdelta)
return sl
def read_log(self, lastlines=None, span=None):
"""
Read @lastlines and/or @span from log
"""
__rstart = time()
self.lastlines = lastlines
if span:
try:
self.span_start = arrow.now().shift(hours=int(span) * -1)
except Exception as e:
logger.error("Failed to parse span start time: %s", str(e))
return None
else:
self.span_start = arrow.get(0)
logger.debug("Using span=%s (span_start=%s) / lastlines=%s", span, self.span_start, lastlines)
self.seek_end()
logset = []
while True:
__pstart = time()
tset = self.parse_lineset(self.up_next_lines(1000))
if not len(tset):
break
else:
logset += tset
if self.lastlines and len(logset) >= self.lastlines:
break
__pdelta = time() - __pstart
logger.debug("*** NgxTailer prof: read_log() loop d-t = %f [len(tset) = %d / len(logset) = %d]", __pdelta, len(tset), len(logset))
__rdelta = time() - __rstart
logger.debug("NgxTailer: Parsed %d lines in %f seconds", len(logset), __rdelta)
return logset
def parse_lineset(self, lines):
"""
Parse set of @lines, return list
"""
__pstart = time()
lset = []
for tline in lines:
pline = self.parse_logline(tline)
if pline:
lset.append(pline)
__pdelta = time() - __pstart
logger.debug("*** NgxTailer prof: parse_lineset() d-t = %f", __pdelta)
return lset
def parse_logline(self, line):
"""
Parse a single log @line, return string
"""
try:
pline = self.rgx.match(line.strip()).groupdict()
pdate = arrow.get(pline['timestamp'], 'DD/MMM/YYYY:HH:mm:ss Z')
if pdate < self.span_start:
return None
return pline
except Exception as e:
logger.debug("Failed to parse line ('%s'): %s", line.strip(), str(e))
return None
def get_cache_hitrate(logdata):
"""
Return a dict of cache statuses and corresponding hitrates
"""
odict = {}
for tline in logdata:
tpc = tline.get('proxy_cache')
if tpc not in odict:
odict[tpc] = 1
else:
odict[tpc] += 1
return odict
def get_status_hitrate(logdata):
"""
Return a dict of response statuses and corresponding hitrates, ignores
5xx hitrates if eorigin is cpanel.
"""
odict = {'1xx': 0, '2xx': 0, '3xx': 0, '4xx': 0, '5xx': 0}
for tline in logdata:
tpc = tline.get('status')
eorigin = tline.get('eorigin')
if tpc:
if eorigin == 'cpanel' and tpc[0] == '5':
continue
if tpc not in odict:
odict[tpc] = 1
else:
odict[tpc] += 1
try:
odict[tpc[0]+'xx'] += 1
except:
pass
return odict
def get_reqtime(logdata):
"""
Return min/avg/max request time (floats)
"""
odict = {}
rtimes = [float(x.get('request_time', 0.0)) for x in logdata]
odict['avg'] = math.fsum(rtimes) / float(len(rtimes))
odict['min'] = min(rtimes)
odict['max'] = max(rtimes)
return odict
def get_bytes(logdata):
"""
Return TX bytes
"""
return math.fsum([float(x.get('bytes', 0.0)) for x in logdata])
def get_top_uris(logdata, rmethod=None, czone=None, topcount=10, colorize=True):
"""
Return list of top URIs and number of hits, up to a max of @topcount
If @rmethod is set, only methods that match will be returned
(eg. POST, GET, etc.)
If @czone is set to the cache zone, add a 'cache' key to show
current cache status
"""
tally = {}
for tline in logdata:
if rmethod and rmethod != tline['method']:
continue
tkey = "{method} {status} {server_host} {uri}".format(**tline)
if tkey not in tally:
tally[tkey] = 1
else:
tally[tkey] += 1
odict = OrderedDict()
for tkey, thit in sorted(tally.items(), key=lambda x: x[1], reverse=True)[:topcount]:
tsplit = tkey.split(' ', 3)
if czone:
citem = find_cache_item_url(czone, 'http://{}{}'.format(tsplit[2], tsplit[3]), tsplit[0])
if not citem:
citem = find_cache_item_url(czone, 'https://{}{}'.format(tsplit[2], tsplit[3]), tsplit[0])
if citem:
if citem['expired']:
cstat = strcolor('yellow', 'EXPIRED') if colorize else 'expired'
else:
cstat = strcolor('green', 'VALID') if colorize else 'valid'
else:
cstat = strcolor('red', 'NO') if colorize else 'no'
else:
cstat = '-'
odict[tkey] = {'method': tsplit[0], 'status': tsplit[1], 'host': tsplit[2],
'uri': tsplit[3], 'hits': thit, 'cache': cstat}
return dict(odict)
def get_top_hosts(logdata, topcount=10):
"""
Return list of top Hosts and number of hits, up to a max of @topcount
"""
tally = {}
for tline in logdata:
tkey = "{server_host}".format(**tline)
if tkey not in tally:
tally[tkey] = 1
else:
tally[tkey] += 1
odict = OrderedDict()
for tkey, thit in sorted(tally.items(), key=lambda x: x[1], reverse=True)[:topcount]:
odict[tkey] = {'host': tkey, 'hits': thit}
return dict(odict)
def get_top_locations(logdata, topcount=10):
"""
Return list of top locations and number of hits, up to a max of @topcount
"""
tally = {}
for tline in logdata:
if tline['loc'] not in tally:
tally[tline['loc']] = 1
else:
tally[tline['loc']] += 1
return dict(OrderedDict(sorted(tally.items(), key=lambda x: x[1], reverse=True)[:topcount]))
def get_top_agents(logdata, topcount=10):
"""
Return list of top User-Agents and number of hits, up to a max of @topcount
"""
tally = {}
for tline in logdata:
if tline['agent'] not in tally:
tally[tline['agent']] = 1
else:
tally[tline['agent']] += 1
return dict(OrderedDict(sorted(tally.items(), key=lambda x: x[1], reverse=True)[:topcount]))
def get_top_ips(logdata, rresolve=True, rtimeout=2.0, topcount=10):
"""
Return list of top client IPs and number of hits, up to a max of @topcount
If @rresolve is True, reverse lookups will be performed
"""
tally = {}
for tline in logdata:
if tline['clientip'] not in tally:
tally[tline['clientip']] = 1
else:
tally[tline['clientip']] += 1
socket.setdefaulttimeout(rtimeout)
odict = OrderedDict()
for tip, thit in sorted(tally.items(), key=lambda x: x[1], reverse=True)[:topcount]:
if rresolve:
try:
tres = socket.gethostbyaddr(tip)[0]
except:
tres = None
else:
tres = None
odict[tip] = {'hits': thit, 'reverse': tres}
return dict(odict)
def get_totals(logdata):
"""
Calculate totals/averages for request time, size, etc.
"""
odict = {}
for treq in logdata:
odict['rt_tot'] += float(request_time)
return odict
def filter_log(logdata, field, match):
"""
Filter log lines by @field, matching @match
"""
if match == '*':
return logdata
else:
return [x for x in logdata if x.get(field, '').lower() == match]
def parse_log(logpath, lastlines=None, span=None):
"""
Parse access log from @logpath; only parse last @lastlines, if defined
"""
try:
with open(logpath, 'r') as f:
xtail = NgxTailer(f)
glines = xtail.read_log(lastlines, span)
logger.debug("Read %d lines from %s (requested %s)", len(glines), logpath, lastlines)
except Exception as e:
logger.error("Failed to read log [%s]: %s", logpath, str(e))
return None
return glines
Zerion Mini Shell 1.0