Mini Shell

Current File : //opt/sharedrads/alp.py
#! /opt/imh-python/bin/python3
''' Apache Log Parser - Parse Apache domain access logs '''

import os
import sys
import logging
import re
import json
from argparse import ArgumentParser
from time import time
from collections import defaultdict
from platform import node as hostname

import envinfo
from dns import resolver, reversename, exception

from rads import setup_logging, color

__maintainer__ = "Daniel K"
__email__ = "danielk@inmotionhosting.com"
__version__ = "1.0.2"
__date__ = "2016-09-16"


# Location of Apache domain logs for users.
# The bit at the end is for str.format() to allow users to be added there
USER_DOMLOG_DIR = envinfo.get_data()['apache_domlogs'] + "/{0!s}/"
# Maximum number of log files on shared servers
MAX_LOGS_SHARED = 50

LOGGER = logging.getLogger(__name__)


def ptr_lookup(ip_addr):
    """Return PTR for IP address"""
    try:
        myresolver = resolver.Resolver()
        myresolver.lifetime = 1.0
        myresolver.timeout = 1.0

        question_name = reversename.from_address(ip_addr)
        answers = myresolver.query(question_name, "PTR")
        return str(answers[0])

    except resolver.NXDOMAIN:
        return "No Record Found"
    except exception.Timeout:
        LOGGER.debug("Query Timed out looking for %s", ip_addr)
        return "Query Timed Out"
    except resolver.NoNameservers:
        LOGGER.debug("No nameservers found for %s", ip_addr)
        return "No nameservers found"
    except resolver.NoAnswer:
        LOGGER.debug("No answer for %s", ip_addr)
        return "No Answer"


def domlog_lines(source):
    '''Process log lines'''

    if source == "-":
        LOGGER.info("Processing from STDIN.")
        yield from sys.stdin
    else:
        filename = source
        LOGGER.info("Process file %s", source)
        if os.path.exists(filename):
            with open(filename, encoding='utf-8') as file_handle:
                try:
                    yield from file_handle
                except OSError:
                    LOGGER.error("Error reading file %s", filename)


def trim_dict(dictionary, entries):
    '''Trim dictionary to top entries ordered by value'''

    trimmed_dict = {}

    count = 0

    for item in sorted(dictionary, key=lambda x: dictionary[x], reverse=True):
        count = count + 1
        trimmed_dict[item] = dictionary[item]
        if count >= entries:
            return trimmed_dict

    return trimmed_dict


def parse_domlogs(source, numlines=10, add_ptr=False):
    '''Process log lines'''

    results = {
        'status_codes': defaultdict(int),
        'daily_hourly': defaultdict(lambda: defaultdict(int)),
        'requests': defaultdict(int),
        'user_agents': defaultdict(int),
        'top_ips': defaultdict(int),
        'linecount': 0,
    }

    # Single regex to match all log lines.
    # It stores each entry in named groups, even though not all groups
    # are used by this script. You can see the names listed below
    # as (?<name>...).
    rx_logline = re.compile(
        r'^(?P<ips>(?P<ip>[0-9.]+|[a-fA-F0-9:]+)'  # Could handle multiple IPs
        r'(,\s*[0-9.]+|[a-fA-F0-9:]+)*)\s+'
        r'(?P<logname>\S+)\s+(?P<user>\S+)\s+'  # Could find logged in users
        r'\[(?P<date>[0-9]+/[a-zA-Z]+/[0-9]+):'
        r'(?P<time>(?P<hour>[0-9]+):[0-9]+:[0-9]+ [0-9-+]+)\]\s+'
        r'"(?P<request>(?P<type>[A-Z]+)\s+(?P<uri>\S+)) [^"]*"\s+'
        r'(?P<status>[0-9]+|-)\s+(?P<size>[0-9]+|-)\s+'
        r'"(?P<referrer>[^"]*)"\s+'
        r'"(?P<useragent>.*)"$'
    )

    for line in domlog_lines(source):
        results['linecount'] = results['linecount'] + 1
        match_logline = rx_logline.search(line)
        if match_logline is not None:
            results['status_codes'][match_logline.group('status')] = (
                results['status_codes'][match_logline.group('status')] + 1
            )
            request = "{: <4} {}".format(
                match_logline.group('status'), match_logline.group('request')
            )
            results['requests'][request] = results['requests'][request] + 1
            results['top_ips'][match_logline.group('ip')] = (
                results['top_ips'][match_logline.group('ip')] + 1
            )
            results['user_agents'][match_logline.group('useragent')] = (
                results['user_agents'][match_logline.group('useragent')] + 1
            )
            date = match_logline.group('date')
            hour = match_logline.group('hour')
            results['daily_hourly'][date][hour] = (
                results['daily_hourly'][date][hour] + 1
            )
        else:
            LOGGER.warning("Missed log line: %s", line)

    results['requests'] = trim_dict(results['requests'], numlines)
    results['user_agents'] = trim_dict(results['user_agents'], numlines)

    results['top_ips'] = trim_dict(results['top_ips'], numlines)

    if add_ptr:
        ip_ptr = defaultdict(int)
        for ip_addr in results['top_ips']:
            ptr_record = ptr_lookup(ip_addr)

            ip_with_ptr = f"{ip_addr: <15} {ptr_record}"
            ip_ptr[ip_with_ptr] = results['top_ips'][ip_addr]
        results['top_ips_with_ptr'] = ip_ptr

    return results


def logs_for_user(cpuser):
    '''Array of domlogs for cpuser. If cpuser is None, return all domlogs.'''

    if cpuser is None:
        LOGGER.info("Choosing domlog for all users")
        cpuser = '.'
    else:
        LOGGER.info("Choosing domlog for %s", cpuser)

    logfile_list = []
    for filename in os.listdir(USER_DOMLOG_DIR.format(cpuser)):
        if ("_log" not in filename) and ("-ssl" not in filename):
            if "ftpxferlog" in filename:
                continue
            logfile = os.path.join(USER_DOMLOG_DIR.format(cpuser), filename)
            if os.path.isfile(logfile):
                logfile_list.append(logfile)

    return logfile_list


def choose_logfile(cpuser):
    '''
    Determine log file to use for a cPanel user.
    This is done by first using any unique file, then using any
    unique recently updated file, and then preferring size for
    the remaining files.

    If cpuser is None, search for all logs.
    '''

    recentlog_list = []

    logfile_list = logs_for_user(cpuser)

    if len(logfile_list) == 0:
        LOGGER.warning("Could not find valid log file for %s", cpuser)
        return None
    if len(logfile_list) == 1:
        LOGGER.debug("Only one log file for %s: %s", cpuser, logfile_list[0])
        return logfile_list[0]

    for logfile in logfile_list:
        if os.path.getmtime(logfile) > (time() - 86400):
            # File is newer than 24 hours
            recentlog_list.append(logfile)

    if len(recentlog_list) == 1:
        LOGGER.debug(
            "Only one recent log file for %s: %s", cpuser, recentlog_list[0]
        )
        return recentlog_list[0]

    if len(recentlog_list) == 0:
        # If there are no recent files, choose from all files.
        LOGGER.debug("No recent logs for %s", cpuser)
    else:
        logfile_list = recentlog_list

    largest = 0
    domlog = None

    for logfile in logfile_list:
        if os.path.getsize(logfile) > largest:
            largest = os.path.getsize(logfile)
            domlog = logfile

    return domlog


def print_title(title, width):
    '''Print pretty header'''

    header_format = "~~ {0!s} ~~{1}"
    base_header_size = 8

    # If there is not enough room for the title, truncate it
    title = title[: width - base_header_size]

    head_length = len(title) + base_header_size

    long_bar = "".join("~" for i in range(width - head_length))
    print(
        color.green(
            header_format.format(
                title,
                long_bar,
            )
        )
    )


def print_tall(title, array, numlines, width):
    '''Print pretty data in a tall format, with one entry per line'''

    print_title(title, width)
    line_count = 0
    for item in sorted(array, key=lambda x: array[x], reverse=True):
        line_count = line_count + 1
        print(f"{array[item]: 6}     {item}"[:width])
        if line_count == numlines:
            return


def print_wide(title, array, numlines, width):
    '''Print pretty data in a wide format, with many entries per line'''

    print_title(title, width)
    line_count = 0
    current_width = 0
    for item in array:
        next_item = f"{item}: {array[item]}  "
        if current_width + len(next_item) >= width:
            line_count = line_count + 1
            print()
            current_width = 0
            if line_count == numlines:
                return
        current_width = current_width + len(next_item)
        print(next_item, end=' ')

    print()


def parse_args():
    '''
    Parse command line arguments
    '''

    parser = ArgumentParser(description=__doc__)

    parser.add_argument(
        "-a",
        "--all",
        action='store_true',
        help=(
            "Search all users. Do not limit search to single user. "
            "Overrides any usernames or paths given."
        ),
    )

    parser.add_argument(
        "-m",
        "--multilogs",
        action='store_true',
        help="Return results for all log files, rather than just one.",
    )

    ptr_group = parser.add_mutually_exclusive_group()

    ptr_group.add_argument(
        "-p",
        "--with-ptr",
        action='store_true',
        help="Get PTR records for IPs. This is the default.",
    )

    ptr_group.add_argument(
        "-P",
        "--no-ptr",
        action='store_true',
        help="Do not resolve PTRs for IPs. Overrides -p.",
    )

    parser.add_argument(
        "-V",
        "--version",
        action='store_true',
        help="Print version information and exit.",
    )

    output_group = parser.add_argument_group("Output options")

    output_group.add_argument(
        "-n",
        "--numlines",
        action='store',
        type=int,
        default=10,
        help=(
            "Number of lines to display in each section. " "The default is 10."
        ),
    )

    output_group.add_argument(
        "-w",
        "--width",
        action='store',
        type=int,
        default=110,
        help="Width of output in characters. The default is 110.",
    )

    output_group.add_argument(
        "-j", "--json", action='store_true', help="Output data as JSON instead."
    )

    logging_parser_group = parser.add_argument_group("Error logging options")
    logging_group = logging_parser_group.add_mutually_exclusive_group()

    logging_group.add_argument(
        '-v',
        '--verbose',
        dest='loglevel',
        action='store_const',
        const='debug',
        help="Use verbose logging.",
    )

    logging_group.add_argument(
        '-q',
        '--quiet',
        dest='loglevel',
        action='store_const',
        const='critical',
        help='Log only critical errors',
    )

    logging_group.add_argument(
        '--loglevel',
        dest='loglevel',
        type=str,
        choices=['error', 'info', 'debug', 'warning', 'critical'],
        help=(
            "Specify the verbosity of logging output. "
            "The default is 'warning'."
        ),
    )

    logging_parser_group.add_argument(
        "-o",
        "--output",
        action='store',
        type=str,
        default='',
        help="Output logging to the specified file.",
    )

    parser.add_argument(
        'sources',
        metavar='(USER|LOG)',
        type=str,
        nargs='*',
        help=(
            "Either a cPanel user or an Apache domain log file. "
            "'-' will be handled as STDIN. "
            "If none are given, then the script will attempt to gather "
            "data from the STDIN."
        ),
    )

    args = parser.parse_args()

    if args.version:
        print(f"Apache Log Parser version {__version__}")
        print(f"Last modified on {__date__}.")
        sys.exit(0)

    if args.loglevel is None:
        logging_level = logging.WARNING
    else:
        logging_level = getattr(logging, args.loglevel.upper())

    if args.output == '':
        setup_logging(
            path='/var/log/messages',
            loglevel=logging_level,
            print_out=sys.stderr,
        )
    else:
        setup_logging(path=args.output, loglevel=logging_level, print_out=False)

    if args.no_ptr:
        show_ptr = False
    else:
        show_ptr = True

    if len(args.sources) == 0:
        LOGGER.info("No sources. Using STDIN.")
        args.sources.append("-")

    return (
        args.sources,
        show_ptr,
        args.numlines,
        args.width,
        args.json,
        args.all,
        args.multilogs,
    )


def print_results(results, numlines, width):
    '''Print out results to terminal'''

    for (source, result) in results:
        if result['linecount'] < 1:
            print(f"{source} is empty.")
            continue

        print(color.yellow(f"Results for {source}:") + ":")
        for day in result['daily_hourly']:
            print_wide(
                f"Hourly hits ({day})",
                result['daily_hourly'][day],
                numlines,
                width,
            )
        print_wide(
            "HTTP response codes", result['status_codes'], numlines, width
        )
        print_tall("Top Requests", result['requests'], numlines, width)
        print_tall("Top user agents", result['user_agents'], numlines, width)
        if result['top_ips_with_ptr'] is not None:
            print_tall(
                "Top IPs with PTRs", result['top_ips_with_ptr'], numlines, width
            )
        else:
            print_tall("Top IPs", result['top_ips'], numlines, width)

        print("\n")


def main():
    '''Main function for script'''

    (
        sources,
        show_ptr,
        numlines,
        width,
        show_json,
        all_users,
        multilogs,
    ) = parse_args()

    # On shared servers, limit the number of log files searched
    if any(shared_type in hostname() for shared_type in ["biz", "hub", "res"]):
        log_limit = MAX_LOGS_SHARED
    else:
        log_limit = None

    # The complete results of our search.
    # This is an array of tuples, with each tuple being
    # (string, dict) where string is the source, and dict is the entries
    results = []

    if all_users:
        # If all_users, ignore other sources
        if multilogs:
            LOGGER.info("Source is all log files.")
            for domlog in logs_for_user(None)[:log_limit]:
                sections_dict = parse_domlogs(domlog, numlines, show_ptr)
                results.append((domlog, sections_dict))
        else:
            domlog = choose_logfile(None)
            LOGGER.info("Source is user file: %s", domlog)
            sections_dict = parse_domlogs(domlog, numlines, show_ptr)
            results.append((domlog, sections_dict))
    else:
        # Loop through user/paths, adding the results
        for source in sources:
            if source == '-':
                LOGGER.info("Source is STDIN: %s", source)
                sections_dict = parse_domlogs(source, numlines, show_ptr)
                results.append(("STDIN", sections_dict))
            elif os.path.isfile(source):
                LOGGER.info("Source is file: %s", source)
                sections_dict = parse_domlogs(source, numlines, show_ptr)
                results.append((source, sections_dict))
            elif os.path.isfile(f"/var/cpanel/users/{source!s}"):
                if multilogs:
                    LOGGER.info("Source is all files for : %s", source)
                    for domlog in logs_for_user(source)[:log_limit]:
                        sections_dict = parse_domlogs(
                            domlog, numlines, show_ptr
                        )
                        results.append((domlog, sections_dict))
                else:
                    domlog = choose_logfile(source)
                    LOGGER.info("Source is user file: %s", domlog)
                    sections_dict = parse_domlogs(domlog, numlines, show_ptr)
                    results.append((domlog, sections_dict))
            else:
                LOGGER.warning("Unable to determine log file for: %s", source)
                sys.exit('255')

    if show_json:
        print(
            json.dumps(
                results, sort_keys=True, indent=4, separators=(',', ': ')
            )
        )
    else:
        print_results(results, numlines, width)


if __name__ == "__main__":
    main()
Zerion Mini Shell 1.0