ircstats

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+
#Copyright (C) 2009  Joe Blaylock <jrbl@jrbl.org>
#
#This program is free software: you can redistribute it and/or modify it under 
#the terms of the GNU General Public License as published by the Free Software 
#Foundation, either version 3 of the License, or (at your option) any later 
#version.
#
#This program is distributed in the hope that it will be useful, but WITHOUT 
#ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
#FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more 
#details.
#
#You should have received a copy of the GNU General Public License along with 
#this program.  If not, see <http://www.gnu.org/licenses/>.
#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+
"""ircstats - a little script to parse XML IRC logs and gather stats from them

Cf. http://forge.blueoxen.net/wiki/IRC_Analytics
Cf. RFC 2812
"""

import os, sys
import xml.dom.minidom as md
from datetime import datetime

import UserTable
from InputColloquyIRC import *


#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+
DEBUG     = False

HELP_USAGE_EN = """usage: %prog [options] [file1.xml] [file2.xml] [...]"""

#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+

def usersByID(userTable):
    for name in sorted(userTable.keys()):
        yield userTable[name]

def count_messages(user, day = None):
    if day:
        return len( [t for t in user.messages.keys() if t.date() == day] )
    return len(user.messages.keys())

def count_actions(user,day = None):
    if day:
        return len( [t for t in user.actions.keys() if t.date() == day] )
    return len(user.actions.keys())

def dailyStatsForUser(userTable, id, eco, daylist):
    """Returns data structure containing everything interesting about a single user.

    Returns a variable-length list of the form:
    [ name, total messages, total message ratio, total acts, total acts ratio, 
      day1 messages, day1 message ratio, day1 acts, day1 act ratio, ... ]

    FIXME: actually, user's messages and acts should be a dict by day, then by time
           Then most of this infrastructure can go away
    """
    def summary( m, tm, a, ta):
        r_m = float(m)/tm if tm else 0
        r_a = float(a)/ta if ta else 0
        return (m, r_m, a, r_a)

    name = userTable.idToName(id)
    user_object = userTable[id]
    t_msgs, t_acts = reduce( (lambda x,y: (x[0] + y[0], x[1] + y[1])) , [eco[day] for day in daylist], (0, 0) )

    retVal = [ id, name ]
    retVal.extend( summary( count_messages(user_object), t_msgs, count_actions(user_object), t_acts ) )
    for day in daylist:
        t_msgs, t_acts = eco[day]
        retVal.extend( summary( count_messages(user_object, day), t_msgs, count_actions(user_object, day), t_acts ) )
    return retVal
        
def statsForUser(user, msgcount = 0, actcount = 0):
    """Returns (nick, message_count, act_count, message_ratio, act_ratio)
       
    msgcount is the total count of messages seen since the dawn of time
    actcount is the same, for actions

    message_ratio is the ratio of messages this user produced, to all messages
    act_ratio is the ratio of actions this user emitted, to all actions
    """

    nick = user.nick
    msgs = count_messages(user)
    acts = count_actions(user)
    msgrat = float(msgs)/msgcount if msgcount else 0
    actrat = float(acts)/actcount if actcount else 0
    return (nick, msgs, acts, msgrat, actrat)

def count_everything(userTable, day = None):
    """Make a complete pass through the user database, gathering the total count of messages and acts.

    FIXME: this is an unbearably slow and stupid algorithm.  We should be computing and caching this data.
    """
    msgcount = 0
    actcount = 0
    for id in userTable.keys():
        msgcount += count_messages(userTable[id], day)
        actcount += count_actions(userTable[id], day)
    return (msgcount, actcount)

def getDayList(userTable):
    """Make a complete pass through the user database, gathering information about what days had activity.
       
    FIXME: this is an unbearably slow and stupid algorithm.  We should be computing and caching this data.
    """
    daylist = []
    for id in userTable.keys():
        for d in [t.date() for t in userTable[id].messages.keys()]:
            if d not in daylist:
                daylist.append(d)
        for d in [t.date() for t in userTable[id].actions.keys()]:
            if d not in daylist:
                daylist.append(d)
    return sorted(daylist)

def getReportHeader(typeword, daylist, short = " cnt"):
    header = typeword + " by user:\n"
    header += "\t%4s  %4s  %20s " % ("All", "All", " ")
    for day in daylist:
        sd = str(day)
        header += "  %10s %10s" % (sd, sd)
    header += '\n'
    header += '\t%4s  %4s  %15s      ' % (short, "%Tot", "Name/Nick")
    for day in daylist:
        header += "  %10s %10s" % (short, "%Tot")
    return header

def getReportLineByFunc(func, total, table, eco, daylist):
    for id in table:
        res = func( table[id] )
        if res > 0:
            line = '\t %3d  %.2f  %20s ' % ( res, float(res)/total, table.idToName(id) )
            for day in daylist:
                m, a = eco[ day ]
                line += "  %10d %10.2f" % ( res, float(res)/m )
            yield line

def userStatsByLine(offset, all_stats, table):
    """Return iterator over paired stats in all_stats starting at offset; fix names."""
    for line in all_stats:
        #name = table.idToName(table[line[0]].id)
        name = line[1]
        if line[offset] == 0: continue
        line = line[offset:]
        out = '\t %3d  %.2f  %20s ' % (line[0], line[1], name)
        if len(line) > 4:
            for i in range(4, len(line), 4):
                out += "  %10d %10.2f" % (line[i], line[i+1])
        yield out

def setup_optparse():
    import optparse
    usage = HELP_USAGE_EN
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-c', "--csv",      dest="csv",      action="store_true", default=True, 
                      help="Write a CSV-formatted file of all statistics to stdout (the default)")
    parser.add_option('-t', "--totals",   dest="totals",   action="store_true", default=False, 
                      help="Output summary totals of messages and actions")
    parser.add_option('-m', "--messages", dest="messages", action="store_true", default=False, 
                      help="Output message counts per username")
    parser.add_option('-a', "--actions",  dest="actions",  action="store_true", default=False, 
                      help="Output action counts per username")
#    parser.add_option('-l', "--lurkers",  dest="lurkers",  action="store_true", default=False, 
#                      help="Output list of lurkers - users who don't say anything")
    parser.add_option('-d', "--debug",    dest="debug",    action="store_true", default=False, 
                      help="Enable debug mode")
    parser.add_option('-y', '--yaml-file', dest="yaml_file", action="store", metavar="FILE",
                       help="Dereference usernames against YAML file FILE")
    parser.add_option('-s', '--stats-cache', dest="stats_file", action="store", metavar="FILE",
                       help="Cache calculated stats in cachefile FILE")
    parser.add_option('-v', "--verbose",    dest="verbose", action="store_true", default=False, 
                      help="Verbose output.  Can be chatty.")
    options, args = parser.parse_args()

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit()

    return parser


#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+
if __name__ == "__main__":

    dirty_data = False
    option_parser = setup_optparse()
    options, args = option_parser.parse_args()

    if options.debug: DEBUG = True
    if options.yaml_file:
        mapping_yaml = options.yaml_file
    else:
        sys.stderr.write("Usernames mapping YAML file unspecified; defaulting to 'usernames.yaml'...\n")
        mapping_yaml = "usernames.yaml"
    if options.stats_file:
        stats_cache = options.stats_file
    else:
        sys.stderr.write("Stats cache file unspecified; defaulting to 'irc_users.pickle'...\n")
        stats_cache = "ircusers.pickle"

    # Read in on-disk data stores; set up mapping dictionaries
    userTable = UserTable.UserTable(mapping_yaml, stats_cache, verbose=options.verbose)
    
    # Read in and process user log file
    for filename in args:
        dom = md.parse(filename)                           # should be in InputCollquyIRC.py?
        if dom.documentElement.tagName != u"log":
            sys.stderr.write("'%s' does not appear to be a Colloquy IRC transcript file.  Skipping...\n" % filename)
            continue
        handleLogDOM(dom.documentElement, userTable)
        dirty_data = True
        dom.unlink()

    msgcount, actcount = count_everything(userTable)
    daylist = getDayList(userTable)

    everything_counted_once = {}
    for day in daylist:
        everything_counted_once[day] = count_everything(userTable, day)

    if options.totals:
        options.csv = False
        print "Total messages:", msgcount + actcount
        print "Messages:", msgcount
        print "Actions:", actcount
        print "%10s %10s %10s" % ("Date", "Messages", "Actions")
        for day in daylist:
            m, a = everything_counted_once[day]
            print "%10s %10s %10s" % (str(day), str(m), str(a) )

    if options.messages or options.actions or options.csv: # or options.lurkers:
        allTheStats = [dailyStatsForUser(userTable, id, everything_counted_once, daylist) for id in userTable]

        if options.messages:
            options.csv = False
            print getReportHeader("Messages", daylist)
            for line in userStatsByLine(2, allTheStats, userTable):
                print line

        if options.actions:
            options.csv = False
            print getReportHeader("Actions", daylist)
            for line in userStatsByLine(4, allTheStats, userTable):
                print line

#        if options.lurkers:
#            options.csv = False
#            # FIXME: it thinks people who weren't present on a particular day are lurkers
#            header = "Lurkers and Action-only Users:\n"
#            header += "%20s  " % "Name"
#            for day in daylist:
#                header += "%10s  " % str(day)
#            print header
#            for line in allTheStats:
#                def allTrue(l):
#                    for t in l:
#                        if not t: return False
#                    return True
#                name = line[1]
#                line = line[6:]
#                if allTrue([line[i] > 0 for i in range(0, len(line), 4)]): continue
#                pairs = zip([line[i] for i in range(1, len(line), 4)], [line[i] for i in range(3, len(line), 4)])
#                line = "%20s  " % name
#                for pair in pairs:
#                    if pair[0] == 0 and pair[1] == 0: line += "%10s  " % "L"
#                    elif pair[0] == 0: line += "%10s  " % "s"
#                    else: line += "%10s  " % " "
#                print line

        if options.csv:
            import csv, sys
            csv.register_dialect("ooffice_like", delimiter=',', skipinitialspace=True, 
                                                                lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC)
            header = ["ID", "COMMON NAME", "ALL MESSAGES", "% TOTAL MESSAGES", "ALL ACTIONS", "% TOTAL ACTIONS"]
            for day in daylist:
                header.extend( [str(day) + " MESSAGES", str(day) + " % TOTAL MESSAGES", str(day) + " ACTIONS", str(day) + " % TOTAL ACTIONS"] )
            header.extend(("NICK1", "NICK2", "NICK3", "NICK4"))
            csv.writer(sys.stdout, dialect="ooffice_like").writerow(header)
            for line in allTheStats:
                line.extend( userTable[line[0]].nicks[:4] )
                try: 
                    csv.writer(sys.stdout, dialect="ooffice_like").writerow(line)
                except UnicodeEncodeError, msg:
                    for i in range(len(line)):
                        if isinstance(line[i], unicode):
                            line[i] = line[i].encode("utf-8")
                    csv.writer(sys.stdout, dialect="ooffice_like").writerow(line)


    if dirty_data: 
        userTable.close()