-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathircstats
executable file
·299 lines (254 loc) · 12.5 KB
/
ircstats
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+
#Copyright (C) 2009 Joe Blaylock <[email protected]>
#
#This program is free software: you can redistribute it and/or modify it under
#the terms of the GNU General Public License as published by the Free Software
#Foundation, either version 3 of the License, or (at your option) any later
#version.
#
#This program is distributed in the hope that it will be useful, but WITHOUT
#ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
#FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
#details.
#
#You should have received a copy of the GNU General Public License along with
#this program. If not, see <http://www.gnu.org/licenses/>.
#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+
"""ircstats - a little script to parse XML IRC logs and gather stats from them
Cf. http://forge.blueoxen.net/wiki/IRC_Analytics
Cf. RFC 2812
"""
import os, sys
import xml.dom.minidom as md
from datetime import datetime
import UserTable
from InputColloquyIRC import *
#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+
DEBUG = False
HELP_USAGE_EN = """usage: %prog [options] [file1.xml] [file2.xml] [...]"""
#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+
def usersByID(userTable):
for name in sorted(userTable.keys()):
yield userTable[name]
def count_messages(user, day = None):
if day:
return len( [t for t in user.messages.keys() if t.date() == day] )
return len(user.messages.keys())
def count_actions(user,day = None):
if day:
return len( [t for t in user.actions.keys() if t.date() == day] )
return len(user.actions.keys())
def dailyStatsForUser(userTable, id, eco, daylist):
"""Returns data structure containing everything interesting about a single user.
Returns a variable-length list of the form:
[ name, total messages, total message ratio, total acts, total acts ratio,
day1 messages, day1 message ratio, day1 acts, day1 act ratio, ... ]
FIXME: actually, user's messages and acts should be a dict by day, then by time
Then most of this infrastructure can go away
"""
def summary( m, tm, a, ta):
r_m = float(m)/tm if tm else 0
r_a = float(a)/ta if ta else 0
return (m, r_m, a, r_a)
name = userTable.idToName(id)
user_object = userTable[id]
t_msgs, t_acts = reduce( (lambda x,y: (x[0] + y[0], x[1] + y[1])) , [eco[day] for day in daylist], (0, 0) )
retVal = [ id, name ]
retVal.extend( summary( count_messages(user_object), t_msgs, count_actions(user_object), t_acts ) )
for day in daylist:
t_msgs, t_acts = eco[day]
retVal.extend( summary( count_messages(user_object, day), t_msgs, count_actions(user_object, day), t_acts ) )
return retVal
def statsForUser(user, msgcount = 0, actcount = 0):
"""Returns (nick, message_count, act_count, message_ratio, act_ratio)
msgcount is the total count of messages seen since the dawn of time
actcount is the same, for actions
message_ratio is the ratio of messages this user produced, to all messages
act_ratio is the ratio of actions this user emitted, to all actions
"""
nick = user.nick
msgs = count_messages(user)
acts = count_actions(user)
msgrat = float(msgs)/msgcount if msgcount else 0
actrat = float(acts)/actcount if actcount else 0
return (nick, msgs, acts, msgrat, actrat)
def count_everything(userTable, day = None):
"""Make a complete pass through the user database, gathering the total count of messages and acts.
FIXME: this is an unbearably slow and stupid algorithm. We should be computing and caching this data.
"""
msgcount = 0
actcount = 0
for id in userTable.keys():
msgcount += count_messages(userTable[id], day)
actcount += count_actions(userTable[id], day)
return (msgcount, actcount)
def getDayList(userTable):
"""Make a complete pass through the user database, gathering information about what days had activity.
FIXME: this is an unbearably slow and stupid algorithm. We should be computing and caching this data.
"""
daylist = []
for id in userTable.keys():
for d in [t.date() for t in userTable[id].messages.keys()]:
if d not in daylist:
daylist.append(d)
for d in [t.date() for t in userTable[id].actions.keys()]:
if d not in daylist:
daylist.append(d)
return sorted(daylist)
def getReportHeader(typeword, daylist, short = " cnt"):
header = typeword + " by user:\n"
header += "\t%4s %4s %20s " % ("All", "All", " ")
for day in daylist:
sd = str(day)
header += " %10s %10s" % (sd, sd)
header += '\n'
header += '\t%4s %4s %15s ' % (short, "%Tot", "Name/Nick")
for day in daylist:
header += " %10s %10s" % (short, "%Tot")
return header
def getReportLineByFunc(func, total, table, eco, daylist):
for id in table:
res = func( table[id] )
if res > 0:
line = '\t %3d %.2f %20s ' % ( res, float(res)/total, table.idToName(id) )
for day in daylist:
m, a = eco[ day ]
line += " %10d %10.2f" % ( res, float(res)/m )
yield line
def userStatsByLine(offset, all_stats, table):
"""Return iterator over paired stats in all_stats starting at offset; fix names."""
for line in all_stats:
#name = table.idToName(table[line[0]].id)
name = line[1]
if line[offset] == 0: continue
line = line[offset:]
out = '\t %3d %.2f %20s ' % (line[0], line[1], name)
if len(line) > 4:
for i in range(4, len(line), 4):
out += " %10d %10.2f" % (line[i], line[i+1])
yield out
def setup_optparse():
import optparse
usage = HELP_USAGE_EN
parser = optparse.OptionParser(usage=usage)
parser.add_option('-c', "--csv", dest="csv", action="store_true", default=True,
help="Write a CSV-formatted file of all statistics to stdout (the default)")
parser.add_option('-t', "--totals", dest="totals", action="store_true", default=False,
help="Output summary totals of messages and actions")
parser.add_option('-m', "--messages", dest="messages", action="store_true", default=False,
help="Output message counts per username")
parser.add_option('-a', "--actions", dest="actions", action="store_true", default=False,
help="Output action counts per username")
# parser.add_option('-l', "--lurkers", dest="lurkers", action="store_true", default=False,
# help="Output list of lurkers - users who don't say anything")
parser.add_option('-d', "--debug", dest="debug", action="store_true", default=False,
help="Enable debug mode")
parser.add_option('-y', '--yaml-file', dest="yaml_file", action="store", metavar="FILE",
help="Dereference usernames against YAML file FILE")
parser.add_option('-s', '--stats-cache', dest="stats_file", action="store", metavar="FILE",
help="Cache calculated stats in cachefile FILE")
parser.add_option('-v', "--verbose", dest="verbose", action="store_true", default=False,
help="Verbose output. Can be chatty.")
options, args = parser.parse_args()
if len(sys.argv) == 1:
parser.print_help()
sys.exit()
return parser
#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+
if __name__ == "__main__":
dirty_data = False
option_parser = setup_optparse()
options, args = option_parser.parse_args()
if options.debug: DEBUG = True
if options.yaml_file:
mapping_yaml = options.yaml_file
else:
sys.stderr.write("Usernames mapping YAML file unspecified; defaulting to 'usernames.yaml'...\n")
mapping_yaml = "usernames.yaml"
if options.stats_file:
stats_cache = options.stats_file
else:
sys.stderr.write("Stats cache file unspecified; defaulting to 'irc_users.pickle'...\n")
stats_cache = "ircusers.pickle"
# Read in on-disk data stores; set up mapping dictionaries
userTable = UserTable.UserTable(mapping_yaml, stats_cache, verbose=options.verbose)
# Read in and process user log file
for filename in args:
dom = md.parse(filename) # should be in InputCollquyIRC.py?
if dom.documentElement.tagName != u"log":
sys.stderr.write("'%s' does not appear to be a Colloquy IRC transcript file. Skipping...\n" % filename)
continue
handleLogDOM(dom.documentElement, userTable)
dirty_data = True
dom.unlink()
msgcount, actcount = count_everything(userTable)
daylist = getDayList(userTable)
everything_counted_once = {}
for day in daylist:
everything_counted_once[day] = count_everything(userTable, day)
if options.totals:
options.csv = False
print "Total messages:", msgcount + actcount
print "Messages:", msgcount
print "Actions:", actcount
print "%10s %10s %10s" % ("Date", "Messages", "Actions")
for day in daylist:
m, a = everything_counted_once[day]
print "%10s %10s %10s" % (str(day), str(m), str(a) )
if options.messages or options.actions or options.csv: # or options.lurkers:
allTheStats = [dailyStatsForUser(userTable, id, everything_counted_once, daylist) for id in userTable]
if options.messages:
options.csv = False
print getReportHeader("Messages", daylist)
for line in userStatsByLine(2, allTheStats, userTable):
print line
if options.actions:
options.csv = False
print getReportHeader("Actions", daylist)
for line in userStatsByLine(4, allTheStats, userTable):
print line
# if options.lurkers:
# options.csv = False
# # FIXME: it thinks people who weren't present on a particular day are lurkers
# header = "Lurkers and Action-only Users:\n"
# header += "%20s " % "Name"
# for day in daylist:
# header += "%10s " % str(day)
# print header
# for line in allTheStats:
# def allTrue(l):
# for t in l:
# if not t: return False
# return True
# name = line[1]
# line = line[6:]
# if allTrue([line[i] > 0 for i in range(0, len(line), 4)]): continue
# pairs = zip([line[i] for i in range(1, len(line), 4)], [line[i] for i in range(3, len(line), 4)])
# line = "%20s " % name
# for pair in pairs:
# if pair[0] == 0 and pair[1] == 0: line += "%10s " % "L"
# elif pair[0] == 0: line += "%10s " % "s"
# else: line += "%10s " % " "
# print line
if options.csv:
import csv, sys
csv.register_dialect("ooffice_like", delimiter=',', skipinitialspace=True,
lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC)
header = ["ID", "COMMON NAME", "ALL MESSAGES", "% TOTAL MESSAGES", "ALL ACTIONS", "% TOTAL ACTIONS"]
for day in daylist:
header.extend( [str(day) + " MESSAGES", str(day) + " % TOTAL MESSAGES", str(day) + " ACTIONS", str(day) + " % TOTAL ACTIONS"] )
header.extend(("NICK1", "NICK2", "NICK3", "NICK4"))
csv.writer(sys.stdout, dialect="ooffice_like").writerow(header)
for line in allTheStats:
line.extend( userTable[line[0]].nicks[:4] )
try:
csv.writer(sys.stdout, dialect="ooffice_like").writerow(line)
except UnicodeEncodeError, msg:
for i in range(len(line)):
if isinstance(line[i], unicode):
line[i] = line[i].encode("utf-8")
csv.writer(sys.stdout, dialect="ooffice_like").writerow(line)
if dirty_data:
userTable.close()