-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathInputColloquyIRC.py
executable file
·217 lines (184 loc) · 8.1 KB
/
InputColloquyIRC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+
# Copyright (C) 2009 Joe Blaylock <[email protected]>
#
#This program is free software: you can redistribute it and/or modify it under
#the terms of the GNU General Public License as published by the Free Software
#Foundation, either version 3 of the License, or (at your option) any later
#version.
#
#This program is distributed in the hope that it will be useful, but WITHOUT
#ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
#FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
#details.
#
#You should have received a copy of the GNU General Public License along with
#this program. If not, see <http://www.gnu.org/licenses/>.
"""XML parser for Collquy XML-formatted IRC transcripts.
Currently just a pile of functions. Safe for "from InputColloquyIRC import *".
Cf. http://forge.blueoxen.net/wiki/IRC_Analytics
Cf. RFC 2812
Cf. http://colloquy.info/project/wiki/Development/Styles/LogFileFormat
"""
from xml.dom import NotSupportedErr as NotSupportedError
from datetime import datetime
from UserStats import UserStats
class PlainTextIRCParser(object):
def __init__(self, logfile_object, user_table):
self.log = logfile_object
self.user_table = user_table
self.res = {
'time': re.compile("^\(\[\d{4}-\d{2}-\d{2} \d{2}::\d{2}:\d{2}]\)"), #[2009-07-21 12::33:42]
'nick': re.compile("] \([a-zA-Z\[\]_\\`^{}|][0-9a-zA-Z\[\]_\\`^{}|\-]*\): "), #] nickname: text
}
self.fils = {
'time': [self._regex2datetime, "[%Y-%m%-%d %H::%M:%S]"],
'nick': [self._getIRCnick, ],
}
self.start_time = None
self.end_time = None
def _regex2datetime(self, match_obj, timestrl):
return datetime.strptime(match_obj.group(), timestrl[0])
def _getIRCnick(self, match_obj, empty = None):
return self._ircLower(match_obj.group)
def _ircLower(self, s):
"""Convert the input string to all-lower-case, with attempts to respect RFC 2812 s2.2"""
s = s.lower()
s.replace('[', '{')
s.replace(']', '}')
s.replace('\\', '|')
s.replace('~', '^')
return s
def _match_and_apply(self, key, str):
regex = self.res[key]
m = regex.match(str)
if m != None:
return self._apply_filter(key, m)
def _apply_filter(self, key, match_object):
func = self.fils[key][0]
args = self.fils[key][1:]
return func(match_object, args)
def process(self, userTable):
first = self.log.readline()
self.start_time = self._match_and_apply('time', first)
dispatch_to_handler(first)
for line in self.log.readlines():
self.end_time = self.dispatch_to_handler(line)
for user in userTable.keys():
userTable[user].part(self.end_time)
def dispatch_to_handler(self, line):
"""Given a line of text, figures out what handler to use on it"""
dt = self._match_and_apply('time', line)
nick = self._match_and_apply('nick', line)
if nick == None:
# server msg, action or event
raise Exception, "Not implemented. FIXME XXX HACK" # FIXME XXX HACK
else:
end = self.handle_message(dt, nick, line)
return end
def handle_message(date, nick, line):
"""Book an irc message to a particular user nick"""
raise Exception, "Not implemented" # FIXME XXX HACK
user_object = None
text_offset = nick.span()[1]+1
msg = line[text_offset:]
user_object.message(date, msg)
def handleEnvelope(child, userTable, logStartTime):
"""Pick the relevant data off of a blob of messages from a single user."""
logEndTime = None
irc_nick = getIrcNickAndValidate(child.getElementsByTagName('sender'))
user_object = getUserStatsForNick(irc_nick, userTable, logStartTime)
for message in child.getElementsByTagName('message'):
timestamp = message.getAttribute('received')
timestamp = datetime.strptime(timestamp[:19], "%Y-%m-%d %H:%M:%S")
pretty = message.toprettyxml(encoding="utf-8")
if message.getAttribute('type') == u"notice":
handleMessageNotice(user_object, timestamp, pretty)
elif message.getAttribute('action'):
user_object.action(timestamp, pretty)
else:
user_object.message(timestamp, pretty)
logEndTime = timestamp
return logEndTime
def handleLogDOM(dom, userTable):
"""Process the elements in a colloquy log DOM"""
logStartTime = datetime.strptime(dom.getAttribute('began')[:19], "%Y-%m-%d %H:%M:%S")
logEndTime = None
for child in dom.childNodes:
if child.nodeName == u"envelope": # one or more lines from a user
logEndTime = handleEnvelope(child, userTable, logStartTime)
elif child.nodeName == "#text":
for c in child.data:
if c not in '\t\n\x0b\x0c\r ':
print "Unexpected text node: \"" + str(child.data) + "\""
print "continuing..."
elif child.nodeName == u"event": # IRC server event
logEndTime = handleEvent(child, userTable, logStartTime)
else: # violates log spec
raise NotSupportedError, "Unknown child node " + child.tagName
for user in userTable.keys():
userTable[user].part(logEndTime)
def ircLower(s):
"""Convert the input string to all-lower-case, with attempts to respect RFC 2812 s2.2"""
s = s.lower()
s.replace('[', '{')
s.replace(']', '}')
s.replace('\\', '|')
s.replace('~', '^')
return s
def getUserStatsForNick(irc_nick, userTable, logStartTime):
"""Checks whether we've ever seen this nick and returns the corresponding user"""
if irc_nick not in userTable:
userTable[irc_nick] = UserStats(irc_nick, logStartTime, userTable.getID())
return userTable[irc_nick]
def handleMessageNotice(user_object, timestamp, message):
print "------------------------------------------------------------------------------"
print "System notice detected, but currently unsupported by the parser."
print "Now that there's some sample input for what notices look like,"
print "this can be fixed."
print
print message
print "------------------------------------------------------------------------------"
def handleEvent(child, userTable, logStartTime):
logEndTime = None
timestamp = child.getAttribute('occurred')
event_name = child.getAttribute('name')
timestamp = datetime.strptime(timestamp[:19], "%Y-%m-%d %H:%M:%S")
whos = child.getElementsByTagName('who')
if len(whos) > 0:
irc_nick = getIrcNickAndValidate(whos)
else:
pass # this is odd, and doesn't it violate the Colloquy spec? yeeagh
# XXX: ignore it until it becomes a problem
if event_name == "memberParted":
user_object = getUserStatsForNick(irc_nick, userTable, logStartTime)
user_object.part(timestamp)
elif event_name == "memberJoined":
user_object = getUserStatsForNick(irc_nick, userTable, logStartTime)
user_object.join(timestamp)
elif event_name == "memberNewNickname":
new = irc_nick
old = getIrcNickAndValidate(child.getElementsByTagName('old'))
user_object = getUserStatsForNick(old, userTable, logStartTime)
user_object.addNick(new)
elif event_name == "newNickname":
pass
else:
print "Unhandled event "+event_name
logEndTime = timestamp
return logEndTime
def getIrcNickAndValidate(element_list):
try:
first = element_list[0]
except IndexError, msg:
print element_list
raise
if len(element_list) > 1:
parent = first.parentNode
raise NotSupportedError, "Multiple senders on node: " + parent.toprettyxml()
id = first.getAttribute('identifier')
if id == '':
return ircLower(first.firstChild.nodeValue)
else:
return ircLower(id)