-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuser_merges
executable file
·249 lines (210 loc) · 9.17 KB
/
user_merges
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+#########+
# Copyright (C) 2009 Joe Blaylock <[email protected]>
#
#This program is free software: you can redistribute it and/or modify it under
#the terms of the GNU General Public License as published by the Free Software
#Foundation, either version 3 of the License, or (at your option) any later
#version.
#
#This program is distributed in the hope that it will be useful, but WITHOUT
#ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
#FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
#details.
#
#You should have received a copy of the GNU General Public License along with
#this program. If not, see <http://www.gnu.org/licenses/>.
"""Checks usernames.yaml for mergable records and suggests them.
FIXME: should also provide merge functionality
"""
# Imports
import os, sys
from UserTable import UserTable
from UserStats import UserStats
HELP_USAGE_EN = """usage: %prog [usernames.yaml] [irc_users.pickle]
Interactively scans user data files, suggesting possible merges and asking for permission."""
def match_report(n1, n1_mergelist, verbose=False):
"""Returns a nice report suggesting that n1_mergelist be merged to n1."""
# FIXME: make verbose do something.
s = "The records on the right appear to be similar to the record on the left.\n"
s += "You may want to consider merging them:\n"
s += n1 + ": " + str(n1_mergelist)
return s
def id_match_report(userTable, id1, id2):
"""Returns a nice report suggesting that n1_mergelist be merged to n1."""
s = "The records on the right appear to be similar to the record on the left.\n"
s += "You may want to consider merging them:\n"
s += "%s: %s <==> %s: %s" % (id1, userTable.idToName(id1), id2, userTable.idToName(id2))
return s
def ircLower(s):
"""Convert the input string to all-lower-case, with attempts to respect RFC 2812 s2.2"""
s = s.lower()
s.replace('[', '{')
s.replace(']', '}')
s.replace('\\', '|')
s.replace('~', '^')
if isinstance(s, unicode):
return s.encode("utf8")
return unicode(s, "utf8")
def fuzzy_match(n1, n2):
"""Checks whether n1 contains n2, or vice versa."""
##if abs(len(n1) - len(n2)) > delta:
## return False
# FIXME: use irc downcaser before comparison
n1 = ircLower(n1)
n2 = ircLower(n2)
try:
#if (n1.find(n2) != -1) or (n2.find(n1) != -1):
# return True
if n1.startswith(n2) or n2.startswith(n1):
return True
except UnicodeDecodeError, msg:
print n1
print n2
raise
return False
def ircSorted(nick_list):
"""Given a list of lists of nicks, sorts respecting IRC order
That is, input:
[ [ nick, nick|away, nicklong ],
[ bob045, bob_, bob ] ]
returns an iterator which yields:
[ nick, nicklong, nick|away ]
[ bob, bob045, bob_ ]
Cf. ircLower()
"""
def irc_cmp(x, y):
specials = '{}|^_'
if (x in specials) and (y not in specials):
return 1
if (x not in specials) and (y in specials):
return -1
if (x in specials) and (y in specials):
return 0
else:
return cmp(x, y)
for names in nick_list:
names = [ircLower(name) for name in names]
names.sort(irc_cmp)
yield names
def confirm(text, aff='y'):
selection = raw_input(text)
if len(selection) < 1: return False
return selection.lower()[0] == aff
def possible_wikiMerge(cnames, userTable):
yaml = userTable._UserTable__yaml_data # XXX: Acknowledged to be dangerous
wnames = {}
suggestions = set()
seen_ids = set()
for id in yaml:
for name in yaml[id]['wiki']:
if name == '': continue
wnames[name] = id
#for inner_list in [cnames, wnames]:
for inner_list in [cnames]:
#seen_ids.clear()
for name in wnames:
if wnames[name] in seen_ids: continue
seen_ids.add(wnames[name])
for nick in inner_list:
if inner_list[nick] in seen_ids: continue
if nick == '': continue
if inner_list[nick] == wnames[name]: continue
if fuzzy_match(name, nick):
suggestions.add(tuple(sorted((wnames[name], inner_list[nick]))))
for suggestion in suggestions:
print id_match_report(userTable, suggestion[0], suggestion[1])
if confirm("merge? y/N -> "):
yield suggestion
def possible_merges(c_names, userTable):
"""Does all-against-all fuzzy matching and returns an iterator over similar items.
c_names -> a dictionary mapping names to id's
userTable -> a place we can dereference id's to objects
"""
already_compared = {} # Build up a data structure to halve the amount of work
for name in c_names:
already_compared[name] = []
for name1 in c_names:
n1_mergelist = []
for name2 in c_names:
# skip things we've already seen
if ((name1 == name2) or (name2 in already_compared[name1])
or (name1 in already_compared[name2])
or (c_names[name1] == c_names[name2]) ):
continue # XXX: do we need every condition? Probably not... prove it.
# for everything else, keep the already seen table up to date
already_compared[name1].append(name2)
already_compared[name2].append(name1)
if fuzzy_match(name1, name2):
n1_mergelist.append(name2)
# we've got possible matches, now ask user to confirm, then yield them
if ( len(n1_mergelist) > 0 ):
print match_report(name1, n1_mergelist, verbose=False) # FIXME: verbose command line option
if confirm("merge? y/N -> "):
n1_mergelist.append(name1)
yield n1_mergelist
if __name__ == "__main__":
import optparse
parser = optparse.OptionParser(usage = HELP_USAGE_EN)
parser.add_option('-n', "--no-scan", dest="scan", action="store_false", default=True,
help="Disable interactive data scan; useful along with -f.")
parser.add_option('-w', "--wiki-scan", dest="wiki", action="store_true", default=False,
help="Interactive scan using wiki names; disables standard (IRC) scan.")
parser.add_option('-i', "--id-merge", action="store", nargs=2, type="int",
dest="merge_ids", metavar="id1 id2",
help="Force merge of data for id1 and id2, regardless of scan results. Use with care.")
parser.add_option('-f', "--force-merge", action="store", nargs=2,
dest="merge_nicks", metavar="nick1 nick2",
help="Force merge of data for nick1 and nick2, regardless of scan results. Use with care.")
#parser.add_option('-d', "--size_delta", action="store", nargs=1, type="int", default=3,
# dest="size_delta", metavar="delta",
# help="Only suggest two items as matches if they come within delta of each other in length.")
options, args = parser.parse_args()
if len(sys.argv) == 1:
parser.print_help()
sys.exit()
user_table_file = ''
user_data_file = ''
if len(args) == 2:
user_table_file = args[0]
user_data_file = args[1]
elif len(args) == 1:
user_table_file = args[0]
user_data_file = 'irc_users.pickle'
elif len(args) == 0:
user_table_file = 'usernames.yaml'
user_data_file = 'irc_users.pickle'
else:
parser.error("Zero, one, or two files must be specified.")
userTable = UserTable(user_table_file, user_data_file)
c_names = userTable._UserTable__commonNames # XXX: Acknowledged to be dangerous
dirty_list = []
if options.wiki:
options.scan = False
for less, more in possible_wikiMerge(c_names, userTable):
userTable.merge(more, less)
if options.merge_nicks:
nick1, nick2 = options.merge_nicks
print "Forcing merge of " + str(nick1) + " and " + str(nick2) + "...",
userTable.merge(nick1, nick2)
print " done."
if options.merge_ids:
id1, id2 = options.merge_ids
print "Forcing merge of " + str(id1) + " and " + str(id2) + "...",
userTable.merge(id1, id2)
print " done."
if options.scan:
for name in c_names:
shorter = name.rstrip('-_')
if shorter not in c_names:
print match_report(name, [shorter], verbose=False)
if confirm("merge? y/N -> "):
userTable[name].addNick(shorter)
dirty_list.append( (userTable[name].id, 'irc', shorter) )
#for names in ircSorted( possible_merges(c_names, userTable) ):
for names in possible_merges(c_names, userTable):
primary = names[0]
for secondary in names[1:]:
userTable.merge(primary, secondary)
userTable.close(dirty_list)