forked from IsaacHaze/main_core_retention
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmcr.py
150 lines (110 loc) · 4.02 KB
/
mcr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
import sys
import os
from io import open
from collections import defaultdict, Counter
from heap import heap
import regex as re
from nltk import pos_tag
from nltk.stem.porter import PorterStemmer
from nltk.tag.mapping import map_tag
def load_stopwords():
path = os.path.join(os.path.dirname(__file__), 'english_stopwords.txt')
with open(path, encoding='utf-8') as f:
stopwords = set(f.read().split())
return stopwords
def weight(vertex, neighbors, C):
"""Return the sum of edge weights of the vertex.
The edges under consideration are restricted to those endpoints
which are in set C.
"""
return sum(count for n, count in neighbors[vertex].items() if n in C)
def N(neighbors, C):
"""Neighbors restricted to set C."""
return C.intersection(neighbors.keys())
def p_cores(vertices, neighbors):
"""Generalized cores algorithm from Batagelj & Zaveršnik 2002
Calculates the weighted k-cores decomposition. The weight function
p() is hard-coded to be the edge weights in the graph.
References
==========
Batagelj, Vladimir, and Matjaž Zaveršnik. "Generalized cores."
arXiv preprint cs/0202039 (2002).
Batagelj, Vladimir, and Matjaž Zaveršnik. "An O(m) algorithm for
cores decomposition of networks." arXiv preprint cs/0310049
(2003).
"""
# XXX: either this is buggy, or the numbers in the paper
# rousseau-ecir2015 are incorrect. Check this against networkx'
# k-cores implementation?
cores = {}
C = set(v for v in vertices)
p = {v: weight(v, neighbors, C) for v in vertices}
min_heap = heap.heap([])
for v in vertices:
min_heap[v] = weight(v, neighbors, C)
while len(C) > 0:
top = min_heap.pop()
C.remove(top)
cores[top] = p[top]
# prints "removing '{}' score {}".format(top, p[top])
for v in N(neighbors[top], C):
p[v] = max(p[top], weight(v, neighbors, C))
min_heap[v] = p[v]
return cores
def ngrams(lst, n=4):
N = len(lst)
if N < n:
return
yield
for i in range(N-n+1):
yield lst[i:i+n]
def add_edges(graph, lst):
# N = len(lst)
for i, a in enumerate(lst):
for j, b in enumerate(lst):
if i == j:
continue
graph[a][b] += 1
# graph[a][b] += 1.0/(1+(i-j)**2)
# graph[a][b] += 2.0**-(abs(i-j)-1)
def get_tokens(fname, stopwords):
with open(fname, encoding='utf-8') as f:
text = f.read()
text = re.sub(r'\d', '9', text)
# word_re = re.compile(r'(\p{L}[\p{L}_-]+|\p{P}+)')
word_re = re.compile(r'(\p{L}[\p{L}_-]*|\p{N}+)')
tokens = word_re.findall(text)
# retain = set(['NOUN', 'ADJ', 'ADV', 'PROPN'])
retain = set(['NOUN', 'ADJ'])
# pos_tagged = pos_tag(tokens)
# print "pos_tags:", " ".join("{}_{}".format(*t) for t in pos_tagged)
tokens = [tok for tok, tag in pos_tag(tokens)
if map_tag('en-ptb', 'universal', tag) in retain]
tokens = [tok.lower() for tok in tokens]
tokens = [tok for tok in tokens if tok not in stopwords]
stemmer = PorterStemmer()
tokens = [stemmer.stem(tok) for tok in tokens]
# print "====="
# print "tokens:", " ".join(tokens)
# print "====="
return tokens
if __name__ == '__main__':
fname = sys.argv[1]
stopwords = load_stopwords()
toks = get_tokens(fname, stopwords)
graph = defaultdict(Counter)
for ngram in ngrams(toks, n=4):
# print " ".join(ngram)
add_edges(graph, ngram)
# print "graph({}):".format(len(graph))
# for a, counter in graph.items():
# for b, count in counter.items():
# print "{}\t{}\t{}".format(a, b, count)
# print "====="
cores = p_cores(graph.keys(), graph)
top_k = max(cores.values())
main_core = [(node, val) for node, val in cores.items() if val >= top_k]
for stem, k_core in main_core:
# print u"{}\t{}".format(stem, k_core).encode('utf-8')
print u"{}".format(stem, k_core).encode('utf-8')