-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsumbasic.py
130 lines (97 loc) · 3.69 KB
/
sumbasic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from operator import itemgetter
from nltk.tokenize import sent_tokenize, word_tokenize
import sys
import argparse
import itertools
import nltk
import os
from os.path import dirname, join, realpath
dir_path = dirname(realpath(__file__))
nltk.data.path = [join(dir_path, 'nltk_data')]
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from functools import reduce
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
nltk.data.path.append('./nltk_data/')
# helper methods that apply preprocessing on list of strings
def preprocess(tokens):
return lemmatize(remove_stopwords(lowercase((tokens))))
def lowercase(tokens):
return [t.lower() for t in tokens]
def lemmatize(tokens):
return [wordnet_lemmatizer.lemmatize(t) for t in tokens]
def remove_stopwords(tokens):
return [t for t in tokens if t.lower() not in stop_words]
def handle_unicode(lines):
return [l.decode("utf-8") for l in lines]
def flatten(nestedList):
return list(itertools.chain(*nestedList))
def to_sents(lines):
return flatten([sent_tokenize(line) for line in lines])
def to_tokens(sents):
return flatten([word_tokenize(sent) for sent in sents])
def compact(lines):
"""
remove empty lines
"""
return [x for x in lines if x and not x.isspace()]
def strip(lines):
"""
Strip whitespace from input lines
"""
return [x.strip() for x in lines]
def leading(lines, word_limit):
sents = to_sents(lines)
summary = ""
while len(word_tokenize(summary)) < word_limit:
summary += " " + sents.pop(0)
# main methods
def orig(lines, word_limit):
return sum_basic(lines, word_limit, True)
def simplified(lines, word_limit):
return sum_basic(lines, word_limit, False)
def sum_basic(lines, word_limit, update_non_redundency=True):
def weight(sents, distribution):
def _weight_sent(sent):
tokens = preprocess(word_tokenize(sent))
return reduce(lambda x,y: x+y, [distribution.get(x) for x in tokens]) / len(tokens)
return [_weight_sent(sent) for sent in sents]
def probability_distribution(tokens):
N = len(tokens)
distinct_words = set(tokens)
probabilities = [tokens.count(w) / N for w in distinct_words]
return dict(list(zip(distinct_words, probabilities)))
sents = to_sents(lines)
tokens = to_tokens(sents)
tokens = preprocess(tokens)
pd = probability_distribution(tokens)
summary = ""
while len(word_tokenize(summary)) < word_limit:
weights = weight(sents, pd)
highest_weight_sentence = max(list(zip(sents, weights)), key=itemgetter(1))[0]
summary += " " + highest_weight_sentence
if update_non_redundency:
for token in preprocess(word_tokenize(highest_weight_sentence)):
pd[token] = pd[token] * pd[token]
else:
sents.remove(highest_weight_sentence)
return summary
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="A4 SumBasic")
parser.add_argument("method", choices=[
"orig", "simplified", "leading"
], help="summerizor method")
parser.add_argument('infiles', nargs='*', type=argparse.FileType('r'),
default=[sys.stdin])
args = parser.parse_args()
nestedlines = [f.readlines() for f in args.infiles]
lines = compact(strip(handle_unicode(flatten(nestedlines))))
if args.method == "orig":
print(orig(lines, 100))
elif args.method == "simplified":
print(simplified(lines, 100))
elif args.method == "leading":
print(simplified(lines, 100))