-
Notifications
You must be signed in to change notification settings - Fork 2
/
langkit.py
132 lines (104 loc) · 4.06 KB
/
langkit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/bin/env python3
# Rui Carmo, 2017
# Miscellaneous helpers for NLTK
from operator import itemgetter
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import RSLPStemmer
from sys import maxunicode
from unicodedata import category
from logging import getLogger
from traceback import format_exc
log = getLogger()
STOPWORDS = {'en': stopwords.words('english'),
'pt': stopwords.words('portuguese')}
STEMMERS = {'en': PorterStemmer(),
'pt': RSLPStemmer()}
# RAKE extractor - requires python -m nltk.downloader stopwords punkt
# Build a full unicode punctuation dictionary based on glyph category
# (strings.punctuation doesn't cut it)
PUNCTUATION = dict.fromkeys([i for i in range(maxunicode) if category(chr(i)).startswith('P')])
def _extract_phrases(sentences, language="english"):
"""Extract phrases from a list of sentences"""
def is_punctuation(word):
return len(word) == 1 and ord(word) in PUNCTUATION
lang_stopwords = set(stopwords.words(language))
phrase_list = []
for sentence in sentences:
# NOTE: word_tokenize can't quote cope with rich quotes,
# so we'll need to clean up after it deals with punctuation
words = map(lambda x: "|" if x in lang_stopwords else x, word_tokenize(sentence.lower(), language))
phrase = []
for word in words:
if word == "|" or is_punctuation(word):
if len(phrase) > 0:
phrase_list.append(phrase)
phrase = []
else:
phrase.append(word.translate(PUNCTUATION)) # remove unicode quotes
return phrase_list
def _score_words(phrase_list):
"""Score words based on frequency"""
def is_numeric(word):
# NOTE: this is a quick and dirty way to cope with multi-digit figures
# but will be confused by currency
try:
int(word.replace(',', '').replace('.', ''))
return True
except ValueError:
return False
word_freq = FreqDist()
word_degree = FreqDist()
for phrase in phrase_list:
degree = len(list(filter(lambda x: not is_numeric(x), phrase))) - 1
for word in phrase:
word_freq[word] += 1
word_degree[word] += degree
for word in word_freq.keys():
word_degree[word] = word_degree[word] + word_freq[word] # itself
# word score = deg(w) / freq(w)
word_scores = {}
for word in word_freq.keys():
word_scores[word] = word_degree[word] / word_freq[word]
return word_scores
def _score_phrases(phrase_list, word_scores):
"""Score a phrase by tallying individual word scores"""
phrase_scores = {}
for phrase in phrase_list:
phrase_score = 0
# cumulative score of words
for word in phrase:
phrase_score += word_scores[word]
phrase_scores[" ".join(phrase)] = phrase_score
return phrase_scores
def extract_keywords(text, language="en", scores=False):
"""RAKE extractor"""
try:
lang = {"en": "english",
"pt": "portuguese"}[language]
except KeyError:
log.error(format_exc())
return
sentences = sent_tokenize(text, lang)
phrase_list = _extract_phrases(sentences, lang)
word_scores = _score_words(phrase_list)
phrase_scores = _score_phrases(phrase_list, word_scores)
sorted_scores = sorted(phrase_scores.items(), key=itemgetter(1), reverse=True)
if scores:
return sorted_scores
else:
return list(map(lambda x: x[0], sorted_scores))
def tokenize(plaintext, language):
"""tokenize into stemmed tokens"""
try:
stop_words = STOPWORDS[language]
stemmer = STEMMERS[language]
except KeyError:
log.error(format_exc())
return
# Tokenize, remove stop words and stem
tokenizer = RegexpTokenizer(r'\w+')
tokens = [stemmer.stem(i) for i in tokenizer.tokenize(plaintext.lower()) if not i in stop_words]
return tokens