-
Notifications
You must be signed in to change notification settings - Fork 16
/
textpreprocess.py
executable file
·111 lines (83 loc) · 3.58 KB
/
textpreprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/python
import common.html2text
import common.tokenizer
#from nltk import word_tokenize
from nltk.tokenize import WordPunctTokenizer # This is better for sentences containing unicode, like: u"N\u00faria Espert"
word_tokenize = WordPunctTokenizer().tokenize
#from nltk.corpus import stopwords
# Use the PyStemmer stemmer, since it is written in C and is thus much faster than the NLTK porter stemmer
import Stemmer
#from nltk.stem.porter import PorterStemmer
import os.path
import re
import string
STOPFILE = os.path.join(os.path.abspath(os.path.dirname(os.path.realpath(__file__))), "english.stop")
stoplist = None
_wsre = re.compile("\s+")
_alphanumre = re.compile("[\w\-\' ]", re.UNICODE)
#stemmer = PorterStemmer()
stemmer = Stemmer.Stemmer("english")
def textpreprocess(txt, converthtml=True, sentencetokenize=True, removeblanklines=True, replacehyphenbyspace=True, wordtokenize=True, lowercase=True, removestopwords=True, stem=True, removenonalphanumericchars=True, stemlastword=False, stripallwhitespace=False):
"""
Note: For html2text, one could also use NCleaner (common.html2text.batch_nclean)
Note: One could improve the sentence tokenization, by using the
original HTML formatting in the tokenization.
Note: We use the Porter stemmer. (Optimization: Shouldn't rebuild
the PorterStemmer object each time this function is called.)
"""
if converthtml:
txt = common.html2text.html2text(txt)
if sentencetokenize:
txts = common.tokenizer.tokenize(txt)
else:
txts = [txt]
txt = None
if removeblanklines:
newtxts = []
for t in txts:
if len(string.strip(t)) > 0:
newtxts.append(t)
txts = newtxts
if replacehyphenbyspace:
txts = [t.replace("-", " ") for t in txts]
if wordtokenize:
txtwords = [word_tokenize(t) for t in txts]
else:
txtwords = [string.split(t) for t in txts]
txts = None
if lowercase:
txtwords = [[string.lower(w) for w in t] for t in txtwords]
if removestopwords:
txtwords = _removestopwords(txtwords)
if stem:
txtwords = _stem(txtwords)
# TODO: Maybe remove Unicode accents? http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
if removenonalphanumericchars:
txtwords = _removenonalphanumericchars(txtwords)
txtwords = [[w for w in t if w != ""] for t in txtwords]
if stemlastword:
txtwords = _stemlastword(txtwords)
txts = [string.join(words) for words in txtwords]
if stripallwhitespace:
txts = _stripallwhitespace(txts)
return string.join(txts, sep="\n")
def _removestopwords(txtwords):
global stoplist
# stoplist = stopwords.words("english")
if stoplist is None:
stoplist = frozenset([string.strip(l) for l in open(STOPFILE).readlines()])
return [[w for w in t if w not in stoplist] for t in txtwords]
def _stem(txtwords):
# stemmer = PorterStemmer()
# return [[stemmer.stem(w) for w in t] for t in txtwords]
return [stemmer.stemWords(t) for t in txtwords]
def _removenonalphanumericchars(txtwords):
return [[string.join([c for c in w if _alphanumre.search(c) is not None], "") for w in t] for t in txtwords]
def _stemlastword(txtwords):
# return [t[:-1] + [stemmer.stem(t[-1])] for t in txtwords if len(t) > 0]
return [t[:-1] + [stemmer.stemWord(t[-1])] for t in txtwords if len(t) > 0]
def _stripallwhitespace(txts):
return [_wsre.sub("", txt) for txt in txts]
if __name__ == "__main__":
import sys
print textpreprocess(sys.stdin.read())