-
Notifications
You must be signed in to change notification settings - Fork 20
/
clean.py
37 lines (35 loc) · 1.05 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# -*- coding: utf-8 -*-
import re
from sys import stdin
from json import loads
from pybloom import BloomFilter
from HTMLParser import HTMLParser
hp = HTMLParser()
bloom = BloomFilter(capacity=100000000)
for line in stdin:
j = loads(line)
s = j['text']
s = hp.unescape(s)
s = s.lower().encode('utf-8')
s = s.replace("'", '')
s = s.replace("’", '')
s = s.replace('_', ' ')
s = s.replace("\n", " ")
s = s.replace("\r", " ")
s = s.replace("\t", " ")
s = re.sub(r'http[s]?://[^\s<>"]+|www\.[^\s<>"]+', ' URL ', s)
s = re.sub(r'(^|[^@\w])@(\w{1,15})\b', ' USER ', s)
s = re.sub(r'[^\w\s]', ' ', s)
s = re.sub('happ+y', ' happy ', s)
s = re.sub('crazy+', ' crazy ', s)
s = re.sub('lmao+', ' lmao ', s)
s = re.sub('lmfao+', ' lmfao ', s)
s = re.sub(' fu+ck ', ' fuck ', s)
s = re.sub(' shi+t ', ' shit ', s)
s = re.sub(' +', ' ', s)
if len(s) > 4 and 'gameinsight' not in s:
if s[0] == ' ':
s = s[1:]
if s not in bloom:
bloom.add(s)
print s