-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfilter.py
62 lines (45 loc) · 1.93 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# Dataset: https://archive.org/download/archiveteam-twitter-stream-2018-04/archiveteam-twitter-stream-2018-04.tar
# Script to extract only the necessary tweets (and relevant features of it) from the enitre dataset using a list of keywords!
import os
import pandas as pd
import time
import re
import multiprocessing
def clean(_text):
_cleaned = ' '.join(re.sub(r'(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^0-9A-Za-z\t])|(http\S+)',' ', _text).split())
return (_cleaned).strip().lower()
def _readJson(_file):
print(_file)
keyWords = ['migration', 'immigration', 'immigrants', 'migrants', 'immigrate', 'migrate', 'refugee', 'asylum', 'emigration', 'UNHCR']
wordRe = re.compile('|'.join(keyWords), re.IGNORECASE)
keep_columns = ['extended_tweet', 'user', 'lang']
f = open('all_data.csv', 'a')
try:
data = pd.read_json(_file, lines = True)
data = data[keep_columns].dropna().reset_index(drop = True)
for uid, lang in enumerate(data['lang']):
if lang == 'en':
user_desc = clean(str(data['user'][uid]['description']))
text = clean(str(data['extended_tweet'][uid]['full_text']))
line = user_desc + ',' + text
if wordRe.search(text):
f.write(line + '\n')
except Exception as e:
#print(e)
pass
f.close()
if __name__ == '__main__':
start = time.time()
#f.write('user_id,user_desc,tweet,loc\n')
rootDir = './dataset'
files = []
for dirName, subdirList, fileList in os.walk(rootDir):
for fname in fileList:
files.append(dirName + '/' + fname)
pool = multiprocessing.Pool()
pool.map(_readJson, files)
pool.close()
end = time.time()
df = pd.read_csv('all_data.csv')
print('\nNumber of tweets: {}'.format(df.shape[0]))
print('Total time: {} h'.format((end - start)/3600))