-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck.py
68 lines (51 loc) · 2.19 KB
/
check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
__author__ = 'gilax'
MAX = 0
PERCENT = 15
from load_tweets import *
from collections import Counter
import numpy as np
def get_common_words(X, y, labels, promise):
"""
returns the common words and the percentage of them in each group
:param X: tweets
:param y: names indexes
:param labels: list of names indexes
:param promise: percent that promising the separate
:return: a list of [word, [percent, ..., percent]]
"""
all_tweets_list = ["" for _ in range(len(labels))]
all_tweets = ""
# count all the words in all the tweets
for i in range(len(X)):
tweet = X[i].lower()
all_tweets_list[y[i]] += " " + tweet
all_tweets += " " + tweet
list_per_group = []
for label in range(len(labels)):
counter = Counter(all_tweets_list[label].split()).most_common()
list_per_group.append(counter)
# #get a list of all the Prepostion in the english language
# get_Prepostion_List = [line.rstrip('\n') for line in open('PrepositionsList')]
# list of dictionaries for every person with word as key and amount in tweets as value
dictionary_per_person = [{k: v for k, v in list_per_group[i]}
for i in range(len(list_per_group))]
check = open('check.txt', 'w')
# count all the words in all the tweets
counter = Counter(all_tweets.split()).most_common()
all_counter = []
for word, amount in counter:
to_write = []
for i in range(len(dictionary_per_person)):
if word in dictionary_per_person[i].keys():
percent = round(dictionary_per_person[i][word] / amount * 100, 2)
to_write.append(percent)
else:
to_write.append(0)
if all(to_write[j] < promise for j in range(len(dictionary_per_person))) and any(dictionary_per_person[i][word] > MAX
for i in range(len(dictionary_per_person))):
check.write(word + to_write.__str__() + "\n")
all_counter.append((word, to_write))
return all_counter
tweets, names = load_dataset()
politician_labels = [i for i in range(10)]
print(len(get_common_words(tweets, names, politician_labels, PERCENT)))