-
Notifications
You must be signed in to change notification settings - Fork 0
/
getReplies.py
130 lines (114 loc) · 4.22 KB
/
getReplies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import sys
import os
import re
import pandas as pd
from collections import Counter
def getReplies(df, output, top=0, text_column='body', date=''):
# Extract all the mentions from the posts
print('Counting all >> mentions in the posts...')
li_posts = df[text_column].tolist()
li_mentions = []
for post in li_posts:
#print(post)
if type(post) == str:
reg = re.compile('\>\>[0-9]{7,10}')
mentions = reg.findall(post)
li_mentions.append(mentions)
li_all_mentions = []
# how to do this in nested for loop?
for mentions in li_mentions:
for mention in mentions:
li_all_mentions.append(mention[2:])
# Rank the mentions
ranks = Counter(li_all_mentions)
# Add mentions to the dataframe and sort on this column
print('Adding mentions to a new csv...')
li_sorted_mentions = [0] * len(df)
for index, no in enumerate(df['id']):
for key in ranks.keys():
#print(type(key), type(no))
if int(key) == no:
li_sorted_mentions[index] = ranks[key]
df['mentions'] = li_sorted_mentions
# If only the top n should be returned, slice the DataFrame
top_label = 'all'
if top > 0:
df = df.sort_values(by='mentions', ascending=False)
df = df[:top]
top_label = 'top-' + str(top) + '-'
print('Writing to csv...')
if not os.path.exists('output/'):
os.makedirs('output/')
df.to_csv('output/' + output + '-most-replied-to-' + top_label + date + '.csv', encoding='utf-8')
print('Done! ' + output + '-most-replied-to-' + top_label + date + '.csv created.')
# Show manual if needed
if len(sys.argv) < 2:
print()
print("Counts which posts have been replied to the most.")
print("Adds a 'replies' column and makes a new csv.")
print("Use the 'top' parameters to only save the top n replied to posts.")
print()
print("Usage: python3 getReplies.py [--source] [--textcol] [--top] [--timespan] [--timecol]")
print()
print("--source: the relative path to a csv file from 4CAT (e.g. 'data/datasheet.csv').")
print("--top (optional): default 0 - input a number, and if provided, saves the csv only with the top n most replies to posts.")
print("--textcol (optional): default 'body' - the csv column with the body texts.")
print("--timespan (optional): if provided, make separate csvs with the most replied to posts per month or day. Use 'months' or 'days'.")
print("--timecol (optional): default 'timespan' - the csv column in which the time values are stored. Should start with format yyyy-mm-dd.")
print()
print("Example: python getReplies.py --source=input/datasheet.csv --textcol=body --top=50 --timespan=days --timecol=timestamp")
print()
sys.exit(1)
else:
li_args = []
source = ''
top = 0
text_column = 'body'
time_column = 'timestamp'
timespan = False
# Interpret command line arguments
for arg in sys.argv:
if arg[0:9] == "--source=":
source = arg[9:len(arg)]
output = source[:-4]
if "/" in output:
output = output.split("/")
output = output[len(output) - 1]
li_args.append(source)
li_args.append(output)
elif arg[0:7] == "--text=":
text_column = arg[7:len(arg)]
li_args.append(text_column)
elif "--top=" in arg:
top = int(arg[6:len(arg)])
li_args.append(top)
elif arg[0:11] == "--timespan=":
timespan = arg[11:len(arg)]
li_args.append(timespan)
elif arg[0:10] == "--timecol=":
time_column = arg[10:len(arg)]
li_args.append(time_column)
print(li_args)
if source == '' or not os.path.isfile(source):
print("Please provide a valid input file like this: --source=data/datasheet.csv")
sys.exit(1)
else:
df = pd.read_csv(source)
if timespan == False:
getReplies(df, output, top=top, text_column=text_column)
else:
print('Getting most replied to posts per ' + timespan)
# Get the dates to make files for
li_all_dates = df[time_column].tolist()
if timespan == 'days':
li_dates = [date[0:10] for date in li_all_dates]
elif timespan == 'months':
li_dates = [date[0:7] for date in li_all_dates]
else:
print('Please provide a valid date format (\'days\' or \'months\')')
sys.exit(1)
li_dates = set(li_dates)
print('Dates to check: ', li_dates)
for date_slice in li_dates:
print('Getting most replied to for ' + date_slice)
getReplies(df[df[time_column].str.contains(date_slice)], output, top=top, text_column=text_column, date=date_slice)