-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
199 lines (170 loc) · 7.75 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import re
import pickle
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
from sklearn import preprocessing
from fkscore import fkscore
import numpy as np
'''
Interaction Dynamics utils
'''
# refined_sample = pickle.load(open('samples/interaction_dynamics_sample.pkl', mode='rb'))
# returns all comment authors that received a delta
def deltas(submission):
delta_authors = []
id_map = {}
for comment in submission['comments']:
if 'author' in comment.keys():
id_map[comment['id']] = comment['author']
if comment['author'] == 'DeltaBot':
author = re.search(r"(?<=\/u\/).+(?=\.)", comment['body'])
if author is not None:
delta_authors.append(author.group(0))
return delta_authors
# sorts all comments in a submission by creation time
def sort_comments(submission):
comments = []
for comment in submission['comments']:
if 'created' in comment.keys() and 'author' in comment.keys():
comments.append(comment)
return sorted(comments, key= lambda comment: comment['created'])
# counts the number of comments made by each challenger
def num_author_comments(submission):
id_map = {}
op = submission['author']
for comment in submission['comments']:
if 'author' in comment.keys():
id_map[comment['id']] = comment['author']
author_comment_map = defaultdict(int)
for comment in submission['comments']:
if 'author' in comment.keys():
if comment['author'] not in ['DeltaBot', op]:
author_comment_map[comment['author']] += 1
return author_comment_map
# counts the length of back-and-forths between challenger and op
def op_challenger_chain_counts(submission):
id_map = {}
op = submission['author']
for comment in submission['comments']:
if 'author' in comment.keys():
id_map[comment['id']] = comment['author']
op_chal_chains = defaultdict(int)
for comment in submission['comments']:
if 'author' in comment.keys() and comment['replies']:
for reply_id in comment['replies']['data']['children']:
if reply_id in id_map.keys():
if id_map[reply_id] == op:
op_chal_chains[comment['author']] += 1
elif comment['author'] == op:
op_chal_chains[id_map[reply_id]] += 1
return op_chal_chains
'''
Language Indicator utils
'''
stop_words = set(stopwords.words('english'))
# simple implementation of jaccard score and other measures for two lists
def jaccard(A, O):
set1 = set(A)
set2 = set(O)
return len(set1.intersection(set2))/len(set1.union(set2))
def common_words(A, O):
return len(set(A).intersection(set(O)))
def reply_fraction(A, O):
set1 = set(A)
set2 = set(O)
return len(set1.intersection(set2))/len(set1)
def op_fraction(A, O):
set1 = set(A)
set2 = set(O)
return len(set1.intersection(set2))/len(set2)
# tokenizing and cleaning of comments text, replacing quotes and links with special tokens
def tokenize(comment):
comment = comment.replace('>', 'QUOTE')
comment = re.sub(r'http\S+', 'LINK', comment)
comment = comment.replace('(LINK', 'LINK')
tokens = word_tokenize(comment)
return tokens
# identity function (just return)
def id(obj):
return obj
# function to remove stop words from tokenized
def remove_stop_words(tokens):
return [word for word in tokens if word.lower() not in stop_words]
# function to only retrieve the stopw ords from tokenized
def only_stop_words(tokens):
return [word for word in tokens if word.lower() in stop_words]
# separate a root-path unit into root reply and full path
def separate(rooted_path_unit):
return rooted_path_unit['comments'][0]['body'], '\n\n'.join([comment['body'] for comment in rooted_path_unit['comments']])
interplay_features = ['# common ', 'jaccard ', 'op frac ', 'reply frac ']
interplay_bases = ['all', 'content', 'stop']
interplay_cols = []
for feature in interplay_features:
for base in interplay_bases:
interplay_cols.append(feature + base)
INTERPLAY_COLS = interplay_cols
# load VAD and concreteness csv as pd, keep the mean columns for each word
CONCRETENESS = pd.read_csv('docs/concreteness.csv')[['Word', 'Rating.Mean']]
VAD = pd.read_csv('docs/VAD.csv')[['Word', 'V.Mean.Sum', 'A.Mean.Sum', 'D.Mean.Sum']]
ARG_COLS = ['# indefinite articles', '# definite articles', '# 1st person pronouns', '# 1st person plural pronouns',
'# 2nd person pronouns', '# of links', '# of quotes', '# questions',
'V', 'A', 'D', 'C',
'# of sentences', '# of paragraphs', 'Flesch-Kincaid Readability']
def word_count(tokens):
return len([w for w in tokens if w.isalnum()])
# return info for word category-based features in tokenized
def word_category_info(tokens):
quote_link_counter = Counter(tokens)
num_links = quote_link_counter['LINK']
num_quotes = quote_link_counter['QUOTE']
lowered = [token.lower() for token in tokens]
counter = Counter(lowered)
features_map = {'# indefinite articles':['a', 'an'], '# definite articles': ['the'], '# 1st person pronouns': ['i', 'me', 'my', 'mine', 'myself'],
'# 1st person plural pronouns': ['us', 'we', 'ours', 'ourselves', 'our'],
'# 2nd person pronouns': ['you', 'your', 'yours', 'yourself', 'yourselves'], '# questions': ['?']}
count_map = {feature:sum([counter[token] for token in value]) for feature, value in features_map.items()}
count_map['# of links'] = num_links
count_map['# of quotes'] = num_quotes
return count_map
# returns average V,A,D,C for content words in tokenized passage
def word_score_info(tokens):
results = {feature:0.0 for feature in ['V', 'A', 'D','C']}
tokens = remove_stop_words(tokens)
lowered_tokens = [token.lower() for token in tokens]
new_length = word_count(lowered_tokens)
for token in lowered_tokens:
if token in VAD['Word'].values:
results['V'] += VAD.loc[VAD['Word'] == token, 'V.Mean.Sum'].item()
results['A'] += VAD.loc[VAD['Word'] == token, 'A.Mean.Sum'].item()
results['D'] += VAD.loc[VAD['Word'] == token, 'D.Mean.Sum'].item()
if token in CONCRETENESS['Word'].values:
results['C'] += CONCRETENESS.loc[CONCRETENESS['Word'] == token, 'Rating.Mean'].item()
return {key:value/new_length for key, value in results.items()}
def num_sentences(text):
return len(sent_tokenize(text))
def num_paragraphs(text):
return len(text.split('\n\n'))
def flesch_kincaid_score(text):
return fkscore(text).score['readability']
# entire text information for comments
def entire_text_features(text):
results = {}
results['# of sentences'] = num_sentences(text)
results['# of paragraphs'] = num_paragraphs(text)
results['Flesch-Kincaid Readability'] = flesch_kincaid_score(text)
return results
def quarters(text):
tokens = tokenize(text)
return [list(array) for array in np.array_split(tokens, 4)]
# Persuaion utils
# CMV posts insert a footer at the end of the post, remove it and add title to the body of post
def remove_footer(submission):
lines = [line for line in submission['selftext'].splitlines()
if not line.lstrip().startswith('>') and not line.lstrip().startswith('_____')
and "edit" not in " ".join(line.lower().split())
]
return submission['title'] + "\n\n" + "\n".join(lines)
def remove_CMV(text):
return text.replace('CMV:', '', 1)