1
1
#!/usr/bin/env python3
2
2
3
- import re
4
- import pandas as pd
5
3
import argparse as arg
6
4
import itertools
5
+ import re
6
+ import pandas as pd
7
7
8
8
9
9
print ("Starting process_polis_data.py program" )
13
13
14
14
def getargs ():
15
15
parser = arg .ArgumentParser (
16
- description = "Process Polis data from the openData export data." )
16
+ description = "Process Polis data from the openData export data."
17
+ )
17
18
parser .add_argument ("export_directory" , help = "Path to export directory." )
18
- parser .add_argument ("--participants-votes" ,
19
- help = "Participants votes file (override)." )
20
19
parser .add_argument (
21
- "--comments" , help = "Path to the comments file (override)." )
22
- parser .add_argument ("-o" , "--output_file" ,
23
- help = "Path to the output CSV file." , required = True )
24
- parser .add_argument ("--exclude-ungrouped-participants" ,
25
- help = "Whether to include ungrouped participants in the output." , action = "store_true" )
20
+ "--participants-votes" , help = "Participants votes file (override)."
21
+ )
22
+ parser .add_argument (
23
+ "--comments" , help = "Path to the comments file (override)."
24
+ )
25
+ parser .add_argument (
26
+ "-o" , "--output_file" , help = "Path to the output CSV file." , required = True
27
+ )
28
+ parser .add_argument (
29
+ "--exclude-ungrouped-participants" ,
30
+ help = "Whether to include ungrouped participants in the output." ,
31
+ action = "store_true" ,
32
+ )
26
33
args = parser .parse_args ()
27
- args .participants_votes = args .participants_votes or f"{ args .export_directory } /participants-votes.csv"
34
+ args .participants_votes = (
35
+ args .participants_votes
36
+ or f"{ args .export_directory } /participants-votes.csv"
37
+ )
28
38
args .comments = args .comments or f"{ args .export_directory } /comments.csv"
29
39
return args
30
40
@@ -49,29 +59,29 @@ def getargs():
49
59
print ("Args processed" )
50
60
51
61
# make sure to cast comment ids as ints
52
- comments [' comment-id' ] = comments [' comment-id' ].astype (int )
62
+ comments [" comment-id" ] = comments [" comment-id" ].astype (int )
53
63
54
64
55
65
# their votes on everything.
56
66
if args .exclude_ungrouped_participants :
57
67
# filter out votes rows where group-id is nan, and make ints
58
68
print ("Filtering out ungrouped participants" )
59
- votes = votes [votes [' group-id' ].notna ()]
69
+ votes = votes [votes [" group-id" ].notna ()]
60
70
else :
61
71
# We fill the ungrouped participant records with -1 for group, which when
62
72
# processed below will reserve group-0 for the "ungrouped", which we can
63
73
# manually filter into the columns
64
- votes [' group-id' ] = votes [' group-id' ].fillna (- 1 )
74
+ votes [" group-id" ] = votes [" group-id" ].fillna (- 1 )
65
75
66
76
# Increment group ids so they are 1 based instead of 0 (noting that, as described above,
67
77
# the "ungrouped" psuedo-group gets bumped here from -1 to 0, to be dealt with later)
68
- votes [' group-id' ] = votes [' group-id' ].astype (int ) + 1
78
+ votes [" group-id" ] = votes [" group-id" ].astype (int ) + 1
69
79
# Sort the ids so they come out in the right order in the output file header
70
- group_ids = sorted (votes [' group-id' ].unique ())
80
+ group_ids = sorted (votes [" group-id" ].unique ())
71
81
print ("Group ids:" , group_ids )
72
82
73
83
# prompt: find all of the column names in the votes df that match a numeric regex
74
- comment_ids = [col for col in votes .columns if re .match (r' ^\d+$' , col )]
84
+ comment_ids = [col for col in votes .columns if re .match (r" ^\d+$" , col )]
75
85
76
86
# Create a dictionary for mapping comment to total vote count for each column in
77
87
# the votes table, for later verification
@@ -80,23 +90,28 @@ def getargs():
80
90
comment_vote_counts [int (comment_id )] = votes [comment_id ].value_counts ().sum ()
81
91
82
92
# Melt the DataFrame
83
- melted_votes = votes .melt (id_vars = [
84
- "group-id" ], value_vars = comment_ids , var_name = 'comment-id' , value_name = 'value' )
85
- melted_votes ['comment-id' ] = melted_votes ['comment-id' ].astype (int )
93
+ melted_votes = votes .melt (
94
+ id_vars = ["group-id" ],
95
+ value_vars = comment_ids ,
96
+ var_name = "comment-id" ,
97
+ value_name = "value" ,
98
+ )
99
+ melted_votes ["comment-id" ] = melted_votes ["comment-id" ].astype (int )
86
100
# Group, count, unstack, and fill missing values
87
101
result = (
88
- melted_votes .groupby ([' comment-id' , ' group-id' ])[' value' ]
102
+ melted_votes .groupby ([" comment-id" , " group-id" ])[" value" ]
89
103
.value_counts ()
90
104
.unstack (fill_value = 0 )
91
105
.reset_index ()
92
106
)
93
107
94
108
# Rename columns
95
109
result = result .rename (
96
- columns = {- 1 : 'disagree-count' , 0 : 'pass-count' , 1 : 'agree-count' })
110
+ columns = {- 1 : "disagree-count" , 0 : "pass-count" , 1 : "agree-count" }
111
+ )
97
112
98
113
# Pivot out the group-id column so that each of the vote count columns look like "group-N-VOTE-count"
99
- pivoted = result .pivot (index = "comment-id" , columns = ' group-id' )
114
+ pivoted = result .pivot (index = "comment-id" , columns = " group-id" )
100
115
101
116
# A function for naming groups based on group id.
102
117
# Note that for the group_id == 0, the "ungrouped" pseudo-group, this returns "Group-none"
@@ -107,19 +122,20 @@ def group_name(group_id):
107
122
108
123
109
124
# Use the pivoted data to prepare a dataframe for merging
110
- for_merge = pd .DataFrame ({' comment-id' : pivoted .index })
125
+ for_merge = pd .DataFrame ({" comment-id" : pivoted .index })
111
126
for group_id in group_ids :
112
127
for count_col in ["disagree-count" , "pass-count" , "agree-count" ]:
113
- for_merge [group_name (group_id ) + "-" +
114
- count_col ] = pivoted [count_col ][group_id ].values
128
+ for_merge [group_name (group_id ) + "-" + count_col ] = pivoted [count_col ][
129
+ group_id
130
+ ].values
115
131
116
132
# zero out total vote tallies since incorrect from filtering or database caching
117
133
comments ["agrees" ] = 0
118
134
comments ["disagrees" ] = 0
119
135
comments ["passes" ] = 0
120
136
121
137
# merge in the per group tallies above
122
- comments = comments .merge (for_merge , on = ' comment-id' )
138
+ comments = comments .merge (for_merge , on = " comment-id" )
123
139
124
140
# add up from the votes matrix for consistency
125
141
for group_id in group_ids :
@@ -128,14 +144,18 @@ def group_name(group_id):
128
144
comments ["agrees" ] += comments [group + "-agree-count" ]
129
145
comments ["passes" ] += comments [group + "-pass-count" ]
130
146
131
- comments ["votes" ] = comments ["agrees" ] + \
132
- comments ["disagrees" ] + comments ["passes" ]
147
+ comments ["votes" ] = (
148
+ comments ["agrees" ] + comments ["disagrees" ] + comments ["passes" ]
149
+ )
133
150
134
151
comments ["agree_rate" ] = comments ["agrees" ] / comments ["votes" ]
135
152
comments ["disagree_rate" ] = comments ["disagrees" ] / comments ["votes" ]
136
153
comments ["pass_rate" ] = comments ["passes" ] / comments ["votes" ]
137
154
comments ["difference_of_opinion_rank" ] = (
138
- 1 - abs (comments ["agree_rate" ] - comments ["disagree_rate" ]) - comments ["pass_rate" ])
155
+ 1
156
+ - abs (comments ["agree_rate" ] - comments ["disagree_rate" ])
157
+ - comments ["pass_rate" ]
158
+ )
139
159
140
160
141
161
# Go through and check that all of our output comment["votes"] counts are no
@@ -144,10 +164,16 @@ def group_name(group_id):
144
164
# a result of filters applied based on who was grouped in the conversation analysis.
145
165
print ("Validating aggregate vote counts..." )
146
166
failed_validations = 0
147
- for comment_id in comments ['comment-id' ]:
148
- if comment_vote_counts [comment_id ] < comments [comments ['comment-id' ] == int (comment_id )]["votes" ].iloc [0 ]:
167
+ for comment_id in comments ["comment-id" ]:
168
+ if (
169
+ comment_vote_counts [comment_id ]
170
+ < comments [comments ["comment-id" ] == int (comment_id )]["votes" ].iloc [0 ]
171
+ ):
149
172
print (
150
- f"WARNING: Vote count mismatch for comment { comment_id } . Original count: { comment_vote_counts [comment_id ]} , New count: { comments [comments ['comment-id' ] == int (comment_id )]['votes' ].iloc [0 ]} " )
173
+ f"WARNING: Vote count mismatch for comment { comment_id } . Original"
174
+ f" count: { comment_vote_counts [comment_id ]} , New count:"
175
+ f" { comments [comments ['comment-id' ] == int (comment_id )]['votes' ].iloc [0 ]} "
176
+ )
151
177
failed_validations += 1
152
178
if failed_validations == 0 :
153
179
print ("All validations passed!" )
@@ -157,14 +183,17 @@ def group_name(group_id):
157
183
# to non-strict moderation)
158
184
print ("N comments total:" , len (comments ))
159
185
print ("N votes total:" , comments ["votes" ].sum ())
160
- moderated_comments = comments [(comments ["moderated" ] == 1 ) | (
161
- (comments ["moderated" ] == 0 ) & (comments ["votes" ] > 1 ))]
186
+ moderated_comments = comments [
187
+ (comments ["moderated" ] == 1 )
188
+ | ((comments ["moderated" ] == 0 ) & (comments ["votes" ] > 1 ))
189
+ ]
162
190
print ("N comments included after moderation:" , len (moderated_comments ))
163
191
print ("N votes after moderation:" , moderated_comments ["votes" ].sum ())
164
192
165
193
# prompt: write out to a CSV file
166
194
moderated_comments = moderated_comments .rename (
167
- columns = {'comment-body' : 'comment_text' })
195
+ columns = {"comment-body" : "comment_text" }
196
+ )
168
197
moderated_comments .to_csv (args .output_file , index = False )
169
198
170
199
# Exit with non-zero error code if any validations failed
0 commit comments