Jigsaw-Code
diff --git a/‎.gitignore
Lines changed: 2 additions & 1 deletion b/‎.gitignore
Lines changed: 2 additions & 1 deletion
diff --git a/‎.husky/commit-msg
Lines changed: 1 addition & 1 deletion b/‎.husky/commit-msg
Lines changed: 1 addition & 1 deletion
diff --git a/‎.husky/pre-commit
Lines changed: 1 addition & 1 deletion b/‎.husky/pre-commit
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml
Lines changed: 32 additions & 0 deletions b/‎.pre-commit-config.yaml
Lines changed: 32 additions & 0 deletions
diff --git a/‎docker-compose.yaml
Lines changed: 1 addition & 1 deletion b/‎docker-compose.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎library/bin/download_polis_data.sh
Lines changed: 0 additions & 2 deletions b/‎library/bin/download_polis_data.sh
Lines changed: 0 additions & 2 deletions
diff --git a/‎library/bin/process_polis_data.py
Lines changed: 65 additions & 36 deletions b/‎library/bin/process_polis_data.py
Lines changed: 65 additions & 36 deletions
diff --git a/‎library/docs/.nojekyll
Lines changed: 1 addition & 1 deletion b/‎library/docs/.nojekyll
Lines changed: 1 addition & 1 deletion
@@ -7,4 +7,5 @@ __pycache__
 private_*
 # Ignore Gemini CLI files.
 GEMINI.md
-todo.md
+todo.md
+*.env
@@ -5,4 +5,4 @@ if [ -e "$PROJECT_ROOT/.git/hooks/commit-msg" ]; then
   $PROJECT_ROOT/.git/hooks/commit-msg "$@"
 else
   exit 0
-fi
+fi
@@ -1,2 +1,2 @@
 npm test
-npx lint-staged
+npx lint-staged
@@ -0,0 +1,32 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: end-of-file-fixer
+        types: [python]
+  - repo: https://github.com/google/pyink
+    rev: 24.10.1
+    hooks:
+      - id: pyink
+        types: [python]
+        language_version: python3.12
+  - repo: local
+    hooks:
+      - id: pytest
+        name: pytest
+        entry: bash -c 'python -m ensurepip && pip install -r requirements-dev.txt && python -m pytest'
+        language: python
+        types: [python]
+        pass_filenames: false
+      - id: npm-test
+        name: Run npm tests
+        entry: npm test
+        language: system
+        types: [file]
+        pass_filenames: false
+      - id: lint-staged
+        name: Run lint-staged
+        entry: npx lint-staged
+        language: system
+        types: [file]
+        pass_filenames: false
@@ -7,4 +7,4 @@ services:
       - ./napolitan:/app/napolitan
       - ./models:/app/models
     environment:
-      PYTHONPATH: /app
+      PYTHONPATH: /app
@@ -35,5 +35,3 @@ curl "${EXPORT_URL_BASE}/participant-votes.csv" > "${OUTPUT_DIR}/participants-vo
 curl "${EXPORT_URL_BASE}/votes.csv" > "${OUTPUT_DIR}/votes.csv"
 curl "${EXPORT_URL_BASE}/summary.csv" > "${OUTPUT_DIR}/summary.csv"
 curl "${EXPORT_URL_BASE}/comment-groups.csv" > "${OUTPUT_DIR}/comment-groups.csv"
-
-
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 
-import re
-import pandas as pd
 import argparse as arg
 import itertools
+import re
+import pandas as pd
 
 
 print("Starting process_polis_data.py program")
@@ -13,18 +13,28 @@
 
 def getargs():
   parser = arg.ArgumentParser(
-      description="Process Polis data from the openData export data.")
+      description="Process Polis data from the openData export data."
+  )
   parser.add_argument("export_directory", help="Path to export directory.")
-  parser.add_argument("--participants-votes",
-                      help="Participants votes file (override).")
   parser.add_argument(
-      "--comments", help="Path to the comments file (override).")
-  parser.add_argument("-o", "--output_file",
-                      help="Path to the output CSV file.", required=True)
-  parser.add_argument("--exclude-ungrouped-participants",
-                      help="Whether to include ungrouped participants in the output.", action="store_true")
+      "--participants-votes", help="Participants votes file (override)."
+  )
+  parser.add_argument(
+      "--comments", help="Path to the comments file (override)."
+  )
+  parser.add_argument(
+      "-o", "--output_file", help="Path to the output CSV file.", required=True
+  )
+  parser.add_argument(
+      "--exclude-ungrouped-participants",
+      help="Whether to include ungrouped participants in the output.",
+      action="store_true",
+  )
   args = parser.parse_args()
-  args.participants_votes = args.participants_votes or f"{args.export_directory}/participants-votes.csv"
+  args.participants_votes = (
+      args.participants_votes
+      or f"{args.export_directory}/participants-votes.csv"
+  )
   args.comments = args.comments or f"{args.export_directory}/comments.csv"
   return args
 
@@ -49,29 +59,29 @@ def getargs():
 print("Args processed")
 
 # make sure to cast comment ids as ints
-comments['comment-id'] = comments['comment-id'].astype(int)
+comments["comment-id"] = comments["comment-id"].astype(int)
 
 
 # their votes on everything.
 if args.exclude_ungrouped_participants:
   # filter out votes rows where group-id is nan, and make ints
   print("Filtering out ungrouped participants")
-  votes = votes[votes['group-id'].notna()]
+  votes = votes[votes["group-id"].notna()]
 else:
   # We fill the ungrouped participant records with -1 for group, which when
   # processed below will reserve group-0 for the "ungrouped", which we can
   # manually filter into the columns
-  votes['group-id'] = votes['group-id'].fillna(-1)
+  votes["group-id"] = votes["group-id"].fillna(-1)
 
 # Increment group ids so they are 1 based instead of 0 (noting that, as described above,
 # the "ungrouped" psuedo-group gets bumped here from -1 to 0, to be dealt with later)
-votes['group-id'] = votes['group-id'].astype(int) + 1
+votes["group-id"] = votes["group-id"].astype(int) + 1
 # Sort the ids so they come out in the right order in the output file header
-group_ids = sorted(votes['group-id'].unique())
+group_ids = sorted(votes["group-id"].unique())
 print("Group ids:", group_ids)
 
 # prompt: find all of the column names in the votes df that match a numeric regex
-comment_ids = [col for col in votes.columns if re.match(r'^\d+$', col)]
+comment_ids = [col for col in votes.columns if re.match(r"^\d+$", col)]
 
 # Create a dictionary for mapping comment to total vote count for each column in
 # the votes table, for later verification
@@ -80,23 +90,28 @@ def getargs():
   comment_vote_counts[int(comment_id)] = votes[comment_id].value_counts().sum()
 
 # Melt the DataFrame
-melted_votes = votes.melt(id_vars=[
-                          "group-id"], value_vars=comment_ids, var_name='comment-id', value_name='value')
-melted_votes['comment-id'] = melted_votes['comment-id'].astype(int)
+melted_votes = votes.melt(
+    id_vars=["group-id"],
+    value_vars=comment_ids,
+    var_name="comment-id",
+    value_name="value",
+)
+melted_votes["comment-id"] = melted_votes["comment-id"].astype(int)
 # Group, count, unstack, and fill missing values
 result = (
-    melted_votes.groupby(['comment-id', 'group-id'])['value']
+    melted_votes.groupby(["comment-id", "group-id"])["value"]
     .value_counts()
     .unstack(fill_value=0)
     .reset_index()
 )
 
 # Rename columns
 result = result.rename(
-    columns={-1: 'disagree-count', 0: 'pass-count', 1: 'agree-count'})
+    columns={-1: "disagree-count", 0: "pass-count", 1: "agree-count"}
+)
 
 # Pivot out the group-id column so that each of the vote count columns look like "group-N-VOTE-count"
-pivoted = result.pivot(index="comment-id", columns='group-id')
+pivoted = result.pivot(index="comment-id", columns="group-id")
 
 # A function for naming groups based on group id.
 # Note that for the group_id == 0, the "ungrouped" pseudo-group, this returns "Group-none"
@@ -107,19 +122,20 @@ def group_name(group_id):
 
 
 # Use the pivoted data to prepare a dataframe for merging
-for_merge = pd.DataFrame({'comment-id': pivoted.index})
+for_merge = pd.DataFrame({"comment-id": pivoted.index})
 for group_id in group_ids:
   for count_col in ["disagree-count", "pass-count", "agree-count"]:
-    for_merge[group_name(group_id) + "-" +
-              count_col] = pivoted[count_col][group_id].values
+    for_merge[group_name(group_id) + "-" + count_col] = pivoted[count_col][
+        group_id
+    ].values
 
 # zero out total vote tallies since incorrect from filtering or database caching
 comments["agrees"] = 0
 comments["disagrees"] = 0
 comments["passes"] = 0
 
 # merge in the per group tallies above
-comments = comments.merge(for_merge, on='comment-id')
+comments = comments.merge(for_merge, on="comment-id")
 
 # add up from the votes matrix for consistency
 for group_id in group_ids:
@@ -128,14 +144,18 @@ def group_name(group_id):
   comments["agrees"] += comments[group + "-agree-count"]
   comments["passes"] += comments[group + "-pass-count"]
 
-comments["votes"] = comments["agrees"] + \
-    comments["disagrees"] + comments["passes"]
+comments["votes"] = (
+    comments["agrees"] + comments["disagrees"] + comments["passes"]
+)
 
 comments["agree_rate"] = comments["agrees"] / comments["votes"]
 comments["disagree_rate"] = comments["disagrees"] / comments["votes"]
 comments["pass_rate"] = comments["passes"] / comments["votes"]
 comments["difference_of_opinion_rank"] = (
-    1 - abs(comments["agree_rate"] - comments["disagree_rate"]) - comments["pass_rate"])
+    1
+    - abs(comments["agree_rate"] - comments["disagree_rate"])
+    - comments["pass_rate"]
+)
 
 
 # Go through and check that all of our output comment["votes"] counts are no
@@ -144,10 +164,16 @@ def group_name(group_id):
 # a result of filters applied based on who was grouped in the conversation analysis.
 print("Validating aggregate vote counts...")
 failed_validations = 0
-for comment_id in comments['comment-id']:
-  if comment_vote_counts[comment_id] < comments[comments['comment-id'] == int(comment_id)]["votes"].iloc[0]:
+for comment_id in comments["comment-id"]:
+  if (
+      comment_vote_counts[comment_id]
+      < comments[comments["comment-id"] == int(comment_id)]["votes"].iloc[0]
+  ):
     print(
-        f"WARNING: Vote count mismatch for comment {comment_id}. Original count: {comment_vote_counts[comment_id]}, New count: {comments[comments['comment-id'] == int(comment_id)]['votes'].iloc[0]}")
+        f"WARNING: Vote count mismatch for comment {comment_id}. Original"
+        f" count: {comment_vote_counts[comment_id]}, New count:"
+        f" {comments[comments['comment-id'] == int(comment_id)]['votes'].iloc[0]}"
+    )
     failed_validations += 1
 if failed_validations == 0:
   print("All validations passed!")
@@ -157,14 +183,17 @@ def group_name(group_id):
 # to non-strict moderation)
 print("N comments total:", len(comments))
 print("N votes total:", comments["votes"].sum())
-moderated_comments = comments[(comments["moderated"] == 1) | (
-    (comments["moderated"] == 0) & (comments["votes"] > 1))]
+moderated_comments = comments[
+    (comments["moderated"] == 1)
+    | ((comments["moderated"] == 0) & (comments["votes"] > 1))
+]
 print("N comments included after moderation:", len(moderated_comments))
 print("N votes after moderation:", moderated_comments["votes"].sum())
 
 # prompt: write out to a CSV file
 moderated_comments = moderated_comments.rename(
-    columns={'comment-body': 'comment_text'})
+    columns={"comment-body": "comment_text"}
+)
 moderated_comments.to_csv(args.output_file, index=False)
 
 # Exit with non-zero error code if any validations failed
 
@@ -1 +1 @@
-TypeDoc added this file to prevent GitHub Pages from using Jekyll. You can turn off this behavior by setting the `githubPages` option to false.
+TypeDoc added this file to prevent GitHub Pages from using Jekyll. You can turn off this behavior by setting the `githubPages` option to false.
-Original file line number
+Diff line change
   $PROJECT_ROOT/.git/hooks/commit-msg "$@"
 else
   exit 0
 -fi
 +fi
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`npm test`
`2`		`-npx lint-staged`
	`2`	`+npx lint-staged`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-TypeDoc added this file to prevent GitHub Pages from using Jekyll. You can turn off this behavior by setting the `githubPages` option to false.
	`1`	+TypeDoc added this file to prevent GitHub Pages from using Jekyll. You can turn off this behavior by setting the `githubPages` option to false.