started documenting analysis scripts

softwaresaved · Jul 4, 2024 · 7e9f180 · 7e9f180
1 parent d89bf1c
commit 7e9f180
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 17 deletions.
diff --git a/src/analysis/README.md b/src/analysis/README.md
@@ -0,0 +1,11 @@
+# `src/analysis`
+
+The scripts in this directory were used to produce derived data and plots.
+
+- [`aggregate_datasets.py`](./aggregate_datasets.py) aggregates all data mined from GitHub into four datasets described in the wiki. Crucially, the data is reshaped into a time-indexed format for three of those datasets.
+- [`mention_type_timeline.py`](./mention_type_timeline.py) visualises the relationship between how a repository is cited and the difference between its creation date and the publication date.
+- [`repo_intent.py`](./repo_intent.py) creates a dataset with all repositories mined from ePrints for which we manually determined the citation type. The resulting dataset contains data from ePrints as well as a label indicating whether the software was cited as created software.
+- [`overall.py`](./overall.py)
+- [`repository_timeline.py`](./repository_timeline.py)
+
+The schemas for any produced datasets are included in the wiki.
diff --git a/src/analysis/aggregate_datasets.py b/src/analysis/aggregate_datasets.py
@@ -376,7 +376,7 @@ def merge_min_max_weeks(min_max_week_df, df, week_col, name):
     timelines_df = timelines_df.set_index("github_user_cleaned_url")
     return timelines_df
 
-def main(dir, verbose):
+def main(dir, outdir, verbose):
     info(verbose, f"Loading data...")
     metadata = load_data(dir, "metadata.csv", "created_at")
     contents = load_data(dir, "contents.csv", ["citation_added", "contributing_added"])
@@ -418,16 +418,16 @@ def main(dir, verbose):
         left_on="github_user_cleaned_url",
         right_index=True
     )
-    overall_df.to_csv(os.path.join(dir, "aggregated_overall.csv"))
+    overall_df.to_csv(os.path.join(outdir, "aggregated_overall.csv"))
     info(verbose, "Overall aggregation complete.")
 
     info(verbose, "Aggregating timelines...")
     readme_history = analyse_headings(readme_history)
     timelines_df = timelines_init(metadata, contents, contributions, forks, stars, issues, readme_history)
     issue_users_timeline_df = user_type_wrt_issues(issues, timelines_df)
     commit_authors_timeline_df = user_type_wrt_commits(contributions)
-    issue_users_timeline_df.to_csv(os.path.join(dir, "aggregated_issue_user_timeline.csv"))
-    commit_authors_timeline_df.to_csv(os.path.join(dir, "aggregated_commit_author_timeline.csv"))
+    issue_users_timeline_df.to_csv(os.path.join(outdir, "aggregated_issue_user_timeline.csv"))
+    commit_authors_timeline_df.to_csv(os.path.join(outdir, "aggregated_commit_author_timeline.csv"))
     issue_counts_df = no_open_and_closed_issues(issues, timelines_df)
     engagement_df = engagement(forks, stars, timelines_df)
     highlights_df = date_highlights(readme_history, contents, paper_data, timelines_df)
@@ -449,15 +449,16 @@ def main(dir, verbose):
         right_index=True,
         how="left"
     )
-    overall_timeline_df.to_csv(os.path.join(dir, "aggregated_timeline.csv"))
+    overall_timeline_df.to_csv(os.path.join(outdir, "aggregated_timeline.csv"))
     info(verbose, "Timeline aggregation complete.")
 
 if __name__=="__main__":
     parser = argparse.ArgumentParser(
         prog="aggregate_datasets",
         description="Aggregate crawled data into output datasets."
     )
-    parser.add_argument("--dir", default="../data/analysis", type=str, help="path to data directory")
+    parser.add_argument("--datadir", default="../../data/raw/github", type=str, help="path to data directory")
+    parser.add_argument("--outdir", default="../../data/derived", type=str, help="path to use for output data")
     parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
     args = parser.parse_args()
-    main(args.dir, args.verbose)
+    main(args.datadir, args.outdir, args.verbose)
diff --git a/src/analysis/mention_type_timeline.py b/src/analysis/mention_type_timeline.py
@@ -1,17 +1,22 @@
 import argparse
+import os
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 import numpy as np
 from datetime import timedelta
 
-def main():
-    eprints_df = pd.read_csv("../data/outputs/eprints_w_intent.csv", index_col=0)
-    metadata_df = pd.read_csv("../data/analysis/metadata.csv", index_col=0)
+def main(githubdir, outdir):
+    # load data mapping ePrints publication data to intent (produced by repo_intent.py)
+    eprints_df = pd.read_csv(os.path.join(outdir, "eprints_w_intent.csv"), index_col=0)
+    # load GitHub metadata
+    metadata_df = pd.read_csv(os.path.join(githubdir, "metadata.csv"), index_col=0)
+    # combine data
     df = eprints_df.merge(metadata_df[["github_user_cleaned_url", "created_at"]], left_on="github_repo_id", right_on="github_user_cleaned_url")
     df["eprints_date"] = pd.to_datetime(df.eprints_date)
     df["created_at"] = pd.to_datetime(df.created_at)
     df["mention type"] = np.where(df["mention_created"], "created", "not created")
+    # plot repo creation date against date listed in ePrints entry (assumed to be publication date)
     ax = plt.axes()
     ax.grid(True)
 
@@ -34,11 +39,14 @@ def main():
            ylabel="publication date")  # it's usually the publication date, though not always
     ax.set_title("Mention type depending on distance between repo creation and publication date")
     plt.tight_layout()
-    plt.savefig("../data/analysis/overall/mention_type_timeline.png")
+    plt.savefig(os.path.join(outdir, "plots/overall/mention_type_timeline.png"))
 
 if __name__=="__main__":
     parser = argparse.ArgumentParser(
         prog="mention_type_timeline",
         description="Plot mention type (created/not created) against the repo creation date and publication date."
     )
-    main()
+    parser.add_argument("--githubdir", default="../../data/raw/github", type=str, help="path to GitHub data directory")
+    parser.add_argument("--outdir", default="../../data/derived", type=str, help="path to use for output data")
+    args = parser.parse_args()
+    main(args.githubdir, args.outdir)
diff --git a/src/analysis/repo_intent.py b/src/analysis/repo_intent.py
@@ -1,16 +1,21 @@
+import os
 import pandas as pd
 import argparse
 
-def main(filepath, true_list_path, false_list_path):
+def main(filepath, true_list_path, false_list_path, outdir):
+    # read ePrints data
     df = pd.read_csv(filepath)
+    # filter to those we have manually collected intent labels for
     df = df[df.page_no < 2]
+    # for each GitHub repository cited in a pubcliation, find out whether it was mentioned as created software
     created_map = {"github_repo_id": [], "mention_created": []}
     for path, v in [(true_list_path, True), (false_list_path, False)]:
         with open(path, "r") as f:
             for line in f.readlines():
                 created_map["github_repo_id"].append(line.rstrip("\n"))
                 created_map["mention_created"].append(v)
     created_df = pd.DataFrame(created_map)
+    # construct the final dataset
     merged_df = created_df.merge(df, how="left", left_on="github_repo_id", right_on="github_user_cleaned_url")
     print(f"[INFO] Candidates: {len(df)}\n       Mapped: {len(created_df)}\n       Result: {len(merged_df)}")
     merged_df["date"] = pd.to_datetime(merged_df["date"])
@@ -45,16 +50,17 @@ def main(filepath, true_list_path, false_list_path):
     merged_df.drop(columns=["github_user_cleaned_url"], inplace=True)
     print("Schema:")
     merged_df.info()
-    merged_df.to_csv("../data/outputs/eprints_w_intent.csv")
+    merged_df.to_csv(os.path.join(outdir, "eprints_w_intent.csv"))
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         prog="repo_intent",
-        description="Load cleaned links from multiple ePrints repositories and merge them into one file."
+        description="Add repository intent to the ePrints publication data."
     )
-    parser.add_argument("-f", "--file", required=True, type=str, help="data file")
+    parser.add_argument("-f", "--file", required=True, type=str, help="data file containing info extracted from all ePrints repositories")
     parser.add_argument("--true", required=True, type=str, help="list of repo links cited as created")
     parser.add_argument("--false", required=True, type=str, help="list of repo links that are just mentioned")
+    parser.add_argument("--outdir", default="../../data/derived", type=str, help="path to derived data directory")
     parser.add_argument
     args = parser.parse_args()
-    main(args.file, args.true, args.false)
+    main(args.file, args.true, args.false, args.outdir)