Skip to content

Commit

Permalink
started documenting analysis scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
karacolada committed Jul 4, 2024
1 parent d89bf1c commit 7e9f180
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 17 deletions.
11 changes: 11 additions & 0 deletions src/analysis/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# `src/analysis`

The scripts in this directory were used to produce derived data and plots.

- [`aggregate_datasets.py`](./aggregate_datasets.py) aggregates all data mined from GitHub into four datasets described in the wiki. Crucially, the data is reshaped into a time-indexed format for three of those datasets.
- [`mention_type_timeline.py`](./mention_type_timeline.py) visualises the relationship between how a repository is cited and the difference between its creation date and the publication date.
- [`repo_intent.py`](./repo_intent.py) creates a dataset with all repositories mined from ePrints for which we manually determined the citation type. The resulting dataset contains data from ePrints as well as a label indicating whether the software was cited as created software.
- [`overall.py`](./overall.py)
- [`repository_timeline.py`](./repository_timeline.py)

The schemas for any produced datasets are included in the wiki.
15 changes: 8 additions & 7 deletions src/analysis/aggregate_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def merge_min_max_weeks(min_max_week_df, df, week_col, name):
timelines_df = timelines_df.set_index("github_user_cleaned_url")
return timelines_df

def main(dir, verbose):
def main(dir, outdir, verbose):
info(verbose, f"Loading data...")
metadata = load_data(dir, "metadata.csv", "created_at")
contents = load_data(dir, "contents.csv", ["citation_added", "contributing_added"])
Expand Down Expand Up @@ -418,16 +418,16 @@ def main(dir, verbose):
left_on="github_user_cleaned_url",
right_index=True
)
overall_df.to_csv(os.path.join(dir, "aggregated_overall.csv"))
overall_df.to_csv(os.path.join(outdir, "aggregated_overall.csv"))
info(verbose, "Overall aggregation complete.")

info(verbose, "Aggregating timelines...")
readme_history = analyse_headings(readme_history)
timelines_df = timelines_init(metadata, contents, contributions, forks, stars, issues, readme_history)
issue_users_timeline_df = user_type_wrt_issues(issues, timelines_df)
commit_authors_timeline_df = user_type_wrt_commits(contributions)
issue_users_timeline_df.to_csv(os.path.join(dir, "aggregated_issue_user_timeline.csv"))
commit_authors_timeline_df.to_csv(os.path.join(dir, "aggregated_commit_author_timeline.csv"))
issue_users_timeline_df.to_csv(os.path.join(outdir, "aggregated_issue_user_timeline.csv"))
commit_authors_timeline_df.to_csv(os.path.join(outdir, "aggregated_commit_author_timeline.csv"))
issue_counts_df = no_open_and_closed_issues(issues, timelines_df)
engagement_df = engagement(forks, stars, timelines_df)
highlights_df = date_highlights(readme_history, contents, paper_data, timelines_df)
Expand All @@ -449,15 +449,16 @@ def main(dir, verbose):
right_index=True,
how="left"
)
overall_timeline_df.to_csv(os.path.join(dir, "aggregated_timeline.csv"))
overall_timeline_df.to_csv(os.path.join(outdir, "aggregated_timeline.csv"))
info(verbose, "Timeline aggregation complete.")

if __name__=="__main__":
parser = argparse.ArgumentParser(
prog="aggregate_datasets",
description="Aggregate crawled data into output datasets."
)
parser.add_argument("--dir", default="../data/analysis", type=str, help="path to data directory")
parser.add_argument("--datadir", default="../../data/raw/github", type=str, help="path to data directory")
parser.add_argument("--outdir", default="../../data/derived", type=str, help="path to use for output data")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
args = parser.parse_args()
main(args.dir, args.verbose)
main(args.datadir, args.outdir, args.verbose)
18 changes: 13 additions & 5 deletions src/analysis/mention_type_timeline.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
import argparse
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from datetime import timedelta

def main():
eprints_df = pd.read_csv("../data/outputs/eprints_w_intent.csv", index_col=0)
metadata_df = pd.read_csv("../data/analysis/metadata.csv", index_col=0)
def main(githubdir, outdir):
# load data mapping ePrints publication data to intent (produced by repo_intent.py)
eprints_df = pd.read_csv(os.path.join(outdir, "eprints_w_intent.csv"), index_col=0)
# load GitHub metadata
metadata_df = pd.read_csv(os.path.join(githubdir, "metadata.csv"), index_col=0)
# combine data
df = eprints_df.merge(metadata_df[["github_user_cleaned_url", "created_at"]], left_on="github_repo_id", right_on="github_user_cleaned_url")
df["eprints_date"] = pd.to_datetime(df.eprints_date)
df["created_at"] = pd.to_datetime(df.created_at)
df["mention type"] = np.where(df["mention_created"], "created", "not created")
# plot repo creation date against date listed in ePrints entry (assumed to be publication date)
ax = plt.axes()
ax.grid(True)

Expand All @@ -34,11 +39,14 @@ def main():
ylabel="publication date") # it's usually the publication date, though not always
ax.set_title("Mention type depending on distance between repo creation and publication date")
plt.tight_layout()
plt.savefig("../data/analysis/overall/mention_type_timeline.png")
plt.savefig(os.path.join(outdir, "plots/overall/mention_type_timeline.png"))

if __name__=="__main__":
parser = argparse.ArgumentParser(
prog="mention_type_timeline",
description="Plot mention type (created/not created) against the repo creation date and publication date."
)
main()
parser.add_argument("--githubdir", default="../../data/raw/github", type=str, help="path to GitHub data directory")
parser.add_argument("--outdir", default="../../data/derived", type=str, help="path to use for output data")
args = parser.parse_args()
main(args.githubdir, args.outdir)
16 changes: 11 additions & 5 deletions src/analysis/repo_intent.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import os
import pandas as pd
import argparse

def main(filepath, true_list_path, false_list_path):
def main(filepath, true_list_path, false_list_path, outdir):
# read ePrints data
df = pd.read_csv(filepath)
# filter to those we have manually collected intent labels for
df = df[df.page_no < 2]
# for each GitHub repository cited in a pubcliation, find out whether it was mentioned as created software
created_map = {"github_repo_id": [], "mention_created": []}
for path, v in [(true_list_path, True), (false_list_path, False)]:
with open(path, "r") as f:
for line in f.readlines():
created_map["github_repo_id"].append(line.rstrip("\n"))
created_map["mention_created"].append(v)
created_df = pd.DataFrame(created_map)
# construct the final dataset
merged_df = created_df.merge(df, how="left", left_on="github_repo_id", right_on="github_user_cleaned_url")
print(f"[INFO] Candidates: {len(df)}\n Mapped: {len(created_df)}\n Result: {len(merged_df)}")
merged_df["date"] = pd.to_datetime(merged_df["date"])
Expand Down Expand Up @@ -45,16 +50,17 @@ def main(filepath, true_list_path, false_list_path):
merged_df.drop(columns=["github_user_cleaned_url"], inplace=True)
print("Schema:")
merged_df.info()
merged_df.to_csv("../data/outputs/eprints_w_intent.csv")
merged_df.to_csv(os.path.join(outdir, "eprints_w_intent.csv"))

if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="repo_intent",
description="Load cleaned links from multiple ePrints repositories and merge them into one file."
description="Add repository intent to the ePrints publication data."
)
parser.add_argument("-f", "--file", required=True, type=str, help="data file")
parser.add_argument("-f", "--file", required=True, type=str, help="data file containing info extracted from all ePrints repositories")
parser.add_argument("--true", required=True, type=str, help="list of repo links cited as created")
parser.add_argument("--false", required=True, type=str, help="list of repo links that are just mentioned")
parser.add_argument("--outdir", default="../../data/derived", type=str, help="path to derived data directory")
parser.add_argument
args = parser.parse_args()
main(args.file, args.true, args.false)
main(args.file, args.true, args.false, args.outdir)

0 comments on commit 7e9f180

Please sign in to comment.