|
| 1 | +import ast |
| 2 | +import json |
| 3 | +from pathlib import Path |
| 4 | +import pandas as pd |
| 5 | + |
| 6 | +UNKNOWN_COUNTRY = "unknown" |
| 7 | + |
| 8 | +def _parse_affiliations_with_country(raw) -> list[tuple[str, str]]: |
| 9 | + """ |
| 10 | + Parse a JSON/Python-literal dict into [(name, country), …]. |
| 11 | + Always returns a country (missing ⇒ 'unknown'). Returns [] if unparseable. |
| 12 | + Expected shape (examples): |
| 13 | + '{"0": {"name": "MIT", "country": "United States"}}' |
| 14 | + '{0: {"name": "LANL"}}' |
| 15 | + """ |
| 16 | + if raw is None: |
| 17 | + return [] |
| 18 | + try: |
| 19 | + if pd.isna(raw): # safe even if raw isn't a pandas scalar |
| 20 | + return [] |
| 21 | + except Exception: |
| 22 | + pass |
| 23 | + |
| 24 | + if isinstance(raw, dict): |
| 25 | + parsed = raw |
| 26 | + else: |
| 27 | + s = str(raw).strip() |
| 28 | + if s in ("", "{}", "[]"): |
| 29 | + return [] |
| 30 | + try: |
| 31 | + parsed = json.loads(s) |
| 32 | + except Exception: |
| 33 | + try: |
| 34 | + parsed = ast.literal_eval(s) |
| 35 | + except Exception: |
| 36 | + return [] |
| 37 | + |
| 38 | + if not isinstance(parsed, dict): |
| 39 | + return [] |
| 40 | + |
| 41 | + out: list[tuple[str, str]] = [] |
| 42 | + for info in parsed.values(): |
| 43 | + if not isinstance(info, dict): |
| 44 | + continue |
| 45 | + name = info.get("name") |
| 46 | + if not isinstance(name, str) or not name.strip(): |
| 47 | + continue |
| 48 | + country = info.get("country") |
| 49 | + if isinstance(country, str): |
| 50 | + country = country.strip() or UNKNOWN_COUNTRY |
| 51 | + elif country is None: |
| 52 | + country = UNKNOWN_COUNTRY |
| 53 | + else: |
| 54 | + country = str(country).strip() or UNKNOWN_COUNTRY |
| 55 | + out.append((name.strip(), country)) |
| 56 | + return out |
| 57 | + |
| 58 | +def generate_top_affiliations_with_country( |
| 59 | + df_path: str | Path, |
| 60 | + affils_output_path: str | Path, |
| 61 | + min_total_papers: int = 20, |
| 62 | + country_filter: str | None = None, # ← filter to exactly this country (or 'unknown') |
| 63 | + partition_by_year: bool = False, # ← also emit one CSV per year |
| 64 | + per_year_output_dir: str | Path | None = None, |
| 65 | +): |
| 66 | + """ |
| 67 | + Reads df_path (must have 'year' and 'affiliations'), computes per-(affiliation, country, year) |
| 68 | + paper counts for affiliations whose total_papers (within the current filter) ≥ min_total_papers. |
| 69 | + Always includes a 'country' value; if missing in the source, uses 'unknown'. |
| 70 | +
|
| 71 | + If country_filter is provided, restricts to that single country (exact string match, including 'unknown'). |
| 72 | + """ |
| 73 | + df = pd.read_csv(df_path) |
| 74 | + |
| 75 | + if 'year' not in df.columns: |
| 76 | + raise KeyError("Expected a 'year' column.") |
| 77 | + df['year'] = pd.to_numeric(df['year'], errors='coerce') |
| 78 | + df = df[df['year'].notna()].copy() |
| 79 | + df['year'] = df['year'].astype(int) |
| 80 | + |
| 81 | + if 'affiliations' not in df.columns: |
| 82 | + raise KeyError("Expected an 'affiliations' column.") |
| 83 | + |
| 84 | + # 1) Parse affiliations → list of (name, country or 'unknown') |
| 85 | + df_aff = df.copy() |
| 86 | + df_aff['affil_tuples'] = df_aff['affiliations'].apply(_parse_affiliations_with_country) |
| 87 | + |
| 88 | + # 2) Explode into one row per (paper, affiliation_name, country) |
| 89 | + exploded_aff = df_aff.explode('affil_tuples') |
| 90 | + exploded_aff = exploded_aff[exploded_aff['affil_tuples'].notna()].copy() |
| 91 | + exploded_aff[['affiliation_name', 'country']] = pd.DataFrame( |
| 92 | + exploded_aff['affil_tuples'].tolist(), index=exploded_aff.index |
| 93 | + ) |
| 94 | + # Defensive fill (should already be set by parser) |
| 95 | + exploded_aff['country'] = exploded_aff['country'].fillna(UNKNOWN_COUNTRY) |
| 96 | + |
| 97 | + # 3) Optional: restrict to one specific country |
| 98 | + if country_filter is not None: |
| 99 | + exploded_aff = exploded_aff[exploded_aff['country'] == country_filter].copy() |
| 100 | + |
| 101 | + # 4) Totals per affiliation (within current filter scope) |
| 102 | + total_per_aff = ( |
| 103 | + exploded_aff |
| 104 | + .groupby(['affiliation_name', 'country']) |
| 105 | + .size() |
| 106 | + .reset_index(name='total_papers') |
| 107 | + .sort_values('total_papers', ascending=False) |
| 108 | + ) |
| 109 | + |
| 110 | + print("=== Affiliations", |
| 111 | + f"in [{country_filter}]" if country_filter is not None else "(all countries)", |
| 112 | + "with their total paper counts ===") |
| 113 | + print(total_per_aff.head(20).to_string(index=False)) |
| 114 | + print("───────────────────────────────────────────────────────────────────────────\n") |
| 115 | + |
| 116 | + # 5) Keep affiliations with ≥ min_total_papers |
| 117 | + top_affils = total_per_aff[total_per_aff['total_papers'] >= min_total_papers][ |
| 118 | + ['affiliation_name', 'country'] |
| 119 | + ] |
| 120 | + |
| 121 | + if top_affils.empty: |
| 122 | + print(f"No affiliation{' in ' + country_filter if country_filter else ''} " |
| 123 | + f"meets ≥ {min_total_papers} total papers.") |
| 124 | + aff_year_counts = pd.DataFrame(columns=['affiliation_name','country','year','paper_count']) |
| 125 | + else: |
| 126 | + # 6) Per-year counts for top affiliations |
| 127 | + exploded_aff_top = exploded_aff.merge(top_affils, on=['affiliation_name','country'], how='inner') |
| 128 | + aff_year_counts = ( |
| 129 | + exploded_aff_top |
| 130 | + .groupby(['affiliation_name','country','year']) |
| 131 | + .size() |
| 132 | + .reset_index(name='paper_count') |
| 133 | + .sort_values(['affiliation_name','country','year']) |
| 134 | + .reset_index(drop=True) |
| 135 | + ) |
| 136 | + |
| 137 | + # 7) Write consolidated CSV |
| 138 | + affils_output_path = Path(affils_output_path) |
| 139 | + affils_output_path.parent.mkdir(parents=True, exist_ok=True) |
| 140 | + if affils_output_path.suffix == "": |
| 141 | + affils_output_path = affils_output_path.with_suffix(".csv") |
| 142 | + aff_year_counts.to_csv(affils_output_path, index=False, encoding="utf-8-sig") |
| 143 | + |
| 144 | + print(f"Wrote {len(aff_year_counts)} rows to {affils_output_path} " |
| 145 | + f"(≥ {min_total_papers} papers" |
| 146 | + f"{', country=' + country_filter if country_filter is not None else ', all countries'})") |
| 147 | + |
| 148 | + # 8) Optional: one file per year (same columns) |
| 149 | + if partition_by_year and not aff_year_counts.empty: |
| 150 | + out_dir = Path(per_year_output_dir) if per_year_output_dir else affils_output_path.parent |
| 151 | + out_dir.mkdir(parents=True, exist_ok=True) |
| 152 | + stem = affils_output_path.stem |
| 153 | + suffix = affils_output_path.suffix or ".csv" |
| 154 | + for yr in sorted(aff_year_counts['year'].unique()): |
| 155 | + yr_df = aff_year_counts[aff_year_counts['year'] == yr] |
| 156 | + yr_path = out_dir / f"{stem}.year={yr}{suffix}" |
| 157 | + yr_df.to_csv(yr_path, index=False, encoding="utf-8-sig") |
| 158 | + print(f"→ Wrote {len(yr_df)} rows for year {yr} to {yr_path}") |
| 159 | + |
| 160 | +# # All countries in output (missing → 'unknown'), consolidated CSV only |
| 161 | +# generate_top_affiliations_with_country( |
| 162 | +# "papers.csv", "out/affiliations_top.csv", min_total_papers=20 |
| 163 | +# ) |
| 164 | + |
| 165 | +# # Only the United States (others excluded), plus per-year files |
| 166 | +# generate_top_affiliations_with_country( |
| 167 | +# "papers.csv", "out/affiliations_top.csv", |
| 168 | +# min_total_papers=10, |
| 169 | +# country_filter="United States", |
| 170 | +# partition_by_year=True, |
| 171 | +# per_year_output_dir="out/by_year" |
| 172 | +# ) |
| 173 | + |
| 174 | +# # Only entries whose country was missing in the source (now labeled 'unknown') |
| 175 | +# generate_top_affiliations_with_country( |
| 176 | +# "papers.csv", "out/affiliations_unknown.csv", |
| 177 | +# min_total_papers=5, |
| 178 | +# country_filter="unknown" |
| 179 | +# ) |
0 commit comments