lanl
diff --git a/‎TELF/pipeline/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎TELF/pipeline/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎TELF/pipeline/blocks/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎TELF/pipeline/blocks/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎TELF/pipeline/blocks/artic_fox_block.py‎
Lines changed: 1 addition & 1 deletion b/‎TELF/pipeline/blocks/artic_fox_block.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎TELF/pipeline/blocks/author_affiliation_tables.py‎
Lines changed: 141 additions & 0 deletions b/‎TELF/pipeline/blocks/author_affiliation_tables.py‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎TELF/pipeline/blocks/block_helpers/__init__.py‎ b/‎TELF/pipeline/blocks/block_helpers/__init__.py‎
diff --git a/‎TELF/pipeline/blocks/block_helpers/affiliation_partition.py‎
Lines changed: 179 additions & 0 deletions b/‎TELF/pipeline/blocks/block_helpers/affiliation_partition.py‎
Lines changed: 179 additions & 0 deletions
@@ -70,4 +70,6 @@
 
 from .blocks.collect_hnmfk_leaf_block import CollectHNMFkLeafBlock 
 from .blocks.termite_neo4j_block import TermiteNeo4jBlock
-from .blocks.termite_vector_block import TermiteVectorBlock
+from .blocks.termite_vector_block import TermiteVectorBlock
+
+from .blocks.author_affiliation_tables import AffiliationsAndAuthorsBlock
@@ -63,3 +63,4 @@
 from .collect_hnmfk_leaf_block import CollectHNMFkLeafBlock 
 from .termite_neo4j_block import TermiteNeo4jBlock
 from .termite_vector_block import TermiteVectorBlock
+from .author_affiliation_tables import AffiliationsAndAuthorsBlock
@@ -68,7 +68,7 @@ def run(self, bundle: DataBundle) -> None:
         model.load_model()
 
         pipeline = ArcticFox(model=model, **self.init_settings)
-        # pipeline.run_full_pipeline(data_df=df, vocab=vocabulary, **self.call_settings)
+        pipeline.run_full_pipeline(data_df=df, vocab=vocabulary, **self.call_settings)
 
         status_value = "Done"
 
 
@@ -0,0 +1,141 @@
+# blocks/affiliations_and_authors_block.py
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Dict, Sequence, Any, Optional, List, Tuple, Union
+
+import pandas as pd
+
+from .base_block import AnimalBlock
+from .data_bundle import DataBundle, SAVE_DIR_BUNDLE_KEY
+
+# your helpers live in blocks/block_helpers/
+from .block_helpers.affiliation_partition import generate_top_affiliations_with_country
+from .block_helpers.author_partition import write_top_authors_by_cluster
+
+
+class AffiliationsAndAuthorsBlock(AnimalBlock):
+    """
+    Compute (affiliation, country, year) paper counts and top authors by cluster.
+
+    ─────────────────────────────────────────────────────────────
+    always needs : ('df',)   – accepts a CSV path OR a pandas.DataFrame
+    provides     : ('affiliations_df', 'affiliations_csv',
+                    'authors_df',      'authors_csv')
+    tag          : 'AffilsAndAuthors' (namespace for its outputs)
+
+    Results are written under  <bundle[SAVE_DIR_BUNDLE_KEY]>/<tag>/ .
+    Checkpoints persist the two CSV paths so the block can be skipped on re-run.
+    """
+
+    CANONICAL_NEEDS = ("df",)
+
+    def __init__(
+        self,
+        *,
+        needs: Sequence[str] = CANONICAL_NEEDS,
+        provides: Sequence[str] = ("affiliations_df", "affiliations_csv",
+                                   "authors_df",      "authors_csv"),
+        # Persist only the CSVs; DataFrames are rebuilt on load if needed.
+        checkpoint_keys: Sequence[str] = ("affiliations_csv", "authors_csv"),
+        conditional_needs: Sequence[tuple[str, Any]] = (),
+        tag: str = "AffilsAndAuthors",
+        # Defaults mirror your helper signatures
+        init_settings: Dict[str, Any] | None = None,
+        call_settings: Dict[str, Any] | None = None,
+        **kw: Any,
+    ) -> None:
+
+        default_init: Dict[str, Any] = {}
+
+        default_call: Dict[str, Any] = {
+            # generate_top_affiliations_with_country(...)
+            "min_total_papers": 20,
+            "country_filter": None,            # exact match (including 'unknown') or None
+            "partition_by_year": False,
+            "per_year_output_dir": None,       # if None and partition_by_year=True → use <tag>/by_year
+
+            # write_top_authors_by_cluster(...)
+            "countries": None,                 # list[str] or None
+            "top_n": 10,
+        }
+
+        super().__init__(
+            needs=needs,
+            provides=provides,
+            conditional_needs=list(conditional_needs or []),
+            checkpoint_keys=checkpoint_keys,
+            tag=tag,
+            init_settings=self._merge(default_init, init_settings),
+            call_settings=self._merge(default_call, call_settings),
+            **kw,
+        )
+
+    # ─────────────────────────────────────────────────────────────
+    # helpers
+    # ─────────────────────────────────────────────────────────────
+    def _ensure_input_csv(self, bundle: DataBundle) -> Path:
+        """
+        Accepts either a DataFrame or a path in bundle['df'].
+        If a DataFrame, persist it to <save_dir>/<tag>/input.csv and return that path.
+        """
+        src = bundle[self.needs[0]]
+        save_dir = Path(bundle[SAVE_DIR_BUNDLE_KEY]) / self.tag
+        save_dir.mkdir(parents=True, exist_ok=True)
+
+        if isinstance(src, pd.DataFrame):
+            inp = save_dir / "input.csv"
+            src.to_csv(inp, index=False, encoding="utf-8-sig")
+            return inp
+
+        # let AnimalBlock’s path rewriter handle legacy numbered dirs
+        return Path(src)
+
+    # ─────────────────────────────────────────────────────────────
+    # work
+    # ─────────────────────────────────────────────────────────────
+    def run(self, bundle: DataBundle) -> None:
+        df_path = self._ensure_input_csv(bundle)
+
+        out_dir = Path(bundle[SAVE_DIR_BUNDLE_KEY]) / self.tag
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+        # === 1) Affiliations with country (and per-year optional) ===
+        affils_csv = out_dir / "affiliations_top.csv"
+        per_year_dir = self.call_settings.get("per_year_output_dir")
+        if self.call_settings.get("partition_by_year") and not per_year_dir:
+            per_year_dir = out_dir / "by_year"
+
+        generate_top_affiliations_with_country(
+            df_path=df_path,
+            affils_output_path=affils_csv,
+            min_total_papers=int(self.call_settings["min_total_papers"]),
+            country_filter=self.call_settings.get("country_filter"),
+            partition_by_year=bool(self.call_settings.get("partition_by_year")),
+            per_year_output_dir=per_year_dir,
+        )
+
+        # read back for the bundle
+        aff_df = pd.read_csv(affils_csv) if affils_csv.is_file() else pd.DataFrame(
+            columns=["affiliation_name", "country", "year", "paper_count"]
+        )
+
+        # register checkpoints / provide
+        self.register_checkpoint("affiliations_csv", affils_csv)
+        bundle[f"{self.tag}.affiliations_csv"] = str(affils_csv)
+        bundle[f"{self.tag}.affiliations_df"] = aff_df
+
+        # === 2) Top authors by cluster =============================
+        authors_csv = out_dir / "top_authors_by_cluster.csv"
+        # Note: helper param name is COUNTY_NAMES (kept as-is)
+        result_df = write_top_authors_by_cluster(
+            df_path=str(df_path),
+            output_path=str(authors_csv),
+            COUNTY_NAMES=self.call_settings.get("countries"),
+            top_n=int(self.call_settings.get("top_n", 10)),
+        )
+
+        # register checkpoints / provide
+        self.register_checkpoint("authors_csv", authors_csv)
+        bundle[f"{self.tag}.authors_csv"] = str(authors_csv)
+        bundle[f"{self.tag}.authors_df"] = result_df
@@ -0,0 +1,179 @@
+import ast
+import json
+from pathlib import Path
+import pandas as pd
+
+UNKNOWN_COUNTRY = "unknown"
+
+def _parse_affiliations_with_country(raw) -> list[tuple[str, str]]:
+    """
+    Parse a JSON/Python-literal dict into [(name, country), …].
+    Always returns a country (missing ⇒ 'unknown'). Returns [] if unparseable.
+    Expected shape (examples):
+      '{"0": {"name": "MIT", "country": "United States"}}'
+      '{0: {"name": "LANL"}}'
+    """
+    if raw is None:
+        return []
+    try:
+        if pd.isna(raw):  # safe even if raw isn't a pandas scalar
+            return []
+    except Exception:
+        pass
+
+    if isinstance(raw, dict):
+        parsed = raw
+    else:
+        s = str(raw).strip()
+        if s in ("", "{}", "[]"):
+            return []
+        try:
+            parsed = json.loads(s)
+        except Exception:
+            try:
+                parsed = ast.literal_eval(s)
+            except Exception:
+                return []
+
+    if not isinstance(parsed, dict):
+        return []
+
+    out: list[tuple[str, str]] = []
+    for info in parsed.values():
+        if not isinstance(info, dict):
+            continue
+        name = info.get("name")
+        if not isinstance(name, str) or not name.strip():
+            continue
+        country = info.get("country")
+        if isinstance(country, str):
+            country = country.strip() or UNKNOWN_COUNTRY
+        elif country is None:
+            country = UNKNOWN_COUNTRY
+        else:
+            country = str(country).strip() or UNKNOWN_COUNTRY
+        out.append((name.strip(), country))
+    return out
+
+def generate_top_affiliations_with_country(
+    df_path: str | Path,
+    affils_output_path: str | Path,
+    min_total_papers: int = 20,
+    country_filter: str | None = None,           # ← filter to exactly this country (or 'unknown')
+    partition_by_year: bool = False,             # ← also emit one CSV per year
+    per_year_output_dir: str | Path | None = None,
+):
+    """
+    Reads df_path (must have 'year' and 'affiliations'), computes per-(affiliation, country, year)
+    paper counts for affiliations whose total_papers (within the current filter) ≥ min_total_papers.
+    Always includes a 'country' value; if missing in the source, uses 'unknown'.
+
+    If country_filter is provided, restricts to that single country (exact string match, including 'unknown').
+    """
+    df = pd.read_csv(df_path)
+
+    if 'year' not in df.columns:
+        raise KeyError("Expected a 'year' column.")
+    df['year'] = pd.to_numeric(df['year'], errors='coerce')
+    df = df[df['year'].notna()].copy()
+    df['year'] = df['year'].astype(int)
+
+    if 'affiliations' not in df.columns:
+        raise KeyError("Expected an 'affiliations' column.")
+
+    # 1) Parse affiliations → list of (name, country or 'unknown')
+    df_aff = df.copy()
+    df_aff['affil_tuples'] = df_aff['affiliations'].apply(_parse_affiliations_with_country)
+
+    # 2) Explode into one row per (paper, affiliation_name, country)
+    exploded_aff = df_aff.explode('affil_tuples')
+    exploded_aff = exploded_aff[exploded_aff['affil_tuples'].notna()].copy()
+    exploded_aff[['affiliation_name', 'country']] = pd.DataFrame(
+        exploded_aff['affil_tuples'].tolist(), index=exploded_aff.index
+    )
+    # Defensive fill (should already be set by parser)
+    exploded_aff['country'] = exploded_aff['country'].fillna(UNKNOWN_COUNTRY)
+
+    # 3) Optional: restrict to one specific country
+    if country_filter is not None:
+        exploded_aff = exploded_aff[exploded_aff['country'] == country_filter].copy()
+
+    # 4) Totals per affiliation (within current filter scope)
+    total_per_aff = (
+        exploded_aff
+        .groupby(['affiliation_name', 'country'])
+        .size()
+        .reset_index(name='total_papers')
+        .sort_values('total_papers', ascending=False)
+    )
+
+    print("=== Affiliations",
+          f"in [{country_filter}]" if country_filter is not None else "(all countries)",
+          "with their total paper counts ===")
+    print(total_per_aff.head(20).to_string(index=False))
+    print("───────────────────────────────────────────────────────────────────────────\n")
+
+    # 5) Keep affiliations with ≥ min_total_papers
+    top_affils = total_per_aff[total_per_aff['total_papers'] >= min_total_papers][
+        ['affiliation_name', 'country']
+    ]
+
+    if top_affils.empty:
+        print(f"No affiliation{' in ' + country_filter if country_filter else ''} "
+              f"meets ≥ {min_total_papers} total papers.")
+        aff_year_counts = pd.DataFrame(columns=['affiliation_name','country','year','paper_count'])
+    else:
+        # 6) Per-year counts for top affiliations
+        exploded_aff_top = exploded_aff.merge(top_affils, on=['affiliation_name','country'], how='inner')
+        aff_year_counts = (
+            exploded_aff_top
+            .groupby(['affiliation_name','country','year'])
+            .size()
+            .reset_index(name='paper_count')
+            .sort_values(['affiliation_name','country','year'])
+            .reset_index(drop=True)
+        )
+
+    # 7) Write consolidated CSV
+    affils_output_path = Path(affils_output_path)
+    affils_output_path.parent.mkdir(parents=True, exist_ok=True)
+    if affils_output_path.suffix == "":
+        affils_output_path = affils_output_path.with_suffix(".csv")
+    aff_year_counts.to_csv(affils_output_path, index=False, encoding="utf-8-sig")
+
+    print(f"Wrote {len(aff_year_counts)} rows to {affils_output_path} "
+          f"(≥ {min_total_papers} papers"
+          f"{', country=' + country_filter if country_filter is not None else ', all countries'})")
+
+    # 8) Optional: one file per year (same columns)
+    if partition_by_year and not aff_year_counts.empty:
+        out_dir = Path(per_year_output_dir) if per_year_output_dir else affils_output_path.parent
+        out_dir.mkdir(parents=True, exist_ok=True)
+        stem = affils_output_path.stem
+        suffix = affils_output_path.suffix or ".csv"
+        for yr in sorted(aff_year_counts['year'].unique()):
+            yr_df = aff_year_counts[aff_year_counts['year'] == yr]
+            yr_path = out_dir / f"{stem}.year={yr}{suffix}"
+            yr_df.to_csv(yr_path, index=False, encoding="utf-8-sig")
+            print(f"→ Wrote {len(yr_df)} rows for year {yr} to {yr_path}")
+
+# # All countries in output (missing → 'unknown'), consolidated CSV only
+# generate_top_affiliations_with_country(
+#     "papers.csv", "out/affiliations_top.csv", min_total_papers=20
+# )
+
+# # Only the United States (others excluded), plus per-year files
+# generate_top_affiliations_with_country(
+#     "papers.csv", "out/affiliations_top.csv",
+#     min_total_papers=10,
+#     country_filter="United States",
+#     partition_by_year=True,
+#     per_year_output_dir="out/by_year"
+# )
+
+# # Only entries whose country was missing in the source (now labeled 'unknown')
+# generate_top_affiliations_with_country(
+#     "papers.csv", "out/affiliations_unknown.csv",
+#     min_total_papers=5,
+#     country_filter="unknown"
+# )