lanl
diff --git a/‎CITATION.cff
Lines changed: 2 additions & 2 deletions b/‎CITATION.cff
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎TELF/applications/__init__.py
Lines changed: 2 additions & 1 deletion b/‎TELF/applications/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎TELF/helpers/embeddings.py
Lines changed: 1 addition & 1 deletion b/‎TELF/helpers/embeddings.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎TELF/helpers/figures.py
Lines changed: 66 additions & 1 deletion b/‎TELF/helpers/figures.py
Lines changed: 66 additions & 1 deletion
diff --git a/‎TELF/helpers/llm.py
Lines changed: 68 additions & 0 deletions b/‎TELF/helpers/llm.py
Lines changed: 68 additions & 0 deletions
diff --git a/‎TELF/helpers/ml.py
Lines changed: 33 additions & 0 deletions b/‎TELF/helpers/ml.py
Lines changed: 33 additions & 0 deletions
diff --git a/‎TELF/pre_processing/Squirrel/__init__.py
Lines changed: 1 addition & 0 deletions b/‎TELF/pre_processing/Squirrel/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎TELF/pre_processing/Squirrel/pruners/__init__.py
Lines changed: 2 additions & 0 deletions b/‎TELF/pre_processing/Squirrel/pruners/__init__.py
Lines changed: 2 additions & 0 deletions
@@ -1,4 +1,4 @@
-version: 0.0.40
+version: 0.0.41
 message: "If you use this software, please cite it as below."
 authors:
   - family-names: Eren
@@ -20,7 +20,7 @@ authors:
   - family-names: Alexandrov
     given-names: Boian
 title: "Tensor Extraction of Latent Features (T-ELF)"
-version: 0.0.40
+version: 0.0.41
 url: https://github.com/lanl/T-ELF
 doi: 10.5281/zenodo.10257897
 date-released: 2023-12-04
@@ -109,6 +109,7 @@ python post_install.py # use the following, for example, for GPU system: <python
 |   Beaver   | :heavy_check_mark:  | :heavy_check_mark:  |        Fast matrix and tensor building tool for text mining        |  [Link](examples/Beaver)  |
 |  iPenguin  | :heavy_check_mark:  |                     |         Online information retrieval tool for Scopus, SemanticScholar, and OSTI         | [Link](examples/iPenguin) |
 |    Orca    | :heavy_check_mark:  |                     | Duplicate author detector for text mining and information retrieval |   [Link](examples/Orca)          |
+|    Squirrel   | Dataset pruning tool for documents |  [Link](examples/Squirrel)  |
 
 ### TELF.post_processing
 
 
@@ -3,6 +3,7 @@
 sys.path += ["Bunny"]
 sys.path += ["Penguin"]
 
+
 # Cheetah
 from .Cheetah.cheetah import Cheetah
 from .Cheetah.term_formatter import CheetahTermFormatter, convert_txt_to_cheetah_markdown
@@ -13,4 +14,4 @@
 from .Bunny.auto_bunny import AutoBunny, AutoBunnyStep
 
 # Penguin
-from .Penguin.penguin import Penguin
+from .Penguin.penguin import Penguin
@@ -88,4 +88,4 @@ def compute_centroids(embeddings_dict, df):
             centroids[cluster_id] = np.mean(embs, axis=0)
         else:
             centroids[cluster_id] = None
-    return centroids
+    return centroids
@@ -9,11 +9,19 @@
 from wordcloud import WordCloud
 import random
 import os
+from pathlib import Path
+from typing import Optional
+import matplotlib.pyplot as plt
+from matplotlib.cm import get_cmap
+from matplotlib.colors import to_hex
+from scipy.spatial import ConvexHull
+import logging
+log = logging.getLogger(__name__)
 from .file_system import check_path
 from .data_structures import sum_dicts
 from .maps import get_id_to_name
 from .graphs import create_authors_graph
-import math
+
 
 def plot_authors_graph(df, id_col='s2_author_ids', name_col='s2_authors', title='Co-Authors Graph',
                        width=900, height=900, max_node_size=50, min_node_size=3):
@@ -478,3 +486,60 @@ def plot_H_clustering(H, name="filename"):
         plt.close()
 
     return fig
+
+def plot_umap(
+    coords: np.ndarray,
+    labels: list,
+    output_path: Path,
+    label_column: str,
+    model_name: str,
+    accepted_mask: Optional[np.ndarray] = None
+) -> None:
+    """
+    Save a UMAP scatterplot with optional accepted-hull overlay.
+
+    Parameters
+    ----------
+    coords : np.ndarray
+        2D UMAP coordinates (n_samples, 2).
+    labels : list
+        Original labels corresponding to each coordinate.
+    output_path : Path
+        Filepath to save the resulting plot.
+    label_column : str
+        Name of the label column for legend entries.
+    model_name : str
+        Embedding model identifier for plot title.
+    accepted_mask : Optional[np.ndarray]
+        Boolean mask for accepted points; if provided, draws convex hull.
+    """
+    uniq = sorted(set(labels))
+    color_map = {v: to_hex(get_cmap("tab20")(i % 20)) for i, v in enumerate(uniq)}
+
+    fig, ax = plt.subplots(figsize=(6, 6))
+    ax.scatter(coords[:, 0], coords[:, 1],
+               c=[color_map[v] for v in labels],
+               s=25, alpha=0.85)
+
+    if accepted_mask is not None:
+        accepted = coords[accepted_mask]
+        if accepted.shape[0] >= 3:
+            hull = ConvexHull(accepted)
+            verts = accepted[hull.vertices]
+            verts = np.vstack([verts, verts[0]])
+            ax.fill(verts[:, 0], verts[:, 1],
+                    facecolor="none", edgecolor="green", lw=2, alpha=0.8,
+                    label="accepted hull")
+
+    handles = [plt.Line2D([], [], marker="o", ls="", color=color_map[v]) for v in uniq]
+    labels_legend = [f"{label_column}={v}" for v in uniq]
+    if accepted_mask is not None:
+        handles.append(plt.Line2D([], [], color="green", lw=2))
+        labels_legend.append("accepted hull")
+
+    ax.legend(handles, labels_legend, fontsize=8, loc="upper right")
+    ax.set(xticks=[], yticks=[], title=f"UMAP – {model_name} embeddings")
+    fig.tight_layout()
+    fig.savefig(output_path, dpi=300)
+    plt.close(fig)
+    log.info("Saved UMAP plot to %s", output_path)
@@ -0,0 +1,68 @@
+import json
+import re
+import logging
+from langchain_ollama import OllamaLLM
+import subprocess
+
+log = logging.getLogger(__name__)
+from typing import Iterable
+
+def build_json_vote_prompt(candidate: str, contexts: Iterable[str]) -> str:
+    """
+    Build a JSON-only prompt from example contexts and a candidate string.
+    """
+    ctx_block = "\n----\n".join(contexts)
+    return (
+        "You are an expert researcher. Output ONLY valid JSON.\n"
+        f"Target context examples:\n{ctx_block}\n\n"
+        f"Candidate abstract:\n{candidate}\n"
+        "Given the context, is the candidate about any of the concepts? "
+        'Respond {"answer":"yes|no","reason":"..."}'
+    )
+
+
+def get_ollama_llm(model: str, base_url: str, temperature: float) -> OllamaLLM:
+    """
+    Create and return a configured OllamaLLM instance.
+    If `model` isn’t yet pulled locally, this will shell out to:
+        ollama pull <model>
+    """
+    try:
+        available = subprocess.check_output(
+            ["ollama", "list"], text=True, stderr=subprocess.DEVNULL
+        )
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"Could not list Ollama models: {e}")
+
+    if model not in available:
+        try:
+            print(f"Model '{model}' not found locally – pulling…")
+            subprocess.run(
+                ["ollama", "pull", model],
+                check=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Failed to pull Ollama model '{model}': {e.stderr.strip()}")
+
+    return OllamaLLM(model=model, base_url=base_url, temperature=temperature)
+
+
+
+def vote_once(llm: OllamaLLM, prompt: str) -> tuple[bool, str]:
+    """
+    Invoke the given OllamaLLM instance once, strip markdown fences,
+    parse its JSON response, and return (yes_flag, reason).
+    """
+    raw = llm.invoke(prompt)
+    txt = re.sub(r"```(?:json)?", "", raw).strip()
+    try:
+        obj = json.loads(txt)
+        yes = obj.get("answer", "").lower() == "yes"
+        reason = str(obj.get("reason", "")).strip()
+        return yes, reason
+    except Exception:
+        log.warning("Bad JSON from LLM: %s", raw)
+        return False, ""
@@ -0,0 +1,33 @@
+import umap
+import numpy as np
+
+def get_umap_reducer(
+    n_neighbors: int,
+    min_dist: float,
+    metric: str = "cosine",
+    random_state: int = 42
+) -> umap.UMAP:
+    """
+    Initialize and return a UMAP reducer with specified parameters.
+    """
+    return umap.UMAP(
+        n_neighbors=n_neighbors,
+        min_dist=min_dist,
+        metric=metric,
+        random_state=random_state
+    )
+
+
+def compute_umap(
+    reducer: umap.UMAP,
+    embeddings: np.ndarray
+) -> np.ndarray:
+    """
+    Project high-dimensional embeddings to 2D using the provided UMAP reducer.
+
+    Returns
+    -------
+    np.ndarray
+        2D coordinates for each sample.
+    """
+    return reducer.fit_transform(embeddings)
@@ -0,0 +1 @@
+from .squirrel import Squirrel
@@ -0,0 +1,2 @@
+from .embed_prune import EmbeddingPruner
+from .llm_prune import LLMPruner
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .embed_prune import EmbeddingPruner`
	`2`	`+from .llm_prune import LLMPruner`