Skip to content

Commit 9899d14

Browse files
authored
Merge pull request #189 from lanl/v0.0.41
V0.0.41
2 parents 77dc571 + 88b1145 commit 9899d14

File tree

118 files changed

+4853
-347
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

118 files changed

+4853
-347
lines changed

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version: 0.0.40
1+
version: 0.0.41
22
message: "If you use this software, please cite it as below."
33
authors:
44
- family-names: Eren
@@ -20,7 +20,7 @@ authors:
2020
- family-names: Alexandrov
2121
given-names: Boian
2222
title: "Tensor Extraction of Latent Features (T-ELF)"
23-
version: 0.0.40
23+
version: 0.0.41
2424
url: https://github.com/lanl/T-ELF
2525
doi: 10.5281/zenodo.10257897
2626
date-released: 2023-12-04

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ python post_install.py # use the following, for example, for GPU system: <python
109109
| Beaver | :heavy_check_mark: | :heavy_check_mark: | Fast matrix and tensor building tool for text mining | [Link](examples/Beaver) |
110110
| iPenguin | :heavy_check_mark: | | Online information retrieval tool for Scopus, SemanticScholar, and OSTI | [Link](examples/iPenguin) |
111111
| Orca | :heavy_check_mark: | | Duplicate author detector for text mining and information retrieval | [Link](examples/Orca) |
112+
| Squirrel | Dataset pruning tool for documents | [Link](examples/Squirrel) |
112113

113114
### TELF.post_processing
114115

TELF/applications/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
sys.path += ["Bunny"]
44
sys.path += ["Penguin"]
55

6+
67
# Cheetah
78
from .Cheetah.cheetah import Cheetah
89
from .Cheetah.term_formatter import CheetahTermFormatter, convert_txt_to_cheetah_markdown
@@ -13,4 +14,4 @@
1314
from .Bunny.auto_bunny import AutoBunny, AutoBunnyStep
1415

1516
# Penguin
16-
from .Penguin.penguin import Penguin
17+
from .Penguin.penguin import Penguin

TELF/helpers/embeddings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,4 @@ def compute_centroids(embeddings_dict, df):
8888
centroids[cluster_id] = np.mean(embs, axis=0)
8989
else:
9090
centroids[cluster_id] = None
91-
return centroids
91+
return centroids

TELF/helpers/figures.py

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,19 @@
99
from wordcloud import WordCloud
1010
import random
1111
import os
12+
from pathlib import Path
13+
from typing import Optional
14+
import matplotlib.pyplot as plt
15+
from matplotlib.cm import get_cmap
16+
from matplotlib.colors import to_hex
17+
from scipy.spatial import ConvexHull
18+
import logging
19+
log = logging.getLogger(__name__)
1220
from .file_system import check_path
1321
from .data_structures import sum_dicts
1422
from .maps import get_id_to_name
1523
from .graphs import create_authors_graph
16-
import math
24+
1725

1826
def plot_authors_graph(df, id_col='s2_author_ids', name_col='s2_authors', title='Co-Authors Graph',
1927
width=900, height=900, max_node_size=50, min_node_size=3):
@@ -478,3 +486,60 @@ def plot_H_clustering(H, name="filename"):
478486
plt.close()
479487

480488
return fig
489+
490+
def plot_umap(
491+
coords: np.ndarray,
492+
labels: list,
493+
output_path: Path,
494+
label_column: str,
495+
model_name: str,
496+
accepted_mask: Optional[np.ndarray] = None
497+
) -> None:
498+
"""
499+
Save a UMAP scatterplot with optional accepted-hull overlay.
500+
501+
Parameters
502+
----------
503+
coords : np.ndarray
504+
2D UMAP coordinates (n_samples, 2).
505+
labels : list
506+
Original labels corresponding to each coordinate.
507+
output_path : Path
508+
Filepath to save the resulting plot.
509+
label_column : str
510+
Name of the label column for legend entries.
511+
model_name : str
512+
Embedding model identifier for plot title.
513+
accepted_mask : Optional[np.ndarray]
514+
Boolean mask for accepted points; if provided, draws convex hull.
515+
"""
516+
uniq = sorted(set(labels))
517+
color_map = {v: to_hex(get_cmap("tab20")(i % 20)) for i, v in enumerate(uniq)}
518+
519+
fig, ax = plt.subplots(figsize=(6, 6))
520+
ax.scatter(coords[:, 0], coords[:, 1],
521+
c=[color_map[v] for v in labels],
522+
s=25, alpha=0.85)
523+
524+
if accepted_mask is not None:
525+
accepted = coords[accepted_mask]
526+
if accepted.shape[0] >= 3:
527+
hull = ConvexHull(accepted)
528+
verts = accepted[hull.vertices]
529+
verts = np.vstack([verts, verts[0]])
530+
ax.fill(verts[:, 0], verts[:, 1],
531+
facecolor="none", edgecolor="green", lw=2, alpha=0.8,
532+
label="accepted hull")
533+
534+
handles = [plt.Line2D([], [], marker="o", ls="", color=color_map[v]) for v in uniq]
535+
labels_legend = [f"{label_column}={v}" for v in uniq]
536+
if accepted_mask is not None:
537+
handles.append(plt.Line2D([], [], color="green", lw=2))
538+
labels_legend.append("accepted hull")
539+
540+
ax.legend(handles, labels_legend, fontsize=8, loc="upper right")
541+
ax.set(xticks=[], yticks=[], title=f"UMAP – {model_name} embeddings")
542+
fig.tight_layout()
543+
fig.savefig(output_path, dpi=300)
544+
plt.close(fig)
545+
log.info("Saved UMAP plot to %s", output_path)

TELF/helpers/llm.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import json
2+
import re
3+
import logging
4+
from langchain_ollama import OllamaLLM
5+
import subprocess
6+
7+
log = logging.getLogger(__name__)
8+
from typing import Iterable
9+
10+
def build_json_vote_prompt(candidate: str, contexts: Iterable[str]) -> str:
11+
"""
12+
Build a JSON-only prompt from example contexts and a candidate string.
13+
"""
14+
ctx_block = "\n----\n".join(contexts)
15+
return (
16+
"You are an expert researcher. Output ONLY valid JSON.\n"
17+
f"Target context examples:\n{ctx_block}\n\n"
18+
f"Candidate abstract:\n{candidate}\n"
19+
"Given the context, is the candidate about any of the concepts? "
20+
'Respond {"answer":"yes|no","reason":"..."}'
21+
)
22+
23+
24+
def get_ollama_llm(model: str, base_url: str, temperature: float) -> OllamaLLM:
25+
"""
26+
Create and return a configured OllamaLLM instance.
27+
If `model` isn’t yet pulled locally, this will shell out to:
28+
ollama pull <model>
29+
"""
30+
try:
31+
available = subprocess.check_output(
32+
["ollama", "list"], text=True, stderr=subprocess.DEVNULL
33+
)
34+
except subprocess.CalledProcessError as e:
35+
raise RuntimeError(f"Could not list Ollama models: {e}")
36+
37+
if model not in available:
38+
try:
39+
print(f"Model '{model}' not found locally – pulling…")
40+
subprocess.run(
41+
["ollama", "pull", model],
42+
check=True,
43+
stdout=subprocess.PIPE,
44+
stderr=subprocess.PIPE,
45+
text=True
46+
)
47+
except subprocess.CalledProcessError as e:
48+
raise RuntimeError(f"Failed to pull Ollama model '{model}': {e.stderr.strip()}")
49+
50+
return OllamaLLM(model=model, base_url=base_url, temperature=temperature)
51+
52+
53+
54+
def vote_once(llm: OllamaLLM, prompt: str) -> tuple[bool, str]:
55+
"""
56+
Invoke the given OllamaLLM instance once, strip markdown fences,
57+
parse its JSON response, and return (yes_flag, reason).
58+
"""
59+
raw = llm.invoke(prompt)
60+
txt = re.sub(r"```(?:json)?", "", raw).strip()
61+
try:
62+
obj = json.loads(txt)
63+
yes = obj.get("answer", "").lower() == "yes"
64+
reason = str(obj.get("reason", "")).strip()
65+
return yes, reason
66+
except Exception:
67+
log.warning("Bad JSON from LLM: %s", raw)
68+
return False, ""

TELF/helpers/ml.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import umap
2+
import numpy as np
3+
4+
def get_umap_reducer(
5+
n_neighbors: int,
6+
min_dist: float,
7+
metric: str = "cosine",
8+
random_state: int = 42
9+
) -> umap.UMAP:
10+
"""
11+
Initialize and return a UMAP reducer with specified parameters.
12+
"""
13+
return umap.UMAP(
14+
n_neighbors=n_neighbors,
15+
min_dist=min_dist,
16+
metric=metric,
17+
random_state=random_state
18+
)
19+
20+
21+
def compute_umap(
22+
reducer: umap.UMAP,
23+
embeddings: np.ndarray
24+
) -> np.ndarray:
25+
"""
26+
Project high-dimensional embeddings to 2D using the provided UMAP reducer.
27+
28+
Returns
29+
-------
30+
np.ndarray
31+
2D coordinates for each sample.
32+
"""
33+
return reducer.fit_transform(embeddings)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .squirrel import Squirrel
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .embed_prune import EmbeddingPruner
2+
from .llm_prune import LLMPruner

0 commit comments

Comments
 (0)