Skip to content

Commit 0dcae74

Browse files
authored
Merge pull request #197 from lanl/develop
spacey Ner block, hnmfk block error stack trace, termite package
2 parents 6e7c3e4 + b715690 commit 0dcae74

File tree

263 files changed

+17481
-7478
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

263 files changed

+17481
-7478
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
*.bak
2+
projects
13
example_out
24
example_results
35
hidden_keys.py
@@ -15,7 +17,8 @@ EXAMPLE_OUT
1517
result_example/
1618
search_terms.md
1719
scopus_cache/
18-
20+
examples/Termite/01_termite_output
21+
examples/Termite/DOCKERDATA/
1922
# mac
2023
results/
2124
poetry.lock

CITATION.cff

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version: 0.0.43
1+
version: 0.0.44
22
message: "If you use this software, please cite it as below."
33
authors:
44
- family-names: Eren
@@ -20,7 +20,7 @@ authors:
2020
- family-names: Alexandrov
2121
given-names: Boian
2222
title: "Tensor Extraction of Latent Features (T-ELF)"
23-
version: 0.0.43
23+
version: 0.0.44
2424
url: https://github.com/lanl/T-ELF
2525
doi: 10.5281/zenodo.10257897
2626
date-released: 2023-12-04

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,11 @@ conda develop .
5858
Next, we need to install the optional and additional dependencies. These include optional dependencies for GPU and HPC capabilities, as well as required dependencies like the SpaCy language models.
5959
To view all available options, please run:
6060
```shell
61-
python post_install.py --help
61+
telf-post-install --help
6262
```
6363
Install the additional dependencies:
6464
```shell
65-
python post_install.py # use the following, for example, for GPU system: <python post_install.py --gpu>
65+
telf-post-install # use the following, for example, for GPU system: < telf-post-install --gpu>
6666
```
6767

6868
#### Jupyter Setup Tutorial for using the examples ([Link](https://www.maksimeren.com/post/conda-and-jupyter-setup-for-research/))
@@ -129,7 +129,7 @@ python post_install.py # use the following, for example, for GPU system: <python
129129
| Bunny | Dataset generation tool for documents and their citations/references | [Link](examples/Bunny) |
130130
| Penguin | Text storage tool | [Link](examples/Penguin) |
131131
| Lynx | Streamlit UI | [Link](examples/Lynx) |
132-
| Termite | Knowladge graph building tool | :soon: |
132+
| Termite | Knowladge graph building tool | [Link](examples/Termite) |
133133

134134

135135
## Use Cases
@@ -228,7 +228,7 @@ Developer test suites are located under [```tests/```](tests/) directory. Tests
228228
conda create --prefix=<path to your conda environments under projects> python=3.11.10
229229
source activate <path to your conda environments under projects> # or use conda activate <...>
230230
pip install .
231-
python post_install.py --gpu --hpc-conda
231+
telf-post-install --gpu --hpc-conda
232232
```
233233

234234
### Darwin
@@ -239,5 +239,5 @@ module load miniconda3
239239
conda create --name TELF python=3.11.10
240240
conda activate TELF # or <source activate TELF>
241241
pip install .
242-
python post_install.py --gpu --hpc
242+
telf-post-install --gpu --hpc
243243
```

TELF/applications/Bunny/auto_bunny.py

100755100644
File mode changed.

TELF/applications/Lynx/frontend/pages/doc_view.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@
181181
for ii in selected_nodes["checked"]:
182182
directory = st.session_state.data_map[ii]["path"]
183183
peacock_dir = os.path.join(directory, "peacock")
184+
print( peacock_dir )
184185
files = find_files_by_extensions(peacock_dir, extensions=("html", "png"))
185186
with st.expander(st.session_state.data_map[ii]["label"]):
186187
open_explorer_button(peacock_dir, key=f"button_tab2_{ii}")

TELF/applications/Lynx/frontend/pages/helpers/load_project_data.py

Lines changed: 199 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import torch
1010
import pickle
1111
import json
12+
import csv
13+
from collections import defaultdict
1214

1315
@st.cache_resource
1416
def load_link_data(path):
@@ -181,36 +183,213 @@ def add_children(node_name):
181183
tree.append(add_children(node["name"]))
182184
return tree
183185

184-
def parse_topic_folder_name(folder_name, path=None):
186+
DEBUG = False # prints to terminal
187+
188+
def _debug(msg):
189+
if DEBUG:
190+
print(msg, flush=True)
191+
192+
# Matches cluster_for_k=N*.csv (case-insensitive, with optional extra suffix)
193+
_K_FILE_PAT = re.compile(r'^cluster_for_k=(\d+)(?:[^/]*)\.csv$', re.IGNORECASE)
194+
195+
# Cache per root path
196+
_COUNTS_CACHE = {}
197+
_COUNTS_SRC_CACHE = {}
198+
199+
def _list_topic_dirs(root_path: str):
200+
try:
201+
dirs = [
202+
d for d in os.listdir(root_path)
203+
if os.path.isdir(os.path.join(root_path, d))
204+
]
205+
# Heuristic: folders that start with digits are topic dirs (e.g., "3-foo", "12", etc.)
206+
topic_dirs = [d for d in dirs if re.match(r'^\d+\b', d)]
207+
return sorted(topic_dirs)
208+
except FileNotFoundError:
209+
return []
210+
211+
def _iter_candidate_csvs(root_path: str):
212+
"""Yield (abs_path, k) for cluster_for_k=*.csv at root and one level below."""
213+
if not root_path or not os.path.isdir(root_path):
214+
_debug(f"[scan] Not a dir: {root_path}")
215+
return
216+
217+
_debug(f"[scan] Looking for cluster_for_k=*.csv under: {root_path}")
218+
219+
# Top-level files
220+
try:
221+
for entry in os.scandir(root_path):
222+
if entry.is_file():
223+
m = _K_FILE_PAT.match(entry.name)
224+
if m:
225+
_debug(f"[scan] Found CSV: {entry.name} (k={m.group(1)})")
226+
yield entry.path, int(m.group(1))
227+
except FileNotFoundError:
228+
pass
229+
230+
# One level down (subfolders only)
231+
try:
232+
for entry in os.scandir(root_path):
233+
if entry.is_dir():
234+
for sub in os.scandir(entry.path):
235+
if sub.is_file():
236+
m = _K_FILE_PAT.match(sub.name)
237+
if m:
238+
_debug(f"[scan] Found CSV (subdir): {entry.name}/{sub.name} (k={m.group(1)})")
239+
yield sub.path, int(m.group(1))
240+
except FileNotFoundError:
241+
pass
242+
243+
def _norm_cluster_key(val):
244+
"""Normalize cluster key to a stringified integer if possible (e.g., '3', 3, '3.0', 'cluster_3')."""
245+
if val is None:
246+
return None
247+
if isinstance(val, (int, float)):
248+
try:
249+
return str(int(val))
250+
except Exception:
251+
return str(val).strip()
252+
s = str(val).strip()
253+
m = re.search(r'(\d+)', s)
254+
return m.group(1) if m else s # fallback to raw string
255+
256+
def _read_counts_map_from_csv(csv_path: str):
257+
"""Build counts map {cluster_id(str): count(int)} using the 'cluster' column (case-insensitive)."""
258+
_debug(f"[counts] Reading CSV: {csv_path}")
259+
counts = defaultdict(int)
260+
try:
261+
with open(csv_path, "r", newline="", encoding="utf-8") as f:
262+
reader = csv.DictReader(f)
263+
if not reader.fieldnames:
264+
_debug("[counts] No header/fieldnames found.")
265+
return {}
266+
267+
# find 'cluster' column case-insensitively
268+
cluster_col = None
269+
for name in reader.fieldnames:
270+
if name and name.strip().lower() == "cluster":
271+
cluster_col = name
272+
break
273+
274+
if not cluster_col:
275+
_debug(f"[counts] 'cluster' column not found in {reader.fieldnames}")
276+
return {}
277+
278+
row_total = 0
279+
for row in reader:
280+
row_total += 1
281+
key = _norm_cluster_key(row.get(cluster_col))
282+
if key is not None and key != "":
283+
counts[key] += 1
284+
285+
_debug(f"[counts] Total rows read: {row_total}. Unique clusters: {len(counts)}")
286+
except Exception as e:
287+
_debug(f"[counts] Error reading {csv_path}: {e}")
288+
return {}
289+
return dict(counts)
290+
291+
def _choose_best_cluster_csv(root_path: str):
292+
"""Choose the best cluster_for_k=*.csv. Prefer k == number of topic dirs; else largest k."""
293+
candidates = list(_iter_candidate_csvs(root_path))
294+
if not candidates:
295+
_debug("[choose] No cluster_for_k=*.csv candidates found.")
296+
return None
297+
298+
topic_dirs = _list_topic_dirs(root_path)
299+
k_target = len(topic_dirs)
300+
_debug(f"[choose] Topic dirs detected: {k_target}")
301+
302+
# Prefer exact match to number of topic dirs
303+
exact = [p for p in candidates if p[1] == k_target and k_target > 0]
304+
if exact:
305+
# If multiple with same k, pick the shortest path (heuristic)
306+
chosen = sorted(exact, key=lambda x: (len(x[0]), x[0]))[0]
307+
_debug(f"[choose] Using exact k match: {os.path.basename(chosen[0])} (k={chosen[1]})")
308+
return chosen[0]
309+
310+
# Else pick largest k
311+
candidates.sort(key=lambda x: x[1], reverse=True)
312+
chosen = candidates[0]
313+
_debug(f"[choose] Using largest k: {os.path.basename(chosen[0])} (k={chosen[1]})")
314+
return chosen[0]
315+
316+
def _get_counts_map(root_path: str):
317+
"""Get (and cache) the counts map for this root path."""
318+
if root_path in _COUNTS_CACHE:
319+
return _COUNTS_CACHE[root_path], _COUNTS_SRC_CACHE.get(root_path)
320+
321+
csv_path = _choose_best_cluster_csv(root_path)
322+
if not csv_path:
323+
_COUNTS_CACHE[root_path] = {}
324+
_COUNTS_SRC_CACHE[root_path] = None
325+
return {}, None
326+
327+
counts_map = _read_counts_map_from_csv(csv_path)
328+
_COUNTS_CACHE[root_path] = counts_map
329+
_COUNTS_SRC_CACHE[root_path] = csv_path
330+
return counts_map, csv_path
331+
332+
def parse_topic_folder_name(folder_name: str, path: str | None = None):
185333
"""
186334
Extracts topic number, label, and document count from folder name.
187-
188-
Expected format: topic_number-label_with_underscores-documents_count-documents
189-
Example: '3-label_of_the_topic_9-documents' -> ('3', 'Label of the topic', '9')
190-
191-
Args:
192-
folder_name (str): Folder name formatted as: <topic_number>-<label_with_underscores>_<document_count>-documents
193-
194-
Returns:
195-
tuple: (str, str, str) -> (topic_number, cleaned_label, document_count)
196-
or (None, None, None) if parsing fails.
335+
If the count isn't in the name, read it from cluster_for_k=N.csv at the root (covers all clusters),
336+
using the 'cluster' column to count rows per cluster ID.
197337
"""
198-
match = re.match(r'(\d+)-([^-]+)_(\d+)-documents', folder_name)
338+
_debug(f"\n[parse] Folder: {folder_name}")
339+
match = re.match(r'^(\d+)-([^-]+?)(?:[_-](\d+)-documents)?$', folder_name)
199340
if match:
200-
topic_number = match.group(1) # Extract topic number
201-
label = match.group(2).replace("_", " ").strip() # Replace underscores with spaces
202-
document_count = match.group(3) # Extract document count
341+
topic_number = match.group(1)
342+
label = match.group(2).replace("_", " ").strip()
343+
document_count = match.group(3)
344+
_debug(f"[parse] Parsed -> number={topic_number}, label='{label}', name_count={document_count}")
345+
346+
if document_count is None and path:
347+
counts_map, csv_src = _get_counts_map(path)
348+
key = _norm_cluster_key(topic_number)
349+
inferred = counts_map.get(key)
350+
if inferred is not None:
351+
document_count = str(inferred)
352+
_debug(f"[parse] Inferred from CSV ({os.path.basename(csv_src) if csv_src else 'n/a'}): {document_count}")
353+
else:
354+
_debug(f"[parse] No count for cluster={key} in CSV.")
355+
document_count = "Unknown"
356+
357+
if document_count is None:
358+
document_count = "Unknown"
359+
360+
_debug(f"[parse] Result -> ({topic_number}, '{label}', {document_count})")
203361
return topic_number, label, document_count
362+
204363
elif folder_name.isdigit():
205-
labels = load_csv_file_items(path, suffix="cluster_summaries", ends=False, column="label")
206-
if path is None or labels is None:
207-
return folder_name, "Topic", "Unknown"
208-
else:
209-
return folder_name, labels[int(folder_name)], "Unknown"
364+
topic_number = folder_name
365+
# Try label mapping (your existing helper)
366+
label = "Topic"
367+
try:
368+
labels = load_csv_file_items(path, suffix="cluster_summaries", ends=False, column="label")
369+
if path is not None and labels is not None:
370+
label = labels[int(folder_name)]
371+
except Exception as e:
372+
_debug(f"[label] Could not map label: {e}")
373+
374+
document_count = "Unknown"
375+
if path:
376+
counts_map, csv_src = _get_counts_map(path)
377+
key = _norm_cluster_key(topic_number)
378+
inferred = counts_map.get(key)
379+
if inferred is not None:
380+
document_count = str(inferred)
381+
_debug(f"[parse] Inferred (numeric folder) from CSV ({os.path.basename(csv_src) if csv_src else 'n/a'}): {document_count}")
382+
else:
383+
_debug(f"[parse] No count for cluster={key} in CSV (numeric folder).")
384+
385+
_debug(f"[parse] Numeric folder -> ({topic_number}, '{label}', {document_count})")
386+
return topic_number, label, document_count
210387

388+
_debug("[parse] Unrecognized folder name format.")
211389
return None, None, None
212390

213391

392+
214393
def map_folder_to_logical_name(folder_name: str, root_name) -> str:
215394
"""
216395
Convert a physical folder name (like '*_0', '*_1_2', or '0_1')
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Termite/VectorInjector.py
2+
import torch
3+
from transformers import AutoTokenizer, AutoModel
4+
from typing import Iterable, List
5+
6+
class Vectorizer:
7+
"""Computes embeddings; does NOT talk to any DB."""
8+
9+
def __init__(self, model_name: str = "malteos/scincl", device: str = None, max_length: int = 512):
10+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
11+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
12+
self.model = AutoModel.from_pretrained(model_name)
13+
self.model.eval().to(self.device)
14+
self.max_length = max_length
15+
16+
@torch.inference_mode()
17+
def encode(self, texts: Iterable[str]) -> List[List[float]]:
18+
outs = []
19+
for t in texts:
20+
tokens = self.tokenizer(
21+
t if isinstance(t, str) else "",
22+
padding=True,
23+
truncation=True,
24+
max_length=self.max_length,
25+
return_tensors="pt",
26+
).to(self.device)
27+
hidden = self.model(**tokens).last_hidden_state # [1, L, H]
28+
emb = hidden.mean(dim=1).squeeze(0) # mean pool
29+
outs.append(emb.detach().cpu().tolist())
30+
return outs
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Keep this minimal: just re-export the public Termite class
2+
3+
from .termite import Termite
4+
from .neo4j_termite.constants import *
5+
__all__ = ["Termite"]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Termite/embedding_store/__init__.py
2+
import os
3+
from .base import EmbeddingStore
4+
from .opensearch_store import OpenSearchStore
5+
from .milvus_store import MilvusStore
6+
7+
def make_store() -> EmbeddingStore:
8+
backend = os.getenv("EMBEDDING_STORE", "opensearch").lower()
9+
if backend == "milvus":
10+
uri = os.getenv("MILVUS_URI", "http://localhost:19530")
11+
return MilvusStore(uri=uri)
12+
# default: OpenSearch
13+
host = os.getenv("OS_HOST", "localhost")
14+
port = int(os.getenv("OS_PORT", "9200"))
15+
use_ssl = os.getenv("OS_USE_SSL", "false").lower() == "true"
16+
username = os.getenv("OS_USERNAME")
17+
password = os.getenv("OS_PASSWORD")
18+
return OpenSearchStore(host=host, port=port, use_ssl=use_ssl,
19+
username=username, password=password)

0 commit comments

Comments
 (0)