lanl
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 1 deletion b/‎.gitignore‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎CITATION.cff‎
Lines changed: 2 additions & 2 deletions b/‎CITATION.cff‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 5 deletions b/‎README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎TELF/applications/Bunny/auto_bunny.py‎
100755100644 b/‎TELF/applications/Bunny/auto_bunny.py‎
100755100644
diff --git a/‎TELF/applications/Lynx/frontend/pages/doc_view.py‎
Lines changed: 1 addition & 0 deletions b/‎TELF/applications/Lynx/frontend/pages/doc_view.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎TELF/applications/Lynx/frontend/pages/helpers/load_project_data.py‎
Lines changed: 199 additions & 20 deletions b/‎TELF/applications/Lynx/frontend/pages/helpers/load_project_data.py‎
Lines changed: 199 additions & 20 deletions
diff --git a/‎TELF/applications/Termite/VectorInjector.py‎
Lines changed: 30 additions & 0 deletions b/‎TELF/applications/Termite/VectorInjector.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎TELF/applications/Termite/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎TELF/applications/Termite/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎TELF/applications/Termite/embedding_store/__init__.py‎
Lines changed: 19 additions & 0 deletions b/‎TELF/applications/Termite/embedding_store/__init__.py‎
Lines changed: 19 additions & 0 deletions
@@ -1,3 +1,5 @@
+*.bak
+projects
 example_out
 example_results
 hidden_keys.py
@@ -15,7 +17,8 @@ EXAMPLE_OUT
 result_example/
 search_terms.md
 scopus_cache/
-
+examples/Termite/01_termite_output
+examples/Termite/DOCKERDATA/
 # mac
 results/
 poetry.lock
 
@@ -1,4 +1,4 @@
-version: 0.0.43
+version: 0.0.44
 message: "If you use this software, please cite it as below."
 authors:
   - family-names: Eren
@@ -20,7 +20,7 @@ authors:
   - family-names: Alexandrov
     given-names: Boian
 title: "Tensor Extraction of Latent Features (T-ELF)"
-version: 0.0.43
+version: 0.0.44
 url: https://github.com/lanl/T-ELF
 doi: 10.5281/zenodo.10257897
 date-released: 2023-12-04
@@ -58,11 +58,11 @@ conda develop .
 Next, we need to install the optional and additional dependencies. These include optional dependencies for GPU and HPC capabilities, as well as required dependencies like the SpaCy language models.
 To view all available options, please run:
 ```shell
-python post_install.py --help
+telf-post-install --help
 ```
 Install the additional dependencies:
 ```shell
-python post_install.py # use the following, for example, for GPU system: <python post_install.py --gpu>
+telf-post-install # use the following, for example, for GPU system: < telf-post-install --gpu>
 ```
 
 #### Jupyter Setup Tutorial for using the examples ([Link](https://www.maksimeren.com/post/conda-and-jupyter-setup-for-research/))
@@ -129,7 +129,7 @@ python post_install.py # use the following, for example, for GPU system: <python
 |    Bunny   | Dataset generation tool for documents and their citations/references |  [Link](examples/Bunny)  |
 |  Penguin   |         Text storage tool                                    | [Link](examples/Penguin) |
 |  Lynx   |         Streamlit UI                                    | [Link](examples/Lynx) |
-|    Termite   | Knowladge graph building tool | :soon: |
+|    Termite   | Knowladge graph building tool | [Link](examples/Termite) |
 
 
 ## Use Cases
@@ -228,7 +228,7 @@ Developer test suites are located under [```tests/```](tests/) directory. Tests
 conda create --prefix=<path to your conda environments under projects> python=3.11.10
 source activate <path to your conda environments under projects> # or use conda activate <...>
 pip install .
-python post_install.py --gpu --hpc-conda
+telf-post-install --gpu --hpc-conda
 ```
 
 ### Darwin
@@ -239,5 +239,5 @@ module load miniconda3
 conda create --name TELF python=3.11.10
 conda activate TELF # or <source activate TELF>
 pip install .
-python post_install.py --gpu --hpc
+telf-post-install --gpu --hpc
 ```
@@ -181,6 +181,7 @@
     for ii in selected_nodes["checked"]:
         directory = st.session_state.data_map[ii]["path"]
         peacock_dir = os.path.join(directory, "peacock")
+        print( peacock_dir )
         files = find_files_by_extensions(peacock_dir, extensions=("html", "png"))
         with st.expander(st.session_state.data_map[ii]["label"]):
             open_explorer_button(peacock_dir, key=f"button_tab2_{ii}")
 
@@ -9,6 +9,8 @@
 import torch
 import pickle
 import json
+import csv
+from collections import defaultdict
 
 @st.cache_resource
 def load_link_data(path):
@@ -181,36 +183,213 @@ def add_children(node_name):
             tree.append(add_children(node["name"]))
     return tree
 
-def parse_topic_folder_name(folder_name, path=None):
+DEBUG = False  # prints to terminal
+
+def _debug(msg):
+    if DEBUG:
+        print(msg, flush=True)
+
+# Matches cluster_for_k=N*.csv (case-insensitive, with optional extra suffix)
+_K_FILE_PAT = re.compile(r'^cluster_for_k=(\d+)(?:[^/]*)\.csv$', re.IGNORECASE)
+
+# Cache per root path
+_COUNTS_CACHE = {}
+_COUNTS_SRC_CACHE = {}
+
+def _list_topic_dirs(root_path: str):
+    try:
+        dirs = [
+            d for d in os.listdir(root_path)
+            if os.path.isdir(os.path.join(root_path, d))
+        ]
+        # Heuristic: folders that start with digits are topic dirs (e.g., "3-foo", "12", etc.)
+        topic_dirs = [d for d in dirs if re.match(r'^\d+\b', d)]
+        return sorted(topic_dirs)
+    except FileNotFoundError:
+        return []
+
+def _iter_candidate_csvs(root_path: str):
+    """Yield (abs_path, k) for cluster_for_k=*.csv at root and one level below."""
+    if not root_path or not os.path.isdir(root_path):
+        _debug(f"[scan] Not a dir: {root_path}")
+        return
+
+    _debug(f"[scan] Looking for cluster_for_k=*.csv under: {root_path}")
+
+    # Top-level files
+    try:
+        for entry in os.scandir(root_path):
+            if entry.is_file():
+                m = _K_FILE_PAT.match(entry.name)
+                if m:
+                    _debug(f"[scan] Found CSV: {entry.name} (k={m.group(1)})")
+                    yield entry.path, int(m.group(1))
+    except FileNotFoundError:
+        pass
+
+    # One level down (subfolders only)
+    try:
+        for entry in os.scandir(root_path):
+            if entry.is_dir():
+                for sub in os.scandir(entry.path):
+                    if sub.is_file():
+                        m = _K_FILE_PAT.match(sub.name)
+                        if m:
+                            _debug(f"[scan] Found CSV (subdir): {entry.name}/{sub.name} (k={m.group(1)})")
+                            yield sub.path, int(m.group(1))
+    except FileNotFoundError:
+        pass
+
+def _norm_cluster_key(val):
+    """Normalize cluster key to a stringified integer if possible (e.g., '3', 3, '3.0', 'cluster_3')."""
+    if val is None:
+        return None
+    if isinstance(val, (int, float)):
+        try:
+            return str(int(val))
+        except Exception:
+            return str(val).strip()
+    s = str(val).strip()
+    m = re.search(r'(\d+)', s)
+    return m.group(1) if m else s  # fallback to raw string
+
+def _read_counts_map_from_csv(csv_path: str):
+    """Build counts map {cluster_id(str): count(int)} using the 'cluster' column (case-insensitive)."""
+    _debug(f"[counts] Reading CSV: {csv_path}")
+    counts = defaultdict(int)
+    try:
+        with open(csv_path, "r", newline="", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            if not reader.fieldnames:
+                _debug("[counts] No header/fieldnames found.")
+                return {}
+
+            # find 'cluster' column case-insensitively
+            cluster_col = None
+            for name in reader.fieldnames:
+                if name and name.strip().lower() == "cluster":
+                    cluster_col = name
+                    break
+
+            if not cluster_col:
+                _debug(f"[counts] 'cluster' column not found in {reader.fieldnames}")
+                return {}
+
+            row_total = 0
+            for row in reader:
+                row_total += 1
+                key = _norm_cluster_key(row.get(cluster_col))
+                if key is not None and key != "":
+                    counts[key] += 1
+
+            _debug(f"[counts] Total rows read: {row_total}. Unique clusters: {len(counts)}")
+    except Exception as e:
+        _debug(f"[counts] Error reading {csv_path}: {e}")
+        return {}
+    return dict(counts)
+
+def _choose_best_cluster_csv(root_path: str):
+    """Choose the best cluster_for_k=*.csv. Prefer k == number of topic dirs; else largest k."""
+    candidates = list(_iter_candidate_csvs(root_path))
+    if not candidates:
+        _debug("[choose] No cluster_for_k=*.csv candidates found.")
+        return None
+
+    topic_dirs = _list_topic_dirs(root_path)
+    k_target = len(topic_dirs)
+    _debug(f"[choose] Topic dirs detected: {k_target}")
+
+    # Prefer exact match to number of topic dirs
+    exact = [p for p in candidates if p[1] == k_target and k_target > 0]
+    if exact:
+        # If multiple with same k, pick the shortest path (heuristic)
+        chosen = sorted(exact, key=lambda x: (len(x[0]), x[0]))[0]
+        _debug(f"[choose] Using exact k match: {os.path.basename(chosen[0])} (k={chosen[1]})")
+        return chosen[0]
+
+    # Else pick largest k
+    candidates.sort(key=lambda x: x[1], reverse=True)
+    chosen = candidates[0]
+    _debug(f"[choose] Using largest k: {os.path.basename(chosen[0])} (k={chosen[1]})")
+    return chosen[0]
+
+def _get_counts_map(root_path: str):
+    """Get (and cache) the counts map for this root path."""
+    if root_path in _COUNTS_CACHE:
+        return _COUNTS_CACHE[root_path], _COUNTS_SRC_CACHE.get(root_path)
+
+    csv_path = _choose_best_cluster_csv(root_path)
+    if not csv_path:
+        _COUNTS_CACHE[root_path] = {}
+        _COUNTS_SRC_CACHE[root_path] = None
+        return {}, None
+
+    counts_map = _read_counts_map_from_csv(csv_path)
+    _COUNTS_CACHE[root_path] = counts_map
+    _COUNTS_SRC_CACHE[root_path] = csv_path
+    return counts_map, csv_path
+
+def parse_topic_folder_name(folder_name: str, path: str | None = None):
     """
     Extracts topic number, label, and document count from folder name.
-
-    Expected format: topic_number-label_with_underscores-documents_count-documents
-    Example: '3-label_of_the_topic_9-documents' -> ('3', 'Label of the topic', '9')
-
-    Args:
-        folder_name (str): Folder name formatted as: <topic_number>-<label_with_underscores>_<document_count>-documents
-
-    Returns:
-        tuple: (str, str, str) -> (topic_number, cleaned_label, document_count)
-               or (None, None, None) if parsing fails.
+    If the count isn't in the name, read it from cluster_for_k=N.csv at the root (covers all clusters),
+    using the 'cluster' column to count rows per cluster ID.
     """
-    match = re.match(r'(\d+)-([^-]+)_(\d+)-documents', folder_name)
+    _debug(f"\n[parse] Folder: {folder_name}")
+    match = re.match(r'^(\d+)-([^-]+?)(?:[_-](\d+)-documents)?$', folder_name)
     if match:
-        topic_number = match.group(1)  # Extract topic number
-        label = match.group(2).replace("_", " ").strip()  # Replace underscores with spaces
-        document_count = match.group(3)  # Extract document count
+        topic_number = match.group(1)
+        label = match.group(2).replace("_", " ").strip()
+        document_count = match.group(3)
+        _debug(f"[parse] Parsed -> number={topic_number}, label='{label}', name_count={document_count}")
+
+        if document_count is None and path:
+            counts_map, csv_src = _get_counts_map(path)
+            key = _norm_cluster_key(topic_number)
+            inferred = counts_map.get(key)
+            if inferred is not None:
+                document_count = str(inferred)
+                _debug(f"[parse] Inferred from CSV ({os.path.basename(csv_src) if csv_src else 'n/a'}): {document_count}")
+            else:
+                _debug(f"[parse] No count for cluster={key} in CSV.")
+                document_count = "Unknown"
+
+        if document_count is None:
+            document_count = "Unknown"
+
+        _debug(f"[parse] Result -> ({topic_number}, '{label}', {document_count})")
         return topic_number, label, document_count
+
     elif folder_name.isdigit():
-        labels = load_csv_file_items(path, suffix="cluster_summaries", ends=False, column="label")
-        if path is None or labels is None:
-            return folder_name, "Topic", "Unknown"
-        else:
-            return folder_name, labels[int(folder_name)], "Unknown"
+        topic_number = folder_name
+        # Try label mapping (your existing helper)
+        label = "Topic"
+        try:
+            labels = load_csv_file_items(path, suffix="cluster_summaries", ends=False, column="label")
+            if path is not None and labels is not None:
+                label = labels[int(folder_name)]
+        except Exception as e:
+            _debug(f"[label] Could not map label: {e}")
+
+        document_count = "Unknown"
+        if path:
+            counts_map, csv_src = _get_counts_map(path)
+            key = _norm_cluster_key(topic_number)
+            inferred = counts_map.get(key)
+            if inferred is not None:
+                document_count = str(inferred)
+                _debug(f"[parse] Inferred (numeric folder) from CSV ({os.path.basename(csv_src) if csv_src else 'n/a'}): {document_count}")
+            else:
+                _debug(f"[parse] No count for cluster={key} in CSV (numeric folder).")
+
+        _debug(f"[parse] Numeric folder -> ({topic_number}, '{label}', {document_count})")
+        return topic_number, label, document_count
 
+    _debug("[parse] Unrecognized folder name format.")
     return None, None, None
 
 
+
 def map_folder_to_logical_name(folder_name: str, root_name) -> str:
     """
     Convert a physical folder name (like '*_0', '*_1_2', or '0_1')
 
@@ -0,0 +1,30 @@
+# Termite/VectorInjector.py
+import torch
+from transformers import AutoTokenizer, AutoModel
+from typing import Iterable, List
+
+class Vectorizer:
+    """Computes embeddings; does NOT talk to any DB."""
+
+    def __init__(self, model_name: str = "malteos/scincl", device: str = None, max_length: int = 512):
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.model.eval().to(self.device)
+        self.max_length = max_length
+
+    @torch.inference_mode()
+    def encode(self, texts: Iterable[str]) -> List[List[float]]:
+        outs = []
+        for t in texts:
+            tokens = self.tokenizer(
+                t if isinstance(t, str) else "",
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors="pt",
+            ).to(self.device)
+            hidden = self.model(**tokens).last_hidden_state  # [1, L, H]
+            emb = hidden.mean(dim=1).squeeze(0)              # mean pool
+            outs.append(emb.detach().cpu().tolist())
+        return outs
@@ -0,0 +1,5 @@
+# Keep this minimal: just re-export the public Termite class
+
+from .termite import Termite
+from .neo4j_termite.constants import *
+__all__ = ["Termite"]
@@ -0,0 +1,19 @@
+# Termite/embedding_store/__init__.py
+import os
+from .base import EmbeddingStore
+from .opensearch_store import OpenSearchStore
+from .milvus_store import MilvusStore
+
+def make_store() -> EmbeddingStore:
+    backend = os.getenv("EMBEDDING_STORE", "opensearch").lower()
+    if backend == "milvus":
+        uri = os.getenv("MILVUS_URI", "http://localhost:19530")
+        return MilvusStore(uri=uri)
+    # default: OpenSearch
+    host = os.getenv("OS_HOST", "localhost")
+    port = int(os.getenv("OS_PORT", "9200"))
+    use_ssl = os.getenv("OS_USE_SSL", "false").lower() == "true"
+    username = os.getenv("OS_USERNAME")
+    password = os.getenv("OS_PASSWORD")
+    return OpenSearchStore(host=host, port=port, use_ssl=use_ssl,
+                           username=username, password=password)