idaholab
diff --git a/‎.github/workflows/github-actions.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/github-actions.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 0 deletions b/‎.gitignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/install_spacy3.5.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/install_spacy3.5.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/KG_demo/KG_Demo.ipynb‎
Lines changed: 47 additions & 185 deletions b/‎examples/KG_demo/KG_Demo.ipynb‎
Lines changed: 47 additions & 185 deletions
diff --git a/‎src/dackar/main.py‎
Lines changed: 46 additions & 0 deletions b/‎src/dackar/main.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎src/dackar/pipelines/EmergentActivityEntity.py‎
Lines changed: 2 additions & 2 deletions b/‎src/dackar/pipelines/EmergentActivityEntity.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/dackar/pipelines/TemporalEntity.py‎
Lines changed: 3 additions & 3 deletions b/‎src/dackar/pipelines/TemporalEntity.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/dackar/text_processing/Preprocessing.py‎
Lines changed: 4 additions & 0 deletions b/‎src/dackar/text_processing/Preprocessing.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/dackar/utils/nlp/nlp_utils.py‎
Lines changed: 26 additions & 5 deletions b/‎src/dackar/utils/nlp/nlp_utils.py‎
Lines changed: 26 additions & 5 deletions
@@ -41,7 +41,7 @@ jobs:
           pwd
           conda create -n dackar_libs python=3.11
           conda init bash && source ~/.bashrc && conda activate dackar_libs
-          pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas
+          pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas wordcloud jsonschema toml
           pip install neo4j jupyterlab
           pip install pytest
       # python -m spacy download en_core_web_lg [for some reason, GitHub machine complains this command]
@@ -86,7 +86,7 @@ jobs:
           pwd
           conda create -n dackar_libs python=3.11
           conda init zsh && source ~/.zshrc && conda activate dackar_libs
-          pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas
+          pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas wordcloud jsonschema toml
           pip install neo4j jupyterlab
           pip install pytest
 
@@ -134,7 +134,7 @@ jobs:
           echo " Conda information"
           conda info
           echo " Activate Dackar conda environment"
-          pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas
+          pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas wordcloud jsonschema toml
           pip install neo4j jupyterlab
           pip install pytest
           pip uninstall numba llvmlite
 
@@ -70,6 +70,7 @@ instance/
 
 # Sphinx documentation
 docs/_build/
+docs/notebooks/
 
 # PyBuilder
 target/
@@ -144,3 +145,8 @@ tmp/
 Profile.prof
 .vscode
 .sass-cache
+
+
+*.csv
+*.bk
+*.png
@@ -51,7 +51,7 @@ and ``jupyterlab`` is used to execute notebook examples under ``./examples/`` fo
 
 ## Test
 
-### Test functions with ```__pytest__```
+### Test functions with ```pytest```
 
 - Run the following command in your command line to install pytest:
 
 
@@ -32,7 +32,7 @@ Install the Required Libraries
 
   conda activate dackar_libs
 
-  pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas
+  pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas wordcloud jsonschema toml
 
 ..  conda install -c conda-forge pandas
 .. scikit-learn 1.2.2 is required for quantulum3
 
@@ -0,0 +1,46 @@
+# Copyright 2024, Battelle Energy Alliance, LLC  ALL RIGHTS RESERVED
+
+"""
+Created on July 31, 2025
+@author: wangc, mandd
+"""
+import os
+import argparse
+import logging
+import sys
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+
+from dackar.utils.utils import readToml
+from dackar.workflows.WorkflowManager import WorkflowManager
+
+logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)
+logger = logging.getLogger('DACKAR')
+# # create file handler which logs messages
+fh = logging.FileHandler(filename='dackar.log', mode='w')
+fh.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s %(name)-20s %(levelname)-8s %(message)s')
+fh.setFormatter(formatter)
+# add the handlers to the logger
+logger.addHandler(fh)
+
+def main():
+  logger.info('Welcome to use DACKAR!')
+  # set up argument parser
+  parser = argparse.ArgumentParser(description='DACKAR Input ArgumentParser')
+  parser.add_argument('-i', '--file_path', type=str, default='../../system_tests/test_opm.toml' ,help='The path to the input file.')
+  parser.add_argument('-o', '--output-file', type=str, default='output.txt', help='The file to save the output to.')
+  # parse the arguments
+  args = parser.parse_args()
+  logger.info('Input file: %s', args.file_path)
+  # read the TOML file
+  cwd = os.getcwd()
+  configFile = os.path.join(cwd, args.file_path)
+  configDict = readToml(configFile)
+
+  module = WorkflowManager(configDict)
+  module.run()
+  logger.info(' ... Complete!')
+
+if __name__ == '__main__':
+  sys.exit(main())
@@ -42,8 +42,8 @@ def __init__(self, nlp):
     self.name = 'EmergentActivity'
     # work order identification should always before other identification
     # This is because when spans overlap, the (first) longest span is preferred over shorter spans.
-    woPattern = [[{"LOWER": "wo"}, {"IS_PUNCT": True, "OP":"*"}, {"IS_DIGIT": True}], [{"TEXT":{"REGEX":"(?<=wo)\d+"}}]]
-    idPattern = [[{"TEXT":{"REGEX":"(?=\S*[a-zA-Z])(?=\S*[0-9])"}}]]
+    woPattern = [[{"LOWER": "wo"}, {"IS_PUNCT": True, "OP":"*"}, {"IS_DIGIT": True}], [{"TEXT":{"REGEX":r"(?<=wo)\d+"}}]]
+    idPattern = [[{"TEXT":{"REGEX":r"(?=\S*[a-zA-Z])(?=\S*[0-9])"}}]]
     # idPattern = [[{"TEXT":{"REGEX":"^(?=.*\b(?=\S*[a-zA-Z])(?=\S*[0-9]))"}}]]
 
     self.matcher = SimpleEntityMatcher(nlp, label='WO', patterns=woPattern)
 
@@ -134,7 +134,7 @@ def __init__(self, nlp):
         |
         # Ordinal-Day-Month-Year
         (?:
-            """ + ordinalPattern + """
+            """ + ordinalPattern + r"""
             \s+
             (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name
             (?:                         # Year is optional
@@ -144,7 +144,7 @@ def __init__(self, nlp):
         )
         |
         (?:
-            """ + ordinalPattern + """
+            """ + ordinalPattern + r"""
             \s+
             of
             \s+
@@ -159,7 +159,7 @@ def __init__(self, nlp):
         (?:
             (?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*  # Month name
             \s+
-            """ + ordinalPattern + """
+            """ + ordinalPattern + r"""
             (?:                         # Year is optional
                 \s+
                 \d{4}                   # Year
 
@@ -8,6 +8,9 @@
 import re
 import textacy.preprocessing as preprocessing
 from numerizer import numerize
+import logging
+
+logger = logging.getLogger('DACKAR.Preprocessing')
 
 # list of available preprocessors in textacy.preprocessing.normalize
 textacyNormalize = ['bullet_points',
@@ -209,6 +212,7 @@ def __call__(self, text):
       Returns:
         processed: str, string of processed text
     """
+    logger.info('Preprocess raw text data')
     processed = text.strip('\n')
     processed = re.sub(r'&', ' and ', processed)
     # processed = re.sub(r'/', ' and ', processed)
 
@@ -17,7 +17,7 @@
 import matplotlib.pyplot as plt
 
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger('DACKAR.utils')
 
 ###########################################################################
 
@@ -28,7 +28,7 @@ def displayNER(doc, includePunct=False):
     Args:
 
       doc: spacy.tokens.doc.Doc, the processed document using nlp pipelines
-      includePunct: bool, True if the punctuaction is included
+      includePunct: bool, True if the punctuation is included
 
     Returns:
 
@@ -38,9 +38,9 @@ def displayNER(doc, includePunct=False):
   for i, t in enumerate(doc):
     if not t.is_punct or includePunct:
       row = {'token': i,
-             'text': t.text, 'lemma': t.lemma_,
-             'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,
-             'ent_iob_': t.ent_iob_}
+            'text': t.text, 'lemma': t.lemma_,
+            'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,
+            'ent_iob_': t.ent_iob_}
       if doc.has_extension('coref_chains'):
         if t.has_extension('coref_chains') and t._.coref_chains: # neuralcoref attributes
           row['coref_chains'] = t._.coref_chains.pretty_representation
@@ -263,3 +263,24 @@ def customTokenizer(nlp):
                             token_match=nlp.tokenizer.token_match,
                             rules=nlp.Defaults.tokenizer_exceptions)
   return nlp
+
+
+def extractNER(doc):
+  """
+    Generate data frame for visualization of spaCy doc with custom NER.
+
+    Args:
+
+      doc: spacy.tokens.doc.Doc, the processed document using nlp pipelines
+
+    Returns:
+
+      df: pandas.DataFrame, data frame contains attributes of NER tokens
+  """
+  rows = []
+  for ent in doc.ents:
+    row = {'entity':ent, 'label': ent.label_, 'id': ent.ent_id_, 'alias':ent._.alias, 'start': ent.start, 'end': ent.end}
+    rows.append(row)
+  df = pd.DataFrame(rows)
+  df.index.name = None
+  return df