Skip to content

Commit d979218

Browse files
authored
Merge pull request #21 from idaholab/wangc/architecture
DACKAR Architecture Design
2 parents e9e6889 + bb5345d commit d979218

25 files changed

+1515
-304
lines changed

.github/workflows/github-actions.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ jobs:
4141
pwd
4242
conda create -n dackar_libs python=3.11
4343
conda init bash && source ~/.bashrc && conda activate dackar_libs
44-
pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas
44+
pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas wordcloud jsonschema toml
4545
pip install neo4j jupyterlab
4646
pip install pytest
4747
# python -m spacy download en_core_web_lg [for some reason, GitHub machine complains this command]
@@ -86,7 +86,7 @@ jobs:
8686
pwd
8787
conda create -n dackar_libs python=3.11
8888
conda init zsh && source ~/.zshrc && conda activate dackar_libs
89-
pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas
89+
pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas wordcloud jsonschema toml
9090
pip install neo4j jupyterlab
9191
pip install pytest
9292
@@ -134,7 +134,7 @@ jobs:
134134
echo " Conda information"
135135
conda info
136136
echo " Activate Dackar conda environment"
137-
pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas
137+
pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas wordcloud jsonschema toml
138138
pip install neo4j jupyterlab
139139
pip install pytest
140140
pip uninstall numba llvmlite

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ instance/
7070

7171
# Sphinx documentation
7272
docs/_build/
73+
docs/notebooks/
7374

7475
# PyBuilder
7576
target/
@@ -144,3 +145,8 @@ tmp/
144145
Profile.prof
145146
.vscode
146147
.sass-cache
148+
149+
150+
*.csv
151+
*.bk
152+
*.png

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ and ``jupyterlab`` is used to execute notebook examples under ``./examples/`` fo
5151

5252
## Test
5353

54-
### Test functions with ```__pytest__```
54+
### Test functions with ```pytest```
5555

5656
- Run the following command in your command line to install pytest:
5757

docs/install_spacy3.5.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ Install the Required Libraries
3232
3333
conda activate dackar_libs
3434
35-
pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas
35+
pip install spacy==3.5 stumpy textacy matplotlib nltk coreferee beautifulsoup4 networkx pysbd tomli numerizer autocorrect pywsd openpyxl quantulum3[classifier] numpy==1.26 scikit-learn pyspellchecker contextualSpellCheck pandas wordcloud jsonschema toml
3636
3737
.. conda install -c conda-forge pandas
3838
.. scikit-learn 1.2.2 is required for quantulum3

examples/KG_demo/KG_Demo.ipynb

Lines changed: 47 additions & 185 deletions
Large diffs are not rendered by default.

src/dackar/main.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Copyright 2024, Battelle Energy Alliance, LLC ALL RIGHTS RESERVED
2+
3+
"""
4+
Created on July 31, 2025
5+
@author: wangc, mandd
6+
"""
7+
import os
8+
import argparse
9+
import logging
10+
import sys
11+
12+
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
13+
14+
from dackar.utils.utils import readToml
15+
from dackar.workflows.WorkflowManager import WorkflowManager
16+
17+
logging.basicConfig(format='%(asctime)s %(name)-20s %(levelname)-8s %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)
18+
logger = logging.getLogger('DACKAR')
19+
# # create file handler which logs messages
20+
fh = logging.FileHandler(filename='dackar.log', mode='w')
21+
fh.setLevel(logging.INFO)
22+
formatter = logging.Formatter('%(asctime)s %(name)-20s %(levelname)-8s %(message)s')
23+
fh.setFormatter(formatter)
24+
# add the handlers to the logger
25+
logger.addHandler(fh)
26+
27+
def main():
28+
logger.info('Welcome to use DACKAR!')
29+
# set up argument parser
30+
parser = argparse.ArgumentParser(description='DACKAR Input ArgumentParser')
31+
parser.add_argument('-i', '--file_path', type=str, default='../../system_tests/test_opm.toml' ,help='The path to the input file.')
32+
parser.add_argument('-o', '--output-file', type=str, default='output.txt', help='The file to save the output to.')
33+
# parse the arguments
34+
args = parser.parse_args()
35+
logger.info('Input file: %s', args.file_path)
36+
# read the TOML file
37+
cwd = os.getcwd()
38+
configFile = os.path.join(cwd, args.file_path)
39+
configDict = readToml(configFile)
40+
41+
module = WorkflowManager(configDict)
42+
module.run()
43+
logger.info(' ... Complete!')
44+
45+
if __name__ == '__main__':
46+
sys.exit(main())

src/dackar/pipelines/EmergentActivityEntity.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ def __init__(self, nlp):
4242
self.name = 'EmergentActivity'
4343
# work order identification should always before other identification
4444
# This is because when spans overlap, the (first) longest span is preferred over shorter spans.
45-
woPattern = [[{"LOWER": "wo"}, {"IS_PUNCT": True, "OP":"*"}, {"IS_DIGIT": True}], [{"TEXT":{"REGEX":"(?<=wo)\d+"}}]]
46-
idPattern = [[{"TEXT":{"REGEX":"(?=\S*[a-zA-Z])(?=\S*[0-9])"}}]]
45+
woPattern = [[{"LOWER": "wo"}, {"IS_PUNCT": True, "OP":"*"}, {"IS_DIGIT": True}], [{"TEXT":{"REGEX":r"(?<=wo)\d+"}}]]
46+
idPattern = [[{"TEXT":{"REGEX":r"(?=\S*[a-zA-Z])(?=\S*[0-9])"}}]]
4747
# idPattern = [[{"TEXT":{"REGEX":"^(?=.*\b(?=\S*[a-zA-Z])(?=\S*[0-9]))"}}]]
4848

4949
self.matcher = SimpleEntityMatcher(nlp, label='WO', patterns=woPattern)

src/dackar/pipelines/TemporalEntity.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def __init__(self, nlp):
134134
|
135135
# Ordinal-Day-Month-Year
136136
(?:
137-
""" + ordinalPattern + """
137+
""" + ordinalPattern + r"""
138138
\s+
139139
(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name
140140
(?: # Year is optional
@@ -144,7 +144,7 @@ def __init__(self, nlp):
144144
)
145145
|
146146
(?:
147-
""" + ordinalPattern + """
147+
""" + ordinalPattern + r"""
148148
\s+
149149
of
150150
\s+
@@ -159,7 +159,7 @@ def __init__(self, nlp):
159159
(?:
160160
(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* # Month name
161161
\s+
162-
""" + ordinalPattern + """
162+
""" + ordinalPattern + r"""
163163
(?: # Year is optional
164164
\s+
165165
\d{4} # Year

src/dackar/text_processing/Preprocessing.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
import re
99
import textacy.preprocessing as preprocessing
1010
from numerizer import numerize
11+
import logging
12+
13+
logger = logging.getLogger('DACKAR.Preprocessing')
1114

1215
# list of available preprocessors in textacy.preprocessing.normalize
1316
textacyNormalize = ['bullet_points',
@@ -209,6 +212,7 @@ def __call__(self, text):
209212
Returns:
210213
processed: str, string of processed text
211214
"""
215+
logger.info('Preprocess raw text data')
212216
processed = text.strip('\n')
213217
processed = re.sub(r'&', ' and ', processed)
214218
# processed = re.sub(r'/', ' and ', processed)

src/dackar/utils/nlp/nlp_utils.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import matplotlib.pyplot as plt
1818

1919

20-
logger = logging.getLogger(__name__)
20+
logger = logging.getLogger('DACKAR.utils')
2121

2222
###########################################################################
2323

@@ -28,7 +28,7 @@ def displayNER(doc, includePunct=False):
2828
Args:
2929
3030
doc: spacy.tokens.doc.Doc, the processed document using nlp pipelines
31-
includePunct: bool, True if the punctuaction is included
31+
includePunct: bool, True if the punctuation is included
3232
3333
Returns:
3434
@@ -38,9 +38,9 @@ def displayNER(doc, includePunct=False):
3838
for i, t in enumerate(doc):
3939
if not t.is_punct or includePunct:
4040
row = {'token': i,
41-
'text': t.text, 'lemma': t.lemma_,
42-
'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,
43-
'ent_iob_': t.ent_iob_}
41+
'text': t.text, 'lemma': t.lemma_,
42+
'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,
43+
'ent_iob_': t.ent_iob_}
4444
if doc.has_extension('coref_chains'):
4545
if t.has_extension('coref_chains') and t._.coref_chains: # neuralcoref attributes
4646
row['coref_chains'] = t._.coref_chains.pretty_representation
@@ -263,3 +263,24 @@ def customTokenizer(nlp):
263263
token_match=nlp.tokenizer.token_match,
264264
rules=nlp.Defaults.tokenizer_exceptions)
265265
return nlp
266+
267+
268+
def extractNER(doc):
269+
"""
270+
Generate data frame for visualization of spaCy doc with custom NER.
271+
272+
Args:
273+
274+
doc: spacy.tokens.doc.Doc, the processed document using nlp pipelines
275+
276+
Returns:
277+
278+
df: pandas.DataFrame, data frame contains attributes of NER tokens
279+
"""
280+
rows = []
281+
for ent in doc.ents:
282+
row = {'entity':ent, 'label': ent.label_, 'id': ent.ent_id_, 'alias':ent._.alias, 'start': ent.start, 'end': ent.end}
283+
rows.append(row)
284+
df = pd.DataFrame(rows)
285+
df.index.name = None
286+
return df

0 commit comments

Comments
 (0)