idaholab · mandd · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025
diff --git a/examples/demo/CR_NLP_workflow_F.ipynb b/examples/demo/CR_NLP_workflow_F.ipynb
@@ -0,0 +1,276 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0169862e",
+   "metadata": {},
+   "source": [
+    "# Processing of Condition Reports (CRs): Filters F1 and F2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6946cad1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, sys, time\n",
+    "import re\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import spacy\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "from matplotlib.ticker import MaxNLocator\n",
+    "\n",
+    "# Load language model\n",
+    "nlp = spacy.load(\"en_core_web_lg\", exclude=[])\n",
+    "\n",
+    "cwd = os.getcwd()\n",
+    "pathToDACKAR   = os.path.join(cwd, '..', '..', 'src')\n",
+    "sys.path.append(pathToDACKAR)\n",
+    "\n",
+    "from dackar.text_processing.Preprocessing import Preprocessing\n",
+    "from dackar.utils.utils import getOnlyWords, getShortAcronym\n",
+    "from dackar.text_processing import Abbreviation \n",
+    "from dackar.utils.nlp.nlp_utils import resetPipeline\n",
+    "from dackar.text_processing.AbbrExpander import AbbrExpander\n",
+    "\n",
+    "from dackar.utils.nlp.nlp_utils import generatePatternList\n",
+    "from dackar.pipelines.GeneralEntity import GeneralEntity"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b8da0ac3",
+   "metadata": {},
+   "source": [
+    "## Import Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73f6188f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv('raw_data/textual/F_CR.csv')\n",
+    "data = data.applymap(lambda x: x.lower() if isinstance(x, str) else x)\n",
+    "data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bfc0d5d",
+   "metadata": {},
+   "source": [
+    "# Identify equipment IDs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57f9fc3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idLabel = \"equip_ID\"\n",
+    "ID      = \"equip_ID\" \n",
+    "\n",
+    "ID_list = pd.read_csv('processed_data/customMBSEobject_ID.csv')['ID'].to_list()\n",
+    "patterns_IDs = generatePatternList(ID_list, label=idLabel, id=ID, nlp=nlp, attr=\"LEMMA\")\n",
+    "\n",
+    "pipelines = []\n",
+    "resetPipeline(nlp, pipelines)\n",
+    "nlp.disable_pipes(\"ner\")\n",
+    "\n",
+    "ID_ents = GeneralEntity(nlp, patterns_IDs)\n",
+    "\n",
+    "ents  = []\n",
+    "for index, act in data['Equipment'].to_frame().iterrows():\n",
+    "    doc = nlp(data['Equipment'].iloc[index].lower())\n",
+    "    newDoc = ID_ents(doc)\n",
+    "    \n",
+    "    newTuple = ()\n",
+    "    for ent in newDoc.ents:\n",
+    "        newTuple = newTuple + (ent.lemma_,)\n",
+    "\n",
+    "    ents.append(newTuple)  \n",
+    "data['identifiedID'] = ents\n",
+    "data\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f3284e29",
+   "metadata": {},
+   "source": [
+    "# Identify nuclear related entities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2f52d6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dackar.utils import tagKeywordListReader as tklr\n",
+    "\n",
+    "tagDict = tklr.entityLibrary('../../../DACKAR/data/tag_keywords_lists.xlsx')\n",
+    "tagDict.checker()\n",
+    "\n",
+    "nuc_ent_dict = tagDict.getLibrary()\n",
+    "\n",
+    "patterns_ents =[]\n",
+    "\n",
+    "for key in nuc_ent_dict.keys():\n",
+    "    entLabel = str(key)\n",
+    "    entId    = str(key) \n",
+    "    patterns_ents.extend(generatePatternList(nuc_ent_dict[key], label=entLabel, id=entId, nlp=nlp, attr=\"LEMMA\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8dbacb8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipelines = []\n",
+    "resetPipeline(nlp, pipelines)\n",
+    "nlp.disable_pipes(\"ner\")\n",
+    "\n",
+    "# General entity object\n",
+    "generalEntity_ents = GeneralEntity(nlp, patterns_ents)\n",
+    "\n",
+    "ents  = []\n",
+    "for index, act in data['Component'].to_frame().iterrows():\n",
+    "    doc = nlp(data['Component'].iloc[index].lower())\n",
+    "    newDoc = generalEntity_ents(doc)\n",
+    "    \n",
+    "    newTuple = ()\n",
+    "    for ent in newDoc.ents:\n",
+    "        newTuple = newTuple + (ent.lemma_,)\n",
+    "\n",
+    "    ents.append(newTuple)  \n",
+    "data['Component NER entities'] = ents\n",
+    "\n",
+    "ents  = []\n",
+    "for index, act in data['Issue Observed'].to_frame().iterrows():\n",
+    "    doc = nlp(data['Issue Observed'].iloc[index].lower())\n",
+    "    newDoc = generalEntity_ents(doc)\n",
+    "    \n",
+    "    newTuple = ()\n",
+    "    for ent in newDoc.ents:\n",
+    "        newTuple = newTuple + (ent.lemma_,)\n",
+    "\n",
+    "    ents.append(newTuple)  \n",
+    "data['Issue NER entities'] = ents\n",
+    "\n",
+    "ents  = []\n",
+    "for index, act in data['Detection Method'].to_frame().iterrows():\n",
+    "    doc = nlp(data['Detection Method'].iloc[index].lower())\n",
+    "    newDoc = generalEntity_ents(doc)\n",
+    "    \n",
+    "    newTuple = ()\n",
+    "    for ent in newDoc.ents:\n",
+    "        newTuple = newTuple + (ent.lemma_,)\n",
+    "\n",
+    "    ents.append(newTuple)  \n",
+    "data['Detection NER entities'] = ents\n",
+    "\n",
+    "data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55cc9afa",
+   "metadata": {},
+   "source": [
+    "# Print on files: Nodes and Edges"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "80e2d849",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# CR nodes\n",
+    "data= data.rename(columns={'Report ID': 'Report_ID', 'Risk Impact': 'Risk_Impact'})\n",
+    "data[['date', 'Report_ID', 'Risk_Impact']].to_csv('processed_data/CR_F_nodes.csv', index=False)\n",
+    "\n",
+    "# NER entity nodes\n",
+    "compEnt  = data['Component NER entities'].tolist()\n",
+    "issueEnt = data['Issue NER entities'].tolist()\n",
+    "compEnt  = data['Detection NER entities'].tolist()\n",
+    "selected = compEnt + issueEnt + compEnt\n",
+    "selected = list(set(selected))\n",
+    "temp_dict = {'entities':[i for sub in selected for i in sub]}\n",
+    "pd.DataFrame(temp_dict).to_csv('processed_data/entities_F_nodes.csv', index=False)\n",
+    "\n",
+    "# Edges\n",
+    "id_edges_orig = []\n",
+    "id_edges_dest = []\n",
+    "ent_edges_orig = []\n",
+    "ent_edges_dest = []\n",
+    "ent_edges_attr = []\n",
+    "\n",
+    "for index, row in data.iterrows():\n",
+    "    if row['identifiedID']:\n",
+    "        for id in row['identifiedID']:\n",
+    "            id_edges_orig.append(id)\n",
+    "            id_edges_dest.append(row['Report_ID'])\n",
+    "\n",
+    "    if row['Component NER entities']:\n",
+    "        for ent in row['Component NER entities']:\n",
+    "            ent_edges_orig.append(row['Report_ID'])\n",
+    "            ent_edges_dest.append(ent)\n",
+    "            ent_edges_attr.append('component')\n",
+    "\n",
+    "    if row['Issue NER entities']:\n",
+    "        for ent in row['Issue NER entities']:\n",
+    "            ent_edges_orig.append(row['Report_ID'])\n",
+    "            ent_edges_dest.append(ent)\n",
+    "            ent_edges_attr.append('issue')\n",
+    "\n",
+    "    if row['Detection NER entities']:\n",
+    "        for ent in row['Detection NER entities']:\n",
+    "            ent_edges_orig.append(row['Report_ID'])\n",
+    "            ent_edges_dest.append(ent)\n",
+    "            ent_edges_attr.append('inspection')\n",
+    "\n",
+    "edges_ent_dict = {'orig':ent_edges_orig, 'dest':ent_edges_dest, 'attribute': ent_edges_attr}\n",
+    "edges_id_dict  = {'orig':id_edges_orig, 'dest':id_edges_dest}\n",
+    "\n",
+    "pd.DataFrame(edges_ent_dict).to_csv('processed_data/F_edges_ent.csv', index=False)\n",
+    "pd.DataFrame(edges_id_dict).to_csv('processed_data/F_edges_id.csv', index=False)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dackar_libs",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}