Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
276 changes: 276 additions & 0 deletions examples/demo/CR_NLP_workflow_F.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0169862e",
"metadata": {},
"source": [
"# Processing of Condition Reports (CRs): Filters F1 and F2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6946cad1",
"metadata": {},
"outputs": [],
"source": [
"import os, sys, time\n",
"import re\n",
"import pandas as pd\n",
"import numpy as np\n",
"import spacy\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"from matplotlib.ticker import MaxNLocator\n",
"\n",
"# Load language model\n",
"nlp = spacy.load(\"en_core_web_lg\", exclude=[])\n",
"\n",
"cwd = os.getcwd()\n",
"pathToDACKAR = os.path.join(cwd, '..', '..', 'src')\n",
"sys.path.append(pathToDACKAR)\n",
"\n",
"from dackar.text_processing.Preprocessing import Preprocessing\n",
"from dackar.utils.utils import getOnlyWords, getShortAcronym\n",
"from dackar.text_processing import Abbreviation \n",
"from dackar.utils.nlp.nlp_utils import resetPipeline\n",
"from dackar.text_processing.AbbrExpander import AbbrExpander\n",
"\n",
"from dackar.utils.nlp.nlp_utils import generatePatternList\n",
"from dackar.pipelines.GeneralEntity import GeneralEntity"
]
},
{
"cell_type": "markdown",
"id": "b8da0ac3",
"metadata": {},
"source": [
"## Import Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "73f6188f",
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('raw_data/textual/F_CR.csv')\n",
"data = data.applymap(lambda x: x.lower() if isinstance(x, str) else x)\n",
"data"
]
},
{
"cell_type": "markdown",
"id": "4bfc0d5d",
"metadata": {},
"source": [
"# Identify equipment IDs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57f9fc3b",
"metadata": {},
"outputs": [],
"source": [
"idLabel = \"equip_ID\"\n",
"ID = \"equip_ID\" \n",
"\n",
"ID_list = pd.read_csv('processed_data/customMBSEobject_ID.csv')['ID'].to_list()\n",
"patterns_IDs = generatePatternList(ID_list, label=idLabel, id=ID, nlp=nlp, attr=\"LEMMA\")\n",
"\n",
"pipelines = []\n",
"resetPipeline(nlp, pipelines)\n",
"nlp.disable_pipes(\"ner\")\n",
"\n",
"ID_ents = GeneralEntity(nlp, patterns_IDs)\n",
"\n",
"ents = []\n",
"for index, act in data['Equipment'].to_frame().iterrows():\n",
" doc = nlp(data['Equipment'].iloc[index].lower())\n",
" newDoc = ID_ents(doc)\n",
" \n",
" newTuple = ()\n",
" for ent in newDoc.ents:\n",
" newTuple = newTuple + (ent.lemma_,)\n",
"\n",
" ents.append(newTuple) \n",
"data['identifiedID'] = ents\n",
"data\n"
]
},
{
"cell_type": "markdown",
"id": "f3284e29",
"metadata": {},
"source": [
"# Identify nuclear related entities"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b2f52d6a",
"metadata": {},
"outputs": [],
"source": [
"from dackar.utils import tagKeywordListReader as tklr\n",
"\n",
"tagDict = tklr.entityLibrary('../../../DACKAR/data/tag_keywords_lists.xlsx')\n",
"tagDict.checker()\n",
"\n",
"nuc_ent_dict = tagDict.getLibrary()\n",
"\n",
"patterns_ents =[]\n",
"\n",
"for key in nuc_ent_dict.keys():\n",
" entLabel = str(key)\n",
" entId = str(key) \n",
" patterns_ents.extend(generatePatternList(nuc_ent_dict[key], label=entLabel, id=entId, nlp=nlp, attr=\"LEMMA\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d8dbacb8",
"metadata": {},
"outputs": [],
"source": [
"pipelines = []\n",
"resetPipeline(nlp, pipelines)\n",
"nlp.disable_pipes(\"ner\")\n",
"\n",
"# General entity object\n",
"generalEntity_ents = GeneralEntity(nlp, patterns_ents)\n",
"\n",
"ents = []\n",
"for index, act in data['Component'].to_frame().iterrows():\n",
" doc = nlp(data['Component'].iloc[index].lower())\n",
" newDoc = generalEntity_ents(doc)\n",
" \n",
" newTuple = ()\n",
" for ent in newDoc.ents:\n",
" newTuple = newTuple + (ent.lemma_,)\n",
"\n",
" ents.append(newTuple) \n",
"data['Component NER entities'] = ents\n",
"\n",
"ents = []\n",
"for index, act in data['Issue Observed'].to_frame().iterrows():\n",
" doc = nlp(data['Issue Observed'].iloc[index].lower())\n",
" newDoc = generalEntity_ents(doc)\n",
" \n",
" newTuple = ()\n",
" for ent in newDoc.ents:\n",
" newTuple = newTuple + (ent.lemma_,)\n",
"\n",
" ents.append(newTuple) \n",
"data['Issue NER entities'] = ents\n",
"\n",
"ents = []\n",
"for index, act in data['Detection Method'].to_frame().iterrows():\n",
" doc = nlp(data['Detection Method'].iloc[index].lower())\n",
" newDoc = generalEntity_ents(doc)\n",
" \n",
" newTuple = ()\n",
" for ent in newDoc.ents:\n",
" newTuple = newTuple + (ent.lemma_,)\n",
"\n",
" ents.append(newTuple) \n",
"data['Detection NER entities'] = ents\n",
"\n",
"data"
]
},
{
"cell_type": "markdown",
"id": "55cc9afa",
"metadata": {},
"source": [
"# Print on files: Nodes and Edges"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "80e2d849",
"metadata": {},
"outputs": [],
"source": [
"# CR nodes\n",
"data= data.rename(columns={'Report ID': 'Report_ID', 'Risk Impact': 'Risk_Impact'})\n",
"data[['date', 'Report_ID', 'Risk_Impact']].to_csv('processed_data/CR_F_nodes.csv', index=False)\n",
"\n",
"# NER entity nodes\n",
"compEnt = data['Component NER entities'].tolist()\n",
"issueEnt = data['Issue NER entities'].tolist()\n",
"compEnt = data['Detection NER entities'].tolist()\n",
"selected = compEnt + issueEnt + compEnt\n",
"selected = list(set(selected))\n",
"temp_dict = {'entities':[i for sub in selected for i in sub]}\n",
"pd.DataFrame(temp_dict).to_csv('processed_data/entities_F_nodes.csv', index=False)\n",
"\n",
"# Edges\n",
"id_edges_orig = []\n",
"id_edges_dest = []\n",
"ent_edges_orig = []\n",
"ent_edges_dest = []\n",
"ent_edges_attr = []\n",
"\n",
"for index, row in data.iterrows():\n",
" if row['identifiedID']:\n",
" for id in row['identifiedID']:\n",
" id_edges_orig.append(id)\n",
" id_edges_dest.append(row['Report_ID'])\n",
"\n",
" if row['Component NER entities']:\n",
" for ent in row['Component NER entities']:\n",
" ent_edges_orig.append(row['Report_ID'])\n",
" ent_edges_dest.append(ent)\n",
" ent_edges_attr.append('component')\n",
"\n",
" if row['Issue NER entities']:\n",
" for ent in row['Issue NER entities']:\n",
" ent_edges_orig.append(row['Report_ID'])\n",
" ent_edges_dest.append(ent)\n",
" ent_edges_attr.append('issue')\n",
"\n",
" if row['Detection NER entities']:\n",
" for ent in row['Detection NER entities']:\n",
" ent_edges_orig.append(row['Report_ID'])\n",
" ent_edges_dest.append(ent)\n",
" ent_edges_attr.append('inspection')\n",
"\n",
"edges_ent_dict = {'orig':ent_edges_orig, 'dest':ent_edges_dest, 'attribute': ent_edges_attr}\n",
"edges_id_dict = {'orig':id_edges_orig, 'dest':id_edges_dest}\n",
"\n",
"pd.DataFrame(edges_ent_dict).to_csv('processed_data/F_edges_ent.csv', index=False)\n",
"pd.DataFrame(edges_id_dict).to_csv('processed_data/F_edges_id.csv', index=False)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "dackar_libs",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading
Loading