1717import matplotlib .pyplot as plt
1818
1919
20- logger = logging .getLogger (__name__ )
20+ logger = logging .getLogger ('DACKAR.utils' )
2121
2222###########################################################################
2323
@@ -28,7 +28,7 @@ def displayNER(doc, includePunct=False):
2828 Args:
2929
3030 doc: spacy.tokens.doc.Doc, the processed document using nlp pipelines
31- includePunct: bool, True if the punctuaction is included
31+ includePunct: bool, True if the punctuation is included
3232
3333 Returns:
3434
@@ -38,9 +38,9 @@ def displayNER(doc, includePunct=False):
3838 for i , t in enumerate (doc ):
3939 if not t .is_punct or includePunct :
4040 row = {'token' : i ,
41- 'text' : t .text , 'lemma' : t .lemma_ ,
42- 'pos' : t .pos_ , 'dep' : t .dep_ , 'ent_type' : t .ent_type_ ,
43- 'ent_iob_' : t .ent_iob_ }
41+ 'text' : t .text , 'lemma' : t .lemma_ ,
42+ 'pos' : t .pos_ , 'dep' : t .dep_ , 'ent_type' : t .ent_type_ ,
43+ 'ent_iob_' : t .ent_iob_ }
4444 if doc .has_extension ('coref_chains' ):
4545 if t .has_extension ('coref_chains' ) and t ._ .coref_chains : # neuralcoref attributes
4646 row ['coref_chains' ] = t ._ .coref_chains .pretty_representation
@@ -263,3 +263,24 @@ def customTokenizer(nlp):
263263 token_match = nlp .tokenizer .token_match ,
264264 rules = nlp .Defaults .tokenizer_exceptions )
265265 return nlp
266+
267+
268+ def extractNER (doc ):
269+ """
270+ Generate data frame for visualization of spaCy doc with custom NER.
271+
272+ Args:
273+
274+ doc: spacy.tokens.doc.Doc, the processed document using nlp pipelines
275+
276+ Returns:
277+
278+ df: pandas.DataFrame, data frame contains attributes of NER tokens
279+ """
280+ rows = []
281+ for ent in doc .ents :
282+ row = {'entity' :ent , 'label' : ent .label_ , 'id' : ent .ent_id_ , 'alias' :ent ._ .alias , 'start' : ent .start , 'end' : ent .end }
283+ rows .append (row )
284+ df = pd .DataFrame (rows )
285+ df .index .name = None
286+ return df
0 commit comments