|
| 1 | +import spacy |
| 2 | + |
| 3 | +from text2story.core.exceptions import UninstalledModel, InvalidLanguage |
| 4 | +from text2story.core.utils import normalize_tag, chunknize_actors |
| 5 | + |
| 6 | +# this stores the pipeline of models used to extract narrative components |
| 7 | +# for a given language (whose code is the key of this dictionary) |
| 8 | +pipeline = {} |
| 9 | + |
| 10 | +def load(lang:str): |
| 11 | + """ |
| 12 | + Definition of load method is mandatory, otherwise the package will raise errors. |
| 13 | + If you do not want to define it, just define an empty method with the command pass |
| 14 | +
|
| 15 | + @param lang: The language code to load models. For instance (pt, en, fr, etc) |
| 16 | + @return: |
| 17 | + """ |
| 18 | + if not (spacy.util.is_package('fr_core_news_lg')): |
| 19 | + spacy.cli.download('fr_core_news_lg') |
| 20 | + pipeline['fr'] = spacy.load('fr_core_news_lg') |
| 21 | + |
| 22 | + try: |
| 23 | + pipeline['fr_time'] = spacy.load(lang + "_tei2go") |
| 24 | + except OSError: |
| 25 | + model_name = lang + "_tei2go" |
| 26 | + command = f"pip install https://huggingface.co/hugosousa/{lang}_tei2go/resolve/main/{lang}_tei2go-any-py3-none-any.whl" |
| 27 | + raise UninstalledModel(model_name, command) |
| 28 | + |
| 29 | + |
| 30 | +def extract_participants(lang, text): |
| 31 | + """ |
| 32 | + Parameters |
| 33 | + ---------- |
| 34 | + lang : str |
| 35 | + the language of text to be annotated |
| 36 | + text : str |
| 37 | + the text to be annotated |
| 38 | +
|
| 39 | + Returns |
| 40 | + ------- |
| 41 | + list[tuple[tuple[int, int], str, str]] |
| 42 | + the list of actors identified where each actor is represented by a tuple |
| 43 | +
|
| 44 | + Raises |
| 45 | + ------ |
| 46 | + InvalidLanguage if the language given is invalid/unsupported |
| 47 | + """ |
| 48 | + |
| 49 | + if lang not in ['fr']: |
| 50 | + raise InvalidLanguage(lang) |
| 51 | + |
| 52 | + doc = pipeline[lang](text) |
| 53 | + |
| 54 | + iob_token_list = [] |
| 55 | + for token in doc: |
| 56 | + start_character_offset = token.idx |
| 57 | + end_character_offset = token.idx + len(token) |
| 58 | + character_span = (start_character_offset, end_character_offset) |
| 59 | + pos = normalize_tag(token.pos_) |
| 60 | + ne = token.ent_iob_ + "-" + normalize_tag(token.ent_type_) if token.ent_iob_ != 'O' else 'O' |
| 61 | + |
| 62 | + iob_token_list.append((character_span, pos, ne)) |
| 63 | + |
| 64 | + actor_list = chunknize_actors(iob_token_list) |
| 65 | + |
| 66 | + return actor_list |
| 67 | + |
| 68 | +def extract_times(lang, text, publication_time=None): |
| 69 | + """ |
| 70 | + Parameters |
| 71 | + ---------- |
| 72 | + lang : str |
| 73 | + the language of text to be annotated |
| 74 | +
|
| 75 | + text : str |
| 76 | + the text to be annotated |
| 77 | +
|
| 78 | + Returns |
| 79 | + ------- |
| 80 | + list[tuple[tuple[int, int], str, str]] |
| 81 | + a list consisting of the times identified, where each time is represented by a tuple |
| 82 | + with the start and end character offset, it's value and type, respectively |
| 83 | +
|
| 84 | + Raises |
| 85 | + ------ |
| 86 | + InvalidLanguage if the language given is invalid/unsupported |
| 87 | + """ |
| 88 | + if lang not in ["fr"]: |
| 89 | + raise InvalidLanguage(lang) |
| 90 | + |
| 91 | + timex_lst = pipeline["fr"](text).ents |
| 92 | + |
| 93 | + ans = [] |
| 94 | + for timex in timex_lst: |
| 95 | + |
| 96 | + start = timex.start_char |
| 97 | + end = timex.end_char |
| 98 | + label = timex.label_ |
| 99 | + text = timex.text |
| 100 | + |
| 101 | + ans.append(((start, end), label, text)) |
| 102 | + return ans |
0 commit comments