-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript.py
130 lines (103 loc) · 3.76 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from flask import Flask, request, jsonify
import bjoern
import stanza
import logging
import json
import os
from stanza.resources.common import DEFAULT_MODEL_DIR
def verify_inputs(data):
language = data.get('language')
text = data.get('text')
processors = data.get('processors')
if not all(isinstance(value, str) for value in [language, text, processors]):
raise ValueError('All inputs must be strings')
return True
os.makedirs(DEFAULT_MODEL_DIR, exist_ok=True)
# print(f"Stanza model directory: {DEFAULT_MODEL_DIR}")
# print(f"Initiating pipeline cache...")
pipelinesCache = dict()
def ensure_stanza(language):
if not os.path.exists(os.path.join(DEFAULT_MODEL_DIR, language)):
print(f"Downloading Stanza model for '{language}'...")
stanza.download(language, model_dir=DEFAULT_MODEL_DIR)
else:
print(f"Stanza model for '{language}' already exists. Skipping download.")
def get_pipeline(language, processors):
global pipelinesCache
cacheKey = language + "_" + processors
ensure_stanza(language)
if cacheKey not in pipelinesCache:
print(f"cacheKey: {cacheKey} NOT FOUND! building")
pipelinesCache[cacheKey] = stanza.Pipeline(
lang=language,
processors=processors,
use_gpu=False
)
return pipelinesCache[cacheKey]
def parse_doc(doc):
print("Parsing doc...")
print(doc)
serializable_entities = [
{
"text": entity.text,
"type": entity.type,
"start_char": entity.start_char,
"end_char": entity.end_char
}
for entity in doc.entities
]
annotated_sentences = []
for sentence in doc.sentences:
tokens = []
deps = []
for word in sentence.words:
# print(word)
tokens.append({
'index': word.id,
'token': word.text,
'lemma': word.lemma,
'xpos': word.xpos,
'upos': word.upos,
'feats': word.feats,
'start_char': word.start_char,
'end_char': word.end_char,
})
deps.append({
'dep': word.deprel,
'governor': word.head,
'governorGloss': sentence.words[word.head-1].text,
'dependent': word.id, 'dependentGloss': word.text
})
annotated_sentences.append({'basicDependencies': deps, 'tokens': tokens})
if hasattr(sentence, 'constituency') and sentence.constituency is not None:
annotated_sentences[-1]['parse'] = str(sentence.constituency)
return annotated_sentences, serializable_entities
app = Flask(__name__, static_url_path='', static_folder=os.path.abspath(os.path.dirname(__file__)))
@app.route('/ping', methods=['GET'])
def ping():
return jsonify({"message": "Service is alive"})
@app.route('/nlp', methods=['POST'])
def get_data():
try:
data = request.get_json()
# print(f"request data: {data}")
try:
verify_inputs(data)
except ValueError as error:
print(f"Error: {error}")
return jsonify({"error": "Input Validation Error", "err": e}), 500
language = data['language']
stringnlp = data['text']
processors = data['processors']
pipeline = get_pipeline(language, processors)
doc = pipeline(stringnlp)
annotated_sentences, serializable_entities = parse_doc(doc)
return json.dumps({
'sentences': annotated_sentences,
'entities': serializable_entities
})
except Exception as e:
print(e)
return jsonify({"error": "Internal Server Error","err":e}), 500
if __name__ == '__main__':
bjoern.run(app, "0.0.0.0", 5000)