improve web-interface and parameter-helper

jacons · jacons · commit 2da48eda3365 · 2023-01-12T16:32:32.000+01:00
diff --git a/Parsing/parser_utils.py b/Parsing/parser_utils.py
@@ -140,9 +140,9 @@ def buildDataset(path_file: str, verbose=True) -> EntityHandler:
     sentences, list_of_labels = [], []
     set_entities = set()  # set of unique entity found (incrementally updated)
 
-    for field in read_conll(path_file):  # generator
+    for fields in read_conll(path_file):  # generator
 
-        tokens, labels = field[0], field[1]
+        tokens, labels = fields[0], fields[1]
 
         sentences.append(" ".join(tokens))
         list_of_labels.append(" ".join(labels))
@@ -206,22 +206,33 @@ def random_chars(y):
 
 
 def parse_args():
-    p = argparse.ArgumentParser(description='Model configuration.', add_help=False)
+    p = argparse.ArgumentParser(description='Model configuration.', add_help=True)
 
-    p.add_argument('--datasets', type=str, nargs='+', help='Path to the datasets', default=None)
-    p.add_argument('--models', type=str, nargs='+', help='Models in the same order of datasets', default=None)
+    p.add_argument('--datasets', type=str, nargs='+',
+                   help='Dataset used for training, it will split in training, validation and test', default=None)
 
-    p.add_argument('--model_name', type=str, help='Name of trained model', default=None)
-    p.add_argument('--path_model', type=str, help='Directory to save the model', default=".")
+    p.add_argument('--models', type=str, nargs='+',
+                   help='Model trained ready to evaluate or use, if list, the order must follow the same of datasets',
+                   default=None)
 
-    p.add_argument('--bert', type=str, help='Huggingface model', default="dbmdz/bert-base-italian-xxl-cased")
+    p.add_argument('--model_name', type=str,
+                   help='Name to give to a trained model', default=None)
+
+    p.add_argument('--path_model', type=str,
+                   help='Directory to save the model', default=".")
+
+    p.add_argument('--bert', type=str,
+                   help='Bert model provided by Huggingface', default="dbmdz/bert-base-italian-xxl-cased")
+
+    p.add_argument('--save_model', type=int,
+                   help='set 1 if you want save the model otherwise set 0', default=1)
 
     p.add_argument('--lr', type=float, help='Learning rate', default=0.010)
     p.add_argument('--momentum', type=float, help='Momentum', default=0.9)
     p.add_argument('--weight_decay', type=float, help='Weight decay', default=0.0002)
     p.add_argument('--batch_size', type=int, help='Batch size', default=2)
     p.add_argument('--max_epoch', type=int, help='Max number of epochs', default=20)
     p.add_argument('--early_stopping', type=float, help='Patience in early stopping', default=3)
-    p.add_argument('--save_model', type=int, help='1 to save the model', default=1)
+
 
     return p.parse_known_args()
diff --git a/Prediction/Predictor.py b/Prediction/Predictor.py
@@ -1,3 +1,5 @@
+from typing import Tuple
+
 from torch import IntTensor, BoolTensor, masked_select
 from transformers import BertTokenizerFast
 
@@ -61,7 +63,7 @@ def add_model(self, group: str, model: NERClassifier, dictionary: dict):
         model.eval()
         self.models[group] = (model, dictionary)
 
-    def predict(self, string: str) -> list:
+    def predict(self, string: str) -> Tuple[list, list]:
 
         token_text = self.tokenizer(string)
 
@@ -84,4 +86,13 @@ def predict(self, string: str) -> list:
                 [lbl[2:] if lbl != "O" else "O" for lbl in self.map_id2lab(dictionary, logits)])
 
         results = self.unify_labels(results[0], results[1]) if len(results) == 2 else results[0]
-        return results
+
+        # Mask is used to show only a once the entity. if true on the last word in a group of words
+        # where it was detected as entity
+        mask = [False] * len(results)
+        for idx in range(len(results) - 1):
+            if results[idx] != results[idx + 1] and results[idx] != "":
+                mask[idx] = True
+        mask[-1] = True if results[-1] != "" else False
+
+        return results, mask
diff --git a/server.py b/server.py
@@ -35,22 +35,24 @@
 predictor.add_model("a", modelA, id2lab_group_a)
 predictor.add_model("b", modelB, id2lab_group_b)
 
+list_of_result = []
+
 
 @app.route('/', methods=('GET', 'POST'))
 def create():
     if request.method == 'POST':
+
         sentence = request.form['Sentence']
-        tag_pred = predictor.predict(sentence)
 
-        mask = [False] * len(tag_pred)
-        for idx in range(len(tag_pred)-1):
-            if tag_pred[idx] != tag_pred[idx+1] and tag_pred[idx] != "":
-                mask[idx] = True
+        if "predict" in request.form and sentence != "":
+            tag_pred, mask = predictor.predict(sentence)
+            result_ = [*zip(sentence.split(), tag_pred, mask)]
+            list_of_result.append(result_)
+
+        elif "clear" in request.form:
+            list_of_result.clear()
 
-        result_ = [*zip(sentence.split(), tag_pred, mask)]
-    else:
-        result_ = []
-    return render_template('main.html', result=result_)
+    return render_template('main.html', list_of_result=list_of_result)
 
 
 """
diff --git a/templates/main.html b/templates/main.html
@@ -64,17 +64,22 @@
                     <form method="POST">
                       <div style="padding:10px">
                         <label class="form-label"><b>Sentence</b></label>
-                        <textarea type="text" style="padding:8px;" class="form-control" name="Sentence" value=""></textarea>
+                        <textarea style="padding:8px;" class="form-control" name="Sentence"></textarea>
                       </div>
-                      <button type="submit" class="btn" style="background-color:#AD8E70"><b>Entity extraction</b></button>
+                      <button type="submit" class="btn"  name="predict" style="background-color:#AD8E70"><b>Entity extraction</b></button>
+                      <button type="submit" class="btn"  name="clear" style="background-color:#AD8E70"><b>Clear</b></button>
                     </form>
                 </div>
               </div>
               <div class="grid-item3 text_result">
                   <br>
-                  {% for (token, tag, mask) in result %}
-                  {% if tag %}<i><b>{{ token }}</b></i>{% else %}{{ token }}{% endif %}
-                  {% if tag and mask %}<sub style="color:red"> ({{ tag }})</sub> {% endif %}
+                  {% for result in list_of_result %}
+                        {% for (token, tag, mask) in result %}
+                            {% if tag %}<i><b>{{ token }}</b></i>{% else %}{{ token }}{% endif %}
+                            {% if tag and mask %}<sub style="color:red"> ({{ tag }})</sub> {% endif %}
+                        {% endfor %}
+                        <br>
+                        <hr style="width:100%">
                   {% endfor %}
               </div>
               <div class="grid-item2">
diff --git a/train_model.py b/train_model.py
@@ -18,14 +18,14 @@
 
     handler = buildDataset(args.datasets[0], verbose=True)
     df_train, df_val, df_test = holdout(handler.dt)
+
     model = NERClassifier(conf.bert, len(handler.set_entities), frozen=False)
-    # model.load_state_dict(torch.load(conf.folder + "tmp/modelA2.pt"))
 
     if conf.cuda:
         model = model.to(conf.gpu)
 
     train(model, handler, df_train, df_val, conf)
 
     """
-    C:\ProgramData\Anaconda3\envs\deeplearning\python.exe train_model.py --model_name prova.pt --max_epoch 1 --datasets .\Source\dataset.a.conll
+    C:\ProgramData\Anaconda3\envs\deeplearning\python.exe train_model.py --model_name prova --max_epoch 1 --datasets .\Source\dataset.a.conll
     """