Enable sdp as the first task

hankcs · hankcs · commit 4befbd160533 · 2021-08-28T13:43:56.000-04:00
diff --git a/hanlp/components/mtl/tasks/sdp.py b/hanlp/components/mtl/tasks/sdp.py
@@ -112,19 +112,20 @@ def build_metric(self, **kwargs):
 
     def build_dataloader(self, data, transform: TransformList = None, training=False, device=None,
                          logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader:
-        if isinstance(data, list):
-            data = BiaffineSemanticDependencyParser.build_samples(self, data, self.config.use_pos)
         dataset = BiaffineSemanticDependencyParser.build_dataset(self, data, transform)
         if isinstance(data, str):
             dataset.purge_cache()
+            length_field = 'token'
+        else:
+            length_field = 'FORM'
         if self.vocabs.mutable:
             BiaffineSemanticDependencyParser.build_vocabs(self, dataset, logger, transformer=True)
         if dataset.cache:
             timer = CountdownTimer(len(dataset))
             BiaffineSemanticDependencyParser.cache_dataset(self, dataset, timer, training, logger)
         return PadSequenceDataLoader(
-            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
-                                                     gradient_accumulation=gradient_accumulation),
+            batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset, length_field=length_field),
+                                                     shuffle=training, gradient_accumulation=gradient_accumulation),
             device=device,
             dataset=dataset,
             pad=self.get_pad_dict())
@@ -167,3 +168,6 @@ def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]
                 deprels = [vocab[r[i]] for i in range(sent_len + 1) if a[i]]
                 result.append(list(zip(heads, deprels)))
             yield result
+
+    def build_samples(self, inputs, cls_is_bos=False, sep_is_eos=False):
+        return BiaffineSemanticDependencyParser.build_samples(self, inputs, self.config.use_pos)
diff --git a/hanlp/version.py b/hanlp/version.py
@@ -2,5 +2,5 @@
 # Author: hankcs
 # Date: 2019-12-28 19:26
 
-__version__ = '2.1.0-alpha.55'
+__version__ = '2.1.0-alpha.56'
 """HanLP version"""
diff --git a/tests/test_mtl.py b/tests/test_mtl.py
@@ -30,6 +30,24 @@ def test_skip_tok(self):
         doc: Document = mtl(pre_tokenized_sents, skip_tasks='tok*')
         self.assertSequenceEqual(doc['tok'], pre_tokenized_sents)
 
+    def test_sdp_as_the_first_task(self):
+        doc: Document = mtl(['人', '吃', '鱼'], tasks='sdp', skip_tasks='tok*')
+        self.assertDictEqual(
+            doc.to_dict(),
+            {
+                "sdp": [
+                    [(2, "Agt")],
+                    [(0, "Root")],
+                    [(2, "Pat")]
+                ],
+                "tok": [
+                    "人",
+                    "吃",
+                    "鱼"
+                ]
+            }
+        )
+
     def test_threading(self):
         num_proc = 8
         with Pool(num_proc) as pool: