Skip to content

Commit 4eaf7ee

Browse files
committed
Improve the merge rule for NER dict_whitelist
1 parent 7e36bc4 commit 4eaf7ee

File tree

3 files changed

+14
-3
lines changed

3 files changed

+14
-3
lines changed

hanlp/components/ner/transformer_ner.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,7 @@ def tag_to_span(self, batch_tags, batch):
6060
for tags, tokens in zip(batch_tags, sents):
6161
if dict_whitelist:
6262
for start, end, label in dict_whitelist.tokenize(tokens):
63-
if (tags[start].startswith('B') or tags[start].startswith('S')) and (
64-
tags[end - 1].startswith('E') or tags[end - 1].startswith('S')):
63+
if (not tags[start][0] in 'ME') and (not tags[end - 1][0] in 'BM'):
6564
if end - start == 1:
6665
tags[start] = 'S-' + label
6766
else:

hanlp/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
# Author: hankcs
33
# Date: 2019-12-28 19:26
44

5-
__version__ = '2.1.0-alpha.36'
5+
__version__ = '2.1.0-alpha.38'
66
"""HanLP version"""
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# -*- coding:utf-8 -*-
2+
# Author: hankcs
3+
# Date: 2021-04-29 11:06
4+
import hanlp
5+
6+
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)
7+
HanLP['ner/msra'].dict_whitelist = {'午饭后': 'TIME'}
8+
doc = HanLP('2021年测试高血压是138,时间是午饭后2点45,低血压是44', tasks='ner/msra')
9+
doc.pretty_print()
10+
print(doc['ner/msra'])
11+
12+
# See https://hanlp.hankcs.com/docs/api/hanlp/components/mtl/tasks/ner/tag_ner.html

0 commit comments

Comments
 (0)