Skip to content

Commit fc379e9

Browse files
authored
Fix: change create dataset htto api delimiter default value to r'\n' (#7434)
### What problem does this PR solve? change create dataset delimiter default value to r'\n' ### Type of change - [x] New Feature (non-breaking change which adds functionality)
1 parent fea9d97 commit fc379e9

File tree

4 files changed

+7
-7
lines changed

4 files changed

+7
-7
lines changed

api/utils/api_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ def get_parser_config(chunk_method, parser_config):
353353
if not chunk_method:
354354
chunk_method = "naive"
355355
key_mapping = {
356-
"naive": {"chunk_token_num": 128, "delimiter": "\\n!?;。;!?", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}},
356+
"naive": {"chunk_token_num": 128, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}},
357357
"qa": {"raptor": {"use_raptor": False}},
358358
"tag": None,
359359
"resume": None,
@@ -364,7 +364,7 @@ def get_parser_config(chunk_method, parser_config):
364364
"laws": {"raptor": {"use_raptor": False}},
365365
"presentation": {"raptor": {"use_raptor": False}},
366366
"one": None,
367-
"knowledge_graph": {"chunk_token_num": 8192, "delimiter": "\\n!?;。;!?", "entity_types": ["organization", "person", "location", "event", "time"]},
367+
"knowledge_graph": {"chunk_token_num": 8192, "delimiter": r"\n", "entity_types": ["organization", "person", "location", "event", "time"]},
368368
"email": None,
369369
"picture": None,
370370
}

api/utils/validation_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ class ParserConfig(Base):
9696
auto_keywords: int = Field(default=0, ge=0, le=32)
9797
auto_questions: int = Field(default=0, ge=0, le=10)
9898
chunk_token_num: int = Field(default=128, ge=1, le=2048)
99-
delimiter: str = Field(default=r"\n!?;。;!?", min_length=1)
99+
delimiter: str = Field(default=r"\n", min_length=1)
100100
graphrag: Optional[GraphragConfig] = None
101101
html4excel: bool = False
102102
layout_recognize: str = "DeepDOC"

sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ def test_valid_parser_config(self, get_http_api_auth, name, parser_config):
400400
if parser_config is None:
401401
assert res["data"]["parser_config"] == {
402402
"chunk_token_num": 128,
403-
"delimiter": r"\n!?;。;!?",
403+
"delimiter": r"\n",
404404
"html4excel": False,
405405
"layout_recognize": "DeepDOC",
406406
"raptor": {"use_raptor": False},
@@ -410,7 +410,7 @@ def test_valid_parser_config(self, get_http_api_auth, name, parser_config):
410410
"auto_keywords": 0,
411411
"auto_questions": 0,
412412
"chunk_token_num": 128,
413-
"delimiter": r"\n!?;。;!?",
413+
"delimiter": r"\n",
414414
"filename_embd_weight": None,
415415
"graphrag": None,
416416
"html4excel": False,

sdk/python/test/test_http_api/test_file_management_within_dataset/test_update_document.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ class TestUpdateDocumentParserConfig:
303303
"chunk_token_num": 128,
304304
"layout_recognize": "DeepDOC",
305305
"html4excel": False,
306-
"delimiter": "\\n!?;。;!?",
306+
"delimiter": r"\n",
307307
"task_page_size": 12,
308308
"raptor": {"use_raptor": False},
309309
},
@@ -530,7 +530,7 @@ def test_parser_config(
530530
else:
531531
assert res["data"]["docs"][0]["parser_config"] == {
532532
"chunk_token_num": 128,
533-
"delimiter": "\\n!?;。;!?",
533+
"delimiter": r"\n",
534534
"html4excel": False,
535535
"layout_recognize": "DeepDOC",
536536
"raptor": {"use_raptor": False},

0 commit comments

Comments
 (0)