@@ -9,27 +9,67 @@ def analyze_connectors(doc, token):
9
9
Analyze whether a token is a connector that should trigger a sentence split.
10
10
11
11
Processing logic and order:
12
- 1. Check if the token is one of the target connectors (that, which, where, when).
13
- 2. For 'that', check if it's part of a contraction (e.g., that's, that'll).
14
- 3. For all connectors, check if they function as a 'mark' dependent of a verb.
15
- 4. For 'which', 'where', 'when', check if they function as determiners or pronouns
16
- for nouns or proper nouns.
17
- 5. Default to splitting for 'which', 'where', 'when' if no other conditions are met.
18
- 6. For 'and', 'or', 'but', check if they connect two independent clauses.
12
+ 1. Check if the token is one of the target connectors based on the language.
13
+ 2. For 'that' (English), check if it's part of a contraction (e.g., that's, that'll).
14
+ 3. For all connectors, check if they function as a specific dependency of a verb or noun.
15
+ 4. Default to splitting for certain connectors if no other conditions are met.
16
+ 5. For coordinating conjunctions, check if they connect two independent clauses.
19
17
"""
20
- # Check if the token is one of the target connectors
21
- if token .text .lower () not in ["that" , "which" , "where" , "when" , "because" , "but" , "and" , "or" ]:
18
+ lang = doc .lang_
19
+ if lang == "en" :
20
+ connectors = ["that" , "which" , "where" , "when" , "because" , "but" , "and" , "or" ]
21
+ mark_dep = "mark"
22
+ det_pron_deps = ["det" , "pron" ]
23
+ verb_pos = "VERB"
24
+ noun_pos = ["NOUN" , "PROPN" ]
25
+ elif lang == "ja" :
26
+ connectors = ["けれども" , "しかし" , "だから" , "それで" , "ので" , "のに" , "ため" ]
27
+ mark_dep = "mark"
28
+ det_pron_deps = ["case" ]
29
+ verb_pos = "VERB"
30
+ noun_pos = ["NOUN" , "PROPN" ]
31
+ elif lang == "fr" :
32
+ connectors = ["que" , "qui" , "où" , "quand" , "parce que" , "mais" , "et" , "ou" ]
33
+ mark_dep = "mark"
34
+ det_pron_deps = ["det" , "pron" ]
35
+ verb_pos = "VERB"
36
+ noun_pos = ["NOUN" , "PROPN" ]
37
+ elif lang == "ru" :
38
+ connectors = ["что" , "который" , "где" , "когда" , "потому что" , "но" , "и" , "или" ]
39
+ mark_dep = "mark"
40
+ det_pron_deps = ["det" ]
41
+ verb_pos = "VERB"
42
+ noun_pos = ["NOUN" , "PROPN" ]
43
+ elif lang == "es" :
44
+ connectors = ["que" , "cual" , "donde" , "cuando" , "porque" , "pero" , "y" , "o" ]
45
+ mark_dep = "mark"
46
+ det_pron_deps = ["det" , "pron" ]
47
+ verb_pos = "VERB"
48
+ noun_pos = ["NOUN" , "PROPN" ]
49
+ elif lang == "de" :
50
+ connectors = ["dass" , "welche" , "wo" , "wann" , "weil" , "aber" , "und" , "oder" ]
51
+ mark_dep = "mark"
52
+ det_pron_deps = ["det" , "pron" ]
53
+ verb_pos = "VERB"
54
+ noun_pos = ["NOUN" , "PROPN" ]
55
+ elif lang == "it" :
56
+ connectors = ["che" , "quale" , "dove" , "quando" , "perché" , "ma" , "e" , "o" ]
57
+ mark_dep = "mark"
58
+ det_pron_deps = ["det" , "pron" ]
59
+ verb_pos = "VERB"
60
+ noun_pos = ["NOUN" , "PROPN" ]
61
+ else :
62
+ return False , False
63
+
64
+ if token .text .lower () not in connectors :
22
65
return False , False
23
66
24
- if token .text .lower () == "that" :
25
- if token .dep_ == "mark" and token .head .pos_ == "VERB" :
26
- # Split if 'that' is a 'mark' dependent of a verb
67
+ if lang == "en" and token .text .lower () == "that" :
68
+ if token .dep_ == mark_dep and token .head .pos_ == verb_pos :
27
69
return True , False
28
70
else :
29
- # Don't split for other uses of 'that'
30
71
return False , False
31
- elif token .text .lower () != "that" and token .dep_ in ["det" , "pron" ] and token .head .pos_ in ["NOUN" , "PROPN" ]:
32
- # Don't split if 'which', 'where', 'when' are determiners or pronouns for nouns
72
+ elif token .dep_ in det_pron_deps and token .head .pos_ in noun_pos :
33
73
return False , False
34
74
else :
35
75
return True , False
0 commit comments