Skip to content

Commit 156ae23

Browse files
committed
add lematizer improvement . remove e088
1 parent 5dc1f2a commit 156ae23

File tree

1 file changed

+7
-9
lines changed

1 file changed

+7
-9
lines changed

_test_sent_explode.py

+7-9
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,9 @@
4848
-- url like '%t.me%' or
4949
-- url like '%nesslabs.com%' or
5050
url like '%habr.com%'
51-
)
52-
LIMIT 100
51+
)
52+
-- and c.id = 36947
53+
LIMIT 200
5354
""", sqlite3.connect(r'C:\MyFiles\Code\PDB-tools\PDB-tools\results\articles.db') )
5455

5556

@@ -177,8 +178,10 @@ def expl_sent(text,num_of_sent = 3,max_sent_len = 100):
177178

178179
def lemmatize(text: str):
179180
# TODO mayb i should exclude some submodules of nlp spacy?
181+
180182
nlp.max_length = len(text) + 1000
181-
doc = nlp(text.lower())
183+
with nlp.select_pipes(enable=['lemmatizer']):
184+
doc = nlp(text.lower())
182185

183186
lemmas = []
184187
for token in doc:
@@ -236,9 +239,4 @@ def lemmatize(text: str):
236239
res_stat.to_excel(r'out/BERTopic.xlsx')
237240

238241

239-
# SOLVED
240-
# [E088] Text of length 1894734 exceeds maximum of 1000000.
241-
# The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input.
242-
# This means long texts may cause memory allocation errors.
243-
# If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit.
244-
# The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.
242+

0 commit comments

Comments
 (0)