add lematizer improvement . remove e088

RedHotUnicorn · RedHotUnicorn · commit 156ae23a4819 · 2023-12-11T23:56:08.000+03:00
diff --git a/_test_sent_explode.py b/_test_sent_explode.py
@@ -48,8 +48,9 @@
                                                                 -- url like '%t.me%' or 
                                                                 -- url like '%nesslabs.com%' or
                                                                 url like '%habr.com%'
-                                                            )       
-                              LIMIT 100
+                                                            )    
+                                    -- and    c.id = 36947
+                              LIMIT 200
                               """, sqlite3.connect(r'C:\MyFiles\Code\PDB-tools\PDB-tools\results\articles.db') )
 
 
@@ -177,8 +178,10 @@ def expl_sent(text,num_of_sent = 3,max_sent_len = 100):
 
 def lemmatize(text: str):
     # TODO mayb i should exclude some submodules of nlp spacy?
+
     nlp.max_length = len(text) + 1000
-    doc = nlp(text.lower())
+    with nlp.select_pipes(enable=['lemmatizer']):
+        doc = nlp(text.lower())
     
     lemmas = []
     for token in doc:
@@ -236,9 +239,4 @@ def lemmatize(text: str):
 res_stat.to_excel(r'out/BERTopic.xlsx')
 
 
-# SOLVED 
-# [E088] Text of length 1894734 exceeds maximum of 1000000. 
-# The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. 
-# This means long texts may cause memory allocation errors. 
-# If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. 
-# The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.
+