66 @date:2023/10/20 14:01
77 @desc:
88"""
9- import datetime
109import logging
1110import os
1211import threading
13- import time
1412import traceback
1513from typing import List
1614
1715import django .db .models
18- from django .db import models , transaction
1916from django .db .models import QuerySet
2017from django .db .models .functions import Substr , Reverse
2118from langchain_core .embeddings import Embeddings
2219
2320from common .config .embedding_config import VectorStore
2421from common .db .search import native_search , get_dynamics_model , native_update
25- from common .db .sql_execute import sql_execute , update_execute
2622from common .util .file_util import get_file_content
2723from common .util .lock import try_lock , un_lock
28- from common .util .page_utils import page
24+ from common .util .page_utils import page_desc
2925from dataset .models import Paragraph , Status , Document , ProblemParagraphMapping , TaskType , State
3026from embedding .models import SourceType , SearchMode
3127from smartdoc .conf import PROJECT_DIR
@@ -167,7 +163,7 @@ def embedding_paragraph_apply(paragraph_list):
167163 if is_the_task_interrupted ():
168164 break
169165 ListenerManagement .embedding_by_paragraph (str (paragraph .get ('id' )), embedding_model )
170- post_apply ()
166+ post_apply ()
171167
172168 return embedding_paragraph_apply
173169
@@ -246,13 +242,16 @@ def update_status(query_set: QuerySet, taskType: TaskType, state: State):
246242 lock .release ()
247243
248244 @staticmethod
249- def embedding_by_document (document_id , embedding_model : Embeddings ):
245+ def embedding_by_document (document_id , embedding_model : Embeddings , state_list = None ):
250246 """
251247 向量化文档
248+ @param state_list:
252249 @param document_id: 文档id
253250 @param embedding_model 向量模型
254251 :return: None
255252 """
253+ if state_list is None :
254+ state_list = [State .PENDING , State .SUCCESS , State .FAILURE , State .REVOKE , State .REVOKED ]
256255 if not try_lock ('embedding' + str (document_id )):
257256 return
258257 try :
@@ -274,11 +273,17 @@ def is_the_task_interrupted():
274273 VectorStore .get_embedding_vector ().delete_by_document_id (document_id )
275274
276275 # 根据段落进行向量化处理
277- page (QuerySet (Paragraph ).filter (document_id = document_id ).values ('id' ), 5 ,
278- ListenerManagement .get_embedding_paragraph_apply (embedding_model , is_the_task_interrupted ,
279- ListenerManagement .get_aggregation_document_status (
280- document_id )),
281- is_the_task_interrupted )
276+ page_desc (QuerySet (Paragraph )
277+ .annotate (
278+ reversed_status = Reverse ('status' ),
279+ task_type_status = Substr ('reversed_status' , TaskType .EMBEDDING .value ,
280+ 1 ),
281+ ).filter (task_type_status__in = state_list , document_id = document_id )
282+ .values ('id' ), 5 ,
283+ ListenerManagement .get_embedding_paragraph_apply (embedding_model , is_the_task_interrupted ,
284+ ListenerManagement .get_aggregation_document_status (
285+ document_id )),
286+ is_the_task_interrupted )
282287 except Exception as e :
283288 max_kb_error .error (_ ('Vectorized document: {document_id} error {error} {traceback}' ).format (
284289 document_id = document_id , error = str (e ), traceback = traceback .format_exc ()))
0 commit comments