1
1
from collections .abc import Callable
2
+ from typing import Any
2
3
3
4
import pymupdf4llm # type: ignore[import-untyped]
4
5
from langchain_community .embeddings .fastembed import FastEmbedEmbeddings
5
6
from langchain_core .documents import Document
6
- from langchain_core .vectorstores import VectorStoreRetriever
7
7
from langchain_qdrant import Qdrant
8
8
from langchain_text_splitters import MarkdownTextSplitter
9
+ from qdrant_client import QdrantClient
9
10
from qdrant_client .http import models as rest
10
11
11
12
from app .core .config import settings
13
+ from app .core .graph .rag .qdrant_retriever import QdrantRetriever
12
14
13
15
14
16
class QdrantStore :
15
17
"""
16
18
A class to handle uploading and searching documents in a Qdrant vector store.
17
19
"""
18
20
19
- embeddings = FastEmbedEmbeddings (model_name = settings .EMBEDDING_MODEL ) # type: ignore[call-arg]
21
+ embeddings = FastEmbedEmbeddings (model_name = settings .DENSE_EMBEDDING_MODEL ) # type: ignore[call-arg]
20
22
collection_name = settings .QDRANT_COLLECTION
21
23
url = settings .QDRANT_URL
22
24
23
- def create (
25
+ def __init__ (self ) -> None :
26
+ self .client = self ._create_collection ()
27
+
28
+ def add (
24
29
self ,
25
30
file_path : str ,
26
31
upload_id : int ,
@@ -46,22 +51,46 @@ def create(
46
51
[md_text ],
47
52
[{"user_id" : user_id , "upload_id" : upload_id }],
48
53
)
49
- Qdrant .from_documents (
50
- documents = docs ,
51
- embedding = self .embeddings ,
52
- url = self .url ,
53
- prefer_grpc = True ,
54
+
55
+ doc_texts : list [str ] = []
56
+ metadata : list [dict [Any , Any ]] = []
57
+ for doc in docs :
58
+ doc_texts .append (doc .page_content )
59
+ metadata .append (doc .metadata )
60
+
61
+ self .client .add (
54
62
collection_name = self .collection_name ,
55
- api_key = settings .QDRANT__SERVICE__API_KEY ,
63
+ documents = doc_texts ,
64
+ metadata = metadata ,
56
65
)
66
+
57
67
callback () if callback else None
58
68
69
+ def _create_collection (self ) -> QdrantClient :
70
+ """
71
+ Creates a collection in Qdrant if it does not already exist, configured for hybrid search.
72
+
73
+ The collection uses both dense and sparse vector models. Returns an instance of the Qdrant client.
74
+
75
+ Returns:
76
+ QdrantClient: An instance of the Qdrant client.
77
+ """
78
+ client = QdrantClient (url = self .url , api_key = settings .QDRANT__SERVICE__API_KEY )
79
+ client .set_model (settings .DENSE_EMBEDDING_MODEL )
80
+ client .set_sparse_model (settings .SPARSE_EMBEDDING_MODEL )
81
+ if not client .collection_exists (self .collection_name ):
82
+ client .create_collection (
83
+ collection_name = self .collection_name ,
84
+ vectors_config = client .get_fastembed_vector_params (),
85
+ sparse_vectors_config = client .get_fastembed_sparse_vector_params (),
86
+ )
87
+ return client
88
+
59
89
def _get_collection (self ) -> Qdrant :
60
- """Get instance of an existing Qdrant collection."""
90
+ """Get instance of an existing Qdrant collection from langchain_qdrant ."""
61
91
return Qdrant .from_existing_collection (
62
92
embedding = self .embeddings ,
63
93
url = self .url ,
64
- prefer_grpc = True ,
65
94
collection_name = self .collection_name ,
66
95
api_key = settings .QDRANT__SERVICE__API_KEY ,
67
96
)
@@ -73,11 +102,11 @@ def delete(self, upload_id: int, user_id: int) -> bool | None:
73
102
ids = rest .Filter ( # type: ignore[arg-type]
74
103
must = [
75
104
rest .FieldCondition (
76
- key = "metadata. user_id" ,
105
+ key = "user_id" ,
77
106
match = rest .MatchValue (value = user_id ),
78
107
),
79
108
rest .FieldCondition (
80
- key = "metadata. upload_id" ,
109
+ key = "upload_id" ,
81
110
match = rest .MatchValue (value = upload_id ),
82
111
),
83
112
]
@@ -95,10 +124,10 @@ def update(
95
124
) -> None :
96
125
"""Delete and re-upload the new PDF document to the Qdrant vector store"""
97
126
self .delete (user_id , upload_id )
98
- self .create (file_path , upload_id , user_id , chunk_size , chunk_overlap )
127
+ self .add (file_path , upload_id , user_id , chunk_size , chunk_overlap )
99
128
callback () if callback else None
100
129
101
- def retriever (self , user_id : int , upload_id : int ) -> VectorStoreRetriever :
130
+ def retriever (self , user_id : int , upload_id : int ) -> QdrantRetriever :
102
131
"""
103
132
Creates a VectorStoreRetriever that retrieves results containing the specified user_id and upload_id in the metadata.
104
133
@@ -109,9 +138,21 @@ def retriever(self, user_id: int, upload_id: int) -> VectorStoreRetriever:
109
138
Returns:
110
139
VectorStoreRetriever: A VectorStoreRetriever instance.
111
140
"""
112
- qdrant = self ._get_collection ()
113
- retriever = qdrant .as_retriever (
114
- search_kwargs = {"filter" : {"user_id" : user_id , "upload_id" : upload_id }}
141
+ retriever = QdrantRetriever (
142
+ client = self .client ,
143
+ collection_name = self .collection_name ,
144
+ search_kwargs = rest .Filter (
145
+ must = [
146
+ rest .FieldCondition (
147
+ key = "user_id" ,
148
+ match = rest .MatchValue (value = user_id ),
149
+ ),
150
+ rest .FieldCondition (
151
+ key = "upload_id" ,
152
+ match = rest .MatchValue (value = upload_id ),
153
+ ),
154
+ ],
155
+ ),
115
156
)
116
157
return retriever
117
158
@@ -127,20 +168,27 @@ def search(self, user_id: int, upload_ids: list[int], query: str) -> list[Docume
127
168
Returns:
128
169
List[Document]: A list of documents matching the search criteria.
129
170
"""
130
- qdrant = self ._get_collection ()
131
- found_docs = qdrant . similarity_search (
132
- query ,
133
- filter = rest .Filter (
171
+ search_results = self .client . query (
172
+ collection_name = self . collection_name ,
173
+ query_text = query ,
174
+ query_filter = rest .Filter (
134
175
must = [
135
176
rest .FieldCondition (
136
- key = "metadata. user_id" ,
177
+ key = "user_id" ,
137
178
match = rest .MatchValue (value = user_id ),
138
179
),
139
180
rest .FieldCondition (
140
- key = "metadata. upload_id" ,
181
+ key = "upload_id" ,
141
182
match = rest .MatchAny (any = upload_ids ),
142
183
),
143
- ]
184
+ ],
144
185
),
145
186
)
146
- return found_docs
187
+ documents : list [Document ] = []
188
+ for result in search_results :
189
+ document = Document (
190
+ page_content = result .document ,
191
+ metadata = {"score" : result .score },
192
+ )
193
+ documents .append (document )
194
+ return documents
0 commit comments