-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathqdrant-single.py
157 lines (134 loc) · 4.38 KB
/
qdrant-single.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/python3
"""
Fetch blog data from jasonacox.com and embed into a qdrant vector database.
This uses a sentence transformer for the embedding calculations.
SINGLE DOCUMENT VERSION - This version uploads a single document at a time.
Author: Jason A. Cox
10 October 2023
https://github.com/jasonacox/TinyLLM/
Requirements:
* pip install qdrant-client sentence-transformers
Credits:
* Jacob Marks - How I Turned My Company’s Docs into a Searchable Database with OpenAI
https://towardsdatascience.com/how-i-turned-my-companys-docs-into-a-searchable-database-with-openai-4f2d34bd8736
* Jason Fan - How to connect Llama 2 to your own data, privately
https://jfan001.medium.com/how-to-connect-llama-2-to-your-own-data-privately-3e14a73e82a2
"""
import os
import re
import string
import uuid
from html import unescape
import httpx
import qdrant_client as qc
import qdrant_client.http.models as qmodels
from sentence_transformers import SentenceTransformer
# Configuration Settings
MODEL = os.environ.get("MY_MODEL", "all-MiniLM-L6-v2")
DEBUG = os.environ.get("DEBUG", "False") == "True"
COLLECTION_NAME = os.environ.get("COLLECTION_NAME", "mylibrary")
QDRANT_HOST = os.environ.get("QDRANT_HOST", "localhost")
RESULTS = 5
# Sentence Transformer Setup
print("Sentence Transformer starting...")
model = SentenceTransformer(MODEL, device="cuda")
# Qdrant Setup
print("Connecting to Qdrant DB...")
client = qc.QdrantClient(url=QDRANT_HOST)
METRIC = qmodels.Distance.DOT
DIMENSION = model.get_sentence_embedding_dimension()
# Create embeddings for text
def embed_text(text):
embeddings = model.encode(text, convert_to_tensor=True)
return embeddings
# Initialize qdrant collection (will erase!)
def create_index():
client.recreate_collection(
collection_name=COLLECTION_NAME,
vectors_config = qmodels.VectorParams(
size=DIMENSION,
distance=METRIC,
)
)
# Creates vector for content with attributes
def create_vector(content, title, page_url, doc_type="text"):
vector = embed_text(content)
uid = str(uuid.uuid1().int)[:32]
# Document attributes
payload = {
"text": content,
"title": title,
"url": page_url,
"doc_type": doc_type
}
return uid, vector, payload
# Adds document vector to qdrant database
def add_doc_to_index(text, title, url, doc_type="text"):
ids = []
vectors = []
payloads = []
uid, vector, payload = create_vector(text,title,url, doc_type)
ids.append(uid)
vectors.append(vector)
payloads.append(payload)
## Add vectors to collection
client.upsert(
collection_name=COLLECTION_NAME,
points=qmodels.Batch(
ids = ids,
vectors=vectors,
payloads=payloads
),
)
# Find document closely related to query
def query_index(query, top_k=5):
vector = embed_text(query)
results = client.search(
collection_name=COLLECTION_NAME,
query_vector=vector,
limit=top_k,
with_payload=True,
)
found=[]
for res in results:
found.append({"title": res.payload["title"],
"text": res.payload["text"],
"url": res.payload["url"],
"score": res.score})
return found
#
# Main - Index Blog Articles
#
tag_re = re.compile('<.*?>') # regex to remove html tags
# blog address - rss feed in json format
feed = "https://www.jasonacox.com/wordpress/feed/json"
# pull blog content
print(f"Pulling blog json feed content from {feed}...")
data = httpx.get(feed).json()
# First time - create index and import data
create_index()
# Loop to read in all articles - ignore any errors
print("Indexing blog articles...")
n = 1
for item in data["items"]:
title = item["title"]
url = item["url"]
body = tag_re.sub('', item["content_html"])
body = unescape(body)
body = ''.join(char for char in body if char in string.printable)
try:
print(f"Adding: {n} : {title} [size={len(body)}]")
add_doc_to_index(body, title, url, doc_type="text")
except:
print(" - ERROR: Ignoring")
n = n + 1
# Query the collection - TEST
prompt = "Give me some facts about solar."
query_result = query_index(prompt, top_k=RESULTS)
# Print results
print("")
print("Prompt: " + prompt)
print(f"Top {RESULTS} Documents found:")
for result in query_result:
print(" * " + result['title'])
# Done