Skip to content

Commit cee36c9

Browse files
authored
Merge pull request #450 from realpython/chromadb
Materials for Embeddings and Vector Databases With ChromaDB
2 parents 32e1ce0 + 18f88a5 commit cee36c9

11 files changed

+468
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Embeddings and Vector Databases With ChromaDB
2+
3+
Supporting code for the Real Python tutorial [Embeddings and Vector Databases With ChromaDB](https://realpython.com/embeddings-and-vector-databases-with-chromadb/).
4+
5+
To run the code in this tutorial, you should have `numpy`, `spacy`, `sentence-transformers`, `chromadb`, `polars`, `more-itertools`, and `openai` installed in your environment.
6+
7+
You can install the dependencies manually, or by running:
8+
9+
```
10+
(venv) $ python -m pip install -r requirements.txt
11+
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import pathlib
2+
3+
import polars as pl
4+
5+
6+
def prepare_car_reviews_data(
7+
data_path: pathlib.Path, vehicle_years: list[int] = [2017]
8+
):
9+
"""Prepare the car reviews dataset for ChromaDB"""
10+
11+
# Define the schema to ensure proper data types are enforced
12+
dtypes = {
13+
"": pl.Int64,
14+
"Review_Date": pl.Utf8,
15+
"Author_Name": pl.Utf8,
16+
"Vehicle_Title": pl.Utf8,
17+
"Review_Title": pl.Utf8,
18+
"Review": pl.Utf8,
19+
"Rating": pl.Float64,
20+
}
21+
22+
# Scan the car reviews dataset(s)
23+
car_reviews = pl.scan_csv(data_path, dtypes=dtypes)
24+
25+
# Extract the vehicle title and year as new columns
26+
# Filter on selected years
27+
car_review_db_data = (
28+
car_reviews.with_columns(
29+
[
30+
(
31+
pl.col("Vehicle_Title")
32+
.str.split(by=" ")
33+
.list.get(0)
34+
.cast(pl.Int64)
35+
).alias("Vehicle_Year"),
36+
(pl.col("Vehicle_Title").str.split(by=" ").list.get(1)).alias(
37+
"Vehicle_Model"
38+
),
39+
]
40+
)
41+
.filter(pl.col("Vehicle_Year").is_in(vehicle_years))
42+
.select(
43+
[
44+
"Review_Title",
45+
"Review",
46+
"Rating",
47+
"Vehicle_Year",
48+
"Vehicle_Model",
49+
]
50+
)
51+
.sort(["Vehicle_Model", "Rating"])
52+
.collect()
53+
)
54+
55+
# Create ids, documents, and metadatas data in the format chromadb expects
56+
ids = [f"review{i}" for i in range(car_review_db_data.shape[0])]
57+
documents = car_review_db_data["Review"].to_list()
58+
metadatas = car_review_db_data.drop("Review").to_dicts()
59+
60+
chroma_data = {"ids": ids, "documents": documents, "metadatas": metadatas}
61+
62+
return chroma_data
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import pathlib
2+
3+
import chromadb
4+
from chromadb.utils import embedding_functions
5+
from more_itertools import batched
6+
7+
8+
def build_chroma_collection(
9+
chroma_path: pathlib.Path,
10+
collection_name: str,
11+
embbeding_func_name: str,
12+
ids: list[str],
13+
documents: list[str],
14+
metadatas: list[dict],
15+
distance_func_name: str = "cosine",
16+
):
17+
"""Create a ChromaDB collection"""
18+
19+
chroma_client = chromadb.PersistentClient(chroma_path)
20+
21+
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
22+
model_name=embbeding_func_name
23+
)
24+
25+
collection = chroma_client.create_collection(
26+
name=collection_name,
27+
embedding_function=embedding_func,
28+
metadata={"hnsw:space": distance_func_name},
29+
)
30+
31+
document_indices = list(range(len(documents)))
32+
33+
for batch in batched(document_indices, 166):
34+
start_idx = batch[0]
35+
end_idx = batch[-1]
36+
37+
collection.add(
38+
ids=ids[start_idx:end_idx],
39+
documents=documents[start_idx:end_idx],
40+
metadatas=metadatas[start_idx:end_idx],
41+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"openai-secret-key": "your-api-key"
3+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import numpy as np
2+
3+
4+
def compute_cosine_similarity(u: np.ndarray, v: np.ndarray) -> float:
5+
"""Compute the cosine similarity between two vectors"""
6+
7+
return u.dot(v) / (np.linalg.norm(u) * np.linalg.norm(v))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import chromadb
2+
from chromadb.utils import embedding_functions
3+
4+
from car_data_etl import prepare_car_reviews_data
5+
from chroma_utils import build_chroma_collection
6+
7+
DATA_PATH = "data/archive/*"
8+
CHROMA_PATH = "car_review_embeddings"
9+
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
10+
COLLECTION_NAME = "car_reviews"
11+
12+
chroma_car_reviews_dict = prepare_car_reviews_data(DATA_PATH)
13+
14+
build_chroma_collection(
15+
CHROMA_PATH,
16+
COLLECTION_NAME,
17+
EMBEDDING_FUNC_NAME,
18+
chroma_car_reviews_dict["ids"],
19+
chroma_car_reviews_dict["documents"],
20+
chroma_car_reviews_dict["metadatas"],
21+
)
22+
23+
client = chromadb.PersistentClient(CHROMA_PATH)
24+
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
25+
model_name=EMBEDDING_FUNC_NAME
26+
)
27+
collection = client.get_collection(
28+
name=COLLECTION_NAME, embedding_function=embedding_func
29+
)
30+
31+
great_reviews = collection.query(
32+
query_texts=[
33+
"Find me some positive reviews that discuss the car's performance"
34+
],
35+
n_results=5,
36+
include=["documents", "distances", "metadatas"],
37+
)
38+
39+
print(great_reviews["documents"][0][0])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import numpy as np
2+
3+
# Create vectors with NumPy
4+
vector1 = np.array([1, 0])
5+
vector2 = np.array([0, 1])
6+
print(vector1)
7+
print(vector2)
8+
9+
v1 = np.array([1, 0])
10+
v2 = np.array([0, 1])
11+
v3 = np.array([np.sqrt(2), np.sqrt(2)])
12+
13+
# Dimension
14+
print(v1.shape)
15+
16+
# Magnitude
17+
print(np.sqrt(np.sum(v1**2)))
18+
print(np.linalg.norm(v1))
19+
print(np.linalg.norm(v3))
20+
21+
# Dot product
22+
print(np.sum(v1 * v2))
23+
print(v1.dot(v3))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import json
2+
import os
3+
4+
import chromadb
5+
import openai
6+
from chromadb.utils import embedding_functions
7+
8+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
9+
10+
DATA_PATH = "data/archive/*"
11+
CHROMA_PATH = "car_review_embeddings"
12+
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
13+
COLLECTION_NAME = "car_reviews"
14+
15+
with open("config.json", "r") as json_file:
16+
config_data = json.load(json_file)
17+
18+
openai.api_key = config_data.get("openai-secret-key")
19+
20+
client = chromadb.PersistentClient(CHROMA_PATH)
21+
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
22+
model_name=EMBEDDING_FUNC_NAME
23+
)
24+
25+
collection = client.get_collection(
26+
name=COLLECTION_NAME, embedding_function=embedding_func
27+
)
28+
29+
context = """
30+
You are a customer success employee at a large
31+
car dealership. Use the following car reviews
32+
to answer questions: {}
33+
"""
34+
35+
question = """
36+
What's the key to great customer satisfaction
37+
based on detailed positive reviews?
38+
"""
39+
40+
good_reviews = collection.query(
41+
query_texts=[question],
42+
n_results=10,
43+
include=["documents"],
44+
where={"Rating": {"$gte": 3}},
45+
)
46+
47+
reviews_str = ",".join(good_reviews["documents"][0])
48+
49+
good_review_summaries = openai.ChatCompletion.create(
50+
model="gpt-3.5-turbo",
51+
messages=[
52+
{"role": "system", "content": context.format(reviews_str)},
53+
{"role": "user", "content": question},
54+
],
55+
temperature=0,
56+
n=1,
57+
)
58+
59+
reviews_str = ",".join(good_reviews["documents"][0])
60+
61+
print("Good reviews: ")
62+
print(reviews_str)
63+
print("###########################################")
64+
65+
good_review_summaries = openai.ChatCompletion.create(
66+
model="gpt-3.5-turbo",
67+
messages=[
68+
{"role": "system", "content": context.format(reviews_str)},
69+
{"role": "user", "content": question},
70+
],
71+
temperature=0,
72+
n=1,
73+
)
74+
75+
print("AI-Generated summary of good reviews: ")
76+
print(good_review_summaries["choices"][0]["message"]["content"])
77+
print("###########################################")
78+
79+
80+
context = """
81+
You are a customer success employee at a large car dealership.
82+
Use the following car reivews to answer questions: {}
83+
"""
84+
question = """
85+
Which of these poor reviews has the worst implications about
86+
our dealership? Explain why.
87+
"""
88+
89+
poor_reviews = collection.query(
90+
query_texts=[question],
91+
n_results=5,
92+
include=["documents"],
93+
where={"Rating": {"$lte": 3}},
94+
)
95+
96+
reviews_str = ",".join(poor_reviews["documents"][0])
97+
98+
print("Worst reviews: ")
99+
print(poor_reviews["documents"][0][0])
100+
print("###########################################")
101+
102+
poor_review_analysis = openai.ChatCompletion.create(
103+
model="gpt-3.5-turbo",
104+
messages=[
105+
{"role": "system", "content": context.format(reviews_str)},
106+
{"role": "user", "content": question},
107+
],
108+
temperature=0,
109+
n=1,
110+
)
111+
112+
print("AI-Generated summary of the single worst review: ")
113+
print(poor_review_analysis["choices"][0]["message"]["content"])
114+
print("###########################################")

0 commit comments

Comments
 (0)