Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Zilliz free tier example #1192

Merged
merged 1 commit into from
Nov 21, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 186 additions & 38 deletions notebooks/llms/langchain/readthedocs_rag_zilliz.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "d7570b2e",
"metadata": {},
"outputs": [],
Expand All @@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "20dcdaf7",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -93,10 +93,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "0806d2db",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Type of server: zilliz_cloud\n"
]
}
],
"source": [
"from pymilvus import connections, utility\n",
"\n",
Expand Down Expand Up @@ -134,10 +142,26 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "dd2be7fd",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"device: cpu\n",
"<class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>\n",
"SentenceTransformer(\n",
" (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel \n",
" (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})\n",
")\n",
"model_name: BAAI/bge-base-en-v1.5\n",
"EMBEDDING_LENGTH: 768\n",
"MAX_SEQ_LENGTH: 512\n"
]
}
],
"source": [
"# Import torch.\n",
"import torch\n",
Expand Down Expand Up @@ -188,9 +212,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Embedding length: 768\n",
"Created collection: MIlvusDocs\n",
"Schema: {'auto_id': True, 'description': 'The schema for docs pages', 'fields': [{'name': 'pk', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'enable_dynamic_field': True}\n"
]
}
],
"source": [
"from pymilvus import (\n",
" FieldSchema, DataType, \n",
Expand Down Expand Up @@ -246,9 +280,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'loading_progress': '100%'}\n"
]
}
],
"source": [
"# 5. Drop the index, in case it already exists.\n",
"mc.drop_index()\n",
Expand All @@ -266,19 +308,26 @@
" field_name=\"vector\", \n",
" index_params=index_params)\n",
"\n",
"# collection.load()\n",
"\n",
"# Get loading progress\n",
"mc.load()\n",
"progress = utility.loading_progress(COLLECTION_NAME)\n",
"print(progress)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"id": "6861beb7",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loaded 15 documents\n"
]
}
],
"source": [
"## Read docs into LangChain\n",
"#!pip install langchain \n",
Expand Down Expand Up @@ -309,9 +358,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"chunking time: 0.01805710792541504\n",
"docs: 15, split into: 15\n",
"split into chunks: 159, type: list of <class 'langchain.schema.document.Document'>\n",
"\n",
"Looking at a sample chunk...\n",
"{'h1': 'Installation', 'h2': 'Installing via pip', 'source': 'rtdocs/pymilvus.readthedocs.io/en/latest/install.html'}\n",
"demonstrate how to install and using PyMilvus in a virtual environment. See virtualenv for more info\n"
]
}
],
"source": [
"from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter\n",
"\n",
Expand Down Expand Up @@ -384,10 +447,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"id": "512130a3",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'h1': 'Installation', 'h2': 'Installing via pip', 'source': 'https://pymilvus.readthedocs.io/en/latest/install.html'}\n",
"Installation¶ Installing via pip¶ PyMilvus is in the Python Package Index. PyMilvus only support pyt\n"
]
}
],
"source": [
"# Clean up the metadata urls\n",
"for doc in chunks:\n",
Expand All @@ -413,7 +485,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -445,10 +517,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"id": "b51ff139",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Start inserting entities\n",
"Milvus insert time for 159 vectors: 1.0154786109924316 seconds\n",
"(insert count: 159, delete count: 0, upsert count: 0, timestamp: 445785288603074562, success count: 159, err count: 0)\n",
"[{\"name\":\"_default\",\"collection_name\":\"MIlvusDocs\",\"description\":\"\"}]\n"
]
}
],
"source": [
"# Insert a batch of data into the Milvus collection.\n",
"\n",
Expand Down Expand Up @@ -503,10 +586,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"id": "5e7f41f4",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"query length: 54\n"
]
}
],
"source": [
"# Define a sample question about your data.\n",
"question = \"what is the default distance metric used in AUTOINDEX?\"\n",
Expand Down Expand Up @@ -534,10 +625,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"id": "89642119",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded milvus collection into memory.\n",
"Milvus search time: 0.06506514549255371 sec\n",
"type: <class 'pymilvus.client.abstract.SearchResult'>, count: 5\n"
]
}
],
"source": [
"# RETRIEVAL USING MILVUS.\n",
"\n",
Expand Down Expand Up @@ -587,9 +688,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2267\n"
]
}
],
"source": [
"# # TODO - remove this before saving in github.\n",
"# for n, hits in enumerate(results):\n",
Expand Down Expand Up @@ -617,10 +726,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"id": "3e7fa0b6",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Question: what is the default distance metric used in AUTOINDEX?\n",
"Answer: lazy dog\n"
]
}
],
"source": [
"# BASELINING THE LLM: ASK A QUESTION WITHOUT ANY RETRIEVED CONTEXT.\n",
"\n",
Expand Down Expand Up @@ -649,10 +767,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"id": "a68e87b1",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Question: what is the default distance metric used in AUTOINDEX?\n",
"Answer: MetricType.L2\n"
]
}
],
"source": [
"# NOW ASK THE SAME LLM THE SAME QUESTION USING THE RETRIEVED CONTEXT.\n",
"QA_input = {\n",
Expand All @@ -673,7 +800,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"id": "d0e81e68",
"metadata": {},
"outputs": [],
Expand All @@ -684,10 +811,31 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"id": "c777937e",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Author: Christy Bergman\n",
"\n",
"Python implementation: CPython\n",
"Python version : 3.10.12\n",
"IPython version : 8.15.0\n",
"\n",
"torch : 2.0.1\n",
"transformers: 4.34.1\n",
"milvus : 2.3.3\n",
"pymilvus : 2.3.3\n",
"langchain : 0.0.322\n",
"\n",
"conda environment: py310\n",
"\n"
]
}
],
"source": [
"# Props to Sebastian Raschka for this handy watermark.\n",
"# !pip install watermark\n",
Expand Down