milvus-io · christy · Nov 21, 2023 · Nov 21, 2023
diff --git a/notebooks/llms/langchain/readthedocs_rag_zilliz.ipynb b/notebooks/llms/langchain/readthedocs_rag_zilliz.ipynb
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "d7570b2e",
    "metadata": {},
    "outputs": [],
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "20dcdaf7",
    "metadata": {},
    "outputs": [],
@@ -93,10 +93,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "0806d2db",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Type of server: zilliz_cloud\n"
+     ]
+    }
+   ],
    "source": [
     "from pymilvus import connections, utility\n",
     "\n",
@@ -134,10 +142,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "dd2be7fd",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "device: cpu\n",
+      "<class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>\n",
+      "SentenceTransformer(\n",
+      "  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel \n",
+      "  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})\n",
+      ")\n",
+      "model_name: BAAI/bge-base-en-v1.5\n",
+      "EMBEDDING_LENGTH: 768\n",
+      "MAX_SEQ_LENGTH: 512\n"
+     ]
+    }
+   ],
    "source": [
     "# Import torch.\n",
     "import torch\n",
@@ -188,9 +212,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Embedding length: 768\n",
+      "Created collection: MIlvusDocs\n",
+      "Schema: {'auto_id': True, 'description': 'The schema for docs pages', 'fields': [{'name': 'pk', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'enable_dynamic_field': True}\n"
+     ]
+    }
+   ],
    "source": [
     "from pymilvus import (\n",
     "    FieldSchema, DataType, \n",
@@ -246,9 +280,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loading_progress': '100%'}\n"
+     ]
+    }
+   ],
    "source": [
     "# 5. Drop the index, in case it already exists.\n",
     "mc.drop_index()\n",
@@ -266,19 +308,26 @@
     "    field_name=\"vector\", \n",
     "    index_params=index_params)\n",
     "\n",
-    "# collection.load()\n",
-    "\n",
     "# Get loading progress\n",
+    "mc.load()\n",
     "progress = utility.loading_progress(COLLECTION_NAME)\n",
     "print(progress)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "6861beb7",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "loaded 15 documents\n"
+     ]
+    }
+   ],
    "source": [
     "## Read docs into LangChain\n",
     "#!pip install langchain \n",
@@ -309,9 +358,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "chunking time: 0.01805710792541504\n",
+      "docs: 15, split into: 15\n",
+      "split into chunks: 159, type: list of <class 'langchain.schema.document.Document'>\n",
+      "\n",
+      "Looking at a sample chunk...\n",
+      "{'h1': 'Installation', 'h2': 'Installing via pip', 'source': 'rtdocs/pymilvus.readthedocs.io/en/latest/install.html'}\n",
+      "demonstrate how to install and using PyMilvus in a virtual environment. See virtualenv for more info\n"
+     ]
+    }
+   ],
    "source": [
     "from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter\n",
     "\n",
@@ -384,10 +447,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "512130a3",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'h1': 'Installation', 'h2': 'Installing via pip', 'source': 'https://pymilvus.readthedocs.io/en/latest/install.html'}\n",
+      "Installation¶ Installing via pip¶ PyMilvus is in the Python Package Index. PyMilvus only support pyt\n"
+     ]
+    }
+   ],
    "source": [
     "# Clean up the metadata urls\n",
     "for doc in chunks:\n",
@@ -413,7 +485,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -445,10 +517,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "id": "b51ff139",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Start inserting entities\n",
+      "Milvus insert time for 159 vectors: 1.0154786109924316 seconds\n",
+      "(insert count: 159, delete count: 0, upsert count: 0, timestamp: 445785288603074562, success count: 159, err count: 0)\n",
+      "[{\"name\":\"_default\",\"collection_name\":\"MIlvusDocs\",\"description\":\"\"}]\n"
+     ]
+    }
+   ],
    "source": [
     "# Insert a batch of data into the Milvus collection.\n",
     "\n",
@@ -503,10 +586,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "5e7f41f4",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "query length: 54\n"
+     ]
+    }
+   ],
    "source": [
     "# Define a sample question about your data.\n",
     "question = \"what is the default distance metric used in AUTOINDEX?\"\n",
@@ -534,10 +625,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "id": "89642119",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded milvus collection into memory.\n",
+      "Milvus search time: 0.06506514549255371 sec\n",
+      "type: <class 'pymilvus.client.abstract.SearchResult'>, count: 5\n"
+     ]
+    }
+   ],
    "source": [
     "# RETRIEVAL USING MILVUS.\n",
     "\n",
@@ -587,9 +688,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2267\n"
+     ]
+    }
+   ],
    "source": [
     "# # TODO - remove this before saving in github.\n",
     "# for n, hits in enumerate(results):\n",
@@ -617,10 +726,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "id": "3e7fa0b6",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Question: what is the default distance metric used in AUTOINDEX?\n",
+      "Answer: lazy dog\n"
+     ]
+    }
+   ],
    "source": [
     "# BASELINING THE LLM: ASK A QUESTION WITHOUT ANY RETRIEVED CONTEXT.\n",
     "\n",
@@ -649,10 +767,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "id": "a68e87b1",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Question: what is the default distance metric used in AUTOINDEX?\n",
+      "Answer: MetricType.L2\n"
+     ]
+    }
+   ],
    "source": [
     "# NOW ASK THE SAME LLM THE SAME QUESTION USING THE RETRIEVED CONTEXT.\n",
     "QA_input = {\n",
@@ -673,7 +800,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "id": "d0e81e68",
    "metadata": {},
    "outputs": [],
@@ -684,10 +811,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "id": "c777937e",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Author: Christy Bergman\n",
+      "\n",
+      "Python implementation: CPython\n",
+      "Python version       : 3.10.12\n",
+      "IPython version      : 8.15.0\n",
+      "\n",
+      "torch       : 2.0.1\n",
+      "transformers: 4.34.1\n",
+      "milvus      : 2.3.3\n",
+      "pymilvus    : 2.3.3\n",
+      "langchain   : 0.0.322\n",
+      "\n",
+      "conda environment: py310\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "# Props to Sebastian Raschka for this handy watermark.\n",
     "# !pip install watermark\n",