From 0ee081bc010d680daf4b6fb7b6b62dfe29b0904f Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Fri, 28 Feb 2025 17:05:59 -0500 Subject: [PATCH] Update notebook --- .../how-to-create-pinecone-datasets.ipynb | 1887 ++++++++--------- 1 file changed, 875 insertions(+), 1012 deletions(-) diff --git a/docs/assets/how-to-create-pinecone-datasets.ipynb b/docs/assets/how-to-create-pinecone-datasets.ipynb index 8f533640..3bc47615 100644 --- a/docs/assets/how-to-create-pinecone-datasets.ipynb +++ b/docs/assets/how-to-create-pinecone-datasets.ipynb @@ -1,1042 +1,905 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "cdN6QOXIUaUq" - }, - "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/docs/assets/how-to-create-pinecone-datasets.ipynb)\n", - "[![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/docs/assets/how-to-create-pinecone-datasets.ipynb)\n", - "\n", - "# Creating Pinecone Datasets" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8Fiobs_oUaUr" - }, - "source": [ - "This notebook will walk you through the process of creating a Pinecone dataset from a pandas Dataframe." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DLuQirtzUaUs" - }, - "source": [ - "## Step 1: create a simple sample dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "bVW2DlVQUaUs", - "outputId": "bd3c9438-7c67-4097-b580-4bfdd695ab92", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [], - "source": [ - "!pip install -qU pandas==2.0.2" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "cdN6QOXIUaUq" + }, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/docs/assets/how-to-create-pinecone-datasets.ipynb)\n", + "[![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/docs/assets/how-to-create-pinecone-datasets.ipynb)\n", + "\n", + "# Creating Pinecone Datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8Fiobs_oUaUr" + }, + "source": [ + "This notebook will walk you through the process of creating a Pinecone dataset from a pandas Dataframe." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DLuQirtzUaUs" + }, + "source": [ + "## Step 1: Create a simple sample dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "fPebr9XNUaUs" - }, - "outputs": [], - "source": [ - "import pandas as pd" - ] + "id": "bVW2DlVQUaUs", + "outputId": "bd3c9438-7c67-4097-b580-4bfdd695ab92" + }, + "outputs": [], + "source": [ + "!pip install -qU pinecone==6.0.1 pinecone-datasets==1.0.1 pandas==2.2.3" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 }, + "id": "I_WRSqY8UaUs", + "outputId": "36348ad8-38ef-40b2-8b0c-fc7e34e12575" + }, + "outputs": [], + "source": [ + "def build_doc(sentence, category):\n", + " return {\n", + " 'text': sentence,\n", + " 'category': category\n", + " }\n", + "\n", + "planet_sentences = [\n", + " \"The smallest planet in our solar system, Mercury, holds the title for being closest to the Sun.\",\n", + " \"Orbiting the Sun in just 88 Earth days, Mercury zips around at an incredible pace.\",\n", + " \"A rocky, cratered surface defines this planet, offering clues about its tumultuous history.\",\n", + " \"Extreme temperature fluctuations mark its days and nights, creating a challenging environment.\",\n", + "]\n", + "planet_sentence_objs = [build_doc(s, \"astronomy\") for s in planet_sentences]\n", + "\n", + "mythology_sentences = [\n", + " \"In Roman mythology, the messenger god Mercury is celebrated for his swift movement.\",\n", + " \"With winged sandals propelling him, Mercury traverses the skies effortlessly.\",\n", + " \"Renowned as a mediator between gods and mortals, the deity Mercury also guides souls to the underworld.\",\n", + " \"Carrying the caduceus, Mercury symbolizes both commerce and communication.\",\n", + "]\n", + "mythology_sentence_objs = [build_doc(s, 'mythology') for s in mythology_sentences]\n", + "\n", + "chemistry_sentences = [\n", + " \"Known as quicksilver, mercury is the only metal that remains liquid at room temperature.\",\n", + " \"With the symbol Hg, derived from the Greek 'hydrargyrum' meaning 'water-silver', mercury has captivated chemists for centuries.\",\n", + " \"Utilized in devices like thermometers and barometers, mercury’s unique properties make it invaluable in scientific instruments.\",\n", + " \"Once a common component in industrial processes, mercury now faces strict regulation due to its toxic effects.\",\n", + "]\n", + "chemistry_sentence_objs = [build_doc(s, 'chemistry') for s in chemistry_sentences]\n", + "\n", + "sentences = planet_sentence_objs + mythology_sentence_objs + chemistry_sentence_objs\n" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "I_WRSqY8UaUs", - "outputId": "36348ad8-38ef-40b2-8b0c-fc7e34e12575", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " id values sparse_values \\\n", - "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", - "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", - "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", - "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", - "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", - "\n", - " metadata blob \n", - "0 {'title': 'title1', 'url': 'url1'} {'extra_field': 'extra_value'} \n", - "1 {'title': 'title2', 'url': 'url2'} None \n", - "2 {'title': 'title3', 'url': 'url3'} None \n", - "3 {'title': 'title4', 'url': 'url4'} None \n", - "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} " - ], - "text/html": [ - "\n", - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'extra_field': 'extra_value'}
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value'}
\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - "
\n", - " \n", - "
\n", - "\n", - "\n", - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 3 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textcategory
0The smallest planet in our solar system, Mercu...astronomy
1Orbiting the Sun in just 88 Earth days, Mercur...astronomy
2A rocky, cratered surface defines this planet,...astronomy
3Extreme temperature fluctuations mark its days...astronomy
4In Roman mythology, the messenger god Mercury ...mythology
5With winged sandals propelling him, Mercury tr...mythology
6Renowned as a mediator between gods and mortal...mythology
7Carrying the caduceus, Mercury symbolizes both...mythology
8Known as quicksilver, mercury is the only meta...chemistry
9With the symbol Hg, derived from the Greek 'hy...chemistry
10Utilized in devices like thermometers and baro...chemistry
11Once a common component in industrial processe...chemistry
\n", + "
" ], - "source": [ - "documents = [\n", - " {\n", - " \"id\": \"1\",\n", - " \"values\": [0.1, 0.2, 0.3],\n", - " \"sparse_values\": {\"indices\": [1, 2, 3], \"values\": [0.1, 0.2, 0.3]},\n", - " \"metadata\": {\"title\": \"title1\", \"url\": \"url1\"},\n", - " \"blob\": {\"extra_field\": \"extra_value\"},\n", - " },\n", - " {\n", - " \"id\": \"2\",\n", - " \"values\": [0.4, 0.5, 0.6],\n", - " \"sparse_values\": {\"indices\": [4, 5, 6], \"values\": [0.4, 0.5, 0.6]},\n", - " \"metadata\": {\"title\": \"title2\", \"url\": \"url2\"},\n", - " \"blob\": None,\n", - " },\n", - " {\n", - " \"id\": \"3\",\n", - " \"values\": [0.7, 0.8, 0.9],\n", - " \"sparse_values\": {\"indices\": [7, 8, 9], \"values\": [0.7, 0.8, 0.9]},\n", - " \"metadata\": {\"title\": \"title3\", \"url\": \"url3\"},\n", - " \"blob\": None,\n", - " },\n", - " {\n", - " \"id\": \"4\",\n", - " \"values\": [1.0, 1.1, 1.2],\n", - " \"sparse_values\": {\"indices\": [10, 11, 12], \"values\": [1.0, 1.1, 1.2]},\n", - " \"metadata\": {\"title\": \"title4\", \"url\": \"url4\"},\n", - " \"blob\": None,\n", - " },\n", - " {\n", - " \"id\": \"5\",\n", - " \"values\": [1.3, 1.4, 1.5],\n", - " \"sparse_values\": {\"indices\": [13, 14, 15], \"values\": [1.3, 1.4, 1.5]},\n", - " \"metadata\": {\"title\": \"title5\", \"url\": \"url5\"},\n", - " \"blob\": {\"another_field\": \"another_value\"},\n", - " }\n", - "]\n", - "\n", - "df = pd.DataFrame(documents)\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "c_zwxJ_OUaUt" - }, - "source": [ - "Some notes:\n", - "* Note that we have both metadata field and 'blob' field, the metadata field is the acutal pinecone metadata we will use in our index, blob, is an additional field that we can use to store any additional information we want to store along with the Dataset.\n", - "* here we used both 'values' and 'sparse_values', however, sparse_values is not a mandatory field, if you don't have sparse values keep it empty." + "text/plain": [ + " text category\n", + "0 The smallest planet in our solar system, Mercu... astronomy\n", + "1 Orbiting the Sun in just 88 Earth days, Mercur... astronomy\n", + "2 A rocky, cratered surface defines this planet,... astronomy\n", + "3 Extreme temperature fluctuations mark its days... astronomy\n", + "4 In Roman mythology, the messenger god Mercury ... mythology\n", + "5 With winged sandals propelling him, Mercury tr... mythology\n", + "6 Renowned as a mediator between gods and mortal... mythology\n", + "7 Carrying the caduceus, Mercury symbolizes both... mythology\n", + "8 Known as quicksilver, mercury is the only meta... chemistry\n", + "9 With the symbol Hg, derived from the Greek 'hy... chemistry\n", + "10 Utilized in devices like thermometers and baro... chemistry\n", + "11 Once a common component in industrial processe... chemistry" ] - }, + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pd.DataFrame(sentences)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Create embeddings\n", + "\n", + "Next, we'll create some embeddings to go along with this data. There are many ways you could do this, but for this demo we'll use a Pinecone Inference hosted model." + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "BcFx1wFqUaUt" - }, - "source": [ - "## Pinecone Dataset\n", - "\n", - "Now that we have our data Ready, we can create a Pinecone Dataset. A Pinecone Dataset is a collection of documtents, queries and Metadata. We can create a Pinecone\n", - "* Documents: a collection of records with Id, Vectors (dense, sparse) and metadata\n", - "* Queries: a collection of queries with Vectors (dense, sparse), metadata filter and top_k\n", - "* Metadata: a defintion of the dataset: Name, dimension, metric, embedding models, etc." + "data": { + "text/plain": [ + "EmbeddingsList(\n", + " model='multilingual-e5-large',\n", + " vector_type='dense',\n", + " data=[\n", + " {'vector_type': dense, 'values': [0.0259246826171875, 0.0168609619140625, ..., -0.0272369384765625, -0.033203125]},\n", + " {'vector_type': dense, 'values': [0.0198974609375, 0.00733184814453125, ..., -0.0400390625, -0.005207061767578125]},\n", + " ... (26 more embeddings) ...,\n", + " {'vector_type': dense, 'values': [0.01123809814453125, -0.02001953125, ..., -0.0277252197265625, -0.0232696533203125]},\n", + " {'vector_type': dense, 'values': [-0.0018777847290039062, 0.004322052001953125, ..., -0.019012451171875, 0.022186279296875]}\n", + " ],\n", + " usage={'total_tokens': 834}\n", + ")" ] - }, + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pinecone import Pinecone\n", + "\n", + "pc = Pinecone()\n", + "sentence_embeddings = pc.inference.embed(\n", + " model='multilingual-e5-large',\n", + " inputs=[s['text'] for s in sentences],\n", + " parameters={\"input_type\": \"passage\", \"truncate\": \"END\"}\n", + ")\n", + "sentence_embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c_zwxJ_OUaUt" + }, + "source": [ + "## Step 3: Format data\n", + "\n", + "Now we need to do a bit of work to format our data into the schema expected by the pinecone-datasets package.\n", + "\n", + "Some notes:\n", + "* Note that we have both metadata field and 'blob' field, the metadata field is the acutal pinecone metadata we will use in our index, blob, is an additional field that we can use to store any additional information we want to store along with the Dataset.\n", + "* here we used both `values` but `sparse_values`, is also available as an optional field." + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "DCGFhTtyUaUt" - }, - "outputs": [], - "source": [ - "!pip install -qU \\\n", - " pinecone-client==2.2.2 \\\n", - " pinecone-datasets==0.6.0" + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvaluesmetadatablob
00c55e3ab-6f8b-43de-a759-fe2d1e884b56[0.0259246826171875, 0.0168609619140625, -0.02...{'category': 'astronomy'}{'text': 'The smallest planet in our solar sys...
147c6c690-26ef-491e-82f1-ace0d7f29a30[0.0198974609375, 0.00733184814453125, -0.0004...{'category': 'astronomy'}{'text': 'Orbiting the Sun in just 88 Earth da...
21e5a2b22-6bf7-4466-878b-619e573fb268[0.006023406982421875, 0.0005712509155273438, ...{'category': 'astronomy'}{'text': 'A rocky, cratered surface defines th...
339843338-ef61-425a-9a70-6a4d1ae1fc75[0.01971435546875, -0.0247802734375, -0.024307...{'category': 'astronomy'}{'text': 'Extreme temperature fluctuations mar...
41a294212-a879-4a4c-842f-083570a19f8e[0.01239013671875, -0.007007598876953125, -0.0...{'category': 'mythology'}{'text': 'In Roman mythology, the messenger go...
\n", + "
" + ], + "text/plain": [ + " id \\\n", + "0 0c55e3ab-6f8b-43de-a759-fe2d1e884b56 \n", + "1 47c6c690-26ef-491e-82f1-ace0d7f29a30 \n", + "2 1e5a2b22-6bf7-4466-878b-619e573fb268 \n", + "3 39843338-ef61-425a-9a70-6a4d1ae1fc75 \n", + "4 1a294212-a879-4a4c-842f-083570a19f8e \n", + "\n", + " values \\\n", + "0 [0.0259246826171875, 0.0168609619140625, -0.02... \n", + "1 [0.0198974609375, 0.00733184814453125, -0.0004... \n", + "2 [0.006023406982421875, 0.0005712509155273438, ... \n", + "3 [0.01971435546875, -0.0247802734375, -0.024307... \n", + "4 [0.01239013671875, -0.007007598876953125, -0.0... \n", + "\n", + " metadata \\\n", + "0 {'category': 'astronomy'} \n", + "1 {'category': 'astronomy'} \n", + "2 {'category': 'astronomy'} \n", + "3 {'category': 'astronomy'} \n", + "4 {'category': 'mythology'} \n", + "\n", + " blob \n", + "0 {'text': 'The smallest planet in our solar sys... \n", + "1 {'text': 'Orbiting the Sun in just 88 Earth da... \n", + "2 {'text': 'A rocky, cratered surface defines th... \n", + "3 {'text': 'Extreme temperature fluctuations mar... \n", + "4 {'text': 'In Roman mythology, the messenger go... " ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import uuid\n", + "\n", + "docs = [{\n", + " \"id\": str(uuid.uuid4()),\n", + " \"values\": e.values,\n", + " \"metadata\": {\n", + " \"category\": t['category']\n", + " },\n", + " \"blob\": {\"text\": t['text']}\n", + "} for e, t in zip(sentence_embeddings.data, sentences)]\n", + "\n", + "docs_df = pd.DataFrame(docs)\n", + "docs_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BcFx1wFqUaUt" + }, + "source": [ + "## Pinecone Dataset\n", + "\n", + "Now that we have our data Ready, we can create a Pinecone Dataset. A Pinecone Dataset is a collection of documtents, queries and Metadata. We can create a Pinecone\n", + "* Documents: a collection of records with Id, Vectors (dense, sparse) and metadata\n", + "* Queries: a collection of queries with Vectors (dense, sparse), metadata filter and top_k\n", + "* Metadata: a defintion of the dataset: Name, dimension, metric, embedding models, etc." + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "Eaiy3IjIUaUt", + "outputId": "4ff727bd-1a56-42bb-8cd2-e645b5ab390c" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "S9NCQyTqUaUt" - }, - "outputs": [], - "source": [ - "from pinecone_datasets import Dataset, DatasetMetadata" + "data": { + "text/plain": [ + "{'name': 'mercury-sentences',\n", + " 'created_at': '2025-02-28 21:59:43.146979',\n", + " 'documents': 12,\n", + " 'queries': 0,\n", + " 'source': None,\n", + " 'license': None,\n", + " 'bucket': None,\n", + " 'task': None,\n", + " 'dense_model': {'name': 'multilingual-e5-large',\n", + " 'tokenizer': None,\n", + " 'dimension': 1024},\n", + " 'sparse_model': None,\n", + " 'description': None,\n", + " 'tags': None,\n", + " 'args': None}" ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pinecone_datasets import Dataset, DatasetMetadata, DenseModelMetadata\n", + "from datetime import datetime\n", + "\n", + "metadata = DatasetMetadata(\n", + " name=\"mercury-sentences\",\n", + " documents=len(docs_df),\n", + " queries=0,\n", + " created_at=datetime.now().strftime(\"%Y-%m-%d %H:%M:%S.%f\"),\n", + " dense_model=DenseModelMetadata(\n", + " name='multilingual-e5-large',\n", + " dimension=1024\n", + " )\n", + ")\n", + "metadata.model_dump()" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 }, + "id": "g_ACjKDOUaUt", + "outputId": "bc47c7d1-a3ef-4cf1-9e4b-7da6f82e111c" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "Eaiy3IjIUaUt", - "outputId": "4ff727bd-1a56-42bb-8cd2-e645b5ab390c", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'name': '',\n", - " 'created_at': '2023-08-14 09:18:50.196514',\n", - " 'documents': 0,\n", - " 'queries': 0,\n", - " 'source': None,\n", - " 'license': None,\n", - " 'bucket': None,\n", - " 'task': None,\n", - " 'dense_model': {'name': '', 'tokenizer': None, 'dimension': 0},\n", - " 'sparse_model': None,\n", - " 'description': None,\n", - " 'tags': None,\n", - " 'args': None}" - ] - }, - "metadata": {}, - "execution_count": 6 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvaluessparse_valuesmetadatablob
095290eba-a884-43e9-8f25-fb56b581eca0[0.0259246826171875, 0.0168609619140625, -0.02...None{'category': 'astronomy'}{'text': 'The smallest planet in our solar sys...
15eec80c1-0f39-4cf8-8826-d1098d44e8f7[0.0198974609375, 0.00733184814453125, -0.0004...None{'category': 'astronomy'}{'text': 'Orbiting the Sun in just 88 Earth da...
2efedb64b-8e3a-421d-b956-9e7c6f3774f5[0.006023406982421875, 0.0005712509155273438, ...None{'category': 'astronomy'}{'text': 'A rocky, cratered surface defines th...
3d3e5ac3e-5a74-4bc4-b016-ebcaed352fc0[0.01971435546875, -0.0247802734375, -0.024307...None{'category': 'astronomy'}{'text': 'Extreme temperature fluctuations mar...
427ebe053-dda3-424d-baa8-0784fa5f9672[0.01239013671875, -0.007007598876953125, -0.0...None{'category': 'astronomy'}{'text': 'Unlike many other planets, it has no...
\n", + "
" ], - "source": [ - "# creating a new empty metadata\n", - "metadata = DatasetMetadata.empty()\n", - "metadata.dict()" + "text/plain": [ + " id \\\n", + "0 95290eba-a884-43e9-8f25-fb56b581eca0 \n", + "1 5eec80c1-0f39-4cf8-8826-d1098d44e8f7 \n", + "2 efedb64b-8e3a-421d-b956-9e7c6f3774f5 \n", + "3 d3e5ac3e-5a74-4bc4-b016-ebcaed352fc0 \n", + "4 27ebe053-dda3-424d-baa8-0784fa5f9672 \n", + "\n", + " values sparse_values \\\n", + "0 [0.0259246826171875, 0.0168609619140625, -0.02... None \n", + "1 [0.0198974609375, 0.00733184814453125, -0.0004... None \n", + "2 [0.006023406982421875, 0.0005712509155273438, ... None \n", + "3 [0.01971435546875, -0.0247802734375, -0.024307... None \n", + "4 [0.01239013671875, -0.007007598876953125, -0.0... None \n", + "\n", + " metadata \\\n", + "0 {'category': 'astronomy'} \n", + "1 {'category': 'astronomy'} \n", + "2 {'category': 'astronomy'} \n", + "3 {'category': 'astronomy'} \n", + "4 {'category': 'astronomy'} \n", + "\n", + " blob \n", + "0 {'text': 'The smallest planet in our solar sys... \n", + "1 {'text': 'Orbiting the Sun in just 88 Earth da... \n", + "2 {'text': 'A rocky, cratered surface defines th... \n", + "3 {'text': 'Extreme temperature fluctuations mar... \n", + "4 {'text': 'Unlike many other planets, it has no... " ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = Dataset.from_pandas(documents=df, q=None, metadata=metadata)\n", + "ds.documents.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CGzdg2sZUaUt" + }, + "source": [ + "## Save dataset to local path\n" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "IVkK6fJUUaUt", + "outputId": "943ff58d-91d6-4a75-e218-d833214fee1b" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "g_ACjKDOUaUt", - "outputId": "bc47c7d1-a3ef-4cf1-9e4b-7da6f82e111c", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " id values sparse_values \\\n", - "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", - "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", - "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", - "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", - "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", - "\n", - " metadata blob \n", - "0 {'title': 'title1', 'url': 'url1'} {'extra_field': 'extra_value'} \n", - "1 {'title': 'title2', 'url': 'url2'} None \n", - "2 {'title': 'title3', 'url': 'url3'} None \n", - "3 {'title': 'title4', 'url': 'url4'} None \n", - "4 {'title': 'title5', 'url': 'url5'} {'another_field': 'another_value'} " - ], - "text/html": [ - "\n", - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'extra_field': 'extra_value'}
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value'}
\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - "
\n", - " \n", - "
\n", - "\n", - "\n", - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 7 - } - ], - "source": [ - "ds = Dataset.from_pandas(documents=df, q=None, metadata=metadata)\n", - "ds.documents" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Files saved to /tmp/tmpsgv_o4bj\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "CGzdg2sZUaUt" - }, - "source": [ - "## Save dataset to local path\n" - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.12/site-packages/pinecone_datasets/dataset_fswriter.py:54: UserWarning: Queries are empty, not saving queries\n", + " warnings.warn(\"Queries are empty, not saving queries\")\n" + ] + } + ], + "source": [ + "import tempfile\n", + "from pinecone_datasets import Catalog\n", + "\n", + "catalog_path = tempfile.mkdtemp()\n", + "catalog = Catalog(base_path=catalog_path)\n", + "catalog.save_dataset(ds)\n", + "\n", + "print('Files saved to ' + catalog_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "id": "pLEhwSaRUaUu" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "IVkK6fJUUaUt", - "outputId": "943ff58d-91d6-4a75-e218-d833214fee1b", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.10/dist-packages/pinecone_datasets/dataset.py:433: UserWarning: Queries are empty, not saving queries\n", - " warnings.warn(\"Queries are empty, not saving queries\")\n" - ] - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namecreated_atdocumentsqueriessourcelicensebuckettaskdense_modelsparse_modeldescriptiontagsargs
0mercury-sentences2025-02-28 21:44:48.510318300NoneNoneNoneNone{'name': 'multilingual-e5-large', 'tokenizer':...NoneNoneNoneNone
\n", + "
" ], - "source": [ - "ds.to_path('/tmp/ds')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B5tvJlnSUaUu" - }, - "source": [ - "### Reload dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "pLEhwSaRUaUu" - }, - "outputs": [], - "source": [ - "new_ds = Dataset.from_path('/tmp/ds')" + "text/plain": [ + " name created_at documents queries source \\\n", + "0 mercury-sentences 2025-02-28 21:44:48.510318 30 0 None \n", + "\n", + " license bucket task dense_model \\\n", + "0 None None None {'name': 'multilingual-e5-large', 'tokenizer':... \n", + "\n", + " sparse_model description tags args \n", + "0 None None None None " ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog.list_datasets(as_df=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B5tvJlnSUaUu" + }, + "source": [ + "### Reload dataset (or another dataset from your local catalog)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 }, + "id": "J5LJGYqxUaUu", + "outputId": "120f1ebf-e30a-4913-a84f-727e52e2add8" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "J5LJGYqxUaUu", - "outputId": "120f1ebf-e30a-4913-a84f-727e52e2add8", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " id values sparse_values \\\n", - "0 1 [0.1, 0.2, 0.3] {'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]} \n", - "1 2 [0.4, 0.5, 0.6] {'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]} \n", - "2 3 [0.7, 0.8, 0.9] {'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]} \n", - "3 4 [1.0, 1.1, 1.2] {'indices': [10, 11, 12], 'values': [1.0, 1.1,... \n", - "4 5 [1.3, 1.4, 1.5] {'indices': [13, 14, 15], 'values': [1.3, 1.4,... \n", - "\n", - " metadata \\\n", - "0 {'title': 'title1', 'url': 'url1'} \n", - "1 {'title': 'title2', 'url': 'url2'} \n", - "2 {'title': 'title3', 'url': 'url3'} \n", - "3 {'title': 'title4', 'url': 'url4'} \n", - "4 {'title': 'title5', 'url': 'url5'} \n", - "\n", - " blob \n", - "0 {'another_field': None, 'extra_field': 'extra_... \n", - "1 None \n", - "2 None \n", - "3 None \n", - "4 {'another_field': 'another_value', 'extra_fiel... " - ], - "text/html": [ - "\n", - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idvaluessparse_valuesmetadatablob
01[0.1, 0.2, 0.3]{'indices': [1, 2, 3], 'values': [0.1, 0.2, 0.3]}{'title': 'title1', 'url': 'url1'}{'another_field': None, 'extra_field': 'extra_...
12[0.4, 0.5, 0.6]{'indices': [4, 5, 6], 'values': [0.4, 0.5, 0.6]}{'title': 'title2', 'url': 'url2'}None
23[0.7, 0.8, 0.9]{'indices': [7, 8, 9], 'values': [0.7, 0.8, 0.9]}{'title': 'title3', 'url': 'url3'}None
34[1.0, 1.1, 1.2]{'indices': [10, 11, 12], 'values': [1.0, 1.1,...{'title': 'title4', 'url': 'url4'}None
45[1.3, 1.4, 1.5]{'indices': [13, 14, 15], 'values': [1.3, 1.4,...{'title': 'title5', 'url': 'url5'}{'another_field': 'another_value', 'extra_fiel...
\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - "
\n", - " \n", - "
\n", - "\n", - "\n", - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 10 - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idvaluessparse_valuesmetadatablob
095290eba-a884-43e9-8f25-fb56b581eca0[0.0259246826171875, 0.0168609619140625, -0.02...None{'category': 'astronomy'}{'text': 'The smallest planet in our solar sys...
15eec80c1-0f39-4cf8-8826-d1098d44e8f7[0.0198974609375, 0.00733184814453125, -0.0004...None{'category': 'astronomy'}{'text': 'Orbiting the Sun in just 88 Earth da...
2efedb64b-8e3a-421d-b956-9e7c6f3774f5[0.006023406982421875, 0.0005712509155273438, ...None{'category': 'astronomy'}{'text': 'A rocky, cratered surface defines th...
3d3e5ac3e-5a74-4bc4-b016-ebcaed352fc0[0.01971435546875, -0.0247802734375, -0.024307...None{'category': 'astronomy'}{'text': 'Extreme temperature fluctuations mar...
427ebe053-dda3-424d-baa8-0784fa5f9672[0.01239013671875, -0.007007598876953125, -0.0...None{'category': 'astronomy'}{'text': 'Unlike many other planets, it has no...
\n", + "
" ], - "source": [ - "new_ds.documents" + "text/plain": [ + " id \\\n", + "0 95290eba-a884-43e9-8f25-fb56b581eca0 \n", + "1 5eec80c1-0f39-4cf8-8826-d1098d44e8f7 \n", + "2 efedb64b-8e3a-421d-b956-9e7c6f3774f5 \n", + "3 d3e5ac3e-5a74-4bc4-b016-ebcaed352fc0 \n", + "4 27ebe053-dda3-424d-baa8-0784fa5f9672 \n", + "\n", + " values sparse_values \\\n", + "0 [0.0259246826171875, 0.0168609619140625, -0.02... None \n", + "1 [0.0198974609375, 0.00733184814453125, -0.0004... None \n", + "2 [0.006023406982421875, 0.0005712509155273438, ... None \n", + "3 [0.01971435546875, -0.0247802734375, -0.024307... None \n", + "4 [0.01239013671875, -0.007007598876953125, -0.0... None \n", + "\n", + " metadata \\\n", + "0 {'category': 'astronomy'} \n", + "1 {'category': 'astronomy'} \n", + "2 {'category': 'astronomy'} \n", + "3 {'category': 'astronomy'} \n", + "4 {'category': 'astronomy'} \n", + "\n", + " blob \n", + "0 {'text': 'The smallest planet in our solar sys... \n", + "1 {'text': 'Orbiting the Sun in just 88 Earth da... \n", + "2 {'text': 'A rocky, cratered surface defines th... \n", + "3 {'text': 'Extreme temperature fluctuations mar... \n", + "4 {'text': 'Unlike many other planets, it has no... " ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" - }, - "orig_nbformat": 4, - "colab": { - "provenance": [] - } + ], + "source": [ + "new_ds = catalog.load_dataset('mercury-sentences')\n", + "new_ds.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Save to bucket\n", + "\n", + "Saving to a bucket in the cloud is basically the same except you will need to auth and specify the bucket location using the Catalog `base_path`. \n", + "\n", + "To authenticate:\n", + "- For Google Storage, run `gcloud auth login` to set auth credentials where the underlying `gcsfs` library can find them. Or, alternatively, set `GOOGLE_APPLICATION_CREDENTIALS` environment variable with a path to your service account json credentials. \n", + "- For S3, set `AWSACCESSKEYID` and `AWSSECRETACCESSKEY`.\n", + "\n", + "Once you've done that, you instantiate your Catalog with the bucket path and load/save datasets.\n", + "\n", + "```pythom\n", + "from pinecone_dataset import Catalog\n", + "\n", + "catalog = Catalog(\"gs://bucket-name\")\n", + "catalog.list_datasets(as_df=True)\n", + "```" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}