From 0dc3aef007b80dbf6d4984e9ab7ed06fee631acd Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 2 Jan 2025 16:48:30 +0100 Subject: [PATCH] install nltk --- tutorials/42_Sentence_Window_Retriever.ipynb | 50 +++++++++++--------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/tutorials/42_Sentence_Window_Retriever.ipynb b/tutorials/42_Sentence_Window_Retriever.ipynb index 0aa7bf9..56fbb16 100644 --- a/tutorials/42_Sentence_Window_Retriever.ipynb +++ b/tutorials/42_Sentence_Window_Retriever.ipynb @@ -55,7 +55,7 @@ "%%bash\n", "\n", "pip install --upgrade pip\n", - "pip install haystack-ai" + "pip install haystack-ai nltk" ] }, { @@ -98,17 +98,20 @@ "source": [ "from haystack import Document\n", "from haystack.components.preprocessors import DocumentSplitter\n", + "\n", "splitter = DocumentSplitter(split_length=1, split_overlap=0, split_by=\"period\")\n", "\n", - "text = (\"Paul fell asleep to dream of an Arrakeen cavern, silent people all around him moving in the dim light \"\n", - " \"of glowglobes. It was solemn there and like a cathedral as he listened to a faint sound—the \"\n", - " \"drip-drip-drip of water. Even while he remained in the dream, Paul knew he would remember it upon \"\n", - " \"awakening. He always remembered the dreams that were predictions. The dream faded. Paul awoke to feel \"\n", - " \"himself in the warmth of his bed—thinking thinking. This world of Castle Caladan, without play or \"\n", - " \"companions his own age, perhaps did not deserve sadness in farewell. Dr Yueh, his teacher, had \"\n", - " \"hinted that the faufreluches class system was not rigidly guarded on Arrakis. The planet sheltered \"\n", - " \"people who lived at the desert edge without caid or bashar to command them: will-o’-the-sand people \"\n", - " \"called Fremen, marked down on no census of the Imperial Regate.\")\n", + "text = (\n", + " \"Paul fell asleep to dream of an Arrakeen cavern, silent people all around him moving in the dim light \"\n", + " \"of glowglobes. It was solemn there and like a cathedral as he listened to a faint sound—the \"\n", + " \"drip-drip-drip of water. Even while he remained in the dream, Paul knew he would remember it upon \"\n", + " \"awakening. He always remembered the dreams that were predictions. The dream faded. Paul awoke to feel \"\n", + " \"himself in the warmth of his bed—thinking thinking. This world of Castle Caladan, without play or \"\n", + " \"companions his own age, perhaps did not deserve sadness in farewell. Dr Yueh, his teacher, had \"\n", + " \"hinted that the faufreluches class system was not rigidly guarded on Arrakis. The planet sheltered \"\n", + " \"people who lived at the desert edge without caid or bashar to command them: will-o’-the-sand people \"\n", + " \"called Fremen, marked down on no census of the Imperial Regate.\"\n", + ")\n", "\n", "doc = Document(content=text)\n", "docs = splitter.run([doc])" @@ -144,7 +147,7 @@ "from haystack.document_stores.types import DuplicatePolicy\n", "\n", "doc_store = InMemoryDocumentStore()\n", - "doc_store.write_documents(docs['documents'], policy=DuplicatePolicy.OVERWRITE)" + "doc_store.write_documents(docs[\"documents\"], policy=DuplicatePolicy.OVERWRITE)" ] }, { @@ -167,7 +170,7 @@ "from haystack.components.retrievers import SentenceWindowRetriever\n", "\n", "retriever = SentenceWindowRetriever(document_store=doc_store, window_size=2)\n", - "result = retriever.run(retrieved_documents=[docs['documents'][4]])" + "result = retriever.run(retrieved_documents=[docs[\"documents\"][4]])" ] }, { @@ -199,7 +202,7 @@ } ], "source": [ - "result['context_windows']" + "result[\"context_windows\"]" ] }, { @@ -224,7 +227,7 @@ } ], "source": [ - "result['context_documents']" + "result[\"context_documents\"]" ] }, { @@ -259,6 +262,7 @@ "import csv\n", "from haystack import Document\n", "\n", + "\n", "def read_documents(file: str) -> List[Document]:\n", " with open(file, \"r\") as file:\n", " reader = csv.reader(file, delimiter=\"\\t\")\n", @@ -283,11 +287,11 @@ "from pathlib import Path\n", "import requests\n", "\n", - "doc = requests.get('https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv')\n", + "doc = requests.get(\"https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv\")\n", "\n", - "datafolder = Path('data')\n", + "datafolder = Path(\"data\")\n", "datafolder.mkdir(exist_ok=True)\n", - "with open(datafolder/'bbc-news-data.csv', 'wb') as f:\n", + "with open(datafolder / \"bbc-news-data.csv\", \"wb\") as f:\n", " for chunk in doc.iter_content(512):\n", " f.write(chunk)" ] @@ -356,7 +360,7 @@ "\n", "indexing_pipeline.connect(\"splitter\", \"writer\")\n", "\n", - "indexing_pipeline.run({\"documents\":docs})" + "indexing_pipeline.run({\"documents\": docs})" ] }, { @@ -421,7 +425,9 @@ "metadata": {}, "outputs": [], "source": [ - "result = sentence_window_pipeline.run(data={'bm25_retriever': {'query': \"phishing attacks\", \"top_k\": 1}}, include_outputs_from={'bm25_retriever'})" + "result = sentence_window_pipeline.run(\n", + " data={\"bm25_retriever\": {\"query\": \"phishing attacks\", \"top_k\": 1}}, include_outputs_from={\"bm25_retriever\"}\n", + ")" ] }, { @@ -450,7 +456,7 @@ } ], "source": [ - "result['bm25_retriever']['documents']" + "result[\"bm25_retriever\"][\"documents\"]" ] }, { @@ -479,7 +485,7 @@ } ], "source": [ - "result['sentence_window__retriever']['context_windows']" + "result[\"sentence_window__retriever\"][\"context_windows\"]" ] }, { @@ -512,7 +518,7 @@ } ], "source": [ - "result['sentence_window__retriever']['context_documents']" + "result[\"sentence_window__retriever\"][\"context_documents\"]" ] }, {