feat: integrate nano-graphrag (#433)

cin-klein · taprosoft · web-flow · commit 66e565649e59 · 2024-10-30T15:32:30.000+07:00
* add nano graph-rag

* ignore entities for relevant context reference

* refactor and add local model as default nano-graphrag

* feat: add kotaemon llm &amp; embedding integration with nanographrag

* fix: add env var for nano GraphRAG

---------

Co-authored-by: Tadashi &lt;tadashi@cinnamon.is&gt;
diff --git a/.env.example b/.env.example
@@ -19,6 +19,8 @@ COHERE_API_KEY=<COHERE_API_KEY>
 # settings for local models
 LOCAL_MODEL=llama3.1:8b
 LOCAL_MODEL_EMBEDDINGS=nomic-embed-text
+LOCAL_EMBEDDING_MODEL_DIM = 768
+LOCAL_EMBEDDING_MODEL_MAX_TOKENS = 8192
 
 # settings for GraphRAG
 GRAPHRAG_API_KEY=<YOUR_OPENAI_KEY>
diff --git a/README.md b/README.md
@@ -170,7 +170,22 @@ documents and developers who want to build their own RAG pipeline.
 ### Setup GraphRAG
 
 > [!NOTE]
-> Currently GraphRAG feature only works with OpenAI or Ollama API.
+> Official MS GraphRAG indexing only works with OpenAI or Ollama API.
+> We recommend most users to use NanoGraphRAG implementation for straightforward integration with Kotaemon.
+
+<details>
+
+<summary>Setup Nano GRAPHRAG</summary>
+
+- Install nano-GraphRAG: `pip install nano-graphrag`
+- Launch Kotaemon with `USE_NANO_GRAPHRAG=true` environment variable.
+- Set your default LLM & Embedding models in Resources setting and it will be recognized automatically from NanoGraphRAG.
+
+</details>
+
+<details>
+
+<summary>Setup MS GRAPHRAG</summary>
 
 - **Non-Docker Installation**: If you are not using Docker, install GraphRAG with the following command:
 
@@ -181,6 +196,8 @@ documents and developers who want to build their own RAG pipeline.
 - **Setting Up API KEY**: To use the GraphRAG retriever feature, ensure you set the `GRAPHRAG_API_KEY` environment variable. You can do this directly in your environment or by adding it to a `.env` file.
 - **Using Local Models and Custom Settings**: If you want to use GraphRAG with local models (like `Ollama`) or customize the default LLM and other configurations, set the `USE_CUSTOMIZED_GRAPHRAG_SETTING` environment variable to true. Then, adjust your settings in the `settings.yaml.example` file.
 
+</details>
+
 ### Setup Local Models (for local/private RAG)
 
 See [Local model setup](docs/local_model.md).
diff --git a/flowsettings.py b/flowsettings.py
@@ -284,32 +284,54 @@
     },
 }
 
-
+USE_NANO_GRAPHRAG = config("USE_NANO_GRAPHRAG", default=False, cast=bool)
+GRAPHRAG_INDEX_TYPE = (
+    "ktem.index.file.graph.GraphRAGIndex"
+    if not USE_NANO_GRAPHRAG
+    else "ktem.index.file.graph.NanoGraphRAGIndex"
+)
 KH_INDEX_TYPES = [
     "ktem.index.file.FileIndex",
-    "ktem.index.file.graph.GraphRAGIndex",
+    GRAPHRAG_INDEX_TYPE,
 ]
-KH_INDICES = [
+
+GRAPHRAG_INDEX = (
     {
-        "name": "File",
+        "name": "GraphRAG",
         "config": {
             "supported_file_types": (
                 ".png, .jpeg, .jpg, .tiff, .tif, .pdf, .xls, .xlsx, .doc, .docx, "
                 ".pptx, .csv, .html, .mhtml, .txt, .md, .zip"
             ),
             "private": False,
         },
-        "index_type": "ktem.index.file.FileIndex",
-    },
+        "index_type": "ktem.index.file.graph.GraphRAGIndex",
+    }
+    if not USE_NANO_GRAPHRAG
+    else {
+        "name": "NanoGraphRAG",
+        "config": {
+            "supported_file_types": (
+                ".png, .jpeg, .jpg, .tiff, .tif, .pdf, .xls, .xlsx, .doc, .docx, "
+                ".pptx, .csv, .html, .mhtml, .txt, .md, .zip"
+            ),
+            "private": False,
+        },
+        "index_type": "ktem.index.file.graph.NanoGraphRAGIndex",
+    }
+)
+
+KH_INDICES = [
     {
-        "name": "GraphRAG",
+        "name": "File",
         "config": {
             "supported_file_types": (
                 ".png, .jpeg, .jpg, .tiff, .tif, .pdf, .xls, .xlsx, .doc, .docx, "
                 ".pptx, .csv, .html, .mhtml, .txt, .md, .zip"
             ),
             "private": False,
         },
-        "index_type": "ktem.index.file.graph.GraphRAGIndex",
+        "index_type": "ktem.index.file.FileIndex",
     },
+    GRAPHRAG_INDEX,
 ]
diff --git a/libs/ktem/ktem/index/file/graph/__init__.py b/libs/ktem/ktem/index/file/graph/__init__.py
@@ -1,3 +1,4 @@
 from .graph_index import GraphRAGIndex
+from .nano_graph_index import NanoGraphRAGIndex
 
-__all__ = ["GraphRAGIndex"]
+__all__ = ["GraphRAGIndex", "NanoGraphRAGIndex"]
diff --git a/libs/ktem/ktem/index/file/graph/nano_graph_index.py b/libs/ktem/ktem/index/file/graph/nano_graph_index.py
@@ -0,0 +1,26 @@
+from typing import Any
+
+from ..base import BaseFileIndexRetriever
+from .graph_index import GraphRAGIndex
+from .nano_pipelines import NanoGraphRAGIndexingPipeline, NanoGraphRAGRetrieverPipeline
+
+
+class NanoGraphRAGIndex(GraphRAGIndex):
+    def _setup_indexing_cls(self):
+        self._indexing_pipeline_cls = NanoGraphRAGIndexingPipeline
+
+    def _setup_retriever_cls(self):
+        self._retriever_pipeline_cls = [NanoGraphRAGRetrieverPipeline]
+
+    def get_retriever_pipelines(
+        self, settings: dict, user_id: int, selected: Any = None
+    ) -> list["BaseFileIndexRetriever"]:
+        _, file_ids, _ = selected
+        retrievers = [
+            NanoGraphRAGRetrieverPipeline(
+                file_ids=file_ids,
+                Index=self._resources["Index"],
+            )
+        ]
+
+        return retrievers
diff --git a/libs/ktem/ktem/index/file/graph/nano_pipelines.py b/libs/ktem/ktem/index/file/graph/nano_pipelines.py
diff --git a/libs/ktem/ktem/index/file/graph/pipelines.py b/libs/ktem/ktem/index/file/graph/pipelines.py