atomobianco
diff --git a/‎.gitignore
+8 b/‎.gitignore
+8
diff --git a/‎README.md
+93-1 b/‎README.md
+93-1
diff --git a/‎configs/FactKG.json
+31 b/‎configs/FactKG.json
+31
diff --git a/‎configs/metaQA_1hop.json
+30 b/‎configs/metaQA_1hop.json
+30
diff --git a/‎configs/metaQA_2hop.json
+30 b/‎configs/metaQA_2hop.json
+30
diff --git a/‎configs/metaQA_3hop.json
+30 b/‎configs/metaQA_3hop.json
+30
diff --git a/‎ollama_server.sh
+7 b/‎ollama_server.sh
+7
diff --git a/‎pipeline/FactKG_index.py
+18 b/‎pipeline/FactKG_index.py
+18
diff --git a/‎pipeline/FactKG_query.py
+78 b/‎pipeline/FactKG_query.py
+78
diff --git a/‎pipeline/metaQA_index.py
+17 b/‎pipeline/metaQA_index.py
+17
@@ -0,0 +1,8 @@
+data/
+results/
+test/
+
+Milvus/
+
+__pycache__/
+.vscode/
@@ -1 +1,93 @@
-test
+# SimGRAG
+
+The is the repository for the paper "SimGraphRAG: Leveraging Similar Subgraphs for Knowledge Graphs Driven Retrieval-Augmented Generation".
+SimGRAG is a KG-driven RAG approach that can support various KG based tasks, such as question answering and fact verification.
+
+## Prerequisites
+
+It supports plug-and-play usability with the following three components:
+- Large language model: For generation.
+- Embedding model: For node and relation embedding.
+- Vector database: store the embedding of the nodes and relations in the knowledge graph, supporting efficient similarity search.
+
+This repository is built on open-source solutions of these components:
+- Ollama for runing the large language model of Llama 3 70B
+- Nomic embedding model for node and relation embedding
+- Milvus for vector database
+
+You can replace the components with your own preference, all you need is to prepare the APIs.
+Next, we provide the preparation steps for the components we used.
+
+### Ollama
+
+Please visit the [Ollama](https://ollama.com/) website to install Ollama on your local environment.
+After installation, you can use the following command to run the Llama 3 70B model:
+```
+ollama run llama3:70b
+```
+Then, you can use the following command to start the service needed by SimGRAG:
+```
+bash ollama_server.sh
+```
+
+### Nomic Embedding Model
+
+You can clone the model from [here](https://huggingface.co/nomic-ai/nomic-embed-text-v1) with the following command:
+```
+mkdir -p data/raw
+cd data/raw
+git clone https://huggingface.co/nomic-ai/nomic-embed-text-v1
+```
+
+### Milvus
+
+Please visit the [Milvus](https://milvus.io/) website to install Milvus on your local environment.
+After installation, you can follow its documentation to start the service needed by SimGRAG.
+
+## Data preparation
+
+### MetaQA
+Please download the MetaQA dataset following the url in the [repository](https://github.com/yuyuz/MetaQA) and put it in the `data/raw` folder.
+
+### FactKG
+Please download the FactKG dataset following the url in the [repository](https://github.com/jiho283/FactKG) and put it in the `data/raw` folder.
+
+### Directonary structure
+After preparation, the directories should be organized as follows:
+```
+SimGraphRAG
+├── data
+│   └── raw
+│       ├── nomic-embed-text-v1
+│       ├── MetaQA
+│       └── FactKG
+├── configs
+├── pipeline
+├── prompts
+└── src
+```
+
+## Configuration
+
+You can find the configuration files in the `configs` folder. You can modify the configuration files to fit your needs.
+
+## Runing the pipeline
+
+For MetaQA, you can run the following command:
+```
+cd pipeline
+python metaQA_index.py
+python metaQA_query1hop.py
+python metaQA_query2hop.py
+python metaQA_query3hop.py
+```
+
+For FactKG, you can run the following command:
+```
+cd pipeline
+python factKG_index.py
+python factKG_query.py
+```
+
+The results can be found in the file that assigned to the "output_filename" in the configuration file. For example, "results/FactKG_query.txt".
+Each line of the result file is a dictionary, in which the key "correct" presents the correctness of the final answer.
@@ -0,0 +1,31 @@
+{
+    "raw_data_dir": "../data/raw/FactKG",
+    "processed_data_dir": "../data/FactKG",
+    "embedding_model": {
+        "model_path": "../data/raw/nomic-embed-text-v1",
+        "device": "cuda:0"
+    },
+    "vector_store_names": {
+        "node": "FactKG_node",
+        "relation": "FactKG_relation",
+        "type": "FactKG_type"
+    },
+    "retriever": {
+        "node_sim_topk": 16384,
+        "relation_sim_topk": 512,
+        "type_sim_topk": 16,
+        "final_topk": 3,
+        "timeout": 1800
+    },
+    "llm": {
+        "model": "llama3:70b",
+        "base_url": "http://localhost:11451/v1",
+        "api_key": "ollama",
+        "temperature": 0.2,
+        "top_p": 0.1,
+        "max_tokens": 1024
+    },
+    "rewrite_shot": 12,
+    "answer_shot": 12,
+    "output_filename": "../results/FactKG_query.txt"
+}
@@ -0,0 +1,30 @@
+{
+    "raw_data_dir": "../data/raw/metaQA",
+    "processed_data_dir": "../data/metaQA",
+    "hop": 1,
+    "embedding_model": {
+        "model_path": "../data/raw/nomic-embed-text-v1",
+        "device": "cuda:0"
+    },
+    "vector_store_names": {
+        "node": "metaQA_node",
+        "relation": "metaQA_relation"
+    },
+    "retriever": {
+        "node_sim_topk": 16,
+        "relation_sim_topk": 16,
+        "final_topk": 3,
+        "timeout": 600
+    },
+    "llm": {
+        "model": "llama3:70b",
+        "base_url": "http://localhost:11451/v1",
+        "api_key": "ollama",
+        "temperature": 0.2,
+        "top_p": 0.1,
+        "max_tokens": 1024
+    },
+    "rewrite_shot": 12,
+    "answer_shot": 12,
+    "output_filename": "../results/metaQA_1hop_query.txt"
+}
@@ -0,0 +1,30 @@
+{
+    "raw_data_dir": "../data/raw/metaQA",
+    "processed_data_dir": "../data/metaQA",
+    "hop": 2,
+    "embedding_model": {
+        "model_path": "../data/raw/nomic-embed-text-v1",
+        "device": "cuda:0"
+    },
+    "vector_store_names": {
+        "node": "metaQA_node",
+        "relation": "metaQA_relation"
+    },
+    "retriever": {
+        "node_sim_topk": 16,
+        "relation_sim_topk": 16,
+        "final_topk": 3,
+        "timeout": 600
+    },
+    "llm": {
+        "model": "llama3:70b",
+        "base_url": "http://localhost:11451/v1",
+        "api_key": "ollama",
+        "temperature": 0.2,
+        "top_p": 0.1,
+        "max_tokens": 1024
+    },
+    "rewrite_shot": 12,
+    "answer_shot": 12,
+    "output_filename": "../results/metaQA_2hop_query.txt"
+}
@@ -0,0 +1,30 @@
+{
+    "raw_data_dir": "../data/raw/metaQA",
+    "processed_data_dir": "../data/metaQA",
+    "hop": 3,
+    "embedding_model": {
+        "model_path": "../data/raw/nomic-embed-text-v1",
+        "device": "cuda:0"
+    },
+    "vector_store_names": {
+        "node": "metaQA_node",
+        "relation": "metaQA_relation"
+    },
+    "retriever": {
+        "node_sim_topk": 16,
+        "relation_sim_topk": 16,
+        "final_topk": 3,
+        "timeout": 600
+    },
+    "llm": {
+        "model": "llama3:70b",
+        "base_url": "http://localhost:11451/v1",
+        "api_key": "ollama",
+        "temperature": 0.2,
+        "top_p": 0.1,
+        "max_tokens": 1024
+    },
+    "rewrite_shot": 12,
+    "answer_shot": 12,
+    "output_filename": "../results/metaQA_3hop_query.txt"
+}
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+export CUDA_VISIBLE_DEVICES=0
+export OLLAMA_MODELS=/usr/share/ollama/.ollama/models
+export OLLAMA_HOST=http://127.0.0.1:11451
+
+ollama serve
@@ -0,0 +1,18 @@
+import sys
+sys.path.append('..')
+
+import json
+from src.dataset import FactKG
+from src.indexer import Indexer
+
+# load configs
+configs = json.load(open('../configs/FactKG.json'))
+
+# load dataset
+dataset = FactKG(configs)
+KG = dataset.get_KG()
+type_to_nodes = dataset.get_type_to_nodes()
+
+# build index
+indexer = Indexer(configs)
+indexer.build_index(KG, type_to_nodes)
@@ -0,0 +1,78 @@
+import sys
+sys.path.append('..')
+
+import time
+import json
+from tqdm import tqdm
+
+from src.llm import LLM
+import prompts.answer_FactKG
+import prompts.rewrite_FactKG
+from src.dataset import FactKG
+from src.retriever import Retriever
+from src.utils import check_answer
+from src.utils import extract_graph
+
+# load configs
+configs = json.load(open('../configs/FactKG.json'))
+
+# load dataset
+dataset = FactKG(configs)
+KG = dataset.get_KG()
+type_to_nodes = dataset.get_type_to_nodes()
+all_queries = dataset.get_queries()
+all_groundtruths = dataset.get_groundtruths()
+
+# load LLM
+llm = LLM(configs)
+
+# load retriever
+retriever = Retriever(configs, KG, type_to_nodes)
+
+# run for each query
+def run(query, groundtruths):
+	res = {
+		'query': query,
+		'groundtruths': groundtruths,
+		'retriever_configs': configs['retriever'],
+		'llm_configs': configs['llm'],
+		'rewrite_shot': configs['rewrite_shot'],
+		'answer_shot': configs['answer_shot'],
+	}
+	
+	try:
+		# rewrite
+		start = time.time()
+		res['rewrite_prompt'] = prompts.rewrite_FactKG.get(query, shot=res['rewrite_shot'])
+		res['rewrite_llm_output'] = llm.chat(res['rewrite_prompt'])
+		res['rewrite_time'] = time.time() - start
+  
+		# extract graph
+		res['query_graph'] = extract_graph(res['rewrite_llm_output'])
+		
+		# subgraph matching
+		start = time.time()
+		res['retrieval_details'] = retriever.retrieve(res['query_graph'], mode='greedy')
+		res['evidences'] = [each[1] for each in res['retrieval_details']['results']]
+		res['retrieval_time'] = time.time() - start
+
+		# answer
+		start = time.time()
+		res['answer_prompt'] = prompts.answer_FactKG.get(res['query'], res['evidences'], shot=res['answer_shot'])
+		res['answer_llm_output'] = llm.chat(res['answer_prompt'])
+		res['answer_time'] = time.time() - start
+  
+		# check answer
+		res['correct'] = check_answer(res['answer_llm_output'], groundtruths)
+  
+	except Exception as e:
+		res['error_message'] = str(e)
+  
+	return res
+
+# run for all queries
+result_file = configs["output_filename"]
+for query, groundtruths in tqdm(zip(all_queries, all_groundtruths), total=len(all_queries)):
+	res = run(query, groundtruths)
+	with open(result_file, 'a', encoding='utf-8') as f:
+		f.write(json.dumps(res, ensure_ascii=False) + '\n')
@@ -0,0 +1,17 @@
+import sys
+sys.path.append('..')
+
+import json
+from src.dataset import MetaQA
+from src.indexer import Indexer
+
+# load configs
+configs = json.load(open('../configs/metaQA_3hop.json'))
+
+# load dataset
+dataset = MetaQA(configs)
+KG = dataset.get_KG()
+
+# build index
+indexer = Indexer(configs)
+indexer.build_index(KG)