Skip to content

Commit cdcec9e

Browse files
committed
pinecone deployment
1 parent b7e705f commit cdcec9e

File tree

9 files changed

+3064
-7
lines changed

9 files changed

+3064
-7
lines changed

.vscode/launch.json

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
// Use IntelliSense to learn about possible attributes.
3+
// Hover to view descriptions of existing attributes.
4+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5+
"version": "0.2.0",
6+
"configurations": [
7+
{
8+
"name": "Python: Current File",
9+
"type": "python",
10+
"request": "launch",
11+
"program": "${file}",
12+
"console": "integratedTerminal",
13+
"justMyCode": true,
14+
"envFile": "${workspaceFolder}/.env"
15+
}
16+
]
17+
}

.vscode/settings.json

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"files.exclude": {
3+
"**/.git": true,
4+
"**/.svn": true,
5+
"**/.hg": true,
6+
"**/CVS": true,
7+
"**/.DS_Store": true,
8+
"**/*.pyc": true,
9+
"**/__pycache__": true
10+
},
11+
"python.pythonPath": " /home/teknetik/.local/share/virtualenvs/documentation-helper-TKW46dO5/bin/python",
12+
"python.linting.pylintArgs": [
13+
"--load-plugins",
14+
"pylint_flask"
15+
]
16+
}

Pipfile

-1
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,3 @@ tqdm = "*"
2323

2424
[requires]
2525
python_version = "3.11"
26-
python_full_version = "3.11.0"

Pipfile.lock

+2,081
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backend/core.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
environment=os.environ["PINECONE_ENVIRONMENT_REGION"],
1414
)
1515

16-
INDEX_NAME = "langchain-doc-index"
16+
INDEX_NAME = "langchain-doc"
1717

1818

1919
def run_llm(query: str, chat_history: List[Dict[str, Any]] = []):

dirs_ingested.txt

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
/home/teknetik/websites/docs.kong/docs.konghq.com/gateway/3.3.x/
2+
/home/teknetik/websites/docs.kong/docs.konghq.com/konnect/
3+
/home/teknetik/websites/docs.kong/docs.konghq.com/api/
4+
/home/teknetik/websites/docs.kong/docs.konghq.com/getting-started-guide/
5+
/home/teknetik/websites/docs.kong/docs.konghq.com/mesh/latest/
6+
7+
8+
#Not done
9+
10+
/home/teknetik/websites/docs.kong/docs.konghq.com/mesh/changelog/
11+
/home/teknetik/websites/docs.kong/docs.konghq.com/plugins/
12+
/home/teknetik/websites/docs.kong/docs.konghq.com/studio/
13+
/home/teknetik/websites/docs.kong/docs.konghq.com/kubernetes-ingress-controller/
14+
/home/teknetik/websites/docs.kong/docs.konghq.com/install/
15+
/home/teknetik/websites/docs.kong/docs.konghq.com/konnect-platform/
16+
/home/teknetik/websites/docs.kong/docs.konghq.com/deck/

ingestion.py

+34-5
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,37 @@
44
from langchain.embeddings import OpenAIEmbeddings
55
from langchain.text_splitter import RecursiveCharacterTextSplitter
66
from langchain.vectorstores import Pinecone
7+
from langchain.document_loaders import UnstructuredHTMLLoader
78
import pinecone
89

910
pinecone.init(
1011
api_key=os.environ["PINECONE_API_KEY"],
1112
environment=os.environ["PINECONE_ENVIRONMENT_REGION"],
1213
)
13-
INDEX_NAME = "langchain-doc-index"
14+
INDEX_NAME = "langchain-doc"
15+
16+
import os
17+
18+
def get_files_in_dir(directory):
19+
file_list = []
20+
21+
# Walk through directory
22+
for dirpath, dirnames, filenames in os.walk(directory):
23+
for file in filenames:
24+
# Check if the file is .html
25+
if file.endswith('.html'):
26+
file_list.append(os.path.join(dirpath, file))
27+
#file_list.append(os.path.join(dirpath, file))
28+
return file_list
1429

1530

16-
def ingest_docs():
17-
loader = ReadTheDocsLoader("python.langchain.com/en/latest/index.html")
31+
def ingest_docs(file):
32+
#loader = ReadTheDocsLoader(path="/home/teknetik/websites/docs.kong/docs.konghq.com/index.html")
33+
loader = UnstructuredHTMLLoader(file)
1834
raw_documents = loader.load()
1935
print(f"loaded {len(raw_documents)} documents")
2036
text_splitter = RecursiveCharacterTextSplitter(
21-
chunk_size=400, chunk_overlap=50, separators=["\n\n", "\n", " ", ""]
37+
chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", " ", ""]
2238
)
2339
documents = text_splitter.split_documents(raw_documents)
2440
for doc in documents:
@@ -33,4 +49,17 @@ def ingest_docs():
3349

3450

3551
if __name__ == "__main__":
36-
ingest_docs()
52+
###
53+
# Change the directory, only uploading one version at a time
54+
#
55+
# Ensure to update the meta data with the product and version or other applicable metadata
56+
#
57+
###
58+
directory_to_scan = "/home/teknetik/websites/docs.kong/docs.konghq.com/mesh/latest/" # Change this to your target directory
59+
file_list = get_files_in_dir(directory_to_scan)
60+
file_num = len(file_list)
61+
i =1
62+
for file in file_list:
63+
print(file + " " + str(i) + " of " + str(file_num))
64+
ingest_docs(file)
65+
i += 1

0 commit comments

Comments
 (0)