Skip to content

Commit 4d8e4ba

Browse files
authored
Fix issue where only the first page of PDFs was indexed (#30)
1 parent 1324dd1 commit 4d8e4ba

File tree

1 file changed

+12
-7
lines changed

1 file changed

+12
-7
lines changed

flask_react/index_server.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,17 +57,22 @@ def query_index(query_text):
5757
def insert_into_index(doc_file_path, doc_id=None):
5858
"""Insert new document into global index."""
5959
global index, stored_docs
60-
document = SimpleDirectoryReader(input_files=[doc_file_path]).load_data()[0]
61-
if doc_id is not None:
62-
document.id_ = doc_id
60+
documents = SimpleDirectoryReader(input_files=[doc_file_path]).load_data()
6361

6462
with lock:
65-
# Keep track of stored docs
66-
stored_docs[document.id_] = document.text[0:200] # only take the first 200 chars
63+
for document in documents:
64+
if doc_id is not None:
65+
document.id_ = doc_id
66+
index.insert(document)
67+
68+
stored_docs[document.id_] = document.text[0:200] # only take the first 200 chars
6769

68-
index.insert(document)
6970
index.storage_context.persist(persist_dir=index_name)
70-
71+
72+
first_document = documents[0]
73+
# Keep track of stored docs -- llama_index doesn't make this easy
74+
stored_docs[first_document.doc_id] = first_document.text[0:200] # only take the first 200 chars
75+
7176
with open(pkl_name, "wb") as f:
7277
pickle.dump(stored_docs, f)
7378

0 commit comments

Comments
 (0)