From 70403a90d2625e3a519a5a3adbfd9f9dc5d19e50 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 15 Oct 2024 11:42:11 -0400 Subject: [PATCH 1/6] localdocs: do not count transaction start time against the batch timer Signed-off-by: Jared Van Bortel --- gpt4all-chat/src/database.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpt4all-chat/src/database.cpp b/gpt4all-chat/src/database.cpp index 65ce828a5796..d79b73f25c37 100644 --- a/gpt4all-chat/src/database.cpp +++ b/gpt4all-chat/src/database.cpp @@ -1635,10 +1635,10 @@ bool Database::scanQueueInterrupted() const void Database::scanQueueBatch() { - m_scanDurationTimer.start(); - transaction(); + m_scanDurationTimer.start(); + // scan for up to 100ms or until we run out of documents while (!m_docsToScan.empty() && !scanQueueInterrupted()) scanQueue(); From 0ca3317776b451321039f068958d5b7370c9013d Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 15 Oct 2024 11:45:18 -0400 Subject: [PATCH 2/6] localdocs: explicitly process at least one document per batch Signed-off-by: Jared Van Bortel --- gpt4all-chat/src/database.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gpt4all-chat/src/database.cpp b/gpt4all-chat/src/database.cpp index d79b73f25c37..7f7dbd865bd2 100644 --- a/gpt4all-chat/src/database.cpp +++ b/gpt4all-chat/src/database.cpp @@ -1640,8 +1640,11 @@ void Database::scanQueueBatch() m_scanDurationTimer.start(); // scan for up to 100ms or until we run out of documents - while (!m_docsToScan.empty() && !scanQueueInterrupted()) + while (!m_docsToScan.empty()) { scanQueue(); + if (scanQueueInterrupted()) + break; + } commit(); From 47e23f439f69afbb58acee052d59ebf7d80de05e Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 15 Oct 2024 11:46:13 -0400 Subject: [PATCH 3/6] localdocs: explicitly process at least one word Signed-off-by: Jared Van Bortel --- gpt4all-chat/src/database.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gpt4all-chat/src/database.cpp b/gpt4all-chat/src/database.cpp index 7f7dbd865bd2..e72316fd5409 100644 --- a/gpt4all-chat/src/database.cpp +++ b/gpt4all-chat/src/database.cpp @@ -1376,10 +1376,6 @@ ChunkStreamer::Status ChunkStreamer::step() m_docKey.reset(); // done processing return *error; } - if (m_database->scanQueueInterrupted()) { - retval = Status::INTERRUPTED; - break; - } // get a word, if needed std::optional word = QString(); // empty string to disable EOF logic @@ -1472,6 +1468,11 @@ ChunkStreamer::Status ChunkStreamer::step() break; } } + + if (m_database->scanQueueInterrupted()) { + retval = Status::INTERRUPTED; + break; + } } if (nChunks) { From 296b29d996322d1fe1d9a5281a3d21f07e261d63 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 15 Oct 2024 15:02:29 -0400 Subject: [PATCH 4/6] localdocs: handle metadata in document reader Recording this metadata once avoids the need to open the PDF document every time we enter scanQueue. Signed-off-by: Jared Van Bortel --- gpt4all-chat/src/database.cpp | 52 ++++++++++++++++------------------- gpt4all-chat/src/database.h | 3 +- 2 files changed, 25 insertions(+), 30 deletions(-) diff --git a/gpt4all-chat/src/database.cpp b/gpt4all-chat/src/database.cpp index e72316fd5409..320f33b29229 100644 --- a/gpt4all-chat/src/database.cpp +++ b/gpt4all-chat/src/database.cpp @@ -1129,9 +1129,12 @@ static void handleDocumentError(const QString &errorMessage, int document_id, co class DocumentReader { public: + struct Metadata { QString title, author, subject, keywords; }; + static std::unique_ptr fromDocument(const DocumentInfo &info); const DocumentInfo &doc () const { return *m_info; } + const Metadata &metadata() const { return m_metadata; } const std::optional &word () const { return m_word; } const std::optional &nextWord() { m_word = advance(); return m_word; } virtual std::optional getError() const { return std::nullopt; } @@ -1143,11 +1146,16 @@ class DocumentReader { explicit DocumentReader(const DocumentInfo &info) : m_info(&info) {} - void postInit() { m_word = advance(); } + void postInit(Metadata &&metadata = {}) + { + m_metadata = std::move(metadata); + m_word = advance(); + } virtual std::optional advance() = 0; const DocumentInfo *m_info; + Metadata m_metadata; std::optional m_word; }; @@ -1161,7 +1169,13 @@ class PdfDocumentReader final : public DocumentReader { QString path = info.file.canonicalFilePath(); if (m_doc.load(path) != QPdfDocument::Error::None) throw std::runtime_error(fmt::format("Failed to load PDF: {}", path)); - postInit(); + Metadata metadata { + .title = m_doc.metaData(QPdfDocument::MetaDataField::Title ).toString(), + .author = m_doc.metaData(QPdfDocument::MetaDataField::Author ).toString(), + .subject = m_doc.metaData(QPdfDocument::MetaDataField::Subject ).toString(), + .keywords = m_doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(), + }; + postInit(std::move(metadata)); } int page() const override { return m_currentPage; } @@ -1200,6 +1214,7 @@ class WordDocumentReader final : public DocumentReader { m_paragraph = &m_doc.paragraphs(); m_run = &m_paragraph->runs(); + // TODO(jared): metadata for Word documents? postInit(); } @@ -1324,9 +1339,7 @@ ChunkStreamer::ChunkStreamer(Database *database) ChunkStreamer::~ChunkStreamer() = default; -void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel, - const QString &title, const QString &author, const QString &subject, - const QString &keywords) +void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel) { auto docKey = doc.key(); if (!m_docKey || *m_docKey != docKey) { @@ -1334,10 +1347,6 @@ void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const Q m_reader = DocumentReader::fromDocument(doc); m_documentId = documentId; m_embeddingModel = embeddingModel; - m_title = title; - m_author = author; - m_subject = subject; - m_keywords = keywords; m_chunk.clear(); m_page = 0; @@ -1434,14 +1443,15 @@ ChunkStreamer::Status ChunkStreamer::step() QSqlQuery q(m_database->m_db); int chunkId = 0; + auto &metadata = m_reader->metadata(); if (!m_database->addChunk(q, m_documentId, chunk, m_reader->doc().file.fileName(), // basename - m_title, - m_author, - m_subject, - m_keywords, + metadata.title, + metadata.author, + metadata.subject, + metadata.keywords, m_page, line_from, line_to, @@ -1731,22 +1741,8 @@ void Database::scanQueue() Q_ASSERT(document_id != -1); { - QString title, author, subject, keywords; - if (info.isPdf()) { - QPdfDocument doc; - if (doc.load(document_path) != QPdfDocument::Error::None) { - qWarning() << "ERROR: Could not load pdf" << document_id << document_path; - return updateFolderToIndex(folder_id, countForFolder); - } - title = doc.metaData(QPdfDocument::MetaDataField::Title).toString(); - author = doc.metaData(QPdfDocument::MetaDataField::Author).toString(); - subject = doc.metaData(QPdfDocument::MetaDataField::Subject).toString(); - keywords = doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(); - // TODO(jared): metadata for Word documents? - } - try { - m_chunkStreamer.setDocument(info, document_id, embedding_model, title, author, subject, keywords); + m_chunkStreamer.setDocument(info, document_id, embedding_model); } catch (const std::runtime_error &e) { qWarning() << "LocalDocs ERROR:" << e.what(); goto dequeue; diff --git a/gpt4all-chat/src/database.h b/gpt4all-chat/src/database.h index faf0686f053d..30c139421cb3 100644 --- a/gpt4all-chat/src/database.h +++ b/gpt4all-chat/src/database.h @@ -171,8 +171,7 @@ class ChunkStreamer { explicit ChunkStreamer(Database *database); ~ChunkStreamer(); - void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel, const QString &title, - const QString &author, const QString &subject, const QString &keywords); + void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel); std::optional currentDocKey() const; void reset(); From 61dc351f39404fc5253a296e225ab1594d58b571 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 15 Oct 2024 15:33:07 -0400 Subject: [PATCH 5/6] changelog: add this PR Signed-off-by: Jared Van Bortel --- gpt4all-chat/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/gpt4all-chat/CHANGELOG.md b/gpt4all-chat/CHANGELOG.md index df80f9388d07..e40f7e28c1a6 100644 --- a/gpt4all-chat/CHANGELOG.md +++ b/gpt4all-chat/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - Fix bug removing documents because of a wrong case sensitive file suffix check ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083)) - Fix bug with hybrid localdocs search where database would get out of sync ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083)) - Fix GUI bug where the localdocs embedding device appears blank ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083)) +- Prevent LocalDocs from not making progress in certain cases ([#3094](https://github.com/nomic-ai/gpt4all/pull/3094)) ## [3.4.1] - 2024-10-11 From 519a915d4cdcaf44880a1c7d4bf769545a1a66b7 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Wed, 16 Oct 2024 10:59:36 -0400 Subject: [PATCH 6/6] localdocs: remove magic number from comment Signed-off-by: Jared Van Bortel --- gpt4all-chat/src/database.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpt4all-chat/src/database.cpp b/gpt4all-chat/src/database.cpp index 320f33b29229..bec2e54810aa 100644 --- a/gpt4all-chat/src/database.cpp +++ b/gpt4all-chat/src/database.cpp @@ -1650,7 +1650,7 @@ void Database::scanQueueBatch() m_scanDurationTimer.start(); - // scan for up to 100ms or until we run out of documents + // scan for up to the maximum scan duration or until we run out of documents while (!m_docsToScan.empty()) { scanQueue(); if (scanQueueInterrupted())