Skip to content

Commit

Permalink
localdocs: avoid cases where batch can make no progress (#3094)
Browse files Browse the repository at this point in the history
Signed-off-by: Jared Van Bortel <[email protected]>
  • Loading branch information
cebtenzzre committed Oct 16, 2024
1 parent f8dde82 commit 36a3826
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 38 deletions.
1 change: 1 addition & 0 deletions gpt4all-chat/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
- Fix bug removing documents because of a wrong case sensitive file suffix check ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
- Fix bug with hybrid localdocs search where database would get out of sync ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
- Fix GUI bug where the localdocs embedding device appears blank ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
- Prevent LocalDocs from not making progress in certain cases ([#3094](https://github.com/nomic-ai/gpt4all/pull/3094))

## [3.4.1] - 2024-10-11

Expand Down
72 changes: 36 additions & 36 deletions gpt4all-chat/src/database.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1129,9 +1129,12 @@ static void handleDocumentError(const QString &errorMessage, int document_id, co

class DocumentReader {
public:
struct Metadata { QString title, author, subject, keywords; };

static std::unique_ptr<DocumentReader> fromDocument(const DocumentInfo &info);

const DocumentInfo &doc () const { return *m_info; }
const Metadata &metadata() const { return m_metadata; }
const std::optional<QString> &word () const { return m_word; }
const std::optional<QString> &nextWord() { m_word = advance(); return m_word; }
virtual std::optional<ChunkStreamer::Status> getError() const { return std::nullopt; }
Expand All @@ -1143,11 +1146,16 @@ class DocumentReader {
explicit DocumentReader(const DocumentInfo &info)
: m_info(&info) {}

void postInit() { m_word = advance(); }
void postInit(Metadata &&metadata = {})
{
m_metadata = std::move(metadata);
m_word = advance();
}

virtual std::optional<QString> advance() = 0;

const DocumentInfo *m_info;
Metadata m_metadata;
std::optional<QString> m_word;
};

Expand All @@ -1161,7 +1169,13 @@ class PdfDocumentReader final : public DocumentReader {
QString path = info.file.canonicalFilePath();
if (m_doc.load(path) != QPdfDocument::Error::None)
throw std::runtime_error(fmt::format("Failed to load PDF: {}", path));
postInit();
Metadata metadata {
.title = m_doc.metaData(QPdfDocument::MetaDataField::Title ).toString(),
.author = m_doc.metaData(QPdfDocument::MetaDataField::Author ).toString(),
.subject = m_doc.metaData(QPdfDocument::MetaDataField::Subject ).toString(),
.keywords = m_doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(),
};
postInit(std::move(metadata));
}

int page() const override { return m_currentPage; }
Expand Down Expand Up @@ -1200,6 +1214,7 @@ class WordDocumentReader final : public DocumentReader {

m_paragraph = &m_doc.paragraphs();
m_run = &m_paragraph->runs();
// TODO(jared): metadata for Word documents?
postInit();
}

Expand Down Expand Up @@ -1324,20 +1339,14 @@ ChunkStreamer::ChunkStreamer(Database *database)

ChunkStreamer::~ChunkStreamer() = default;

void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel,
const QString &title, const QString &author, const QString &subject,
const QString &keywords)
void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel)
{
auto docKey = doc.key();
if (!m_docKey || *m_docKey != docKey) {
m_docKey = docKey;
m_reader = DocumentReader::fromDocument(doc);
m_documentId = documentId;
m_embeddingModel = embeddingModel;
m_title = title;
m_author = author;
m_subject = subject;
m_keywords = keywords;
m_chunk.clear();
m_page = 0;

Expand Down Expand Up @@ -1376,10 +1385,6 @@ ChunkStreamer::Status ChunkStreamer::step()
m_docKey.reset(); // done processing
return *error;
}
if (m_database->scanQueueInterrupted()) {
retval = Status::INTERRUPTED;
break;
}

// get a word, if needed
std::optional<QString> word = QString(); // empty string to disable EOF logic
Expand Down Expand Up @@ -1438,14 +1443,15 @@ ChunkStreamer::Status ChunkStreamer::step()

QSqlQuery q(m_database->m_db);
int chunkId = 0;
auto &metadata = m_reader->metadata();
if (!m_database->addChunk(q,
m_documentId,
chunk,
m_reader->doc().file.fileName(), // basename
m_title,
m_author,
m_subject,
m_keywords,
metadata.title,
metadata.author,
metadata.subject,
metadata.keywords,
m_page,
line_from,
line_to,
Expand All @@ -1472,6 +1478,11 @@ ChunkStreamer::Status ChunkStreamer::step()
break;
}
}

if (m_database->scanQueueInterrupted()) {
retval = Status::INTERRUPTED;
break;
}
}

if (nChunks) {
Expand Down Expand Up @@ -1635,13 +1646,16 @@ bool Database::scanQueueInterrupted() const

void Database::scanQueueBatch()
{
m_scanDurationTimer.start();

transaction();

// scan for up to 100ms or until we run out of documents
while (!m_docsToScan.empty() && !scanQueueInterrupted())
m_scanDurationTimer.start();

// scan for up to the maximum scan duration or until we run out of documents
while (!m_docsToScan.empty()) {
scanQueue();
if (scanQueueInterrupted())
break;
}

commit();

Expand Down Expand Up @@ -1727,22 +1741,8 @@ void Database::scanQueue()
Q_ASSERT(document_id != -1);

{
QString title, author, subject, keywords;
if (info.isPdf()) {
QPdfDocument doc;
if (doc.load(document_path) != QPdfDocument::Error::None) {
qWarning() << "ERROR: Could not load pdf" << document_id << document_path;
return updateFolderToIndex(folder_id, countForFolder);
}
title = doc.metaData(QPdfDocument::MetaDataField::Title).toString();
author = doc.metaData(QPdfDocument::MetaDataField::Author).toString();
subject = doc.metaData(QPdfDocument::MetaDataField::Subject).toString();
keywords = doc.metaData(QPdfDocument::MetaDataField::Keywords).toString();
// TODO(jared): metadata for Word documents?
}

try {
m_chunkStreamer.setDocument(info, document_id, embedding_model, title, author, subject, keywords);
m_chunkStreamer.setDocument(info, document_id, embedding_model);
} catch (const std::runtime_error &e) {
qWarning() << "LocalDocs ERROR:" << e.what();
goto dequeue;
Expand Down
3 changes: 1 addition & 2 deletions gpt4all-chat/src/database.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,7 @@ class ChunkStreamer {
explicit ChunkStreamer(Database *database);
~ChunkStreamer();

void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel, const QString &title,
const QString &author, const QString &subject, const QString &keywords);
void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel);
std::optional<DocumentInfo::key_type> currentDocKey() const;
void reset();

Expand Down

0 comments on commit 36a3826

Please sign in to comment.