Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

localdocs: avoid cases where batch can make no progress #3094

Merged
merged 6 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions gpt4all-chat/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
- Fix bug removing documents because of a wrong case sensitive file suffix check ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
- Fix bug with hybrid localdocs search where database would get out of sync ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
- Fix GUI bug where the localdocs embedding device appears blank ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
- Prevent LocalDocs from not making progress in certain cases ([#3094](https://github.com/nomic-ai/gpt4all/pull/3094))

## [3.4.1] - 2024-10-11

Expand Down
72 changes: 36 additions & 36 deletions gpt4all-chat/src/database.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1129,9 +1129,12 @@ static void handleDocumentError(const QString &errorMessage, int document_id, co

class DocumentReader {
public:
struct Metadata { QString title, author, subject, keywords; };

static std::unique_ptr<DocumentReader> fromDocument(const DocumentInfo &info);

const DocumentInfo &doc () const { return *m_info; }
const Metadata &metadata() const { return m_metadata; }
const std::optional<QString> &word () const { return m_word; }
const std::optional<QString> &nextWord() { m_word = advance(); return m_word; }
virtual std::optional<ChunkStreamer::Status> getError() const { return std::nullopt; }
Expand All @@ -1143,11 +1146,16 @@ class DocumentReader {
explicit DocumentReader(const DocumentInfo &info)
: m_info(&info) {}

void postInit() { m_word = advance(); }
void postInit(Metadata &&metadata = {})
{
m_metadata = std::move(metadata);
m_word = advance();
}

virtual std::optional<QString> advance() = 0;

const DocumentInfo *m_info;
Metadata m_metadata;
std::optional<QString> m_word;
};

Expand All @@ -1161,7 +1169,13 @@ class PdfDocumentReader final : public DocumentReader {
QString path = info.file.canonicalFilePath();
if (m_doc.load(path) != QPdfDocument::Error::None)
throw std::runtime_error(fmt::format("Failed to load PDF: {}", path));
postInit();
Metadata metadata {
.title = m_doc.metaData(QPdfDocument::MetaDataField::Title ).toString(),
.author = m_doc.metaData(QPdfDocument::MetaDataField::Author ).toString(),
.subject = m_doc.metaData(QPdfDocument::MetaDataField::Subject ).toString(),
.keywords = m_doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(),
};
postInit(std::move(metadata));
}

int page() const override { return m_currentPage; }
Expand Down Expand Up @@ -1200,6 +1214,7 @@ class WordDocumentReader final : public DocumentReader {

m_paragraph = &m_doc.paragraphs();
m_run = &m_paragraph->runs();
// TODO(jared): metadata for Word documents?
postInit();
}

Expand Down Expand Up @@ -1324,20 +1339,14 @@ ChunkStreamer::ChunkStreamer(Database *database)

ChunkStreamer::~ChunkStreamer() = default;

void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel,
const QString &title, const QString &author, const QString &subject,
const QString &keywords)
void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel)
{
auto docKey = doc.key();
if (!m_docKey || *m_docKey != docKey) {
m_docKey = docKey;
m_reader = DocumentReader::fromDocument(doc);
m_documentId = documentId;
m_embeddingModel = embeddingModel;
m_title = title;
m_author = author;
m_subject = subject;
m_keywords = keywords;
m_chunk.clear();
m_page = 0;

Expand Down Expand Up @@ -1376,10 +1385,6 @@ ChunkStreamer::Status ChunkStreamer::step()
m_docKey.reset(); // done processing
return *error;
}
if (m_database->scanQueueInterrupted()) {
retval = Status::INTERRUPTED;
break;
}

// get a word, if needed
std::optional<QString> word = QString(); // empty string to disable EOF logic
Expand Down Expand Up @@ -1438,14 +1443,15 @@ ChunkStreamer::Status ChunkStreamer::step()

QSqlQuery q(m_database->m_db);
int chunkId = 0;
auto &metadata = m_reader->metadata();
if (!m_database->addChunk(q,
m_documentId,
chunk,
m_reader->doc().file.fileName(), // basename
m_title,
m_author,
m_subject,
m_keywords,
metadata.title,
metadata.author,
metadata.subject,
metadata.keywords,
m_page,
line_from,
line_to,
Expand All @@ -1472,6 +1478,11 @@ ChunkStreamer::Status ChunkStreamer::step()
break;
}
}

if (m_database->scanQueueInterrupted()) {
retval = Status::INTERRUPTED;
break;
}
}

if (nChunks) {
Expand Down Expand Up @@ -1635,13 +1646,16 @@ bool Database::scanQueueInterrupted() const

void Database::scanQueueBatch()
{
m_scanDurationTimer.start();

transaction();

// scan for up to 100ms or until we run out of documents
while (!m_docsToScan.empty() && !scanQueueInterrupted())
m_scanDurationTimer.start();

// scan for up to the maximum scan duration or until we run out of documents
while (!m_docsToScan.empty()) {
scanQueue();
if (scanQueueInterrupted())
break;
}

commit();

Expand Down Expand Up @@ -1727,22 +1741,8 @@ void Database::scanQueue()
Q_ASSERT(document_id != -1);

{
QString title, author, subject, keywords;
if (info.isPdf()) {
QPdfDocument doc;
if (doc.load(document_path) != QPdfDocument::Error::None) {
qWarning() << "ERROR: Could not load pdf" << document_id << document_path;
return updateFolderToIndex(folder_id, countForFolder);
}
title = doc.metaData(QPdfDocument::MetaDataField::Title).toString();
author = doc.metaData(QPdfDocument::MetaDataField::Author).toString();
subject = doc.metaData(QPdfDocument::MetaDataField::Subject).toString();
keywords = doc.metaData(QPdfDocument::MetaDataField::Keywords).toString();
// TODO(jared): metadata for Word documents?
}

try {
m_chunkStreamer.setDocument(info, document_id, embedding_model, title, author, subject, keywords);
m_chunkStreamer.setDocument(info, document_id, embedding_model);
} catch (const std::runtime_error &e) {
qWarning() << "LocalDocs ERROR:" << e.what();
goto dequeue;
Expand Down
3 changes: 1 addition & 2 deletions gpt4all-chat/src/database.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,7 @@ class ChunkStreamer {
explicit ChunkStreamer(Database *database);
~ChunkStreamer();

void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel, const QString &title,
const QString &author, const QString &subject, const QString &keywords);
void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel);
std::optional<DocumentInfo::key_type> currentDocKey() const;
void reset();

Expand Down