diff --git a/gpt4all-chat/src/database.cpp b/gpt4all-chat/src/database.cpp index 3b4e58294dd9..1a25836ee5be 100644 --- a/gpt4all-chat/src/database.cpp +++ b/gpt4all-chat/src/database.cpp @@ -1116,9 +1116,12 @@ static void handleDocumentError(const QString &errorMessage, int document_id, co class DocumentReader { public: + struct Metadata { QString title, author, subject, keywords; }; + static std::unique_ptr fromDocument(const DocumentInfo &info); const DocumentInfo &doc () const { return *m_info; } + const Metadata &metadata() const { return m_metadata; } const std::optional &word () const { return m_word; } const std::optional &nextWord() { m_word = advance(); return m_word; } virtual std::optional getError() const { return std::nullopt; } @@ -1130,11 +1133,16 @@ class DocumentReader { explicit DocumentReader(const DocumentInfo &info) : m_info(&info) {} - void postInit() { m_word = advance(); } + void postInit(Metadata &&metadata = {}) + { + m_metadata = std::move(metadata); + m_word = advance(); + } virtual std::optional advance() = 0; const DocumentInfo *m_info; + Metadata m_metadata; std::optional m_word; }; @@ -1148,7 +1156,13 @@ class PdfDocumentReader final : public DocumentReader { QString path = info.file.canonicalFilePath(); if (m_doc.load(path) != QPdfDocument::Error::None) throw std::runtime_error(fmt::format("Failed to load PDF: {}", path)); - postInit(); + Metadata metadata { + .title = m_doc.metaData(QPdfDocument::MetaDataField::Title ).toString(), + .author = m_doc.metaData(QPdfDocument::MetaDataField::Author ).toString(), + .subject = m_doc.metaData(QPdfDocument::MetaDataField::Subject ).toString(), + .keywords = m_doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(), + }; + postInit(std::move(metadata)); } int page() const override { return m_currentPage; } @@ -1187,6 +1201,7 @@ class WordDocumentReader final : public DocumentReader { m_paragraph = &m_doc.paragraphs(); m_run = &m_paragraph->runs(); + // TODO(jared): metadata for Word documents? postInit(); } @@ -1311,9 +1326,7 @@ ChunkStreamer::ChunkStreamer(Database *database) ChunkStreamer::~ChunkStreamer() = default; -void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel, - const QString &title, const QString &author, const QString &subject, - const QString &keywords) +void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel) { auto docKey = doc.key(); if (!m_docKey || *m_docKey != docKey) { @@ -1321,10 +1334,6 @@ void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const Q m_reader = DocumentReader::fromDocument(doc); m_documentId = documentId; m_embeddingModel = embeddingModel; - m_title = title; - m_author = author; - m_subject = subject; - m_keywords = keywords; m_chunk.clear(); m_page = 0; @@ -1421,14 +1430,15 @@ ChunkStreamer::Status ChunkStreamer::step() QSqlQuery q(m_database->m_db); int chunkId = 0; + auto &metadata = m_reader->metadata(); if (!m_database->addChunk(q, m_documentId, chunk, m_reader->doc().file.fileName(), // basename - m_title, - m_author, - m_subject, - m_keywords, + metadata.title, + metadata.author, + metadata.subject, + metadata.keywords, m_page, line_from, line_to, @@ -1718,22 +1728,8 @@ void Database::scanQueue() Q_ASSERT(document_id != -1); { - QString title, author, subject, keywords; - if (info.isPdf()) { - QPdfDocument doc; - if (doc.load(document_path) != QPdfDocument::Error::None) { - qWarning() << "ERROR: Could not load pdf" << document_id << document_path; - return updateFolderToIndex(folder_id, countForFolder); - } - title = doc.metaData(QPdfDocument::MetaDataField::Title).toString(); - author = doc.metaData(QPdfDocument::MetaDataField::Author).toString(); - subject = doc.metaData(QPdfDocument::MetaDataField::Subject).toString(); - keywords = doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(); - // TODO(jared): metadata for Word documents? - } - try { - m_chunkStreamer.setDocument(info, document_id, embedding_model, title, author, subject, keywords); + m_chunkStreamer.setDocument(info, document_id, embedding_model); } catch (const std::runtime_error &e) { qWarning() << "LocalDocs ERROR:" << e.what(); goto dequeue; diff --git a/gpt4all-chat/src/database.h b/gpt4all-chat/src/database.h index 0e90c260057a..c042055a1b4c 100644 --- a/gpt4all-chat/src/database.h +++ b/gpt4all-chat/src/database.h @@ -161,8 +161,7 @@ class ChunkStreamer { explicit ChunkStreamer(Database *database); ~ChunkStreamer(); - void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel, const QString &title, - const QString &author, const QString &subject, const QString &keywords); + void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel); std::optional currentDocKey() const; void reset();