diff --git a/gpt4all-chat/CHANGELOG.md b/gpt4all-chat/CHANGELOG.md index e11e721200e2..df80f9388d07 100644 --- a/gpt4all-chat/CHANGELOG.md +++ b/gpt4all-chat/CHANGELOG.md @@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## [Unreleased] + +### Fixed +- Limit bm25 retrieval to only specified collections ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083)) +- Fix bug removing documents because of a wrong case sensitive file suffix check ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083)) +- Fix bug with hybrid localdocs search where database would get out of sync ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083)) +- Fix GUI bug where the localdocs embedding device appears blank ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083)) + ## [3.4.1] - 2024-10-11 ### Fixed @@ -155,6 +163,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - Fix several Vulkan resource management issues ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694)) - Fix crash/hang when some models stop generating, by showing special tokens ([#2701](https://github.com/nomic-ai/gpt4all/pull/2701)) +[Unreleased]: https://github.com/nomic-ai/gpt4all/compare/v3.4.1...HEAD [3.4.1]: https://github.com/nomic-ai/gpt4all/compare/v3.4.0...v3.4.1 [3.4.0]: https://github.com/nomic-ai/gpt4all/compare/v3.3.0...v3.4.0 [3.3.1]: https://github.com/nomic-ai/gpt4all/compare/v3.3.0...v3.3.1 diff --git a/gpt4all-chat/CMakeLists.txt b/gpt4all-chat/CMakeLists.txt index 9ea50b5a1d4d..5a39962bcdbc 100644 --- a/gpt4all-chat/CMakeLists.txt +++ b/gpt4all-chat/CMakeLists.txt @@ -4,9 +4,9 @@ include(../common/common.cmake) set(APP_VERSION_MAJOR 3) set(APP_VERSION_MINOR 4) -set(APP_VERSION_PATCH 1) +set(APP_VERSION_PATCH 2) set(APP_VERSION_BASE "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}") -set(APP_VERSION "${APP_VERSION_BASE}") +set(APP_VERSION "${APP_VERSION_BASE}-dev0") project(gpt4all VERSION ${APP_VERSION_BASE} LANGUAGES CXX C) diff --git a/gpt4all-chat/qml/LocalDocsSettings.qml b/gpt4all-chat/qml/LocalDocsSettings.qml index db86481f5e0a..a7ea5b75eb41 100644 --- a/gpt4all-chat/qml/LocalDocsSettings.qml +++ b/gpt4all-chat/qml/LocalDocsSettings.qml @@ -176,6 +176,7 @@ MySettingsTab { ListElement { text: qsTr("Application default") } Component.onCompleted: { MySettings.embeddingsDeviceList.forEach(d => append({"text": d})); + deviceBox.updateModel(); } } Accessible.name: deviceLabel.text diff --git a/gpt4all-chat/src/database.cpp b/gpt4all-chat/src/database.cpp index 0f271410fb9d..65ce828a5796 100644 --- a/gpt4all-chat/src/database.cpp +++ b/gpt4all-chat/src/database.cpp @@ -233,12 +233,17 @@ static const QString SELECT_COUNT_CHUNKS_SQL = uR"( )"_s; static const QString SELECT_CHUNKS_FTS_SQL = uR"( - select id, bm25(chunks_fts) as score - from chunks_fts + select fts.id, bm25(chunks_fts) as score + from chunks_fts fts + join documents d on fts.document_id = d.id + join collection_items ci on d.folder_id = ci.folder_id + join collections co on ci.collection_id = co.id where chunks_fts match ? - order by score limit %1; + and co.name in ('%1') + order by score limit %2; )"_s; + #define NAMED_PAIR(name, typea, a, typeb, b) \ struct name { typea a; typeb b; }; \ static bool operator==(const name &x, const name &y) { return x.a == y.a && x.b == y.b; } \ @@ -349,6 +354,14 @@ static const QString UPDATE_LAST_UPDATE_TIME_SQL = uR"( update collections set last_update_time = ? where id = ?; )"_s; +static const QString FTS_INTEGRITY_SQL = uR"( + insert into chunks_fts(chunks_fts, rank) values('integrity-check', 1); +)"_s; + +static const QString FTS_REBUILD_SQL = uR"( + insert into chunks_fts(chunks_fts) values('rebuild'); +)"_s; + static bool addCollection(QSqlQuery &q, const QString &collection_name, const QDateTime &start_update, const QDateTime &last_update, const QString &embedding_model, CollectionItem &item) { @@ -1815,6 +1828,7 @@ void Database::start() m_databaseValid = false; } else { cleanDB(); + ftsIntegrityCheck(); QSqlQuery q(m_db); if (!refreshDocumentIdCache(q)) { m_databaseValid = false; @@ -2328,7 +2342,7 @@ QList Database::searchBM25(const QString &query, const QList &coll QList bm25Queries = queriesForFTS5(query); QSqlQuery sqlQuery(m_db); - sqlQuery.prepare(SELECT_CHUNKS_FTS_SQL.arg(k)); + sqlQuery.prepare(SELECT_CHUNKS_FTS_SQL.arg(collections.join("', '"), QString::number(k))); QList results; for (auto &bm25Query : std::as_const(bm25Queries)) { @@ -2346,11 +2360,13 @@ QList Database::searchBM25(const QString &query, const QList &coll } } - do { - const int chunkId = sqlQuery.value(0).toInt(); - const float score = sqlQuery.value(1).toFloat(); - results.append({chunkId, score}); - } while (sqlQuery.next()); + if (sqlQuery.at() != QSql::AfterLastRow) { + do { + const int chunkId = sqlQuery.value(0).toInt(); + const float score = sqlQuery.value(1).toFloat(); + results.append({chunkId, score}); + } while (sqlQuery.next()); + } k = qMin(k, results.size()); std::partial_sort( @@ -2524,6 +2540,26 @@ void Database::retrieveFromDB(const QList &collections, const QString & results->append(tempResults.value(id)); } +bool Database::ftsIntegrityCheck() +{ + QSqlQuery q(m_db); + + // Returns an error executing sql if it the integrity check fails + // See: https://www.sqlite.org/fts5.html#the_integrity_check_command + const bool success = q.exec(FTS_INTEGRITY_SQL); + if (!success && q.lastError().nativeErrorCode() != "267" /*SQLITE_CORRUPT_VTAB from sqlite header*/) { + qWarning() << "ERROR: Cannot prepare sql for fts integrity check" << q.lastError(); + return false; + } + + if (!success && !q.exec(FTS_REBUILD_SQL)) { + qWarning() << "ERROR: Cannot exec sql for fts rebuild" << q.lastError(); + return false; + } + + return true; +} + // FIXME This is very slow and non-interruptible and when we close the application and we're // cleaning a large table this can cause the app to take forever to shut down. This would ideally be // interruptible and we'd continue 'cleaning' when we restart @@ -2574,7 +2610,7 @@ bool Database::cleanDB() int document_id = q.value(0).toInt(); QString document_path = q.value(1).toString(); QFileInfo info(document_path); - if (info.exists() && info.isReadable() && m_scannedFileExtensions.contains(info.suffix())) + if (info.exists() && info.isReadable() && m_scannedFileExtensions.contains(info.suffix(), Qt::CaseInsensitive)) continue; #if defined(DEBUG) diff --git a/gpt4all-chat/src/database.h b/gpt4all-chat/src/database.h index 0e90c260057a..faf0686f053d 100644 --- a/gpt4all-chat/src/database.h +++ b/gpt4all-chat/src/database.h @@ -41,10 +41,20 @@ class QTimer; /* Version 0: GPT4All v2.4.3, full-text search * Version 1: GPT4All v2.5.3, embeddings in hsnwlib - * Version 2: GPT4All v3.0.0, embeddings in sqlite */ + * Version 2: GPT4All v3.0.0, embeddings in sqlite + * Version 3: GPT4All v3.4.0, hybrid search + */ // minimum supported version static const int LOCALDOCS_MIN_VER = 1; + +// FIXME: (Adam) The next time we bump the version we should add triggers to manage the fts external +// content table as recommended in the official documentation to keep the fts index in sync +// See: https://www.sqlite.org/fts5.html#external_content_tables + +// FIXME: (Adam) The fts virtual table should include the chunk_id explicitly instead of relying upon +// the id of the two tables to be in sync + // current version static const int LOCALDOCS_VERSION = 3; @@ -252,6 +262,7 @@ private Q_SLOTS: void enqueueDocumentInternal(DocumentInfo &&info, bool prepend = false); void enqueueDocuments(int folder_id, std::list &&infos); void scanQueue(); + bool ftsIntegrityCheck(); bool cleanDB(); void addFolderToWatch(const QString &path); void removeFolderFromWatch(const QString &path);