diff --git a/server/utils/TextSplitter/index.js b/server/utils/TextSplitter/index.js index 4162fa74e9..fe6fe95cc1 100644 --- a/server/utils/TextSplitter/index.js +++ b/server/utils/TextSplitter/index.js @@ -1,3 +1,18 @@ +/** + * @typedef {object} DocumentMetadata + * @property {string} id - eg; "123e4567-e89b-12d3-a456-426614174000" + * @property {string} url - eg; "file://example.com/index.html" + * @property {string} title - eg; "example.com/index.html" + * @property {string} docAuthor - eg; "no author found" + * @property {string} description - eg; "No description found." + * @property {string} docSource - eg; "URL link uploaded by the user." + * @property {string} chunkSource - eg; link://https://example.com + * @property {string} published - ISO 8601 date string + * @property {number} wordCount - Number of words in the document + * @property {string} pageContent - The raw text content of the document + * @property {number} token_count_estimate - Number of tokens in the document + */ + function isNullOrNaN(value) { if (value === null) return true; return isNaN(value); @@ -29,10 +44,12 @@ class TextSplitter { console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args); } - // Does a quick check to determine the text chunk length limit. - // Embedder models have hard-set limits that cannot be exceeded, just like an LLM context - // so here we want to allow override of the default 1000, but up to the models maximum, which is - // sometimes user defined. + /** + * Does a quick check to determine the text chunk length limit. + * Embedder models have hard-set limits that cannot be exceeded, just like an LLM context + * so here we want to allow override of the default 1000, but up to the models maximum, which is + * sometimes user defined. + */ static determineMaxChunkSize(preferred = null, embedderLimit = 1000) { const prefValue = isNullOrNaN(preferred) ? Number(embedderLimit) @@ -45,6 +62,70 @@ class TextSplitter { return prefValue > limit ? limit : prefValue; } + /** + * Creates a string of metadata to be prepended to each chunk. + * @param {DocumentMetadata} metadata - Metadata to be prepended to each chunk. + * @returns {{[key: ('title' | 'published' | 'source')]: string}} Object of metadata that will be prepended to each chunk. + */ + static buildHeaderMeta(metadata = {}) { + if (!metadata || Object.keys(metadata).length === 0) return null; + const PLUCK_MAP = { + title: { + as: "sourceDocument", + pluck: (metadata) => { + return metadata?.title || null; + }, + }, + published: { + as: "published", + pluck: (metadata) => { + return metadata?.published || null; + }, + }, + chunkSource: { + as: "source", + pluck: (metadata) => { + const validPrefixes = ["link://", "youtube://"]; + // If the chunkSource is a link or youtube link, we can add the URL + // as its source in the metadata so the LLM can use it for context. + // eg prompt: Where did you get this information? -> answer: "from https://example.com" + if ( + !metadata?.chunkSource || // Exists + !metadata?.chunkSource.length || // Is not empty + typeof metadata.chunkSource !== "string" || // Is a string + !validPrefixes.some( + (prefix) => metadata.chunkSource.startsWith(prefix) // Has a valid prefix we respect + ) + ) + return null; + + // We know a prefix is present, so we can split on it and return the rest. + // If nothing is found, return null and it will not be added to the metadata. + let source = null; + for (const prefix of validPrefixes) { + source = metadata.chunkSource.split(prefix)?.[1] || null; + if (source) break; + } + + return source; + }, + }, + }; + + const pluckedData = {}; + Object.entries(PLUCK_MAP).forEach(([key, value]) => { + if (!(key in metadata)) return; // Skip if the metadata key is not present. + const pluckedValue = value.pluck(metadata); + if (!pluckedValue) return; // Skip if the plucked value is null/empty. + pluckedData[value.as] = pluckedValue; + }); + + return pluckedData; + } + + /** + * Creates a string of metadata to be prepended to each chunk. + */ stringifyHeader() { if (!this.config.chunkHeaderMeta) return null; let content = ""; diff --git a/server/utils/vectorDbProviders/astra/index.js b/server/utils/vectorDbProviders/astra/index.js index efaaa135a7..d292973efa 100644 --- a/server/utils/vectorDbProviders/astra/index.js +++ b/server/utils/vectorDbProviders/astra/index.js @@ -160,10 +160,7 @@ const AstraDB = { { label: "text_splitter_chunk_overlap" }, 20 ), - chunkHeaderMeta: { - sourceDocument: metadata?.title, - published: metadata?.published || "unknown", - }, + chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/chroma/index.js b/server/utils/vectorDbProviders/chroma/index.js index a79d4fc420..262ecd2578 100644 --- a/server/utils/vectorDbProviders/chroma/index.js +++ b/server/utils/vectorDbProviders/chroma/index.js @@ -251,10 +251,7 @@ const Chroma = { { label: "text_splitter_chunk_overlap" }, 20 ), - chunkHeaderMeta: { - sourceDocument: metadata?.title, - published: metadata?.published || "unknown", - }, + chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/lance/index.js b/server/utils/vectorDbProviders/lance/index.js index f6e9206181..e4eb87e956 100644 --- a/server/utils/vectorDbProviders/lance/index.js +++ b/server/utils/vectorDbProviders/lance/index.js @@ -240,10 +240,7 @@ const LanceDb = { { label: "text_splitter_chunk_overlap" }, 20 ), - chunkHeaderMeta: { - sourceDocument: metadata?.title, - published: metadata?.published || "unknown", - }, + chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/milvus/index.js b/server/utils/vectorDbProviders/milvus/index.js index 7b4c778f77..d0f2866f80 100644 --- a/server/utils/vectorDbProviders/milvus/index.js +++ b/server/utils/vectorDbProviders/milvus/index.js @@ -203,10 +203,7 @@ const Milvus = { { label: "text_splitter_chunk_overlap" }, 20 ), - chunkHeaderMeta: { - sourceDocument: metadata?.title, - published: metadata?.published || "unknown", - }, + chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/pinecone/index.js b/server/utils/vectorDbProviders/pinecone/index.js index 040f41d32f..a86c37f676 100644 --- a/server/utils/vectorDbProviders/pinecone/index.js +++ b/server/utils/vectorDbProviders/pinecone/index.js @@ -146,10 +146,7 @@ const PineconeDB = { { label: "text_splitter_chunk_overlap" }, 20 ), - chunkHeaderMeta: { - sourceDocument: metadata?.title, - published: metadata?.published || "unknown", - }, + chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/qdrant/index.js b/server/utils/vectorDbProviders/qdrant/index.js index 36550f098f..0773563678 100644 --- a/server/utils/vectorDbProviders/qdrant/index.js +++ b/server/utils/vectorDbProviders/qdrant/index.js @@ -222,10 +222,7 @@ const QDrant = { { label: "text_splitter_chunk_overlap" }, 20 ), - chunkHeaderMeta: { - sourceDocument: metadata?.title, - published: metadata?.published || "unknown", - }, + chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/weaviate/index.js b/server/utils/vectorDbProviders/weaviate/index.js index 176c56d634..e98296a8f4 100644 --- a/server/utils/vectorDbProviders/weaviate/index.js +++ b/server/utils/vectorDbProviders/weaviate/index.js @@ -262,10 +262,7 @@ const Weaviate = { { label: "text_splitter_chunk_overlap" }, 20 ), - chunkHeaderMeta: { - sourceDocument: metadata?.title, - published: metadata?.published || "unknown", - }, + chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata), }); const textChunks = await textSplitter.splitText(pageContent); diff --git a/server/utils/vectorDbProviders/zilliz/index.js b/server/utils/vectorDbProviders/zilliz/index.js index cb60d2e3f0..be138318c6 100644 --- a/server/utils/vectorDbProviders/zilliz/index.js +++ b/server/utils/vectorDbProviders/zilliz/index.js @@ -196,10 +196,7 @@ const Zilliz = { { label: "text_splitter_chunk_overlap" }, 20 ), - chunkHeaderMeta: { - sourceDocument: metadata?.title, - published: metadata?.published || "unknown", - }, + chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata), }); const textChunks = await textSplitter.splitText(pageContent);