Skip to content

Commit

Permalink
Add header static class for metadata assembly (#2567)
Browse files Browse the repository at this point in the history
* Add header static class for metadata assembly

* update comments

* patch header parsing for links
  • Loading branch information
timothycarambat authored Nov 4, 2024
1 parent 80565d7 commit 04e2920
Show file tree
Hide file tree
Showing 9 changed files with 93 additions and 36 deletions.
89 changes: 85 additions & 4 deletions server/utils/TextSplitter/index.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
/**
* @typedef {object} DocumentMetadata
* @property {string} id - eg; "123e4567-e89b-12d3-a456-426614174000"
* @property {string} url - eg; "file://example.com/index.html"
* @property {string} title - eg; "example.com/index.html"
* @property {string} docAuthor - eg; "no author found"
* @property {string} description - eg; "No description found."
* @property {string} docSource - eg; "URL link uploaded by the user."
* @property {string} chunkSource - eg; link://https://example.com
* @property {string} published - ISO 8601 date string
* @property {number} wordCount - Number of words in the document
* @property {string} pageContent - The raw text content of the document
* @property {number} token_count_estimate - Number of tokens in the document
*/

function isNullOrNaN(value) {
if (value === null) return true;
return isNaN(value);
Expand Down Expand Up @@ -29,10 +44,12 @@ class TextSplitter {
console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args);
}

// Does a quick check to determine the text chunk length limit.
// Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
// so here we want to allow override of the default 1000, but up to the models maximum, which is
// sometimes user defined.
/**
* Does a quick check to determine the text chunk length limit.
* Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
* so here we want to allow override of the default 1000, but up to the models maximum, which is
* sometimes user defined.
*/
static determineMaxChunkSize(preferred = null, embedderLimit = 1000) {
const prefValue = isNullOrNaN(preferred)
? Number(embedderLimit)
Expand All @@ -45,6 +62,70 @@ class TextSplitter {
return prefValue > limit ? limit : prefValue;
}

/**
* Creates a string of metadata to be prepended to each chunk.
* @param {DocumentMetadata} metadata - Metadata to be prepended to each chunk.
* @returns {{[key: ('title' | 'published' | 'source')]: string}} Object of metadata that will be prepended to each chunk.
*/
static buildHeaderMeta(metadata = {}) {
if (!metadata || Object.keys(metadata).length === 0) return null;
const PLUCK_MAP = {
title: {
as: "sourceDocument",
pluck: (metadata) => {
return metadata?.title || null;
},
},
published: {
as: "published",
pluck: (metadata) => {
return metadata?.published || null;
},
},
chunkSource: {
as: "source",
pluck: (metadata) => {
const validPrefixes = ["link://", "youtube://"];
// If the chunkSource is a link or youtube link, we can add the URL
// as its source in the metadata so the LLM can use it for context.
// eg prompt: Where did you get this information? -> answer: "from https://example.com"
if (
!metadata?.chunkSource || // Exists
!metadata?.chunkSource.length || // Is not empty
typeof metadata.chunkSource !== "string" || // Is a string
!validPrefixes.some(
(prefix) => metadata.chunkSource.startsWith(prefix) // Has a valid prefix we respect
)
)
return null;

// We know a prefix is present, so we can split on it and return the rest.
// If nothing is found, return null and it will not be added to the metadata.
let source = null;
for (const prefix of validPrefixes) {
source = metadata.chunkSource.split(prefix)?.[1] || null;
if (source) break;
}

return source;
},
},
};

const pluckedData = {};
Object.entries(PLUCK_MAP).forEach(([key, value]) => {
if (!(key in metadata)) return; // Skip if the metadata key is not present.
const pluckedValue = value.pluck(metadata);
if (!pluckedValue) return; // Skip if the plucked value is null/empty.
pluckedData[value.as] = pluckedValue;
});

return pluckedData;
}

/**
* Creates a string of metadata to be prepended to each chunk.
*/
stringifyHeader() {
if (!this.config.chunkHeaderMeta) return null;
let content = "";
Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/astra/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,7 @@ const AstraDB = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/chroma/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -251,10 +251,7 @@ const Chroma = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/lance/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,7 @@ const LanceDb = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/milvus/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,7 @@ const Milvus = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/pinecone/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,7 @@ const PineconeDB = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/qdrant/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,7 @@ const QDrant = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/weaviate/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,7 @@ const Weaviate = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down
5 changes: 1 addition & 4 deletions server/utils/vectorDbProviders/zilliz/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,7 @@ const Zilliz = {
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: {
sourceDocument: metadata?.title,
published: metadata?.published || "unknown",
},
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
});
const textChunks = await textSplitter.splitText(pageContent);

Expand Down

0 comments on commit 04e2920

Please sign in to comment.