Skip to content

Commit 04e2920

Browse files
Add header static class for metadata assembly (#2567)
* Add header static class for metadata assembly * update comments * patch header parsing for links
1 parent 80565d7 commit 04e2920

File tree

9 files changed

+93
-36
lines changed

9 files changed

+93
-36
lines changed

Diff for: server/utils/TextSplitter/index.js

+85-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
1+
/**
2+
* @typedef {object} DocumentMetadata
3+
* @property {string} id - eg; "123e4567-e89b-12d3-a456-426614174000"
4+
* @property {string} url - eg; "file://example.com/index.html"
5+
* @property {string} title - eg; "example.com/index.html"
6+
* @property {string} docAuthor - eg; "no author found"
7+
* @property {string} description - eg; "No description found."
8+
* @property {string} docSource - eg; "URL link uploaded by the user."
9+
* @property {string} chunkSource - eg; link://https://example.com
10+
* @property {string} published - ISO 8601 date string
11+
* @property {number} wordCount - Number of words in the document
12+
* @property {string} pageContent - The raw text content of the document
13+
* @property {number} token_count_estimate - Number of tokens in the document
14+
*/
15+
116
function isNullOrNaN(value) {
217
if (value === null) return true;
318
return isNaN(value);
@@ -29,10 +44,12 @@ class TextSplitter {
2944
console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args);
3045
}
3146

32-
// Does a quick check to determine the text chunk length limit.
33-
// Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
34-
// so here we want to allow override of the default 1000, but up to the models maximum, which is
35-
// sometimes user defined.
47+
/**
48+
* Does a quick check to determine the text chunk length limit.
49+
* Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
50+
* so here we want to allow override of the default 1000, but up to the models maximum, which is
51+
* sometimes user defined.
52+
*/
3653
static determineMaxChunkSize(preferred = null, embedderLimit = 1000) {
3754
const prefValue = isNullOrNaN(preferred)
3855
? Number(embedderLimit)
@@ -45,6 +62,70 @@ class TextSplitter {
4562
return prefValue > limit ? limit : prefValue;
4663
}
4764

65+
/**
66+
* Creates a string of metadata to be prepended to each chunk.
67+
* @param {DocumentMetadata} metadata - Metadata to be prepended to each chunk.
68+
* @returns {{[key: ('title' | 'published' | 'source')]: string}} Object of metadata that will be prepended to each chunk.
69+
*/
70+
static buildHeaderMeta(metadata = {}) {
71+
if (!metadata || Object.keys(metadata).length === 0) return null;
72+
const PLUCK_MAP = {
73+
title: {
74+
as: "sourceDocument",
75+
pluck: (metadata) => {
76+
return metadata?.title || null;
77+
},
78+
},
79+
published: {
80+
as: "published",
81+
pluck: (metadata) => {
82+
return metadata?.published || null;
83+
},
84+
},
85+
chunkSource: {
86+
as: "source",
87+
pluck: (metadata) => {
88+
const validPrefixes = ["link://", "youtube://"];
89+
// If the chunkSource is a link or youtube link, we can add the URL
90+
// as its source in the metadata so the LLM can use it for context.
91+
// eg prompt: Where did you get this information? -> answer: "from https://example.com"
92+
if (
93+
!metadata?.chunkSource || // Exists
94+
!metadata?.chunkSource.length || // Is not empty
95+
typeof metadata.chunkSource !== "string" || // Is a string
96+
!validPrefixes.some(
97+
(prefix) => metadata.chunkSource.startsWith(prefix) // Has a valid prefix we respect
98+
)
99+
)
100+
return null;
101+
102+
// We know a prefix is present, so we can split on it and return the rest.
103+
// If nothing is found, return null and it will not be added to the metadata.
104+
let source = null;
105+
for (const prefix of validPrefixes) {
106+
source = metadata.chunkSource.split(prefix)?.[1] || null;
107+
if (source) break;
108+
}
109+
110+
return source;
111+
},
112+
},
113+
};
114+
115+
const pluckedData = {};
116+
Object.entries(PLUCK_MAP).forEach(([key, value]) => {
117+
if (!(key in metadata)) return; // Skip if the metadata key is not present.
118+
const pluckedValue = value.pluck(metadata);
119+
if (!pluckedValue) return; // Skip if the plucked value is null/empty.
120+
pluckedData[value.as] = pluckedValue;
121+
});
122+
123+
return pluckedData;
124+
}
125+
126+
/**
127+
* Creates a string of metadata to be prepended to each chunk.
128+
*/
48129
stringifyHeader() {
49130
if (!this.config.chunkHeaderMeta) return null;
50131
let content = "";

Diff for: server/utils/vectorDbProviders/astra/index.js

+1-4
Original file line numberDiff line numberDiff line change
@@ -160,10 +160,7 @@ const AstraDB = {
160160
{ label: "text_splitter_chunk_overlap" },
161161
20
162162
),
163-
chunkHeaderMeta: {
164-
sourceDocument: metadata?.title,
165-
published: metadata?.published || "unknown",
166-
},
163+
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
167164
});
168165
const textChunks = await textSplitter.splitText(pageContent);
169166

Diff for: server/utils/vectorDbProviders/chroma/index.js

+1-4
Original file line numberDiff line numberDiff line change
@@ -251,10 +251,7 @@ const Chroma = {
251251
{ label: "text_splitter_chunk_overlap" },
252252
20
253253
),
254-
chunkHeaderMeta: {
255-
sourceDocument: metadata?.title,
256-
published: metadata?.published || "unknown",
257-
},
254+
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
258255
});
259256
const textChunks = await textSplitter.splitText(pageContent);
260257

Diff for: server/utils/vectorDbProviders/lance/index.js

+1-4
Original file line numberDiff line numberDiff line change
@@ -240,10 +240,7 @@ const LanceDb = {
240240
{ label: "text_splitter_chunk_overlap" },
241241
20
242242
),
243-
chunkHeaderMeta: {
244-
sourceDocument: metadata?.title,
245-
published: metadata?.published || "unknown",
246-
},
243+
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
247244
});
248245
const textChunks = await textSplitter.splitText(pageContent);
249246

Diff for: server/utils/vectorDbProviders/milvus/index.js

+1-4
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,7 @@ const Milvus = {
203203
{ label: "text_splitter_chunk_overlap" },
204204
20
205205
),
206-
chunkHeaderMeta: {
207-
sourceDocument: metadata?.title,
208-
published: metadata?.published || "unknown",
209-
},
206+
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
210207
});
211208
const textChunks = await textSplitter.splitText(pageContent);
212209

Diff for: server/utils/vectorDbProviders/pinecone/index.js

+1-4
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,7 @@ const PineconeDB = {
146146
{ label: "text_splitter_chunk_overlap" },
147147
20
148148
),
149-
chunkHeaderMeta: {
150-
sourceDocument: metadata?.title,
151-
published: metadata?.published || "unknown",
152-
},
149+
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
153150
});
154151
const textChunks = await textSplitter.splitText(pageContent);
155152

Diff for: server/utils/vectorDbProviders/qdrant/index.js

+1-4
Original file line numberDiff line numberDiff line change
@@ -222,10 +222,7 @@ const QDrant = {
222222
{ label: "text_splitter_chunk_overlap" },
223223
20
224224
),
225-
chunkHeaderMeta: {
226-
sourceDocument: metadata?.title,
227-
published: metadata?.published || "unknown",
228-
},
225+
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
229226
});
230227
const textChunks = await textSplitter.splitText(pageContent);
231228

Diff for: server/utils/vectorDbProviders/weaviate/index.js

+1-4
Original file line numberDiff line numberDiff line change
@@ -262,10 +262,7 @@ const Weaviate = {
262262
{ label: "text_splitter_chunk_overlap" },
263263
20
264264
),
265-
chunkHeaderMeta: {
266-
sourceDocument: metadata?.title,
267-
published: metadata?.published || "unknown",
268-
},
265+
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
269266
});
270267
const textChunks = await textSplitter.splitText(pageContent);
271268

Diff for: server/utils/vectorDbProviders/zilliz/index.js

+1-4
Original file line numberDiff line numberDiff line change
@@ -196,10 +196,7 @@ const Zilliz = {
196196
{ label: "text_splitter_chunk_overlap" },
197197
20
198198
),
199-
chunkHeaderMeta: {
200-
sourceDocument: metadata?.title,
201-
published: metadata?.published || "unknown",
202-
},
199+
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
203200
});
204201
const textChunks = await textSplitter.splitText(pageContent);
205202

0 commit comments

Comments
 (0)