1
+ /**
2
+ * @typedef {object } DocumentMetadata
3
+ * @property {string } id - eg; "123e4567-e89b-12d3-a456-426614174000"
4
+ * @property {string } url - eg; "file://example.com/index.html"
5
+ * @property {string } title - eg; "example.com/index.html"
6
+ * @property {string } docAuthor - eg; "no author found"
7
+ * @property {string } description - eg; "No description found."
8
+ * @property {string } docSource - eg; "URL link uploaded by the user."
9
+ * @property {string } chunkSource - eg; link://https://example.com
10
+ * @property {string } published - ISO 8601 date string
11
+ * @property {number } wordCount - Number of words in the document
12
+ * @property {string } pageContent - The raw text content of the document
13
+ * @property {number } token_count_estimate - Number of tokens in the document
14
+ */
15
+
1
16
function isNullOrNaN ( value ) {
2
17
if ( value === null ) return true ;
3
18
return isNaN ( value ) ;
@@ -29,10 +44,12 @@ class TextSplitter {
29
44
console . log ( `\x1b[35m[TextSplitter]\x1b[0m ${ text } ` , ...args ) ;
30
45
}
31
46
32
- // Does a quick check to determine the text chunk length limit.
33
- // Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
34
- // so here we want to allow override of the default 1000, but up to the models maximum, which is
35
- // sometimes user defined.
47
+ /**
48
+ * Does a quick check to determine the text chunk length limit.
49
+ * Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
50
+ * so here we want to allow override of the default 1000, but up to the models maximum, which is
51
+ * sometimes user defined.
52
+ */
36
53
static determineMaxChunkSize ( preferred = null , embedderLimit = 1000 ) {
37
54
const prefValue = isNullOrNaN ( preferred )
38
55
? Number ( embedderLimit )
@@ -45,6 +62,70 @@ class TextSplitter {
45
62
return prefValue > limit ? limit : prefValue ;
46
63
}
47
64
65
+ /**
66
+ * Creates a string of metadata to be prepended to each chunk.
67
+ * @param {DocumentMetadata } metadata - Metadata to be prepended to each chunk.
68
+ * @returns {{[key: ('title' | 'published' | 'source')]: string} } Object of metadata that will be prepended to each chunk.
69
+ */
70
+ static buildHeaderMeta ( metadata = { } ) {
71
+ if ( ! metadata || Object . keys ( metadata ) . length === 0 ) return null ;
72
+ const PLUCK_MAP = {
73
+ title : {
74
+ as : "sourceDocument" ,
75
+ pluck : ( metadata ) => {
76
+ return metadata ?. title || null ;
77
+ } ,
78
+ } ,
79
+ published : {
80
+ as : "published" ,
81
+ pluck : ( metadata ) => {
82
+ return metadata ?. published || null ;
83
+ } ,
84
+ } ,
85
+ chunkSource : {
86
+ as : "source" ,
87
+ pluck : ( metadata ) => {
88
+ const validPrefixes = [ "link://" , "youtube://" ] ;
89
+ // If the chunkSource is a link or youtube link, we can add the URL
90
+ // as its source in the metadata so the LLM can use it for context.
91
+ // eg prompt: Where did you get this information? -> answer: "from https://example.com"
92
+ if (
93
+ ! metadata ?. chunkSource || // Exists
94
+ ! metadata ?. chunkSource . length || // Is not empty
95
+ typeof metadata . chunkSource !== "string" || // Is a string
96
+ ! validPrefixes . some (
97
+ ( prefix ) => metadata . chunkSource . startsWith ( prefix ) // Has a valid prefix we respect
98
+ )
99
+ )
100
+ return null ;
101
+
102
+ // We know a prefix is present, so we can split on it and return the rest.
103
+ // If nothing is found, return null and it will not be added to the metadata.
104
+ let source = null ;
105
+ for ( const prefix of validPrefixes ) {
106
+ source = metadata . chunkSource . split ( prefix ) ?. [ 1 ] || null ;
107
+ if ( source ) break ;
108
+ }
109
+
110
+ return source ;
111
+ } ,
112
+ } ,
113
+ } ;
114
+
115
+ const pluckedData = { } ;
116
+ Object . entries ( PLUCK_MAP ) . forEach ( ( [ key , value ] ) => {
117
+ if ( ! ( key in metadata ) ) return ; // Skip if the metadata key is not present.
118
+ const pluckedValue = value . pluck ( metadata ) ;
119
+ if ( ! pluckedValue ) return ; // Skip if the plucked value is null/empty.
120
+ pluckedData [ value . as ] = pluckedValue ;
121
+ } ) ;
122
+
123
+ return pluckedData ;
124
+ }
125
+
126
+ /**
127
+ * Creates a string of metadata to be prepended to each chunk.
128
+ */
48
129
stringifyHeader ( ) {
49
130
if ( ! this . config . chunkHeaderMeta ) return null ;
50
131
let content = "" ;
0 commit comments