@@ -10,11 +10,11 @@ import { VectorStore } from '@cairo-coder/agents/db/postgresVectorStore';
1010import { logger } from '@cairo-coder/agents/utils/index' ;
1111import * as fs from 'fs/promises' ;
1212import * as path from 'path' ;
13+ import { calculateHash } from '../utils/contentUtils' ;
1314import {
14- addSectionWithSizeLimit ,
15- calculateHash ,
16- createAnchor ,
17- } from '../utils/contentUtils' ;
15+ RecursiveMarkdownSplitter ,
16+ SplitOptions ,
17+ } from '../utils/RecursiveMarkdownSplitter' ;
1818
1919/**
2020 * Ingester for the Cairo Book documentation
@@ -63,109 +63,50 @@ export class CairoBookIngester extends MarkdownIngester {
6363 }
6464
6565 /**
66- * Chunk the core library summary file by H1 headers
66+ * Chunk the core library summary file using RecursiveMarkdownSplitter
6767 *
68- * This function takes the markdown content and splits it into sections
69- * based on H1 headers (# Header). Each section becomes a separate chunk
70- * with its content hashed for uniqueness.
68+ * This function takes the markdown content and splits it using a recursive
69+ * strategy that respects headers, code blocks, and maintains overlap between chunks.
7170 *
7271 * @param text - The markdown content to chunk
73- * @returns Promise<Document<BookChunk>[]> - Array of document chunks, one per H1 section
72+ * @returns Promise<Document<BookChunk>[]> - Array of document chunks
7473 */
7574 async chunkSummaryFile ( text : string ) : Promise < Document < BookChunk > [ ] > {
76- const content = text ;
77- const sections : ParsedSection [ ] = [ ] ;
78-
79- // We can't use a simple global regex, as it will incorrectly match commented
80- // lines inside code blocks. Instead, we'll parse line-by-line to find
81- // "real" headers, while keeping track of whether we're inside a code block.
82-
83- const realHeaders : { title : string ; startIndex : number } [ ] = [ ] ;
84- const lines = content . split ( '\n' ) ;
85- let inCodeBlock = false ;
86- let charIndex = 0 ;
87-
88- for ( const line of lines ) {
89- // Toggle the state if we encounter a code block fence
90- if ( line . trim ( ) . startsWith ( '```' ) ) {
91- inCodeBlock = ! inCodeBlock ;
92- }
93-
94- // A real H1 header is a line that starts with '# ' and is NOT in a code block.
95- // We use a specific regex to ensure it's a proper H1.
96- const h1Match = line . match ( / ^ # { 1 , 2 } \s + ( .+ ) $ / ) ;
97- if ( ! inCodeBlock && h1Match ) {
98- realHeaders . push ( {
99- title : h1Match [ 1 ] . trim ( ) ,
100- startIndex : charIndex ,
101- } ) ;
102- }
103-
104- // Move the character index forward, accounting for the newline character
105- charIndex += line . length + 1 ;
106- }
75+ // Configure the splitter with appropriate settings
76+ const splitOptions : SplitOptions = {
77+ maxChars : 2048 ,
78+ minChars : 500 ,
79+ overlap : 256 ,
80+ headerLevels : [ 1 , 2 ] , // Split on H1 and H2 headers
81+ preserveCodeBlocks : true ,
82+ idPrefix : 'cairo-book' ,
83+ trim : true ,
84+ } ;
10785
108- // If no H1 headers were found, treat the entire content as one section.
109- if ( realHeaders . length === 0 ) {
110- logger . debug (
111- 'No H1 headers found, creating single section from entire content' ,
112- ) ;
113- addSectionWithSizeLimit (
114- sections ,
115- 'Core Library Documentation' ,
116- content . trim ( ) ,
117- 20000 ,
118- createAnchor ( 'Core Library Documentation' ) ,
119- ) ;
120- } else {
121- // Process each valid H1 header found
122- for ( let i = 0 ; i < realHeaders . length ; i ++ ) {
123- const header = realHeaders [ i ] ;
124- const headerTitle = header . title ;
125- const headerStartIndex = header . startIndex ;
126-
127- // Determine the end of this section (start of next header or end of content)
128- const nextHeaderIndex =
129- i < realHeaders . length - 1
130- ? realHeaders [ i + 1 ] . startIndex
131- : content . length ;
132-
133- // Extract section content from the start of the header line to before the next header
134- const sectionContent = content
135- . slice ( headerStartIndex , nextHeaderIndex )
136- . trim ( ) ;
137-
138- logger . debug ( `Adding section: ${ headerTitle } ` ) ;
139-
140- addSectionWithSizeLimit (
141- sections ,
142- headerTitle ,
143- sectionContent ,
144- 20000 ,
145- createAnchor ( headerTitle ) ,
146- ) ;
147- }
148- }
86+ // Create the splitter and split the content
87+ const splitter = new RecursiveMarkdownSplitter ( splitOptions ) ;
88+ const chunks = splitter . splitMarkdownToChunks ( text ) ;
14989
150- const localChunks : Document < BookChunk > [ ] = [ ] ;
151-
152- // Create a document for each section
153- sections . forEach ( ( section : ParsedSection , index : number ) => {
154- const hash : string = calculateHash ( section . content ) ;
155- localChunks . push (
156- new Document < BookChunk > ( {
157- pageContent : section . content ,
158- metadata : {
159- name : section . title ,
160- title : section . title ,
161- chunkNumber : index ,
162- contentHash : hash ,
163- uniqueId : `${ section . title } -${ index } ` ,
164- sourceLink : `` ,
165- source : this . source , // Using placeholder for 'this.source'
166- } ,
167- } ) ,
168- ) ;
90+ logger . info (
91+ `Created ${ chunks . length } chunks using RecursiveMarkdownSplitter` ,
92+ ) ;
93+
94+ // Convert chunks to Document<BookChunk> format
95+ const localChunks : Document < BookChunk > [ ] = chunks . map ( ( chunk ) => {
96+ const contentHash = calculateHash ( chunk . content ) ;
97+
98+ return new Document < BookChunk > ( {
99+ pageContent : chunk . content ,
100+ metadata : {
101+ name : chunk . meta . title ,
102+ title : chunk . meta . title ,
103+ chunkNumber : chunk . meta . chunkNumber , // Already 0-based
104+ contentHash : contentHash ,
105+ uniqueId : chunk . meta . uniqueId ,
106+ sourceLink : '' ,
107+ source : this . source ,
108+ } ,
109+ } ) ;
169110 } ) ;
170111
171112 return localChunks ;
0 commit comments