Skip to content

Commit 696f312

Browse files
authored
feat(ingester): better markdown splitter (#33)
1 parent df21098 commit 696f312

File tree

8 files changed

+2121
-183
lines changed

8 files changed

+2121
-183
lines changed

packages/ingester/src/ingesters/CairoBookIngester.ts

Lines changed: 41 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ import { VectorStore } from '@cairo-coder/agents/db/postgresVectorStore';
1010
import { logger } from '@cairo-coder/agents/utils/index';
1111
import * as fs from 'fs/promises';
1212
import * as path from 'path';
13+
import { calculateHash } from '../utils/contentUtils';
1314
import {
14-
addSectionWithSizeLimit,
15-
calculateHash,
16-
createAnchor,
17-
} from '../utils/contentUtils';
15+
RecursiveMarkdownSplitter,
16+
SplitOptions,
17+
} from '../utils/RecursiveMarkdownSplitter';
1818

1919
/**
2020
* Ingester for the Cairo Book documentation
@@ -63,109 +63,50 @@ export class CairoBookIngester extends MarkdownIngester {
6363
}
6464

6565
/**
66-
* Chunk the core library summary file by H1 headers
66+
* Chunk the core library summary file using RecursiveMarkdownSplitter
6767
*
68-
* This function takes the markdown content and splits it into sections
69-
* based on H1 headers (# Header). Each section becomes a separate chunk
70-
* with its content hashed for uniqueness.
68+
* This function takes the markdown content and splits it using a recursive
69+
* strategy that respects headers, code blocks, and maintains overlap between chunks.
7170
*
7271
* @param text - The markdown content to chunk
73-
* @returns Promise<Document<BookChunk>[]> - Array of document chunks, one per H1 section
72+
* @returns Promise<Document<BookChunk>[]> - Array of document chunks
7473
*/
7574
async chunkSummaryFile(text: string): Promise<Document<BookChunk>[]> {
76-
const content = text;
77-
const sections: ParsedSection[] = [];
78-
79-
// We can't use a simple global regex, as it will incorrectly match commented
80-
// lines inside code blocks. Instead, we'll parse line-by-line to find
81-
// "real" headers, while keeping track of whether we're inside a code block.
82-
83-
const realHeaders: { title: string; startIndex: number }[] = [];
84-
const lines = content.split('\n');
85-
let inCodeBlock = false;
86-
let charIndex = 0;
87-
88-
for (const line of lines) {
89-
// Toggle the state if we encounter a code block fence
90-
if (line.trim().startsWith('```')) {
91-
inCodeBlock = !inCodeBlock;
92-
}
93-
94-
// A real H1 header is a line that starts with '# ' and is NOT in a code block.
95-
// We use a specific regex to ensure it's a proper H1.
96-
const h1Match = line.match(/^#{1,2}\s+(.+)$/);
97-
if (!inCodeBlock && h1Match) {
98-
realHeaders.push({
99-
title: h1Match[1].trim(),
100-
startIndex: charIndex,
101-
});
102-
}
103-
104-
// Move the character index forward, accounting for the newline character
105-
charIndex += line.length + 1;
106-
}
75+
// Configure the splitter with appropriate settings
76+
const splitOptions: SplitOptions = {
77+
maxChars: 2048,
78+
minChars: 500,
79+
overlap: 256,
80+
headerLevels: [1, 2], // Split on H1 and H2 headers
81+
preserveCodeBlocks: true,
82+
idPrefix: 'cairo-book',
83+
trim: true,
84+
};
10785

108-
// If no H1 headers were found, treat the entire content as one section.
109-
if (realHeaders.length === 0) {
110-
logger.debug(
111-
'No H1 headers found, creating single section from entire content',
112-
);
113-
addSectionWithSizeLimit(
114-
sections,
115-
'Core Library Documentation',
116-
content.trim(),
117-
20000,
118-
createAnchor('Core Library Documentation'),
119-
);
120-
} else {
121-
// Process each valid H1 header found
122-
for (let i = 0; i < realHeaders.length; i++) {
123-
const header = realHeaders[i];
124-
const headerTitle = header.title;
125-
const headerStartIndex = header.startIndex;
126-
127-
// Determine the end of this section (start of next header or end of content)
128-
const nextHeaderIndex =
129-
i < realHeaders.length - 1
130-
? realHeaders[i + 1].startIndex
131-
: content.length;
132-
133-
// Extract section content from the start of the header line to before the next header
134-
const sectionContent = content
135-
.slice(headerStartIndex, nextHeaderIndex)
136-
.trim();
137-
138-
logger.debug(`Adding section: ${headerTitle}`);
139-
140-
addSectionWithSizeLimit(
141-
sections,
142-
headerTitle,
143-
sectionContent,
144-
20000,
145-
createAnchor(headerTitle),
146-
);
147-
}
148-
}
86+
// Create the splitter and split the content
87+
const splitter = new RecursiveMarkdownSplitter(splitOptions);
88+
const chunks = splitter.splitMarkdownToChunks(text);
14989

150-
const localChunks: Document<BookChunk>[] = [];
151-
152-
// Create a document for each section
153-
sections.forEach((section: ParsedSection, index: number) => {
154-
const hash: string = calculateHash(section.content);
155-
localChunks.push(
156-
new Document<BookChunk>({
157-
pageContent: section.content,
158-
metadata: {
159-
name: section.title,
160-
title: section.title,
161-
chunkNumber: index,
162-
contentHash: hash,
163-
uniqueId: `${section.title}-${index}`,
164-
sourceLink: ``,
165-
source: this.source, // Using placeholder for 'this.source'
166-
},
167-
}),
168-
);
90+
logger.info(
91+
`Created ${chunks.length} chunks using RecursiveMarkdownSplitter`,
92+
);
93+
94+
// Convert chunks to Document<BookChunk> format
95+
const localChunks: Document<BookChunk>[] = chunks.map((chunk) => {
96+
const contentHash = calculateHash(chunk.content);
97+
98+
return new Document<BookChunk>({
99+
pageContent: chunk.content,
100+
metadata: {
101+
name: chunk.meta.title,
102+
title: chunk.meta.title,
103+
chunkNumber: chunk.meta.chunkNumber, // Already 0-based
104+
contentHash: contentHash,
105+
uniqueId: chunk.meta.uniqueId,
106+
sourceLink: '',
107+
source: this.source,
108+
},
109+
});
169110
});
170111

171112
return localChunks;

packages/ingester/src/ingesters/CoreLibDocsIngester.ts

Lines changed: 46 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,15 @@ import * as fs from 'fs/promises';
22
import * as path from 'path';
33
import { BookConfig } from '../utils/types';
44
import { MarkdownIngester } from './MarkdownIngester';
5-
import {
6-
BookChunk,
7-
DocumentSource,
8-
ParsedSection,
9-
} from '@cairo-coder/agents/types/index';
5+
import { BookChunk, DocumentSource } from '@cairo-coder/agents/types/index';
106
import { Document } from '@langchain/core/documents';
117
import { VectorStore } from '@cairo-coder/agents/db/postgresVectorStore';
128
import { logger } from '@cairo-coder/agents/utils/index';
9+
import { calculateHash } from '../utils/contentUtils';
1310
import {
14-
addSectionWithSizeLimit,
15-
calculateHash,
16-
createAnchor,
17-
} from '../utils/contentUtils';
11+
RecursiveMarkdownSplitter,
12+
SplitOptions,
13+
} from '../utils/RecursiveMarkdownSplitter';
1814

1915
/**
2016
* Ingester for the Cairo Core Library documentation
@@ -63,84 +59,54 @@ export class CoreLibDocsIngester extends MarkdownIngester {
6359
}
6460

6561
/**
66-
* Chunk the core library summary file by H1 headers
62+
* Chunk the core library summary file using RecursiveMarkdownSplitter
6763
*
68-
* This function takes the markdown content and splits it into sections
69-
* based on H1 headers (# Header). Each section becomes a separate chunk
70-
* with its content hashed for uniqueness.
64+
* This function takes the markdown content and splits it using a recursive
65+
* strategy that respects headers, code blocks, and maintains overlap between chunks.
7166
*
7267
* @param text - The markdown content to chunk
73-
* @returns Promise<Document<BookChunk>[]> - Array of document chunks, one per H1 section
68+
* @returns Promise<Document<BookChunk>[]> - Array of document chunks
7469
*/
7570
async chunkCorelibSummaryFile(text: string): Promise<Document<BookChunk>[]> {
76-
const content = text;
77-
const sections: ParsedSection[] = [];
78-
79-
// Regex to match H1 headers (# Header)
80-
const headerRegex = /^(#{1})\s+(.+)$/gm;
81-
const matches = Array.from(content.matchAll(headerRegex));
82-
83-
let lastSectionEndIndex = 0;
84-
85-
// Process each H1 header found
86-
for (let i = 0; i < matches.length; i++) {
87-
const match = matches[i];
88-
const headerTitle = match[2].trim();
89-
const headerStartIndex = match.index!;
90-
91-
// Determine the end of this section (start of next header or end of content)
92-
const nextHeaderIndex =
93-
i < matches.length - 1 ? matches[i + 1].index! : content.length;
94-
95-
// Extract section content from after the header to before the next header
96-
const sectionContent = content
97-
.slice(headerStartIndex, nextHeaderIndex)
98-
.trim();
99-
100-
logger.debug(`Adding section: ${headerTitle}`);
101-
102-
addSectionWithSizeLimit(
103-
sections,
104-
headerTitle,
105-
sectionContent,
106-
20000,
107-
createAnchor(headerTitle),
108-
);
109-
}
71+
logger.info(
72+
'Using RecursiveMarkdownSplitter to chunk Core Library documentation',
73+
);
11074

111-
// If no H1 headers found, treat the entire content as one section
112-
if (sections.length === 0) {
113-
logger.debug(
114-
'No H1 headers found, creating single section from entire content',
115-
);
116-
addSectionWithSizeLimit(
117-
sections,
118-
'Core Library Documentation',
119-
content,
120-
20000,
121-
createAnchor('Core Library Documentation'),
122-
);
123-
}
75+
// Configure the splitter with appropriate settings
76+
const splitOptions: SplitOptions = {
77+
maxChars: 2048,
78+
minChars: 500,
79+
overlap: 256,
80+
headerLevels: [1, 2], // Split on H1 and H2 headers
81+
preserveCodeBlocks: true,
82+
idPrefix: 'corelib',
83+
trim: true,
84+
};
12485

125-
const localChunks: Document<BookChunk>[] = [];
126-
127-
// Create a document for each section
128-
sections.forEach((section: ParsedSection, index: number) => {
129-
const hash: string = calculateHash(section.content);
130-
localChunks.push(
131-
new Document<BookChunk>({
132-
pageContent: section.content,
133-
metadata: {
134-
name: section.title,
135-
title: section.title,
136-
chunkNumber: index,
137-
contentHash: hash,
138-
uniqueId: `${section.title}-${index}`,
139-
sourceLink: ``,
140-
source: this.source,
141-
},
142-
}),
143-
);
86+
// Create the splitter and split the content
87+
const splitter = new RecursiveMarkdownSplitter(splitOptions);
88+
const chunks = splitter.splitMarkdownToChunks(text);
89+
90+
logger.info(
91+
`Created ${chunks.length} chunks using RecursiveMarkdownSplitter`,
92+
);
93+
94+
// Convert chunks to Document<BookChunk> format
95+
const localChunks: Document<BookChunk>[] = chunks.map((chunk) => {
96+
const contentHash = calculateHash(chunk.content);
97+
98+
return new Document<BookChunk>({
99+
pageContent: chunk.content,
100+
metadata: {
101+
name: chunk.meta.title,
102+
title: chunk.meta.title,
103+
chunkNumber: chunk.meta.chunkNumber, // Already 0-based
104+
contentHash: contentHash,
105+
uniqueId: chunk.meta.uniqueId,
106+
sourceLink: '',
107+
source: this.source,
108+
},
109+
});
144110
});
145111

146112
return localChunks;

0 commit comments

Comments
 (0)