Skip to content

Commit 623bcb9

Browse files
committed
docs: refill OZ & corelib docs (#74)
1 parent 445dc12 commit 623bcb9

20 files changed

+8209
-23859
lines changed
Lines changed: 399 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,399 @@
1+
import { Document } from '@langchain/core/documents';
2+
import { type BookChunk, DocumentSource } from '../src/types';
3+
import {
4+
AsciiDocIngester,
5+
type AsciiDocIngesterConfig,
6+
} from '../src/ingesters/AsciiDocIngester';
7+
import {
8+
type BookConfig,
9+
type BookPageDto,
10+
type ParsedSection,
11+
} from '../src/utils/types';
12+
13+
// Create a concrete implementation of AsciiDocIngester for testing
14+
class TestAsciiDocIngester extends AsciiDocIngester {
15+
constructor(config: AsciiDocIngesterConfig) {
16+
super(config);
17+
}
18+
19+
// Implement the abstract method
20+
protected async processDocFilesCustom(
21+
config: BookConfig,
22+
directory: string,
23+
): Promise<BookPageDto[]> {
24+
return [];
25+
}
26+
27+
// Expose protected methods for testing
28+
public exposedParsePage(
29+
content: string,
30+
split: boolean = false,
31+
): ParsedSection[] {
32+
return this.parsePage(content, split);
33+
}
34+
35+
public exposedCreateChunks(
36+
pages: BookPageDto[],
37+
): Promise<Document<BookChunk>[]> {
38+
return this.createChunks(pages);
39+
}
40+
41+
// Expose private methods for testing
42+
public exposedSplitAsciiDocIntoSections(
43+
content: string,
44+
split: boolean = true,
45+
): ParsedSection[] {
46+
// @ts-ignore - accessing private method for testing
47+
return this.splitAsciiDocIntoSections(content, split);
48+
}
49+
50+
public exposedConvertCodeBlocks(content: string): string {
51+
// @ts-ignore - accessing private method for testing
52+
return this.convertCodeBlocks(content);
53+
}
54+
55+
public exposedIsInsideCodeBlock(content: string, index: number): boolean {
56+
// @ts-ignore - accessing private method for testing
57+
return this.isInsideCodeBlock(content, index);
58+
}
59+
}
60+
61+
describe('AsciiDocIngester', () => {
62+
let ingester: TestAsciiDocIngester;
63+
64+
beforeEach(() => {
65+
const config: AsciiDocIngesterConfig = {
66+
bookConfig: {
67+
repoOwner: 'test-owner',
68+
repoName: 'test-repo',
69+
fileExtension: '.adoc',
70+
chunkSize: 1000,
71+
chunkOverlap: 200,
72+
baseUrl: 'https://example.com',
73+
urlSuffix: '',
74+
useUrlMapping: false,
75+
},
76+
playbookPath: 'test-playbook.yml',
77+
outputDir: '/tmp/output',
78+
restructuredDir: '/tmp/restructured',
79+
source: DocumentSource.OPENZEPPELIN_DOCS,
80+
};
81+
82+
ingester = new TestAsciiDocIngester(config);
83+
});
84+
85+
describe('parsePage', () => {
86+
it('should parse a page without splitting', () => {
87+
const content = `= Title
88+
89+
This is some content.
90+
91+
== Section 1
92+
93+
This is section 1 content.
94+
95+
== Section 2
96+
97+
This is section 2 content.`;
98+
99+
const result = ingester.exposedParsePage(content, false);
100+
101+
expect(result.length).toBe(1);
102+
expect(result[0]!.title).toBe('Title');
103+
expect(result[0]!.content).toContain('This is some content.');
104+
expect(result[0]!.content).toContain('Section 1');
105+
expect(result[0]!.content).toContain('Section 2');
106+
});
107+
108+
it('should parse a page with splitting', () => {
109+
const content = `= Title
110+
111+
This is some content.
112+
113+
== Section 1
114+
115+
This is section 1 content.
116+
117+
== Section 2
118+
119+
This is section 2 content.`;
120+
121+
const result = ingester.exposedParsePage(content, true);
122+
123+
expect(result.length).toBe(3);
124+
expect(result[0]!.title).toBe('Title');
125+
expect(result[0]!.content).toContain('This is some content.');
126+
expect(result[1]!.title).toBe('Section 1');
127+
expect(result[1]!.content).toContain('This is section 1 content.');
128+
expect(result[2]!.title).toBe('Section 2');
129+
expect(result[2]!.content).toContain('This is section 2 content.');
130+
});
131+
});
132+
133+
describe('convertCodeBlocks', () => {
134+
it('should convert AsciiDoc code blocks to markdown format', () => {
135+
const content = `Here is some code:
136+
137+
[source,cairo]
138+
----
139+
function hello() {
140+
return "world";
141+
}
142+
----
143+
144+
And here is some more code:
145+
146+
[source,typescript]
147+
----
148+
function hello(): string {
149+
return "world";
150+
}
151+
----`;
152+
153+
const result = ingester.exposedConvertCodeBlocks(content);
154+
155+
expect(result).toContain('```cairo');
156+
expect(result).toContain('```typescript');
157+
expect(result).not.toContain('[source,cairo]');
158+
expect(result).not.toContain('[source,typescript]');
159+
expect(result).not.toContain('----');
160+
});
161+
162+
it('should handle code blocks without language specification', () => {
163+
const content = `Here is some code:
164+
165+
----
166+
function hello() {
167+
return "world";
168+
}
169+
----`;
170+
171+
const result = ingester.exposedConvertCodeBlocks(content);
172+
173+
expect(result).toContain('```');
174+
expect(result).not.toContain('----');
175+
});
176+
});
177+
178+
describe('isInsideCodeBlock', () => {
179+
it('should correctly identify positions inside code blocks', () => {
180+
const content = `Here is some text.
181+
182+
[source,cairo]
183+
----
184+
function hello() {
185+
return "world";
186+
}
187+
----
188+
189+
More text here.`;
190+
191+
// Position inside the code block
192+
const insidePosition = content.indexOf('function hello');
193+
expect(ingester.exposedIsInsideCodeBlock(content, insidePosition)).toBe(
194+
true,
195+
);
196+
197+
// Position outside the code block
198+
const outsidePosition = content.indexOf('More text');
199+
expect(ingester.exposedIsInsideCodeBlock(content, outsidePosition)).toBe(
200+
false,
201+
);
202+
});
203+
});
204+
205+
describe('URL sourcing and generation', () => {
206+
it('should generate correct sourceLinks for documentation pages', async () => {
207+
// Mock the parsePage method to return predictable sections
208+
jest.spyOn(ingester, 'exposedParsePage').mockImplementation((content) => {
209+
if (content.includes('Title 1')) {
210+
return [
211+
{
212+
title: 'Title 1',
213+
content: 'This is page 1 content.',
214+
anchor: 'title-1',
215+
},
216+
{
217+
title: 'Section 1.1',
218+
content: 'This is section 1.1 content.',
219+
anchor: 'section-1-1',
220+
},
221+
];
222+
} else {
223+
return [
224+
{
225+
title: 'Title 2',
226+
content: 'This is page 2 content.',
227+
anchor: 'title-2',
228+
},
229+
];
230+
}
231+
});
232+
233+
const pages: BookPageDto[] = [
234+
{
235+
name: 'page1',
236+
content: `= Title 1
237+
238+
This is page 1 content.
239+
240+
== Section 1.1
241+
242+
This is section 1.1 content.`,
243+
},
244+
{
245+
name: 'page2',
246+
content: `= Title 2
247+
248+
This is page 2 content.`,
249+
},
250+
];
251+
252+
const chunks = await ingester.exposedCreateChunks(pages);
253+
254+
expect(chunks.length).toBeGreaterThan(0);
255+
expect(chunks[0]).toBeInstanceOf(Document);
256+
257+
// Check metadata
258+
expect(chunks[0]!.metadata.name).toBe('page1');
259+
expect(chunks[0]!.metadata.source).toBe(DocumentSource.OPENZEPPELIN_DOCS);
260+
expect(chunks[0]!.metadata.sourceLink).toBe(
261+
'https://example.com/page1#title-1',
262+
);
263+
264+
// Second chunk is from the first page
265+
if (chunks.length > 1 && chunks[1]!.metadata.name === 'page1') {
266+
expect(chunks[1]!.metadata.sourceLink).toBe(
267+
'https://example.com/page1#section-1-1',
268+
);
269+
}
270+
});
271+
272+
it('should handle nested paths in URLs', async () => {
273+
jest.spyOn(ingester, 'exposedParsePage').mockImplementation(() => {
274+
return [
275+
{
276+
title: 'Custom Accounts',
277+
content: 'Content here',
278+
anchor: 'custom-accounts',
279+
},
280+
];
281+
});
282+
283+
const pages: BookPageDto[] = [
284+
{
285+
name: 'guides/advanced/custom-accounts',
286+
content: '= Custom Accounts\nContent here',
287+
},
288+
];
289+
290+
const chunks = await ingester.exposedCreateChunks(pages);
291+
292+
expect(chunks).toHaveLength(1);
293+
expect(chunks[0]!.metadata.sourceLink).toBe(
294+
'https://example.com/guides/advanced/custom-accounts#custom-accounts',
295+
);
296+
});
297+
298+
it('should generate empty sourceLinks when baseUrl is not provided', async () => {
299+
const configNoUrl: AsciiDocIngesterConfig = {
300+
bookConfig: {
301+
repoOwner: 'test-owner',
302+
repoName: 'test-repo',
303+
fileExtension: '.adoc',
304+
chunkSize: 1000,
305+
chunkOverlap: 200,
306+
baseUrl: '',
307+
urlSuffix: '',
308+
useUrlMapping: false,
309+
},
310+
playbookPath: 'test-playbook.yml',
311+
outputDir: '/tmp/output',
312+
restructuredDir: '/tmp/restructured',
313+
source: DocumentSource.OPENZEPPELIN_DOCS,
314+
};
315+
316+
const ingesterNoUrl = new TestAsciiDocIngester(configNoUrl);
317+
318+
jest.spyOn(ingesterNoUrl, 'exposedParsePage').mockImplementation(() => {
319+
return [
320+
{
321+
title: 'Title',
322+
content: 'Content',
323+
anchor: 'title',
324+
},
325+
];
326+
});
327+
328+
const pages: BookPageDto[] = [
329+
{
330+
name: 'page1',
331+
content: '= Title\nContent',
332+
},
333+
];
334+
335+
const chunks = await ingesterNoUrl.exposedCreateChunks(pages);
336+
337+
expect(chunks).toHaveLength(1);
338+
// When baseUrl is empty string, it builds URL without the base part
339+
expect(chunks[0]!.metadata.sourceLink).toBe('/page1#title');
340+
});
341+
342+
it('should generate unique IDs correctly', async () => {
343+
const pages: BookPageDto[] = [
344+
{
345+
name: 'test-page',
346+
content: '= Section 1\nContent\n== Section 2\nMore content',
347+
},
348+
];
349+
350+
const chunks = await ingester.exposedCreateChunks(pages);
351+
352+
// The actual parsing will create chunks based on the real parser
353+
expect(chunks.length).toBeGreaterThan(0);
354+
expect(chunks[0]!.metadata.uniqueId).toBe('test-page-0');
355+
if (chunks.length > 1) {
356+
expect(chunks[1]!.metadata.uniqueId).toBe('test-page-1');
357+
}
358+
});
359+
360+
it('should calculate content hash for each chunk', async () => {
361+
jest.spyOn(ingester, 'exposedParsePage').mockImplementation(() => {
362+
return [
363+
{
364+
title: 'Section 1',
365+
content: 'Content',
366+
anchor: 'section-1',
367+
},
368+
];
369+
});
370+
371+
const pages: BookPageDto[] = [
372+
{
373+
name: 'test-page',
374+
content: '= Section 1\nContent',
375+
},
376+
];
377+
378+
const chunks = await ingester.exposedCreateChunks(pages);
379+
380+
expect(chunks).toHaveLength(1);
381+
expect(chunks[0]!.metadata.contentHash).toBeDefined();
382+
expect(typeof chunks[0]!.metadata.contentHash).toBe('string');
383+
expect(chunks[0]!.metadata.contentHash.length).toBeGreaterThan(0);
384+
});
385+
386+
it('should preserve custom anchors from AsciiDoc', async () => {
387+
const content = `[#custom-anchor-id]
388+
= Title
389+
390+
Content here.`;
391+
392+
const result = ingester.exposedParsePage(content, false);
393+
394+
expect(result).toHaveLength(1);
395+
expect(result[0]!.anchor).toBe('custom-anchor-id');
396+
expect(result[0]!.title).toBe('Title');
397+
});
398+
});
399+
});

0 commit comments

Comments
 (0)