|
| 1 | +import { Document } from '@langchain/core/documents'; |
| 2 | +import { type BookChunk, DocumentSource } from '../src/types'; |
| 3 | +import { |
| 4 | + AsciiDocIngester, |
| 5 | + type AsciiDocIngesterConfig, |
| 6 | +} from '../src/ingesters/AsciiDocIngester'; |
| 7 | +import { |
| 8 | + type BookConfig, |
| 9 | + type BookPageDto, |
| 10 | + type ParsedSection, |
| 11 | +} from '../src/utils/types'; |
| 12 | + |
| 13 | +// Create a concrete implementation of AsciiDocIngester for testing |
| 14 | +class TestAsciiDocIngester extends AsciiDocIngester { |
| 15 | + constructor(config: AsciiDocIngesterConfig) { |
| 16 | + super(config); |
| 17 | + } |
| 18 | + |
| 19 | + // Implement the abstract method |
| 20 | + protected async processDocFilesCustom( |
| 21 | + config: BookConfig, |
| 22 | + directory: string, |
| 23 | + ): Promise<BookPageDto[]> { |
| 24 | + return []; |
| 25 | + } |
| 26 | + |
| 27 | + // Expose protected methods for testing |
| 28 | + public exposedParsePage( |
| 29 | + content: string, |
| 30 | + split: boolean = false, |
| 31 | + ): ParsedSection[] { |
| 32 | + return this.parsePage(content, split); |
| 33 | + } |
| 34 | + |
| 35 | + public exposedCreateChunks( |
| 36 | + pages: BookPageDto[], |
| 37 | + ): Promise<Document<BookChunk>[]> { |
| 38 | + return this.createChunks(pages); |
| 39 | + } |
| 40 | + |
| 41 | + // Expose private methods for testing |
| 42 | + public exposedSplitAsciiDocIntoSections( |
| 43 | + content: string, |
| 44 | + split: boolean = true, |
| 45 | + ): ParsedSection[] { |
| 46 | + // @ts-ignore - accessing private method for testing |
| 47 | + return this.splitAsciiDocIntoSections(content, split); |
| 48 | + } |
| 49 | + |
| 50 | + public exposedConvertCodeBlocks(content: string): string { |
| 51 | + // @ts-ignore - accessing private method for testing |
| 52 | + return this.convertCodeBlocks(content); |
| 53 | + } |
| 54 | + |
| 55 | + public exposedIsInsideCodeBlock(content: string, index: number): boolean { |
| 56 | + // @ts-ignore - accessing private method for testing |
| 57 | + return this.isInsideCodeBlock(content, index); |
| 58 | + } |
| 59 | +} |
| 60 | + |
| 61 | +describe('AsciiDocIngester', () => { |
| 62 | + let ingester: TestAsciiDocIngester; |
| 63 | + |
| 64 | + beforeEach(() => { |
| 65 | + const config: AsciiDocIngesterConfig = { |
| 66 | + bookConfig: { |
| 67 | + repoOwner: 'test-owner', |
| 68 | + repoName: 'test-repo', |
| 69 | + fileExtension: '.adoc', |
| 70 | + chunkSize: 1000, |
| 71 | + chunkOverlap: 200, |
| 72 | + baseUrl: 'https://example.com', |
| 73 | + urlSuffix: '', |
| 74 | + useUrlMapping: false, |
| 75 | + }, |
| 76 | + playbookPath: 'test-playbook.yml', |
| 77 | + outputDir: '/tmp/output', |
| 78 | + restructuredDir: '/tmp/restructured', |
| 79 | + source: DocumentSource.OPENZEPPELIN_DOCS, |
| 80 | + }; |
| 81 | + |
| 82 | + ingester = new TestAsciiDocIngester(config); |
| 83 | + }); |
| 84 | + |
| 85 | + describe('parsePage', () => { |
| 86 | + it('should parse a page without splitting', () => { |
| 87 | + const content = `= Title |
| 88 | +
|
| 89 | +This is some content. |
| 90 | +
|
| 91 | +== Section 1 |
| 92 | +
|
| 93 | +This is section 1 content. |
| 94 | +
|
| 95 | +== Section 2 |
| 96 | +
|
| 97 | +This is section 2 content.`; |
| 98 | + |
| 99 | + const result = ingester.exposedParsePage(content, false); |
| 100 | + |
| 101 | + expect(result.length).toBe(1); |
| 102 | + expect(result[0]!.title).toBe('Title'); |
| 103 | + expect(result[0]!.content).toContain('This is some content.'); |
| 104 | + expect(result[0]!.content).toContain('Section 1'); |
| 105 | + expect(result[0]!.content).toContain('Section 2'); |
| 106 | + }); |
| 107 | + |
| 108 | + it('should parse a page with splitting', () => { |
| 109 | + const content = `= Title |
| 110 | +
|
| 111 | +This is some content. |
| 112 | +
|
| 113 | +== Section 1 |
| 114 | +
|
| 115 | +This is section 1 content. |
| 116 | +
|
| 117 | +== Section 2 |
| 118 | +
|
| 119 | +This is section 2 content.`; |
| 120 | + |
| 121 | + const result = ingester.exposedParsePage(content, true); |
| 122 | + |
| 123 | + expect(result.length).toBe(3); |
| 124 | + expect(result[0]!.title).toBe('Title'); |
| 125 | + expect(result[0]!.content).toContain('This is some content.'); |
| 126 | + expect(result[1]!.title).toBe('Section 1'); |
| 127 | + expect(result[1]!.content).toContain('This is section 1 content.'); |
| 128 | + expect(result[2]!.title).toBe('Section 2'); |
| 129 | + expect(result[2]!.content).toContain('This is section 2 content.'); |
| 130 | + }); |
| 131 | + }); |
| 132 | + |
| 133 | + describe('convertCodeBlocks', () => { |
| 134 | + it('should convert AsciiDoc code blocks to markdown format', () => { |
| 135 | + const content = `Here is some code: |
| 136 | +
|
| 137 | +[source,cairo] |
| 138 | +---- |
| 139 | +function hello() { |
| 140 | + return "world"; |
| 141 | +} |
| 142 | +---- |
| 143 | +
|
| 144 | +And here is some more code: |
| 145 | +
|
| 146 | +[source,typescript] |
| 147 | +---- |
| 148 | +function hello(): string { |
| 149 | + return "world"; |
| 150 | +} |
| 151 | +----`; |
| 152 | + |
| 153 | + const result = ingester.exposedConvertCodeBlocks(content); |
| 154 | + |
| 155 | + expect(result).toContain('```cairo'); |
| 156 | + expect(result).toContain('```typescript'); |
| 157 | + expect(result).not.toContain('[source,cairo]'); |
| 158 | + expect(result).not.toContain('[source,typescript]'); |
| 159 | + expect(result).not.toContain('----'); |
| 160 | + }); |
| 161 | + |
| 162 | + it('should handle code blocks without language specification', () => { |
| 163 | + const content = `Here is some code: |
| 164 | +
|
| 165 | +---- |
| 166 | +function hello() { |
| 167 | + return "world"; |
| 168 | +} |
| 169 | +----`; |
| 170 | + |
| 171 | + const result = ingester.exposedConvertCodeBlocks(content); |
| 172 | + |
| 173 | + expect(result).toContain('```'); |
| 174 | + expect(result).not.toContain('----'); |
| 175 | + }); |
| 176 | + }); |
| 177 | + |
| 178 | + describe('isInsideCodeBlock', () => { |
| 179 | + it('should correctly identify positions inside code blocks', () => { |
| 180 | + const content = `Here is some text. |
| 181 | +
|
| 182 | +[source,cairo] |
| 183 | +---- |
| 184 | +function hello() { |
| 185 | + return "world"; |
| 186 | +} |
| 187 | +---- |
| 188 | +
|
| 189 | +More text here.`; |
| 190 | + |
| 191 | + // Position inside the code block |
| 192 | + const insidePosition = content.indexOf('function hello'); |
| 193 | + expect(ingester.exposedIsInsideCodeBlock(content, insidePosition)).toBe( |
| 194 | + true, |
| 195 | + ); |
| 196 | + |
| 197 | + // Position outside the code block |
| 198 | + const outsidePosition = content.indexOf('More text'); |
| 199 | + expect(ingester.exposedIsInsideCodeBlock(content, outsidePosition)).toBe( |
| 200 | + false, |
| 201 | + ); |
| 202 | + }); |
| 203 | + }); |
| 204 | + |
| 205 | + describe('URL sourcing and generation', () => { |
| 206 | + it('should generate correct sourceLinks for documentation pages', async () => { |
| 207 | + // Mock the parsePage method to return predictable sections |
| 208 | + jest.spyOn(ingester, 'exposedParsePage').mockImplementation((content) => { |
| 209 | + if (content.includes('Title 1')) { |
| 210 | + return [ |
| 211 | + { |
| 212 | + title: 'Title 1', |
| 213 | + content: 'This is page 1 content.', |
| 214 | + anchor: 'title-1', |
| 215 | + }, |
| 216 | + { |
| 217 | + title: 'Section 1.1', |
| 218 | + content: 'This is section 1.1 content.', |
| 219 | + anchor: 'section-1-1', |
| 220 | + }, |
| 221 | + ]; |
| 222 | + } else { |
| 223 | + return [ |
| 224 | + { |
| 225 | + title: 'Title 2', |
| 226 | + content: 'This is page 2 content.', |
| 227 | + anchor: 'title-2', |
| 228 | + }, |
| 229 | + ]; |
| 230 | + } |
| 231 | + }); |
| 232 | + |
| 233 | + const pages: BookPageDto[] = [ |
| 234 | + { |
| 235 | + name: 'page1', |
| 236 | + content: `= Title 1 |
| 237 | +
|
| 238 | +This is page 1 content. |
| 239 | +
|
| 240 | +== Section 1.1 |
| 241 | +
|
| 242 | +This is section 1.1 content.`, |
| 243 | + }, |
| 244 | + { |
| 245 | + name: 'page2', |
| 246 | + content: `= Title 2 |
| 247 | +
|
| 248 | +This is page 2 content.`, |
| 249 | + }, |
| 250 | + ]; |
| 251 | + |
| 252 | + const chunks = await ingester.exposedCreateChunks(pages); |
| 253 | + |
| 254 | + expect(chunks.length).toBeGreaterThan(0); |
| 255 | + expect(chunks[0]).toBeInstanceOf(Document); |
| 256 | + |
| 257 | + // Check metadata |
| 258 | + expect(chunks[0]!.metadata.name).toBe('page1'); |
| 259 | + expect(chunks[0]!.metadata.source).toBe(DocumentSource.OPENZEPPELIN_DOCS); |
| 260 | + expect(chunks[0]!.metadata.sourceLink).toBe( |
| 261 | + 'https://example.com/page1#title-1', |
| 262 | + ); |
| 263 | + |
| 264 | + // Second chunk is from the first page |
| 265 | + if (chunks.length > 1 && chunks[1]!.metadata.name === 'page1') { |
| 266 | + expect(chunks[1]!.metadata.sourceLink).toBe( |
| 267 | + 'https://example.com/page1#section-1-1', |
| 268 | + ); |
| 269 | + } |
| 270 | + }); |
| 271 | + |
| 272 | + it('should handle nested paths in URLs', async () => { |
| 273 | + jest.spyOn(ingester, 'exposedParsePage').mockImplementation(() => { |
| 274 | + return [ |
| 275 | + { |
| 276 | + title: 'Custom Accounts', |
| 277 | + content: 'Content here', |
| 278 | + anchor: 'custom-accounts', |
| 279 | + }, |
| 280 | + ]; |
| 281 | + }); |
| 282 | + |
| 283 | + const pages: BookPageDto[] = [ |
| 284 | + { |
| 285 | + name: 'guides/advanced/custom-accounts', |
| 286 | + content: '= Custom Accounts\nContent here', |
| 287 | + }, |
| 288 | + ]; |
| 289 | + |
| 290 | + const chunks = await ingester.exposedCreateChunks(pages); |
| 291 | + |
| 292 | + expect(chunks).toHaveLength(1); |
| 293 | + expect(chunks[0]!.metadata.sourceLink).toBe( |
| 294 | + 'https://example.com/guides/advanced/custom-accounts#custom-accounts', |
| 295 | + ); |
| 296 | + }); |
| 297 | + |
| 298 | + it('should generate empty sourceLinks when baseUrl is not provided', async () => { |
| 299 | + const configNoUrl: AsciiDocIngesterConfig = { |
| 300 | + bookConfig: { |
| 301 | + repoOwner: 'test-owner', |
| 302 | + repoName: 'test-repo', |
| 303 | + fileExtension: '.adoc', |
| 304 | + chunkSize: 1000, |
| 305 | + chunkOverlap: 200, |
| 306 | + baseUrl: '', |
| 307 | + urlSuffix: '', |
| 308 | + useUrlMapping: false, |
| 309 | + }, |
| 310 | + playbookPath: 'test-playbook.yml', |
| 311 | + outputDir: '/tmp/output', |
| 312 | + restructuredDir: '/tmp/restructured', |
| 313 | + source: DocumentSource.OPENZEPPELIN_DOCS, |
| 314 | + }; |
| 315 | + |
| 316 | + const ingesterNoUrl = new TestAsciiDocIngester(configNoUrl); |
| 317 | + |
| 318 | + jest.spyOn(ingesterNoUrl, 'exposedParsePage').mockImplementation(() => { |
| 319 | + return [ |
| 320 | + { |
| 321 | + title: 'Title', |
| 322 | + content: 'Content', |
| 323 | + anchor: 'title', |
| 324 | + }, |
| 325 | + ]; |
| 326 | + }); |
| 327 | + |
| 328 | + const pages: BookPageDto[] = [ |
| 329 | + { |
| 330 | + name: 'page1', |
| 331 | + content: '= Title\nContent', |
| 332 | + }, |
| 333 | + ]; |
| 334 | + |
| 335 | + const chunks = await ingesterNoUrl.exposedCreateChunks(pages); |
| 336 | + |
| 337 | + expect(chunks).toHaveLength(1); |
| 338 | + // When baseUrl is empty string, it builds URL without the base part |
| 339 | + expect(chunks[0]!.metadata.sourceLink).toBe('/page1#title'); |
| 340 | + }); |
| 341 | + |
| 342 | + it('should generate unique IDs correctly', async () => { |
| 343 | + const pages: BookPageDto[] = [ |
| 344 | + { |
| 345 | + name: 'test-page', |
| 346 | + content: '= Section 1\nContent\n== Section 2\nMore content', |
| 347 | + }, |
| 348 | + ]; |
| 349 | + |
| 350 | + const chunks = await ingester.exposedCreateChunks(pages); |
| 351 | + |
| 352 | + // The actual parsing will create chunks based on the real parser |
| 353 | + expect(chunks.length).toBeGreaterThan(0); |
| 354 | + expect(chunks[0]!.metadata.uniqueId).toBe('test-page-0'); |
| 355 | + if (chunks.length > 1) { |
| 356 | + expect(chunks[1]!.metadata.uniqueId).toBe('test-page-1'); |
| 357 | + } |
| 358 | + }); |
| 359 | + |
| 360 | + it('should calculate content hash for each chunk', async () => { |
| 361 | + jest.spyOn(ingester, 'exposedParsePage').mockImplementation(() => { |
| 362 | + return [ |
| 363 | + { |
| 364 | + title: 'Section 1', |
| 365 | + content: 'Content', |
| 366 | + anchor: 'section-1', |
| 367 | + }, |
| 368 | + ]; |
| 369 | + }); |
| 370 | + |
| 371 | + const pages: BookPageDto[] = [ |
| 372 | + { |
| 373 | + name: 'test-page', |
| 374 | + content: '= Section 1\nContent', |
| 375 | + }, |
| 376 | + ]; |
| 377 | + |
| 378 | + const chunks = await ingester.exposedCreateChunks(pages); |
| 379 | + |
| 380 | + expect(chunks).toHaveLength(1); |
| 381 | + expect(chunks[0]!.metadata.contentHash).toBeDefined(); |
| 382 | + expect(typeof chunks[0]!.metadata.contentHash).toBe('string'); |
| 383 | + expect(chunks[0]!.metadata.contentHash.length).toBeGreaterThan(0); |
| 384 | + }); |
| 385 | + |
| 386 | + it('should preserve custom anchors from AsciiDoc', async () => { |
| 387 | + const content = `[#custom-anchor-id] |
| 388 | += Title |
| 389 | +
|
| 390 | +Content here.`; |
| 391 | + |
| 392 | + const result = ingester.exposedParsePage(content, false); |
| 393 | + |
| 394 | + expect(result).toHaveLength(1); |
| 395 | + expect(result[0]!.anchor).toBe('custom-anchor-id'); |
| 396 | + expect(result[0]!.title).toBe('Title'); |
| 397 | + }); |
| 398 | + }); |
| 399 | +}); |
0 commit comments