|
| 1 | +import { type BookConfig } from '../utils/types'; |
| 2 | +import { MarkdownIngester } from './MarkdownIngester'; |
| 3 | +import { type BookChunk, DocumentSource } from '../types'; |
| 4 | +import { Document } from '@langchain/core/documents'; |
| 5 | +import { VectorStore } from '../db/postgresVectorStore'; |
| 6 | +import { logger } from '../utils/logger'; |
| 7 | +import * as fs from 'fs/promises'; |
| 8 | +import * as path from 'path'; |
| 9 | +import { calculateHash } from '../utils/contentUtils'; |
| 10 | +import { |
| 11 | + RecursiveMarkdownSplitter, |
| 12 | + type SplitOptions, |
| 13 | +} from '../utils/RecursiveMarkdownSplitter'; |
| 14 | +import { getPythonPath } from '../utils/paths'; |
| 15 | + |
| 16 | +/** |
| 17 | + * Ingester for Starknet blog posts documentation |
| 18 | + * |
| 19 | + * This ingester processes pre-summarized Starknet blog posts from the generated |
| 20 | + * summary file, chunks them using the RecursiveMarkdownSplitter, and stores them |
| 21 | + * in the vector database for retrieval. |
| 22 | + */ |
| 23 | +export class StarknetBlogIngester extends MarkdownIngester { |
| 24 | + /** |
| 25 | + * Constructor for the Starknet Blog ingester |
| 26 | + */ |
| 27 | + constructor() { |
| 28 | + // Define the configuration for the Starknet Blog |
| 29 | + const config: BookConfig = { |
| 30 | + repoOwner: 'starknet', |
| 31 | + repoName: 'starknet-blog', |
| 32 | + fileExtension: '.md', |
| 33 | + chunkSize: 4096, |
| 34 | + chunkOverlap: 512, |
| 35 | + baseUrl: 'https://www.starknet.io/blog', |
| 36 | + urlSuffix: '', |
| 37 | + useUrlMapping: false, |
| 38 | + }; |
| 39 | + |
| 40 | + super(config, DocumentSource.STARKNET_BLOG); |
| 41 | + } |
| 42 | + |
| 43 | + /** |
| 44 | + * Read the pre-summarized Starknet blog documentation file |
| 45 | + */ |
| 46 | + async readSummaryFile(): Promise<string> { |
| 47 | + const summaryPath = getPythonPath( |
| 48 | + 'src', |
| 49 | + 'scripts', |
| 50 | + 'summarizer', |
| 51 | + 'generated', |
| 52 | + 'blog_summary.md', |
| 53 | + ); |
| 54 | + |
| 55 | + logger.info(`Reading Starknet blog summary from ${summaryPath}`); |
| 56 | + const text = await fs.readFile(summaryPath, 'utf-8'); |
| 57 | + return text; |
| 58 | + } |
| 59 | + |
| 60 | + /** |
| 61 | + * Chunk the blog summary file using RecursiveMarkdownSplitter |
| 62 | + * |
| 63 | + * This function takes the markdown content and splits it using a recursive |
| 64 | + * strategy that respects headers, code blocks, and maintains overlap between chunks. |
| 65 | + * |
| 66 | + * @param text - The markdown content to chunk |
| 67 | + * @returns Promise<Document<BookChunk>[]> - Array of document chunks |
| 68 | + */ |
| 69 | + async chunkSummaryFile(text: string): Promise<Document<BookChunk>[]> { |
| 70 | + // Configure the splitter with appropriate settings |
| 71 | + const splitOptions: SplitOptions = { |
| 72 | + maxChars: 2048, |
| 73 | + minChars: 500, |
| 74 | + overlap: 256, |
| 75 | + headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest) |
| 76 | + preserveCodeBlocks: true, |
| 77 | + idPrefix: 'starknet-blog', |
| 78 | + trim: true, |
| 79 | + }; |
| 80 | + |
| 81 | + // Create the splitter and split the content |
| 82 | + const splitter = new RecursiveMarkdownSplitter(splitOptions); |
| 83 | + const chunks = splitter.splitMarkdownToChunks(text); |
| 84 | + |
| 85 | + logger.info( |
| 86 | + `Created ${chunks.length} chunks using RecursiveMarkdownSplitter`, |
| 87 | + ); |
| 88 | + |
| 89 | + // Convert chunks to Document<BookChunk> format |
| 90 | + const localChunks: Document<BookChunk>[] = chunks.map((chunk) => { |
| 91 | + const contentHash = calculateHash(chunk.content); |
| 92 | + |
| 93 | + return new Document<BookChunk>({ |
| 94 | + pageContent: chunk.content, |
| 95 | + metadata: { |
| 96 | + name: chunk.meta.title, |
| 97 | + title: chunk.meta.title, |
| 98 | + chunkNumber: chunk.meta.chunkNumber, // Already 0-based |
| 99 | + contentHash: contentHash, |
| 100 | + uniqueId: chunk.meta.uniqueId, |
| 101 | + sourceLink: chunk.meta.sourceLink || this.config.baseUrl, |
| 102 | + source: this.source, |
| 103 | + }, |
| 104 | + }); |
| 105 | + }); |
| 106 | + |
| 107 | + return localChunks; |
| 108 | + } |
| 109 | + |
| 110 | + /** |
| 111 | + * Starknet Blog specific processing based on the pre-summarized markdown file |
| 112 | + * @param vectorStore |
| 113 | + */ |
| 114 | + public override async process(vectorStore: VectorStore): Promise<void> { |
| 115 | + try { |
| 116 | + // 1. Read the pre-summarized documentation |
| 117 | + const text = await this.readSummaryFile(); |
| 118 | + |
| 119 | + // 2. Create chunks from the documentation |
| 120 | + const chunks = await this.chunkSummaryFile(text); |
| 121 | + |
| 122 | + logger.info( |
| 123 | + `Created ${chunks.length} chunks from Starknet blog documentation`, |
| 124 | + ); |
| 125 | + |
| 126 | + // 3. Update the vector store with the chunks |
| 127 | + await this.updateVectorStore(vectorStore, chunks); |
| 128 | + |
| 129 | + // 4. Clean up any temporary files (no temp files in this case) |
| 130 | + await this.cleanupDownloadedFiles(); |
| 131 | + } catch (error) { |
| 132 | + this.handleError(error); |
| 133 | + } |
| 134 | + } |
| 135 | + |
| 136 | + /** |
| 137 | + * Get the directory path for extracting files |
| 138 | + * |
| 139 | + * @returns string - Path to the extract directory |
| 140 | + */ |
| 141 | + protected getExtractDir(): string { |
| 142 | + const { getTempDir } = require('../utils/paths'); |
| 143 | + return getTempDir('starknet-blog'); |
| 144 | + } |
| 145 | + |
| 146 | + /** |
| 147 | + * Override cleanupDownloadedFiles since we don't download anything |
| 148 | + */ |
| 149 | + protected override async cleanupDownloadedFiles(): Promise<void> { |
| 150 | + // No cleanup needed as we're reading from a local file |
| 151 | + logger.info('No cleanup needed - using local summary file'); |
| 152 | + } |
| 153 | +} |
0 commit comments