KasarLabs · enitrat · Oct 9, 2025 · Oct 9, 2025
diff --git a/ingesters/__tests__/IngesterFactory.test.ts b/ingesters/__tests__/IngesterFactory.test.ts
@@ -75,6 +75,7 @@ describe('IngesterFactory', () => {
         DocumentSource.CORELIB_DOCS,
         DocumentSource.SCARB_DOCS,
         DocumentSource.STARKNET_JS,
+        DocumentSource.STARKNET_BLOG,
       ]);
     });
   });

diff --git a/ingesters/src/IngesterFactory.ts b/ingesters/src/IngesterFactory.ts
@@ -8,6 +8,7 @@ import { OpenZeppelinDocsIngester } from './ingesters/OpenZeppelinDocsIngester';
 import { CoreLibDocsIngester } from './ingesters/CoreLibDocsIngester';
 import { ScarbDocsIngester } from './ingesters/ScarbDocsIngester';
 import { StarknetJSIngester } from './ingesters/StarknetJSIngester';
+import { StarknetBlogIngester } from './ingesters/StarknetBlogIngester';
 
 /**
  * Factory class for creating ingesters
@@ -50,6 +51,9 @@ export class IngesterFactory {
       case 'starknet_js':
         return new StarknetJSIngester();
 
+      case 'starknet_blog':
+        return new StarknetBlogIngester();
+
       default:
         throw new Error(`Unsupported source: ${source}`);
     }

diff --git a/ingesters/src/ingesters/StarknetBlogIngester.ts b/ingesters/src/ingesters/StarknetBlogIngester.ts
@@ -0,0 +1,153 @@
+import { type BookConfig } from '../utils/types';
+import { MarkdownIngester } from './MarkdownIngester';
+import { type BookChunk, DocumentSource } from '../types';
+import { Document } from '@langchain/core/documents';
+import { VectorStore } from '../db/postgresVectorStore';
+import { logger } from '../utils/logger';
+import * as fs from 'fs/promises';
+import * as path from 'path';
+import { calculateHash } from '../utils/contentUtils';
+import {
+  RecursiveMarkdownSplitter,
+  type SplitOptions,
+} from '../utils/RecursiveMarkdownSplitter';
+import { getPythonPath } from '../utils/paths';
+
+/**
+ * Ingester for Starknet blog posts documentation
+ *
+ * This ingester processes pre-summarized Starknet blog posts from the generated
+ * summary file, chunks them using the RecursiveMarkdownSplitter, and stores them
+ * in the vector database for retrieval.
+ */
+export class StarknetBlogIngester extends MarkdownIngester {
+  /**
+   * Constructor for the Starknet Blog ingester
+   */
+  constructor() {
+    // Define the configuration for the Starknet Blog
+    const config: BookConfig = {
+      repoOwner: 'starknet',
+      repoName: 'starknet-blog',
+      fileExtension: '.md',
+      chunkSize: 4096,
+      chunkOverlap: 512,
+      baseUrl: 'https://www.starknet.io/blog',
+      urlSuffix: '',
+      useUrlMapping: false,
+    };
+
+    super(config, DocumentSource.STARKNET_BLOG);
+  }
+
+  /**
+   * Read the pre-summarized Starknet blog documentation file
+   */
+  async readSummaryFile(): Promise<string> {
+    const summaryPath = getPythonPath(
+      'src',
+      'scripts',
+      'summarizer',
+      'generated',
+      'blog_summary.md',
+    );
+
+    logger.info(`Reading Starknet blog summary from ${summaryPath}`);
+    const text = await fs.readFile(summaryPath, 'utf-8');
+    return text;
+  }
+
+  /**
+   * Chunk the blog summary file using RecursiveMarkdownSplitter
+   *
+   * This function takes the markdown content and splits it using a recursive
+   * strategy that respects headers, code blocks, and maintains overlap between chunks.
+   *
+   * @param text - The markdown content to chunk
+   * @returns Promise<Document<BookChunk>[]> - Array of document chunks
+   */
+  async chunkSummaryFile(text: string): Promise<Document<BookChunk>[]> {
+    // Configure the splitter with appropriate settings
+    const splitOptions: SplitOptions = {
+      maxChars: 2048,
+      minChars: 500,
+      overlap: 256,
+      headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
+      preserveCodeBlocks: true,
+      idPrefix: 'starknet-blog',
+      trim: true,
+    };
+
+    // Create the splitter and split the content
+    const splitter = new RecursiveMarkdownSplitter(splitOptions);
+    const chunks = splitter.splitMarkdownToChunks(text);
+
+    logger.info(
+      `Created ${chunks.length} chunks using RecursiveMarkdownSplitter`,
+    );
+
+    // Convert chunks to Document<BookChunk> format
+    const localChunks: Document<BookChunk>[] = chunks.map((chunk) => {
+      const contentHash = calculateHash(chunk.content);
+
+      return new Document<BookChunk>({
+        pageContent: chunk.content,
+        metadata: {
+          name: chunk.meta.title,
+          title: chunk.meta.title,
+          chunkNumber: chunk.meta.chunkNumber, // Already 0-based
+          contentHash: contentHash,
+          uniqueId: chunk.meta.uniqueId,
+          sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
+          source: this.source,
+        },
+      });
+    });
+
+    return localChunks;
+  }
+
+  /**
+   * Starknet Blog specific processing based on the pre-summarized markdown file
+   * @param vectorStore
+   */
+  public override async process(vectorStore: VectorStore): Promise<void> {
+    try {
+      // 1. Read the pre-summarized documentation
+      const text = await this.readSummaryFile();
+
+      // 2. Create chunks from the documentation
+      const chunks = await this.chunkSummaryFile(text);
+
+      logger.info(
+        `Created ${chunks.length} chunks from Starknet blog documentation`,
+      );
+
+      // 3. Update the vector store with the chunks
+      await this.updateVectorStore(vectorStore, chunks);
+
+      // 4. Clean up any temporary files (no temp files in this case)
+      await this.cleanupDownloadedFiles();
+    } catch (error) {
+      this.handleError(error);
+    }
+  }
+
+  /**
+   * Get the directory path for extracting files
+   *
+   * @returns string - Path to the extract directory
+   */
+  protected getExtractDir(): string {
+    const { getTempDir } = require('../utils/paths');
+    return getTempDir('starknet-blog');
+  }
+
+  /**
+   * Override cleanupDownloadedFiles since we don't download anything
+   */
+  protected override async cleanupDownloadedFiles(): Promise<void> {
+    // No cleanup needed as we're reading from a local file
+    logger.info('No cleanup needed - using local summary file');
+  }
+}
diff --git a/ingesters/src/types/index.ts b/ingesters/src/types/index.ts
@@ -16,6 +16,7 @@ export enum DocumentSource {
   CORELIB_DOCS = 'corelib_docs',
   SCARB_DOCS = 'scarb_docs',
   STARKNET_JS = 'starknet_js',
+  STARKNET_BLOG = 'starknet_blog',
 }
 
 export type BookChunk = {

diff --git a/python/optimizers/results/optimized_mcp_program.json b/python/optimizers/results/optimized_mcp_program.json
diff --git a/python/optimizers/results/optimized_rag.json b/python/optimizers/results/optimized_rag.json
diff --git a/python/optimizers/results/optimized_retrieval_program.json b/python/optimizers/results/optimized_retrieval_program.json
diff --git a/python/src/cairo_coder/core/types.py b/python/src/cairo_coder/core/types.py
@@ -38,6 +38,7 @@ class DocumentSource(str, Enum):
     CORELIB_DOCS = "corelib_docs"
     SCARB_DOCS = "scarb_docs"
     STARKNET_JS = "starknet_js"
+    STARKNET_BLOG = "starknet_blog"
 
 
 class DocumentMetadata(TypedDict, total=False):

diff --git a/python/src/cairo_coder/dspy/query_processor.py b/python/src/cairo_coder/dspy/query_processor.py
@@ -26,6 +26,7 @@
     DocumentSource.CORELIB_DOCS: "Cairo Core Library Documentation. For using the Cairo core library: basic types, stdlib functions, stdlib structs, macros, and other core concepts. Essential for Cairo programming questions.",
     DocumentSource.SCARB_DOCS: "Scarb Documentation. For using the Scarb package manager: building, compiling, generating compilation artifacts, managing dependencies, configuration of Scarb.toml.",
     DocumentSource.STARKNET_JS: "StarknetJS Documentation. For using the StarknetJS library: interacting with Starknet contracts, (calls and transactions), deploying Starknet contracts, front-end APIs, javascript integration examples, guides, tutorials and general JS/TS documentation for starknet.",
+    DocumentSource.STARKNET_BLOG: "Starknet Blog Documentation. For latest Starknet updates, announcements, feature releases, ecosystem developments, integration guides, and community updates. Useful for understanding recent Starknet innovations, new tools, partnerships, and protocol enhancements.",
 }
 
 # Ensure all DocumentSource variants are covered

diff --git a/python/src/scripts/docs_crawler.py b/python/src/scripts/docs_crawler.py
@@ -282,10 +282,6 @@ def extract_content(self, html: str, url: str) -> tuple[str, str]:
             if any(keyword in tag_id or keyword in tag_classes for keyword in boilerplate_keywords):
                 tags_to_remove.append(tag)
 
-        # Now decompose all collected tags
-        for tag in tags_to_remove:
-            tag.decompose()
-
         # Try to find main content
         main_content = None
 
@@ -347,8 +343,6 @@ def compile_markdown(self) -> str:
         lines = [
             f"# {self.domain} — Snapshot ({date_str})",
             "",
-            "",
-            "---",
             ""
         ]
 
@@ -362,15 +356,17 @@ def compile_markdown(self) -> str:
                 if not markdown or len(markdown.strip()) < 50:
                     markdown = "*No content extracted.*"
 
+                # Add individual Sources block for this page
                 lines.extend([
-                    f"**Source URL:** {url}",
+                    "---",
+                    "Sources:",
+                    f"  - {url}",
+                    "---",
                     "",
                     f"## {title}",
                     "",
                     markdown,
                     "",
-                    "---",
-                    ""
                 ])
             else:
                 # Skip pages that failed to fetch or returned non-HTML content

diff --git a/python/src/scripts/filter_2025_blogs.py b/python/src/scripts/filter_2025_blogs.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+"""
+Filter doc_dump.md to keep only blog entries published in 2025.
+
+Reads the doc_dump.md file, identifies individual pages separated by "---",
+and filters to keep only those containing blog entries with 2025 dates.
+"""
+
+import re
+from pathlib import Path
+
+
+def is_2025_blog_entry(content: str) -> bool:
+    """
+    Check if content contains a blog entry from 2025.
+
+    Looks for patterns like:
+    Home  /  Blog
+    Feb 5, 2023 ·    2 min read
+
+    Returns True if the date is from 2025.
+    """
+    # Look for the blog pattern with date
+    # Pattern: Month Day, Year · time min read
+    blog_pattern = r'Home\s+/\s+Blog.*?(\w+\s+\d+,\s+(\d{4}))\s+·'
+
+    matches = re.findall(blog_pattern, content, re.DOTALL | re.IGNORECASE)
+
+    for match in matches:
+        year = match[1]
+        if year == '2025':
+            return True
+
+    return False
+
+
+def filter_doc_dump(input_file: Path, output_file: Path):
+    """
+    Read doc_dump.md and filter to keep only 2025 blog entries.
+    Supports both old format (single Sources block) and new format (individual Sources blocks).
+    """
+    with open(input_file, encoding='utf-8') as f:
+        content = f.read()
+
+    filtered_pages = []
+    total_pages = 0
+    kept_pages = 0
+    document_header = ""
+
+    # Try new format first (individual Sources blocks)
+    page_pattern = r'(---\s*\nSources:\s*\n\s*-\s*[^\n]+\n---\s*\n+##[^#].*?)(?=\n---\s*\nSources:|\Z)'
+    matches = list(re.finditer(page_pattern, content, re.DOTALL))
+
+    if matches:
+        # New format detected
+        print("Detected new format (individual Sources blocks)")
+
+        # Keep document header if present
+        header_match = re.match(r'^(.*?)(?=\n---\s*\nSources:)', content, re.DOTALL)
+        document_header = header_match.group(1).strip() if header_match else ""
+
+        for match in matches:
+            page = match.group(1)
+            if not page.strip():
+                continue
+
+            total_pages += 1
+
+            # Check if this is a 2025 blog entry
+            if is_2025_blog_entry(page):
+                filtered_pages.append(page.strip())
+                kept_pages += 1
+
+                # Extract URL for logging (from Sources block)
+                url_match = re.search(r'Sources:\s*\n\s*-\s*(.+)', page)
+                if url_match:
+                    print(f"Keeping: {url_match.group(1)}")
+    else:
+        # Fall back to old format (**Source URL:** markers)
+        print("Detected old format (**Source URL:** markers)")
+
+        pattern = re.compile(r'^\*\*Source URL:\*\*\s+(\S+)', re.MULTILINE)
+        page_matches = list(pattern.finditer(content))
+
+        for i, m in enumerate(page_matches):
+            url = m.group(1)
+            start = m.end()
+            end = page_matches[i + 1].start() if i + 1 < len(page_matches) else len(content)
+            page_content = content[start:end].strip()
+
+            # Remove surrounding '---' separators
+            lines = page_content.splitlines()
+            while lines and lines[0].strip() == '---':
+                lines.pop(0)
+            while lines and lines[-1].strip() == '---':
+                lines.pop()
+            page_content = "\n".join(lines).strip()
+
+            total_pages += 1
+
+            if is_2025_blog_entry(page_content):
+                # Convert to new format
+                new_format_page = f"---\nSources:\n  - {url}\n---\n\n{page_content}"
+                filtered_pages.append(new_format_page)
+                kept_pages += 1
+                print(f"Keeping: {url}")
+
+    # Construct output with header and filtered pages
+    output_parts = []
+    if document_header:
+        output_parts.append(document_header)
+        output_parts.append("")
+        output_parts.append("")
+
+    output_parts.extend(filtered_pages)
+    output_content = '\n\n'.join(output_parts)
+
+    # Write to output file
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(output_content)
+
+    print(f"\n{'-'*60}")
+    print(f"Total pages processed: {total_pages}")
+    print(f"Pages kept (2025 blogs): {kept_pages}")
+    print(f"Pages removed: {total_pages - kept_pages}")
+    print(f"Output written to: {output_file}")
+
+
+def main():
+    # Paths
+    script_dir = Path(__file__).parent
+    python_dir = script_dir.parent.parent
+    input_file = python_dir / "doc_dump.md"
+    output_file = python_dir / "doc_dump_2025_blogs.md"
+
+    if not input_file.exists():
+        print(f"Error: Input file not found: {input_file}")
+        return
+
+    print(f"Reading from: {input_file}")
+    print("Filtering for 2025 blog entries...\n")
+
+    filter_doc_dump(input_file, output_file)
+
+
+if __name__ == "__main__":
+    main()