KasarLabs
diff --git a/‎ingesters/__tests__/IngesterFactory.test.ts‎
Lines changed: 1 addition & 0 deletions b/‎ingesters/__tests__/IngesterFactory.test.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ingesters/src/IngesterFactory.ts‎
Lines changed: 4 additions & 0 deletions b/‎ingesters/src/IngesterFactory.ts‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎ingesters/src/ingesters/StarknetBlogIngester.ts‎
Lines changed: 153 additions & 0 deletions b/‎ingesters/src/ingesters/StarknetBlogIngester.ts‎
Lines changed: 153 additions & 0 deletions
diff --git a/‎ingesters/src/types/index.ts‎
Lines changed: 1 addition & 0 deletions b/‎ingesters/src/types/index.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/optimizers/results/optimized_mcp_program.json‎
Lines changed: 2 additions & 2 deletions b/‎python/optimizers/results/optimized_mcp_program.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/optimizers/results/optimized_rag.json‎
Lines changed: 2 additions & 2 deletions b/‎python/optimizers/results/optimized_rag.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/optimizers/results/optimized_retrieval_program.json‎
Lines changed: 2 additions & 2 deletions b/‎python/optimizers/results/optimized_retrieval_program.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/src/cairo_coder/core/types.py‎
Lines changed: 1 addition & 0 deletions b/‎python/src/cairo_coder/core/types.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/src/cairo_coder/dspy/query_processor.py‎
Lines changed: 1 addition & 0 deletions b/‎python/src/cairo_coder/dspy/query_processor.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/src/scripts/docs_crawler.py‎
Lines changed: 5 additions & 9 deletions b/‎python/src/scripts/docs_crawler.py‎
Lines changed: 5 additions & 9 deletions
@@ -75,6 +75,7 @@ describe('IngesterFactory', () => {
         DocumentSource.CORELIB_DOCS,
         DocumentSource.SCARB_DOCS,
         DocumentSource.STARKNET_JS,
+        DocumentSource.STARKNET_BLOG,
       ]);
     });
   });
 
@@ -8,6 +8,7 @@ import { OpenZeppelinDocsIngester } from './ingesters/OpenZeppelinDocsIngester';
 import { CoreLibDocsIngester } from './ingesters/CoreLibDocsIngester';
 import { ScarbDocsIngester } from './ingesters/ScarbDocsIngester';
 import { StarknetJSIngester } from './ingesters/StarknetJSIngester';
+import { StarknetBlogIngester } from './ingesters/StarknetBlogIngester';
 
 /**
  * Factory class for creating ingesters
@@ -50,6 +51,9 @@ export class IngesterFactory {
       case 'starknet_js':
         return new StarknetJSIngester();
 
+      case 'starknet_blog':
+        return new StarknetBlogIngester();
+
       default:
         throw new Error(`Unsupported source: ${source}`);
     }
 
@@ -0,0 +1,153 @@
+import { type BookConfig } from '../utils/types';
+import { MarkdownIngester } from './MarkdownIngester';
+import { type BookChunk, DocumentSource } from '../types';
+import { Document } from '@langchain/core/documents';
+import { VectorStore } from '../db/postgresVectorStore';
+import { logger } from '../utils/logger';
+import * as fs from 'fs/promises';
+import * as path from 'path';
+import { calculateHash } from '../utils/contentUtils';
+import {
+  RecursiveMarkdownSplitter,
+  type SplitOptions,
+} from '../utils/RecursiveMarkdownSplitter';
+import { getPythonPath } from '../utils/paths';
+
+/**
+ * Ingester for Starknet blog posts documentation
+ *
+ * This ingester processes pre-summarized Starknet blog posts from the generated
+ * summary file, chunks them using the RecursiveMarkdownSplitter, and stores them
+ * in the vector database for retrieval.
+ */
+export class StarknetBlogIngester extends MarkdownIngester {
+  /**
+   * Constructor for the Starknet Blog ingester
+   */
+  constructor() {
+    // Define the configuration for the Starknet Blog
+    const config: BookConfig = {
+      repoOwner: 'starknet',
+      repoName: 'starknet-blog',
+      fileExtension: '.md',
+      chunkSize: 4096,
+      chunkOverlap: 512,
+      baseUrl: 'https://www.starknet.io/blog',
+      urlSuffix: '',
+      useUrlMapping: false,
+    };
+
+    super(config, DocumentSource.STARKNET_BLOG);
+  }
+
+  /**
+   * Read the pre-summarized Starknet blog documentation file
+   */
+  async readSummaryFile(): Promise<string> {
+    const summaryPath = getPythonPath(
+      'src',
+      'scripts',
+      'summarizer',
+      'generated',
+      'blog_summary.md',
+    );
+
+    logger.info(`Reading Starknet blog summary from ${summaryPath}`);
+    const text = await fs.readFile(summaryPath, 'utf-8');
+    return text;
+  }
+
+  /**
+   * Chunk the blog summary file using RecursiveMarkdownSplitter
+   *
+   * This function takes the markdown content and splits it using a recursive
+   * strategy that respects headers, code blocks, and maintains overlap between chunks.
+   *
+   * @param text - The markdown content to chunk
+   * @returns Promise<Document<BookChunk>[]> - Array of document chunks
+   */
+  async chunkSummaryFile(text: string): Promise<Document<BookChunk>[]> {
+    // Configure the splitter with appropriate settings
+    const splitOptions: SplitOptions = {
+      maxChars: 2048,
+      minChars: 500,
+      overlap: 256,
+      headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
+      preserveCodeBlocks: true,
+      idPrefix: 'starknet-blog',
+      trim: true,
+    };
+
+    // Create the splitter and split the content
+    const splitter = new RecursiveMarkdownSplitter(splitOptions);
+    const chunks = splitter.splitMarkdownToChunks(text);
+
+    logger.info(
+      `Created ${chunks.length} chunks using RecursiveMarkdownSplitter`,
+    );
+
+    // Convert chunks to Document<BookChunk> format
+    const localChunks: Document<BookChunk>[] = chunks.map((chunk) => {
+      const contentHash = calculateHash(chunk.content);
+
+      return new Document<BookChunk>({
+        pageContent: chunk.content,
+        metadata: {
+          name: chunk.meta.title,
+          title: chunk.meta.title,
+          chunkNumber: chunk.meta.chunkNumber, // Already 0-based
+          contentHash: contentHash,
+          uniqueId: chunk.meta.uniqueId,
+          sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
+          source: this.source,
+        },
+      });
+    });
+
+    return localChunks;
+  }
+
+  /**
+   * Starknet Blog specific processing based on the pre-summarized markdown file
+   * @param vectorStore
+   */
+  public override async process(vectorStore: VectorStore): Promise<void> {
+    try {
+      // 1. Read the pre-summarized documentation
+      const text = await this.readSummaryFile();
+
+      // 2. Create chunks from the documentation
+      const chunks = await this.chunkSummaryFile(text);
+
+      logger.info(
+        `Created ${chunks.length} chunks from Starknet blog documentation`,
+      );
+
+      // 3. Update the vector store with the chunks
+      await this.updateVectorStore(vectorStore, chunks);
+
+      // 4. Clean up any temporary files (no temp files in this case)
+      await this.cleanupDownloadedFiles();
+    } catch (error) {
+      this.handleError(error);
+    }
+  }
+
+  /**
+   * Get the directory path for extracting files
+   *
+   * @returns string - Path to the extract directory
+   */
+  protected getExtractDir(): string {
+    const { getTempDir } = require('../utils/paths');
+    return getTempDir('starknet-blog');
+  }
+
+  /**
+   * Override cleanupDownloadedFiles since we don't download anything
+   */
+  protected override async cleanupDownloadedFiles(): Promise<void> {
+    // No cleanup needed as we're reading from a local file
+    logger.info('No cleanup needed - using local summary file');
+  }
+}
@@ -16,6 +16,7 @@ export enum DocumentSource {
   CORELIB_DOCS = 'corelib_docs',
   SCARB_DOCS = 'scarb_docs',
   STARKNET_JS = 'starknet_js',
+  STARKNET_BLOG = 'starknet_blog',
 }
 
 export type BookChunk = {
 
@@ -38,6 +38,7 @@ class DocumentSource(str, Enum):
     CORELIB_DOCS = "corelib_docs"
     SCARB_DOCS = "scarb_docs"
     STARKNET_JS = "starknet_js"
+    STARKNET_BLOG = "starknet_blog"
 
 
 class DocumentMetadata(TypedDict, total=False):
 
@@ -26,6 +26,7 @@
     DocumentSource.CORELIB_DOCS: "Cairo Core Library Documentation. For using the Cairo core library: basic types, stdlib functions, stdlib structs, macros, and other core concepts. Essential for Cairo programming questions.",
     DocumentSource.SCARB_DOCS: "Scarb Documentation. For using the Scarb package manager: building, compiling, generating compilation artifacts, managing dependencies, configuration of Scarb.toml.",
     DocumentSource.STARKNET_JS: "StarknetJS Documentation. For using the StarknetJS library: interacting with Starknet contracts, (calls and transactions), deploying Starknet contracts, front-end APIs, javascript integration examples, guides, tutorials and general JS/TS documentation for starknet.",
+    DocumentSource.STARKNET_BLOG: "Starknet Blog Documentation. For latest Starknet updates, announcements, feature releases, ecosystem developments, integration guides, and community updates. Useful for understanding recent Starknet innovations, new tools, partnerships, and protocol enhancements.",
 }
 
 # Ensure all DocumentSource variants are covered
 
@@ -282,10 +282,6 @@ def extract_content(self, html: str, url: str) -> tuple[str, str]:
             if any(keyword in tag_id or keyword in tag_classes for keyword in boilerplate_keywords):
                 tags_to_remove.append(tag)
 
-        # Now decompose all collected tags
-        for tag in tags_to_remove:
-            tag.decompose()
-
         # Try to find main content
         main_content = None
 
@@ -347,8 +343,6 @@ def compile_markdown(self) -> str:
         lines = [
             f"# {self.domain} — Snapshot ({date_str})",
             "",
-            "",
-            "---",
             ""
         ]
 
@@ -362,15 +356,17 @@ def compile_markdown(self) -> str:
                 if not markdown or len(markdown.strip()) < 50:
                     markdown = "*No content extracted.*"
 
+                # Add individual Sources block for this page
                 lines.extend([
-                    f"**Source URL:** {url}",
+                    "---",
+                    "Sources:",
+                    f"  - {url}",
+                    "---",
                     "",
                     f"## {title}",
                     "",
                     markdown,
                     "",
-                    "---",
-                    ""
                 ])
             else:
                 # Skip pages that failed to fetch or returned non-HTML content
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ export enum DocumentSource {`
`16`	`16`	`CORELIB_DOCS = 'corelib_docs',`
`17`	`17`	`SCARB_DOCS = 'scarb_docs',`
`18`	`18`	`STARKNET_JS = 'starknet_js',`
	`19`	`+ STARKNET_BLOG = 'starknet_blog',`
`19`	`20`	`}`
`20`	`21`
`21`	`22`	`export type BookChunk = {`
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@`
`26`	`26`	`DocumentSource.CORELIB_DOCS: "Cairo Core Library Documentation. For using the Cairo core library: basic types, stdlib functions, stdlib structs, macros, and other core concepts. Essential for Cairo programming questions.",`
`27`	`27`	`DocumentSource.SCARB_DOCS: "Scarb Documentation. For using the Scarb package manager: building, compiling, generating compilation artifacts, managing dependencies, configuration of Scarb.toml.",`
`28`	`28`	`DocumentSource.STARKNET_JS: "StarknetJS Documentation. For using the StarknetJS library: interacting with Starknet contracts, (calls and transactions), deploying Starknet contracts, front-end APIs, javascript integration examples, guides, tutorials and general JS/TS documentation for starknet.",`
	`29`	`+ DocumentSource.STARKNET_BLOG: "Starknet Blog Documentation. For latest Starknet updates, announcements, feature releases, ecosystem developments, integration guides, and community updates. Useful for understanding recent Starknet innovations, new tools, partnerships, and protocol enhancements.",`
`29`	`30`	`}`
`30`	`31`
`31`	`32`	`# Ensure all DocumentSource variants are covered`