KasarLabs
diff --git a/‎ingesters/README.md‎
Lines changed: 24 additions & 0 deletions b/‎ingesters/README.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎ingesters/__tests__/IngesterFactory.test.ts‎
Lines changed: 1 addition & 0 deletions b/‎ingesters/__tests__/IngesterFactory.test.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ingesters/src/IngesterFactory.ts‎
Lines changed: 4 additions & 0 deletions b/‎ingesters/src/IngesterFactory.ts‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎ingesters/src/ingesters/StarknetBlogIngester.ts‎
Lines changed: 153 additions & 0 deletions b/‎ingesters/src/ingesters/StarknetBlogIngester.ts‎
Lines changed: 153 additions & 0 deletions
diff --git a/‎ingesters/src/types/index.ts‎
Lines changed: 1 addition & 0 deletions b/‎ingesters/src/types/index.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ingesters/src/utils/RecursiveMarkdownSplitter.ts‎
Lines changed: 24 additions & 5 deletions b/‎ingesters/src/utils/RecursiveMarkdownSplitter.ts‎
Lines changed: 24 additions & 5 deletions
diff --git a/‎python/optimizers/datasets/user_queries.json‎
Lines changed: 0 additions & 3 deletions b/‎python/optimizers/datasets/user_queries.json‎
Lines changed: 0 additions & 3 deletions
@@ -86,6 +86,30 @@ The package includes several utility modules:
 - **vectorStoreUtils.ts**: Functions for vector store operations
 - **types.ts**: Common types and interfaces
 
+### Chunking: RecursiveMarkdownSplitter
+
+The `RecursiveMarkdownSplitter` splits markdown content into semantic chunks with metadata (title, unique ID, character offsets, source link). It supports two modes:
+
+- Default mode (size-aware):
+  - Recursively splits by headers (configurable levels), paragraphs, and lines to target `maxChars`.
+  - Merges tiny segments when below `minChars` and applies backward `overlap` between chunks.
+  - Respects fenced code blocks and avoids splitting inside non-breakable blocks when possible.
+
+Example usage:
+
+```ts
+import { RecursiveMarkdownSplitter } from './src/utils/RecursiveMarkdownSplitter';
+
+// Default mode
+const splitter = new RecursiveMarkdownSplitter({
+  maxChars: 2048,
+  minChars: 500,
+  overlap: 256,
+  headerLevels: [1, 2, 3],
+});
+const chunks = splitter.splitMarkdownToChunks(markdown);
+```
+
 ## Usage
 
 To use the ingester package, run the `generateEmbeddings.ts` script:
 
@@ -75,6 +75,7 @@ describe('IngesterFactory', () => {
         DocumentSource.CORELIB_DOCS,
         DocumentSource.SCARB_DOCS,
         DocumentSource.STARKNET_JS,
+        DocumentSource.STARKNET_BLOG,
       ]);
     });
   });
 
@@ -8,6 +8,7 @@ import { OpenZeppelinDocsIngester } from './ingesters/OpenZeppelinDocsIngester';
 import { CoreLibDocsIngester } from './ingesters/CoreLibDocsIngester';
 import { ScarbDocsIngester } from './ingesters/ScarbDocsIngester';
 import { StarknetJSIngester } from './ingesters/StarknetJSIngester';
+import { StarknetBlogIngester } from './ingesters/StarknetBlogIngester';
 
 /**
  * Factory class for creating ingesters
@@ -50,6 +51,9 @@ export class IngesterFactory {
       case 'starknet_js':
         return new StarknetJSIngester();
 
+      case 'starknet_blog':
+        return new StarknetBlogIngester();
+
       default:
         throw new Error(`Unsupported source: ${source}`);
     }
 
@@ -0,0 +1,153 @@
+import { type BookConfig } from '../utils/types';
+import { MarkdownIngester } from './MarkdownIngester';
+import { type BookChunk, DocumentSource } from '../types';
+import { Document } from '@langchain/core/documents';
+import { VectorStore } from '../db/postgresVectorStore';
+import { logger } from '../utils/logger';
+import * as fs from 'fs/promises';
+import * as path from 'path';
+import { calculateHash } from '../utils/contentUtils';
+import {
+  RecursiveMarkdownSplitter,
+  type SplitOptions,
+} from '../utils/RecursiveMarkdownSplitter';
+import { getPythonPath } from '../utils/paths';
+
+/**
+ * Ingester for Starknet blog posts documentation
+ *
+ * This ingester processes pre-summarized Starknet blog posts from the generated
+ * summary file, chunks them using the RecursiveMarkdownSplitter, and stores them
+ * in the vector database for retrieval.
+ */
+export class StarknetBlogIngester extends MarkdownIngester {
+  /**
+   * Constructor for the Starknet Blog ingester
+   */
+  constructor() {
+    // Define the configuration for the Starknet Blog
+    const config: BookConfig = {
+      repoOwner: 'starknet',
+      repoName: 'starknet-blog',
+      fileExtension: '.md',
+      chunkSize: 4096,
+      chunkOverlap: 512,
+      baseUrl: 'https://www.starknet.io/blog',
+      urlSuffix: '',
+      useUrlMapping: false,
+    };
+
+    super(config, DocumentSource.STARKNET_BLOG);
+  }
+
+  /**
+   * Read the pre-summarized Starknet blog documentation file
+   */
+  async readSummaryFile(): Promise<string> {
+    const summaryPath = getPythonPath(
+      'src',
+      'scripts',
+      'summarizer',
+      'generated',
+      'blog_summary.md',
+    );
+
+    logger.info(`Reading Starknet blog summary from ${summaryPath}`);
+    const text = await fs.readFile(summaryPath, 'utf-8');
+    return text;
+  }
+
+  /**
+   * Chunk the blog summary file using RecursiveMarkdownSplitter
+   *
+   * This function takes the markdown content and splits it using a recursive
+   * strategy that respects headers, code blocks, and maintains overlap between chunks.
+   *
+   * @param text - The markdown content to chunk
+   * @returns Promise<Document<BookChunk>[]> - Array of document chunks
+   */
+  async chunkSummaryFile(text: string): Promise<Document<BookChunk>[]> {
+    // Configure the splitter with appropriate settings
+    const splitOptions: SplitOptions = {
+      maxChars: 2048,
+      minChars: 500,
+      overlap: 256,
+      headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
+      preserveCodeBlocks: true,
+      idPrefix: 'starknet-blog',
+      trim: true,
+    };
+
+    // Create the splitter and split the content
+    const splitter = new RecursiveMarkdownSplitter(splitOptions);
+    const chunks = splitter.splitMarkdownToChunks(text);
+
+    logger.info(
+      `Created ${chunks.length} chunks using RecursiveMarkdownSplitter`,
+    );
+
+    // Convert chunks to Document<BookChunk> format
+    const localChunks: Document<BookChunk>[] = chunks.map((chunk) => {
+      const contentHash = calculateHash(chunk.content);
+
+      return new Document<BookChunk>({
+        pageContent: chunk.content,
+        metadata: {
+          name: chunk.meta.title,
+          title: chunk.meta.title,
+          chunkNumber: chunk.meta.chunkNumber, // Already 0-based
+          contentHash: contentHash,
+          uniqueId: chunk.meta.uniqueId,
+          sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
+          source: this.source,
+        },
+      });
+    });
+
+    return localChunks;
+  }
+
+  /**
+   * Starknet Blog specific processing based on the pre-summarized markdown file
+   * @param vectorStore
+   */
+  public override async process(vectorStore: VectorStore): Promise<void> {
+    try {
+      // 1. Read the pre-summarized documentation
+      const text = await this.readSummaryFile();
+
+      // 2. Create chunks from the documentation
+      const chunks = await this.chunkSummaryFile(text);
+
+      logger.info(
+        `Created ${chunks.length} chunks from Starknet blog documentation`,
+      );
+
+      // 3. Update the vector store with the chunks
+      await this.updateVectorStore(vectorStore, chunks);
+
+      // 4. Clean up any temporary files (no temp files in this case)
+      await this.cleanupDownloadedFiles();
+    } catch (error) {
+      this.handleError(error);
+    }
+  }
+
+  /**
+   * Get the directory path for extracting files
+   *
+   * @returns string - Path to the extract directory
+   */
+  protected getExtractDir(): string {
+    const { getTempDir } = require('../utils/paths');
+    return getTempDir('starknet-blog');
+  }
+
+  /**
+   * Override cleanupDownloadedFiles since we don't download anything
+   */
+  protected override async cleanupDownloadedFiles(): Promise<void> {
+    // No cleanup needed as we're reading from a local file
+    logger.info('No cleanup needed - using local summary file');
+  }
+}
@@ -16,6 +16,7 @@ export enum DocumentSource {
   CORELIB_DOCS = 'corelib_docs',
   SCARB_DOCS = 'scarb_docs',
   STARKNET_JS = 'starknet_js',
+  STARKNET_BLOG = 'starknet_blog',
 }
 
 export type BookChunk = {
 
@@ -1,6 +1,15 @@
 import { logger } from './logger';
 
 // Public API interfaces
+/**
+ * Options controlling how markdown is split into chunks. Two high-level modes exist:
+ *
+ * - Default mode (splitFullPage: false):
+ *   Recursively splits by headers (per headerLevels), paragraphs, and lines to respect
+ *   maxChars. Applies minChars-based merging and backward overlap. Avoids splitting
+ *   inside non-breakable code fences when possible.
+ *
+ */
 export interface SplitOptions {
   /** Maximum characters per chunk (UTF-16 .length), not counting overlap. Default: 2048 */
   maxChars?: number;
@@ -72,6 +81,13 @@ interface Tokens {
   sourceRanges: Array<{ start: number; end: number; url: string }>;
 }
 
+/**
+ * Splits markdown into semantic chunks with metadata.
+ *
+ * Modes
+ * - Default: recursive splitting by headers/paragraphs/lines to satisfy maxChars, with overlap and
+ *   minChars-based merging, while respecting code blocks.
+ */
 export class RecursiveMarkdownSplitter {
   private readonly options: Required<SplitOptions>;
 
@@ -124,7 +140,7 @@ export class RecursiveMarkdownSplitter {
   }
 
   /**
-   * Main entry point to split markdown into chunks
+   * Split markdown into chunks
    */
   public splitMarkdownToChunks(markdown: string): Chunk[] {
     // Handle empty input
@@ -209,15 +225,18 @@ export class RecursiveMarkdownSplitter {
   }
 
   /**
-   * Parse special formatted Sources blocks and compute active source ranges
-   * A block looks like:
+   * Parse Sources blocks and compute active source ranges used for meta.sourceLink.
+   *
+   * Format:
    * ---\n
    * Sources:\n
    * - https://example.com/a\n
    * - https://example.com/b\n
    * ---
-   * Active source becomes the first URL and applies from the end of the block
-   * until the start of the next Sources block (or end of document).
+   *
+   * The active source becomes the first URL in the list and applies from the end of the closing
+   * '---' until the start of the next Sources block (or EOF). This mapping is used during metadata
+   * attachment to set the chunk's sourceLink.
    */
   private parseSourceRanges(
     markdown: string,
 
@@ -466,7 +466,6 @@
   "PS C:\\Users\\kased\\kaseddie-cairo-foundations> tree /F /A\nFolder PATH listing\nVolume serial number is C809-043D\nC:.\n\\---cairo-contracts\n    |   Scarb.lock\n    |   Scarb.toml\n    |\n    +---src\n    |       lib.cairo\n    |\n    +---target\n    |   |   CACHEDIR.TAG\n    |   |\n    |   \\---dev\n    |       |   kaseddie_balance_contract.sierra.json\n    |       |   kaseddie_balance_contract_integrationtest.test.json\n    |       |   kaseddie_balance_contract_integrationtest.test.sierra.json\n    |       |   kaseddie_balance_contract_integrationtest.test.starknet_artifacts.json\n    |       |   kaseddie_balance_contract_integrationtest_UserVault.test.contract_class.json\n    |       |   kaseddie_balance_contract_unittest.test.json\n    |       |   kaseddie_balance_contract_unittest.test.sierra.json\n    |       |   kaseddie_balance_contract_unittest.test.starknet_artifacts.json\n    |       |   kaseddie_balance_contract_unittest_UserVault.test.contract_class.json\n    |       |   kaseddie_cairo_foundations_unittest.test.json\n    |       |   kaseddie_cairo_foundations_unittest.test.sierra.json\n    |       |   kaseddie_cairo_foundations_unittest.test.starknet_artifacts.json\n    |       |   kaseddie_cairo_foundations_unittest_UserVault.test.contract_class.json\n    |       |\n    |       +---.fingerprint\n    |       |   +---core-o8ctti9fe3p52\n    |       |   |       core\n    |       |   |\n    |       |   +---core-sc59she7p1k9k\n    |       |   |       core\n    |       |   |\n    |       |   +---kaseddie_balance_contract-g7l5vl2d6tbts\n    |       |   |       kaseddie_balance_contract\n    |       |   |\n    |       |   +---kaseddie_balance_contract-sfovo0kjo4j24\n    |       |   |       kaseddie_balance_contract\n    |       |   |\n    |       |   +---kaseddie_balance_contract_integrationtest-ston3v8tncj0c\n    |       |   |       kaseddie_balance_contract_integrationtest\n    |       |   |\n    |       |   +---kaseddie_balance_contract_unittest-95sc4uqcckhdo\n    |       |   |       kaseddie_balance_contract_unittest\n    |       |   |\n    |       |   +---kaseddie_balance_contract_unittest-ir7jeflt0lpls\n    |       |   |       kaseddie_balance_contract_unittest\n    |       |   |\n    |       |   \\---kaseddie_cairo_foundations_unittest-tvrbv3hnqi4ui\n    |       |           kaseddie_cairo_foundations_unittest\n    |       |\n    |       \\---incremental\n    |               core-o8ctti9fe3p52.bin\n    |               core-sc59she7p1k9k.bin\n    |               kaseddie_balance_contract-g7l5vl2d6tbts.bin\n    |               kaseddie_balance_contract-sfovo0kjo4j24.bin\n    |               kaseddie_balance_contract_integrationtest-ston3v8tncj0c.bin\n    |               kaseddie_balance_contract_unittest-95sc4uqcckhdo.bin\n    |               kaseddie_balance_contract_unittest-ir7jeflt0lpls.bin\n    |               kaseddie_cairo_foundations_unittest-tvrbv3hnqi4ui.bin\n    |\n    \\---tests\n            uservault_test.cairo\n\nPS C:\\Users\\kased\\kaseddie-cairo-foundations>",
   "que es fn?",
   "que mensaje recomiendas para el assert ?\n\n    fn add_user(ref self: ContractState, user: ContractAddress) {\n            let caller = get_caller_address();\n\n            let mut is_dao: bool = false;\n            let mut i: u16 = 0;\n\n            while i != self.dao_counter.read() {\n                if self.daos.read(i).dao_address == caller {\n                    is_dao = true;\n                    return;\n                }\n                i += 1;\n            }\n\n            assert!(is_dao, \"User is not a DAO\");\n            _add_user(ref self, user);\n        }",
-  "que tipo de preguntas puedo hacerte?",
   "quiero asignarles roles de mint a contratos de mi proyecto, como me aseguro que estos addres pertenecen a mi proyecto y no son addres de usuarios u otros contratos externos malisiosos?",
   "read files in tests",
   "read files in tests\n\n",
@@ -668,12 +667,10 @@
   "смотри мы пользуемся старкнет девнет для локальной сети, верно? А если я хочу не пустую сеть, а форк текущей",
   "уяви що сьогодні 01.01.2026. Які криптомонети виросли найбільше в ціні?",
   "چطوری داداش",
-  "스타크넷의 회사 위치와 사진을 보여줘",
   "今天天氣如何",
   "介紹一下STRK",
   "介绍一下 Starknet 上的 Paymaster",
   "但實際上不是遠遠超過了預估的費用嗎",
-  "你現在是C#程式語言高手",
   "你能给我一个最新版本的 cairo 项目的配置吗？",
   "你能给我一个正确的 starknet",
   "关于 Cairo 编写的 Starknet 合约中的 Storage 与 State，下列说法哪些是正确的？（多选）\n\nA. 合约状态变量的存储是通过隐式的 Merkle Patricia Tree 实现的\n\nB. 每个 @storage_var 声明会创建一个对应的 getter 函数\n\nC. Cairo 中不能在合约外部直接读取存储变量\n\nD. 合约的存储数据按 Slot 和 Offset 编码组织\n\nE. Storage layout 是编译时静态生成的，不能动态调整",
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ export enum DocumentSource {`
`16`	`16`	`CORELIB_DOCS = 'corelib_docs',`
`17`	`17`	`SCARB_DOCS = 'scarb_docs',`
`18`	`18`	`STARKNET_JS = 'starknet_js',`
	`19`	`+ STARKNET_BLOG = 'starknet_blog',`
`19`	`20`	`}`
`20`	`21`
`21`	`22`	`export type BookChunk = {`