Skip to content

Commit f51f54b

Browse files
authored
feat: embeddings for 2025 starknet blog (#78)
1 parent 21bcf7c commit f51f54b

File tree

13 files changed

+4796
-18
lines changed

13 files changed

+4796
-18
lines changed

ingesters/__tests__/IngesterFactory.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ describe('IngesterFactory', () => {
7575
DocumentSource.CORELIB_DOCS,
7676
DocumentSource.SCARB_DOCS,
7777
DocumentSource.STARKNET_JS,
78+
DocumentSource.STARKNET_BLOG,
7879
]);
7980
});
8081
});

ingesters/src/IngesterFactory.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { OpenZeppelinDocsIngester } from './ingesters/OpenZeppelinDocsIngester';
88
import { CoreLibDocsIngester } from './ingesters/CoreLibDocsIngester';
99
import { ScarbDocsIngester } from './ingesters/ScarbDocsIngester';
1010
import { StarknetJSIngester } from './ingesters/StarknetJSIngester';
11+
import { StarknetBlogIngester } from './ingesters/StarknetBlogIngester';
1112

1213
/**
1314
* Factory class for creating ingesters
@@ -50,6 +51,9 @@ export class IngesterFactory {
5051
case 'starknet_js':
5152
return new StarknetJSIngester();
5253

54+
case 'starknet_blog':
55+
return new StarknetBlogIngester();
56+
5357
default:
5458
throw new Error(`Unsupported source: ${source}`);
5559
}
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import { type BookConfig } from '../utils/types';
2+
import { MarkdownIngester } from './MarkdownIngester';
3+
import { type BookChunk, DocumentSource } from '../types';
4+
import { Document } from '@langchain/core/documents';
5+
import { VectorStore } from '../db/postgresVectorStore';
6+
import { logger } from '../utils/logger';
7+
import * as fs from 'fs/promises';
8+
import * as path from 'path';
9+
import { calculateHash } from '../utils/contentUtils';
10+
import {
11+
RecursiveMarkdownSplitter,
12+
type SplitOptions,
13+
} from '../utils/RecursiveMarkdownSplitter';
14+
import { getPythonPath } from '../utils/paths';
15+
16+
/**
17+
* Ingester for Starknet blog posts documentation
18+
*
19+
* This ingester processes pre-summarized Starknet blog posts from the generated
20+
* summary file, chunks them using the RecursiveMarkdownSplitter, and stores them
21+
* in the vector database for retrieval.
22+
*/
23+
export class StarknetBlogIngester extends MarkdownIngester {
24+
/**
25+
* Constructor for the Starknet Blog ingester
26+
*/
27+
constructor() {
28+
// Define the configuration for the Starknet Blog
29+
const config: BookConfig = {
30+
repoOwner: 'starknet',
31+
repoName: 'starknet-blog',
32+
fileExtension: '.md',
33+
chunkSize: 4096,
34+
chunkOverlap: 512,
35+
baseUrl: 'https://www.starknet.io/blog',
36+
urlSuffix: '',
37+
useUrlMapping: false,
38+
};
39+
40+
super(config, DocumentSource.STARKNET_BLOG);
41+
}
42+
43+
/**
44+
* Read the pre-summarized Starknet blog documentation file
45+
*/
46+
async readSummaryFile(): Promise<string> {
47+
const summaryPath = getPythonPath(
48+
'src',
49+
'scripts',
50+
'summarizer',
51+
'generated',
52+
'blog_summary.md',
53+
);
54+
55+
logger.info(`Reading Starknet blog summary from ${summaryPath}`);
56+
const text = await fs.readFile(summaryPath, 'utf-8');
57+
return text;
58+
}
59+
60+
/**
61+
* Chunk the blog summary file using RecursiveMarkdownSplitter
62+
*
63+
* This function takes the markdown content and splits it using a recursive
64+
* strategy that respects headers, code blocks, and maintains overlap between chunks.
65+
*
66+
* @param text - The markdown content to chunk
67+
* @returns Promise<Document<BookChunk>[]> - Array of document chunks
68+
*/
69+
async chunkSummaryFile(text: string): Promise<Document<BookChunk>[]> {
70+
// Configure the splitter with appropriate settings
71+
const splitOptions: SplitOptions = {
72+
maxChars: 2048,
73+
minChars: 500,
74+
overlap: 256,
75+
headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
76+
preserveCodeBlocks: true,
77+
idPrefix: 'starknet-blog',
78+
trim: true,
79+
};
80+
81+
// Create the splitter and split the content
82+
const splitter = new RecursiveMarkdownSplitter(splitOptions);
83+
const chunks = splitter.splitMarkdownToChunks(text);
84+
85+
logger.info(
86+
`Created ${chunks.length} chunks using RecursiveMarkdownSplitter`,
87+
);
88+
89+
// Convert chunks to Document<BookChunk> format
90+
const localChunks: Document<BookChunk>[] = chunks.map((chunk) => {
91+
const contentHash = calculateHash(chunk.content);
92+
93+
return new Document<BookChunk>({
94+
pageContent: chunk.content,
95+
metadata: {
96+
name: chunk.meta.title,
97+
title: chunk.meta.title,
98+
chunkNumber: chunk.meta.chunkNumber, // Already 0-based
99+
contentHash: contentHash,
100+
uniqueId: chunk.meta.uniqueId,
101+
sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
102+
source: this.source,
103+
},
104+
});
105+
});
106+
107+
return localChunks;
108+
}
109+
110+
/**
111+
* Starknet Blog specific processing based on the pre-summarized markdown file
112+
* @param vectorStore
113+
*/
114+
public override async process(vectorStore: VectorStore): Promise<void> {
115+
try {
116+
// 1. Read the pre-summarized documentation
117+
const text = await this.readSummaryFile();
118+
119+
// 2. Create chunks from the documentation
120+
const chunks = await this.chunkSummaryFile(text);
121+
122+
logger.info(
123+
`Created ${chunks.length} chunks from Starknet blog documentation`,
124+
);
125+
126+
// 3. Update the vector store with the chunks
127+
await this.updateVectorStore(vectorStore, chunks);
128+
129+
// 4. Clean up any temporary files (no temp files in this case)
130+
await this.cleanupDownloadedFiles();
131+
} catch (error) {
132+
this.handleError(error);
133+
}
134+
}
135+
136+
/**
137+
* Get the directory path for extracting files
138+
*
139+
* @returns string - Path to the extract directory
140+
*/
141+
protected getExtractDir(): string {
142+
const { getTempDir } = require('../utils/paths');
143+
return getTempDir('starknet-blog');
144+
}
145+
146+
/**
147+
* Override cleanupDownloadedFiles since we don't download anything
148+
*/
149+
protected override async cleanupDownloadedFiles(): Promise<void> {
150+
// No cleanup needed as we're reading from a local file
151+
logger.info('No cleanup needed - using local summary file');
152+
}
153+
}

ingesters/src/types/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ export enum DocumentSource {
1616
CORELIB_DOCS = 'corelib_docs',
1717
SCARB_DOCS = 'scarb_docs',
1818
STARKNET_JS = 'starknet_js',
19+
STARKNET_BLOG = 'starknet_blog',
1920
}
2021

2122
export type BookChunk = {

python/optimizers/results/optimized_mcp_program.json

Lines changed: 2 additions & 2 deletions
Large diffs are not rendered by default.

python/optimizers/results/optimized_rag.json

Lines changed: 2 additions & 2 deletions
Large diffs are not rendered by default.

python/optimizers/results/optimized_retrieval_program.json

Lines changed: 2 additions & 2 deletions
Large diffs are not rendered by default.

python/src/cairo_coder/core/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ class DocumentSource(str, Enum):
3838
CORELIB_DOCS = "corelib_docs"
3939
SCARB_DOCS = "scarb_docs"
4040
STARKNET_JS = "starknet_js"
41+
STARKNET_BLOG = "starknet_blog"
4142

4243

4344
class DocumentMetadata(TypedDict, total=False):

python/src/cairo_coder/dspy/query_processor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
DocumentSource.CORELIB_DOCS: "Cairo Core Library Documentation. For using the Cairo core library: basic types, stdlib functions, stdlib structs, macros, and other core concepts. Essential for Cairo programming questions.",
2727
DocumentSource.SCARB_DOCS: "Scarb Documentation. For using the Scarb package manager: building, compiling, generating compilation artifacts, managing dependencies, configuration of Scarb.toml.",
2828
DocumentSource.STARKNET_JS: "StarknetJS Documentation. For using the StarknetJS library: interacting with Starknet contracts, (calls and transactions), deploying Starknet contracts, front-end APIs, javascript integration examples, guides, tutorials and general JS/TS documentation for starknet.",
29+
DocumentSource.STARKNET_BLOG: "Starknet Blog Documentation. For latest Starknet updates, announcements, feature releases, ecosystem developments, integration guides, and community updates. Useful for understanding recent Starknet innovations, new tools, partnerships, and protocol enhancements.",
2930
}
3031

3132
# Ensure all DocumentSource variants are covered

python/src/scripts/docs_crawler.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -282,10 +282,6 @@ def extract_content(self, html: str, url: str) -> tuple[str, str]:
282282
if any(keyword in tag_id or keyword in tag_classes for keyword in boilerplate_keywords):
283283
tags_to_remove.append(tag)
284284

285-
# Now decompose all collected tags
286-
for tag in tags_to_remove:
287-
tag.decompose()
288-
289285
# Try to find main content
290286
main_content = None
291287

@@ -347,8 +343,6 @@ def compile_markdown(self) -> str:
347343
lines = [
348344
f"# {self.domain} — Snapshot ({date_str})",
349345
"",
350-
"",
351-
"---",
352346
""
353347
]
354348

@@ -362,15 +356,17 @@ def compile_markdown(self) -> str:
362356
if not markdown or len(markdown.strip()) < 50:
363357
markdown = "*No content extracted.*"
364358

359+
# Add individual Sources block for this page
365360
lines.extend([
366-
f"**Source URL:** {url}",
361+
"---",
362+
"Sources:",
363+
f" - {url}",
364+
"---",
367365
"",
368366
f"## {title}",
369367
"",
370368
markdown,
371369
"",
372-
"---",
373-
""
374370
])
375371
else:
376372
# Skip pages that failed to fetch or returned non-HTML content

0 commit comments

Comments
 (0)