Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ingesters/__tests__/IngesterFactory.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ describe('IngesterFactory', () => {
DocumentSource.CORELIB_DOCS,
DocumentSource.SCARB_DOCS,
DocumentSource.STARKNET_JS,
DocumentSource.STARKNET_BLOG,
]);
});
});
Expand Down
4 changes: 4 additions & 0 deletions ingesters/src/IngesterFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { OpenZeppelinDocsIngester } from './ingesters/OpenZeppelinDocsIngester';
import { CoreLibDocsIngester } from './ingesters/CoreLibDocsIngester';
import { ScarbDocsIngester } from './ingesters/ScarbDocsIngester';
import { StarknetJSIngester } from './ingesters/StarknetJSIngester';
import { StarknetBlogIngester } from './ingesters/StarknetBlogIngester';

/**
* Factory class for creating ingesters
Expand Down Expand Up @@ -50,6 +51,9 @@ export class IngesterFactory {
case 'starknet_js':
return new StarknetJSIngester();

case 'starknet_blog':
return new StarknetBlogIngester();

default:
throw new Error(`Unsupported source: ${source}`);
}
Expand Down
153 changes: 153 additions & 0 deletions ingesters/src/ingesters/StarknetBlogIngester.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import { type BookConfig } from '../utils/types';
import { MarkdownIngester } from './MarkdownIngester';
import { type BookChunk, DocumentSource } from '../types';
import { Document } from '@langchain/core/documents';
import { VectorStore } from '../db/postgresVectorStore';
import { logger } from '../utils/logger';
import * as fs from 'fs/promises';
import * as path from 'path';
import { calculateHash } from '../utils/contentUtils';
import {
RecursiveMarkdownSplitter,
type SplitOptions,
} from '../utils/RecursiveMarkdownSplitter';
import { getPythonPath } from '../utils/paths';

/**
* Ingester for Starknet blog posts documentation
*
* This ingester processes pre-summarized Starknet blog posts from the generated
* summary file, chunks them using the RecursiveMarkdownSplitter, and stores them
* in the vector database for retrieval.
*/
export class StarknetBlogIngester extends MarkdownIngester {
/**
* Constructor for the Starknet Blog ingester
*/
constructor() {
// Define the configuration for the Starknet Blog
const config: BookConfig = {
repoOwner: 'starknet',
repoName: 'starknet-blog',
fileExtension: '.md',
chunkSize: 4096,
chunkOverlap: 512,
baseUrl: 'https://www.starknet.io/blog',
urlSuffix: '',
useUrlMapping: false,
};

super(config, DocumentSource.STARKNET_BLOG);
}

/**
* Read the pre-summarized Starknet blog documentation file
*/
async readSummaryFile(): Promise<string> {
const summaryPath = getPythonPath(
'src',
'scripts',
'summarizer',
'generated',
'blog_summary.md',
);

logger.info(`Reading Starknet blog summary from ${summaryPath}`);
const text = await fs.readFile(summaryPath, 'utf-8');
return text;
}

/**
* Chunk the blog summary file using RecursiveMarkdownSplitter
*
* This function takes the markdown content and splits it using a recursive
* strategy that respects headers, code blocks, and maintains overlap between chunks.
*
* @param text - The markdown content to chunk
* @returns Promise<Document<BookChunk>[]> - Array of document chunks
*/
async chunkSummaryFile(text: string): Promise<Document<BookChunk>[]> {
// Configure the splitter with appropriate settings
const splitOptions: SplitOptions = {
maxChars: 2048,
minChars: 500,
overlap: 256,
headerLevels: [1, 2, 3], // Split on H1/H2/H3 (title uses deepest)
preserveCodeBlocks: true,
idPrefix: 'starknet-blog',
trim: true,
};

// Create the splitter and split the content
const splitter = new RecursiveMarkdownSplitter(splitOptions);
const chunks = splitter.splitMarkdownToChunks(text);

logger.info(
`Created ${chunks.length} chunks using RecursiveMarkdownSplitter`,
);

// Convert chunks to Document<BookChunk> format
const localChunks: Document<BookChunk>[] = chunks.map((chunk) => {
const contentHash = calculateHash(chunk.content);

return new Document<BookChunk>({
pageContent: chunk.content,
metadata: {
name: chunk.meta.title,
title: chunk.meta.title,
chunkNumber: chunk.meta.chunkNumber, // Already 0-based
contentHash: contentHash,
uniqueId: chunk.meta.uniqueId,
sourceLink: chunk.meta.sourceLink || this.config.baseUrl,
source: this.source,
},
});
});

return localChunks;
}

/**
* Starknet Blog specific processing based on the pre-summarized markdown file
* @param vectorStore
*/
public override async process(vectorStore: VectorStore): Promise<void> {
try {
// 1. Read the pre-summarized documentation
const text = await this.readSummaryFile();

// 2. Create chunks from the documentation
const chunks = await this.chunkSummaryFile(text);

logger.info(
`Created ${chunks.length} chunks from Starknet blog documentation`,
);

// 3. Update the vector store with the chunks
await this.updateVectorStore(vectorStore, chunks);

// 4. Clean up any temporary files (no temp files in this case)
await this.cleanupDownloadedFiles();
} catch (error) {
this.handleError(error);
}
}

/**
* Get the directory path for extracting files
*
* @returns string - Path to the extract directory
*/
protected getExtractDir(): string {
const { getTempDir } = require('../utils/paths');
return getTempDir('starknet-blog');
}

/**
* Override cleanupDownloadedFiles since we don't download anything
*/
protected override async cleanupDownloadedFiles(): Promise<void> {
// No cleanup needed as we're reading from a local file
logger.info('No cleanup needed - using local summary file');
}
}
1 change: 1 addition & 0 deletions ingesters/src/types/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export enum DocumentSource {
CORELIB_DOCS = 'corelib_docs',
SCARB_DOCS = 'scarb_docs',
STARKNET_JS = 'starknet_js',
STARKNET_BLOG = 'starknet_blog',
}

export type BookChunk = {
Expand Down
4 changes: 2 additions & 2 deletions python/optimizers/results/optimized_mcp_program.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions python/optimizers/results/optimized_rag.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions python/optimizers/results/optimized_retrieval_program.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions python/src/cairo_coder/core/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class DocumentSource(str, Enum):
CORELIB_DOCS = "corelib_docs"
SCARB_DOCS = "scarb_docs"
STARKNET_JS = "starknet_js"
STARKNET_BLOG = "starknet_blog"


class DocumentMetadata(TypedDict, total=False):
Expand Down
1 change: 1 addition & 0 deletions python/src/cairo_coder/dspy/query_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
DocumentSource.CORELIB_DOCS: "Cairo Core Library Documentation. For using the Cairo core library: basic types, stdlib functions, stdlib structs, macros, and other core concepts. Essential for Cairo programming questions.",
DocumentSource.SCARB_DOCS: "Scarb Documentation. For using the Scarb package manager: building, compiling, generating compilation artifacts, managing dependencies, configuration of Scarb.toml.",
DocumentSource.STARKNET_JS: "StarknetJS Documentation. For using the StarknetJS library: interacting with Starknet contracts, (calls and transactions), deploying Starknet contracts, front-end APIs, javascript integration examples, guides, tutorials and general JS/TS documentation for starknet.",
DocumentSource.STARKNET_BLOG: "Starknet Blog Documentation. For latest Starknet updates, announcements, feature releases, ecosystem developments, integration guides, and community updates. Useful for understanding recent Starknet innovations, new tools, partnerships, and protocol enhancements.",
}

# Ensure all DocumentSource variants are covered
Expand Down
14 changes: 5 additions & 9 deletions python/src/scripts/docs_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,6 @@ def extract_content(self, html: str, url: str) -> tuple[str, str]:
if any(keyword in tag_id or keyword in tag_classes for keyword in boilerplate_keywords):
tags_to_remove.append(tag)

# Now decompose all collected tags
for tag in tags_to_remove:
tag.decompose()

# Try to find main content
main_content = None

Expand Down Expand Up @@ -347,8 +343,6 @@ def compile_markdown(self) -> str:
lines = [
f"# {self.domain} — Snapshot ({date_str})",
"",
"",
"---",
""
]

Expand All @@ -362,15 +356,17 @@ def compile_markdown(self) -> str:
if not markdown or len(markdown.strip()) < 50:
markdown = "*No content extracted.*"

# Add individual Sources block for this page
lines.extend([
f"**Source URL:** {url}",
"---",
"Sources:",
f" - {url}",
"---",
"",
f"## {title}",
"",
markdown,
"",
"---",
""
])
else:
# Skip pages that failed to fetch or returned non-HTML content
Expand Down
147 changes: 147 additions & 0 deletions python/src/scripts/filter_2025_blogs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#!/usr/bin/env python3
"""
Filter doc_dump.md to keep only blog entries published in 2025.
Reads the doc_dump.md file, identifies individual pages separated by "---",
and filters to keep only those containing blog entries with 2025 dates.
"""

import re
from pathlib import Path


def is_2025_blog_entry(content: str) -> bool:
"""
Check if content contains a blog entry from 2025.
Looks for patterns like:
Home / Blog
Feb 5, 2023 · 2 min read
Returns True if the date is from 2025.
"""
# Look for the blog pattern with date
# Pattern: Month Day, Year · time min read
blog_pattern = r'Home\s+/\s+Blog.*?(\w+\s+\d+,\s+(\d{4}))\s+·'

matches = re.findall(blog_pattern, content, re.DOTALL | re.IGNORECASE)

for match in matches:
year = match[1]
if year == '2025':
return True

return False


def filter_doc_dump(input_file: Path, output_file: Path):
"""
Read doc_dump.md and filter to keep only 2025 blog entries.
Supports both old format (single Sources block) and new format (individual Sources blocks).
"""
with open(input_file, encoding='utf-8') as f:
content = f.read()

filtered_pages = []
total_pages = 0
kept_pages = 0
document_header = ""

# Try new format first (individual Sources blocks)
page_pattern = r'(---\s*\nSources:\s*\n\s*-\s*[^\n]+\n---\s*\n+##[^#].*?)(?=\n---\s*\nSources:|\Z)'
matches = list(re.finditer(page_pattern, content, re.DOTALL))

if matches:
# New format detected
print("Detected new format (individual Sources blocks)")

# Keep document header if present
header_match = re.match(r'^(.*?)(?=\n---\s*\nSources:)', content, re.DOTALL)
document_header = header_match.group(1).strip() if header_match else ""

for match in matches:
page = match.group(1)
if not page.strip():
continue

total_pages += 1

# Check if this is a 2025 blog entry
if is_2025_blog_entry(page):
filtered_pages.append(page.strip())
kept_pages += 1

# Extract URL for logging (from Sources block)
url_match = re.search(r'Sources:\s*\n\s*-\s*(.+)', page)
if url_match:
print(f"Keeping: {url_match.group(1)}")
else:
# Fall back to old format (**Source URL:** markers)
print("Detected old format (**Source URL:** markers)")

pattern = re.compile(r'^\*\*Source URL:\*\*\s+(\S+)', re.MULTILINE)
page_matches = list(pattern.finditer(content))

for i, m in enumerate(page_matches):
url = m.group(1)
start = m.end()
end = page_matches[i + 1].start() if i + 1 < len(page_matches) else len(content)
page_content = content[start:end].strip()

# Remove surrounding '---' separators
lines = page_content.splitlines()
while lines and lines[0].strip() == '---':
lines.pop(0)
while lines and lines[-1].strip() == '---':
lines.pop()
page_content = "\n".join(lines).strip()

total_pages += 1

if is_2025_blog_entry(page_content):
# Convert to new format
new_format_page = f"---\nSources:\n - {url}\n---\n\n{page_content}"
filtered_pages.append(new_format_page)
kept_pages += 1
print(f"Keeping: {url}")

# Construct output with header and filtered pages
output_parts = []
if document_header:
output_parts.append(document_header)
output_parts.append("")
output_parts.append("")

output_parts.extend(filtered_pages)
output_content = '\n\n'.join(output_parts)

# Write to output file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(output_content)

print(f"\n{'-'*60}")
print(f"Total pages processed: {total_pages}")
print(f"Pages kept (2025 blogs): {kept_pages}")
print(f"Pages removed: {total_pages - kept_pages}")
print(f"Output written to: {output_file}")


def main():
# Paths
script_dir = Path(__file__).parent
python_dir = script_dir.parent.parent
input_file = python_dir / "doc_dump.md"
output_file = python_dir / "doc_dump_2025_blogs.md"

if not input_file.exists():
print(f"Error: Input file not found: {input_file}")
return

print(f"Reading from: {input_file}")
print("Filtering for 2025 blog entries...\n")

filter_doc_dump(input_file, output_file)


if __name__ == "__main__":
main()
Loading
Loading