feat: add scarb docs (#16)

enitrat · ijusttookadnatest · web-flow · commit 9114edc379d7 · 2025-06-18T11:53:48.000+02:00
Co-authored-by: alvinouille &lt;alvinalesaint@protonmail.com&gt;
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,111 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+Cairo Coder is an open-source Cairo language code generation service using Retrieval-Augmented Generation (RAG) to transform natural language requests into functional Cairo smart contracts and programs. It was adapted from the Starknet Agent project.
+
+## Essential Commands
+
+### Development
+
+- `pnpm install` - Install dependencies (requires Node.js 20+ and pnpm 9+)
+- `pnpm dev` - Start all services in development mode with hot reload
+- `pnpm build` - Build all packages for production
+- `pnpm clean` - Clean package build files
+- `pnpm clean:all` - Clean all build files and node_modules
+
+### Testing
+
+- `pnpm test` - Run all tests across packages
+- `pnpm --filter @cairo-coder/agents test` - Run tests for specific package
+- `pnpm --filter @cairo-coder/agents test -- -t "test name"` - Run specific test
+- `pnpm --filter @cairo-coder/backend check-types` - Type check specific package
+
+### Documentation Ingestion
+
+- `pnpm generate-embeddings` - Interactive ingestion of documentation sources
+- `pnpm generate-embeddings:yes` - Non-interactive ingestion (for CI/CD)
+
+### Docker Operations
+
+- `docker compose up postgres backend` - Start main services
+- `docker compose up ingester` - Run documentation ingestion
+
+## High-Level Architecture
+
+### Monorepo Structure
+
+- **packages/agents**: Core RAG pipeline orchestrating query processing, document retrieval, and code generation
+- **packages/backend**: Express API server providing OpenAI-compatible endpoints
+- **packages/ingester**: Documentation processing system using template method pattern
+- **packages/typescript-config**: Shared TypeScript configuration
+
+### Key Design Patterns
+
+1. **RAG Pipeline** (packages/agents/src/core/pipeline/):
+
+   - `QueryProcessor`: Reformulates user queries for better retrieval
+   - `DocumentRetriever`: Searches pgvector database using similarity measures
+   - `AnswerGenerator`: Generates Cairo code from retrieved documents
+   - `McpPipeline`: Special mode returning raw documents without generation
+
+2. **Ingester System** (packages/ingester/src/ingesters/):
+
+   - `BaseIngester`: Abstract class implementing template method pattern
+   - Source-specific ingesters extend base class for each documentation source
+   - Factory pattern (`IngesterFactory`) creates appropriate ingester instances
+
+3. **Multi-Provider LLM Support**:
+   - Configurable providers: OpenAI, Anthropic, Google Gemini
+   - Provider abstraction in agents package handles model differences
+   - Streaming and non-streaming response modes
+
+### Configuration
+
+- Copy `packages/agents/sample.config.toml` to `config.toml`
+- Required configurations:
+  - LLM provider API keys (OPENAI, GEMINI, ANTHROPIC)
+  - Database connection in [VECTOR_DB] section
+  - Model selection in [PROVIDERS] section
+- Environment variables:
+  - Root `.env`: PostgreSQL initialization (POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_DB)
+  - `packages/backend/.env`: Optional LangSmith tracing configuration
+
+### Database Architecture
+
+- PostgreSQL with pgvector extension for vector similarity search
+- Embedding storage for documentation chunks
+- Configurable similarity measures (cosine, dot product, euclidean)
+
+## Development Guidelines
+
+### Code Organization
+
+- Follow existing patterns in neighboring files
+- Use dependency injection for testability
+- Mock external dependencies (LLMs, databases) in tests
+- Prefer editing existing files over creating new ones
+- Follow template method pattern for new ingesters
+
+### Testing Approach
+
+- Jest for all testing
+- Test files in `__tests__/` directories
+- Mock LLM calls and database operations
+- Test each ingester implementation separately
+- Use descriptive test names explaining behavior
+
+### Adding New Documentation Sources
+
+1. Create new ingester extending `BaseIngester` in packages/ingester/src/ingesters/
+2. Implement required abstract methods
+3. Register in `IngesterFactory`
+4. Update configuration if needed
+
+### MCP (Model Context Protocol) Mode
+
+- Special mode activated via `x-mcp-mode: true` header
+- Returns raw documentation chunks without LLM generation
+- Useful for integration with other tools needing Cairo documentation
diff --git a/packages/agents/src/config/agent.ts b/packages/agents/src/config/agent.ts
@@ -4,9 +4,7 @@ import { basicTestTemplate } from './templates/testTemplate';
 import { VectorStore } from '../db/postgresVectorStore';
 import { DocumentSource, RagSearchConfig } from '../types';
 
-export const getAgentConfig = (
-  vectorStore: VectorStore,
-): RagSearchConfig => {
+export const getAgentConfig = (vectorStore: VectorStore): RagSearchConfig => {
   return {
     name: 'Cairo Coder',
     prompts: cairoCoderPrompts,
@@ -19,6 +17,9 @@ export const getAgentConfig = (
       DocumentSource.CAIRO_BOOK,
       DocumentSource.CAIRO_BY_EXAMPLE,
       DocumentSource.STARKNET_FOUNDRY,
+      DocumentSource.CORELIB_DOCS,
+      DocumentSource.OPENZEPPELIN_DOCS,
+      DocumentSource.SCARB_DOCS,
     ],
   };
 };
diff --git a/packages/agents/src/config/prompts/cairoCoderPrompts.ts b/packages/agents/src/config/prompts/cairoCoderPrompts.ts
@@ -29,6 +29,7 @@ You will be given a conversation history and a follow-up question. Your primary
 *   **cairo_by_example:** Cairo by Example Documentation. Provides practical Cairo code snippets for specific language features or common patterns. Useful for "how-to" syntax questions.
 *   **openzeppelin_docs:** OpenZeppelin Cairo Contracts Documentation. For using the OZ library: standard implementations (ERC20, ERC721), access control, security patterns, contract upgradeability. Crucial for building standard-compliant contracts.
 *   **corelib_docs:** Cairo Core Library Documentation. For using the Cairo core library: basic types, stdlib functions, stdlib structs, macros, and other core concepts. Essential for Cairo programming questions.
+*   **scarb_docs:** Scarb Documentation. For using the Scarb package manager: building, compiling, generating compilation artifacts, managing dependencies, configuration of Scarb.toml.
 
 **Examples:**
 
diff --git a/packages/agents/src/core/pipeline/documentRetriever.ts b/packages/agents/src/core/pipeline/documentRetriever.ts
@@ -54,7 +54,11 @@ export class DocumentRetriever {
     ].map(
       (content) => results.flat().find((doc) => doc.pageContent === content)!,
     );
-    logger.debug('Retrieved documents:', { count: uniqueDocs.length });
+    const sourceSet = new Set(uniqueDocs.map((doc) => doc.metadata.source));
+    logger.debug('Retrieved documents:', {
+      count: uniqueDocs.length,
+      sources: Array.from(sourceSet),
+    });
     return uniqueDocs;
   }
 
diff --git a/packages/agents/src/core/pipeline/mcpPipeline.ts b/packages/agents/src/core/pipeline/mcpPipeline.ts
@@ -1,5 +1,5 @@
 import { RagPipeline } from './ragPipeline';
-import { RagInput, StreamHandler } from '../../types';
+import { RagInput, RetrievedDocuments, StreamHandler } from '../../types';
 import { logger, TokenTracker } from '../../utils';
 
 /**
@@ -14,7 +14,7 @@ export class McpPipeline extends RagPipeline {
     try {
       // Reset token counters at the start of each pipeline run
       TokenTracker.resetSessionCounters();
-      
+
       logger.info('Starting MCP pipeline', { query: input.query });
 
       // Step 1: Process the query
@@ -30,33 +30,54 @@ export class McpPipeline extends RagPipeline {
 
       // Step 3: Return raw documents without answer generation
       logger.info('MCP mode - returning raw documents');
-      
-      const rawDocuments = retrieved.documents.map(doc => ({
-        pageContent: doc.pageContent,
-        metadata: doc.metadata
-      }));
+
+      const context = this.assembleDocuments(retrieved);
 
       handler.emitResponse({
-        content: JSON.stringify(rawDocuments, null, 2),
+        content: JSON.stringify(context, null, 2),
       } as any);
 
       logger.debug('MCP pipeline ended');
-      
+
       // Log final token usage
       const tokenUsage = TokenTracker.getSessionTokenUsage();
-      logger.info('MCP Pipeline completed', { 
+      logger.info('MCP Pipeline completed', {
         query: input.query,
         tokenUsage: {
           promptTokens: tokenUsage.promptTokens,
           responseTokens: tokenUsage.responseTokens,
-          totalTokens: tokenUsage.totalTokens
-        }
+          totalTokens: tokenUsage.totalTokens,
+        },
       });
-      
+
       handler.emitEnd();
     } catch (error) {
       logger.error('MCP Pipeline error:', error);
       handler.emitError('An error occurred while processing your request');
     }
   }
-} 
+
+  public assembleDocuments(retrieved: RetrievedDocuments): string {
+    const docs = retrieved.documents;
+    if (!docs.length) {
+      return (
+        this.config.prompts.noSourceFoundPrompt ||
+        'No relevant information found.'
+      );
+    }
+
+    // Concatenate all document content into a single string
+    let context = docs.map((doc) => doc.pageContent).join('\n\n');
+
+    // Add contract and test templates at the end if applicable
+    const { isContractRelated, isTestRelated } = retrieved.processedQuery;
+    if (isContractRelated && this.config.contractTemplate) {
+      context += '\n\n' + this.config.contractTemplate;
+    }
+    if (isTestRelated && this.config.testTemplate) {
+      context += '\n\n' + this.config.testTemplate;
+    }
+
+    return context;
+  }
+}
diff --git a/packages/agents/src/types/index.ts b/packages/agents/src/types/index.ts
@@ -103,6 +103,7 @@ export enum DocumentSource {
   CAIRO_BY_EXAMPLE = 'cairo_by_example',
   OPENZEPPELIN_DOCS = 'openzeppelin_docs',
   CORELIB_DOCS = 'corelib_docs',
+  SCARB_DOCS = 'scarb_docs',
 }
 
 export type BookChunk = {
diff --git a/packages/ingester/src/IngesterFactory.ts b/packages/ingester/src/IngesterFactory.ts
@@ -54,6 +54,10 @@ export class IngesterFactory {
         } = require('./ingesters/CoreLibDocsIngester');
         return new CoreLibDocsIngester();
 
+      case 'scarb_docs':
+        const { ScarbDocsIngester } = require('./ingesters/ScarbDocsIngester');
+        return new ScarbDocsIngester();
+
       default:
         throw new Error(`Unsupported source: ${source}`);
     }
@@ -72,6 +76,7 @@ export class IngesterFactory {
       DocumentSource.CAIRO_BY_EXAMPLE,
       DocumentSource.OPENZEPPELIN_DOCS,
       DocumentSource.CORELIB_DOCS,
+      DocumentSource.SCARB_DOCS,
     ];
   }
 }
diff --git a/packages/ingester/src/generateEmbeddings.ts b/packages/ingester/src/generateEmbeddings.ts
@@ -6,7 +6,6 @@ import { loadOpenAIEmbeddingsModels } from '@cairo-coder/backend/config/provider
 import { DocumentSource } from '@cairo-coder/agents/types/index';
 import { IngesterFactory } from './IngesterFactory';
 
-
 /**
  * Global vector store instance
  */
@@ -138,9 +137,7 @@ async function main() {
     if (target === 'Everything') {
       // Ingest all sources
       const sources = IngesterFactory.getAvailableSources();
-      for (const source of sources) {
-        await ingestSource(source);
-      }
+      await Promise.all(sources.map((source) => ingestSource(source)));
     } else {
       // Ingest specific source
       await ingestSource(target);
diff --git a/packages/ingester/src/ingesters/ScarbDocsIngester.ts b/packages/ingester/src/ingesters/ScarbDocsIngester.ts
@@ -0,0 +1,72 @@
+import * as path from 'path';
+import { DocumentSource } from '@cairo-coder/agents/types/index';
+import { BookConfig, BookPageDto } from '../utils/types';
+import { processDocFiles } from '../utils/fileUtils';
+import { logger } from '@cairo-coder/agents/utils/index';
+import { exec as execCallback } from 'child_process';
+import { promisify } from 'util';
+import { MarkdownIngester } from './MarkdownIngester';
+
+/**
+ * Ingester for the Scarb documentation
+ *
+ * This ingester downloads the Scarb documentation from the GitHub repository,
+ * processes the markdown files from the website/docs directory, and creates chunks for the vector store.
+ */
+export class ScarbDocsIngester extends MarkdownIngester {
+  /**
+   * Constructor for the Scarb docs ingester
+   */
+  constructor() {
+    // Define the configuration for the Scarb documentation
+    const config: BookConfig = {
+      repoOwner: 'software-mansion',
+      repoName: 'scarb',
+      fileExtension: '.md',
+      chunkSize: 4096,
+      chunkOverlap: 512,
+    };
+
+    super(config, DocumentSource.SCARB_DOCS);
+  }
+
+  /**
+   * Get the directory path for extracting files
+   *
+   * @returns string - Path to the extract directory
+   */
+  protected getExtractDir(): string {
+    return path.join(__dirname, '..', '..', 'temp', 'scarb-docs');
+  }
+
+  /**
+   * Download and extract the repository
+   *
+   * @returns Promise<BookPageDto[]> - Array of book pages
+   */
+  protected async downloadAndExtractDocs(): Promise<BookPageDto[]> {
+    const extractDir = this.getExtractDir();
+    const repoUrl = `https://github.com/${this.config.repoOwner}/${this.config.repoName}.git`;
+
+    logger.info(`Cloning repository from ${repoUrl}`);
+
+    // Clone the repository
+    const exec = promisify(execCallback);
+    try {
+      await exec(`git clone ${repoUrl} ${extractDir}`);
+    } catch (error) {
+      logger.error('Error cloning repository:', error);
+      throw new Error('Failed to clone repository');
+    }
+
+    logger.info('Repository cloned successfully.');
+
+    // Process the markdown files from website/docs directory
+    const docsDir = path.join(extractDir, 'website', 'docs');
+    const pages = await processDocFiles(this.config, docsDir);
+
+    logger.info(`Processed ${pages.length} documentation pages from Scarb`);
+
+    return pages;
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -103,6 +103,7 @@ export enum DocumentSource {`
`103`	`103`	`CAIRO_BY_EXAMPLE = 'cairo_by_example',`
`104`	`104`	`OPENZEPPELIN_DOCS = 'openzeppelin_docs',`
`105`	`105`	`CORELIB_DOCS = 'corelib_docs',`
	`106`	`+ SCARB_DOCS = 'scarb_docs',`
`106`	`107`	`}`
`107`	`108`
`108`	`109`	`export type BookChunk = {`
Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,10 @@ export class IngesterFactory {`
`54`	`54`	`} = require('./ingesters/CoreLibDocsIngester');`
`55`	`55`	`return new CoreLibDocsIngester();`
`56`	`56`
	`57`	`+ case 'scarb_docs':`
	`58`	`+ const { ScarbDocsIngester } = require('./ingesters/ScarbDocsIngester');`
	`59`	`+ return new ScarbDocsIngester();`
	`60`	`+`
`57`	`61`	`default:`
`58`	`62`	throw new Error(`Unsupported source: ${source}`);
`59`	`63`	`}`
`@@ -72,6 +76,7 @@ export class IngesterFactory {`
`72`	`76`	`DocumentSource.CAIRO_BY_EXAMPLE,`
`73`	`77`	`DocumentSource.OPENZEPPELIN_DOCS,`
`74`	`78`	`DocumentSource.CORELIB_DOCS,`
	`79`	`+ DocumentSource.SCARB_DOCS,`
`75`	`80`	`];`
`76`	`81`	`}`
`77`	`82`	`}`