Skip to content

Commit

Permalink
feat: implement ChunkingRuleParser for file type and service mapping
Browse files Browse the repository at this point in the history
  • Loading branch information
fzlzjerry committed Feb 13, 2025
1 parent d6301bb commit c132ff3
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 7 deletions.
1 change: 1 addition & 0 deletions src/config/knowledge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export const knowledgeEnv = createEnv({
UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
USE_UNSTRUCTURED_FOR_PDF: process.env.USE_UNSTRUCTURED_FOR_PDF,
FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES || 'pdf=unstructured',
},
server: {
DEFAULT_FILES_CONFIG: z.string().optional(),
Expand Down
56 changes: 49 additions & 7 deletions src/server/modules/ContentChunk/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { ChunkingLoader } from 'src/libs/langchain';
import { Strategy } from 'unstructured-client/sdk/models/shared';
import { ChunkingRuleParser } from './rules';

import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas';
import { ChunkingStrategy, Unstructured } from '@/libs/unstructured';
Expand All @@ -20,10 +21,58 @@ interface ChunkResult {
export class ContentChunk {
private unstructuredClient: Unstructured;
private langchainClient: ChunkingLoader;
private chunkingRules: Record<string, ChunkingService[]>;

constructor() {
this.unstructuredClient = new Unstructured();
this.langchainClient = new ChunkingLoader();
this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES);
}

private getChunkingServices(fileType: string): ChunkingService[] {
const ext = fileType.split('/').pop()?.toLowerCase() || '';
return this.chunkingRules[ext] || ['default'];
}

async chunkContent(params: ChunkContentParams): Promise<ChunkResult> {
const services = this.getChunkingServices(params.fileType);

for (const service of services) {
try {
switch (service) {
case 'unstructured':
if (this.canUseUnstructured()) {
return await this.chunkByUnstructured(params.filename, params.content);
}
break;

case 'doc2x':
// Future implementation
break;

case 'default':
default:
return await this.chunkByLangChain(params.filename, params.content);
}
} catch (error) {
// If this is the last service, throw the error
if (service === services[services.length - 1]) throw error;
// Otherwise continue to next service
console.error(`Chunking failed with service ${service}:`, error);
continue;
}
}

// Fallback to langchain if no service succeeded
return await this.chunkByLangChain(params.filename, params.content);
}

private canUseUnstructured(): boolean {
return !!(
knowledgeEnv.USE_UNSTRUCTURED_FOR_PDF &&
knowledgeEnv.UNSTRUCTURED_API_KEY &&
knowledgeEnv.UNSTRUCTURED_SERVER_URL
);
}

isUsingUnstructured(params: ChunkContentParams) {
Expand All @@ -33,13 +82,6 @@ export class ContentChunk {
!!knowledgeEnv.UNSTRUCTURED_SERVER_URL;
}

async chunkContent(params: ChunkContentParams): Promise<ChunkResult> {
if (this.isUsingUnstructured(params))
return await this.chunkByUnstructured(params.filename, params.content);

return await this.chunkByLangChain(params.filename, params.content);
}

private chunkByUnstructured = async (
filename: string,
content: Uint8Array,
Expand Down
29 changes: 29 additions & 0 deletions src/server/modules/ContentChunk/rules.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
type ChunkingService = 'unstructured' | 'doc2x' | 'default';

interface FileTypeRule {
fileType: string;
services: ChunkingService[];
}

export class ChunkingRuleParser {
static parse(rulesStr: string): Record<string, ChunkingService[]> {
const rules: Record<string, ChunkingService[]> = {};

// Split by semicolon for different file types
const fileTypeRules = rulesStr.split(';');

for (const rule of fileTypeRules) {
const [fileType, services] = rule.split('=');
if (!fileType || !services) continue;

// Split services by comma and validate each service
rules[fileType.toLowerCase()] = services
.split(',')
.map(s => s.trim().toLowerCase())
.filter((s): s is ChunkingService =>
['unstructured', 'doc2x', 'default'].includes(s));
}

return rules;
}
}

0 comments on commit c132ff3

Please sign in to comment.