Skip to content

Commit c132ff3

Browse files
committed
feat: implement ChunkingRuleParser for file type and service mapping
1 parent d6301bb commit c132ff3

File tree

3 files changed

+79
-7
lines changed

3 files changed

+79
-7
lines changed

src/config/knowledge.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ export const knowledgeEnv = createEnv({
77
UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
88
UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
99
USE_UNSTRUCTURED_FOR_PDF: process.env.USE_UNSTRUCTURED_FOR_PDF,
10+
FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES || 'pdf=unstructured',
1011
},
1112
server: {
1213
DEFAULT_FILES_CONFIG: z.string().optional(),

src/server/modules/ContentChunk/index.ts

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { ChunkingLoader } from 'src/libs/langchain';
22
import { Strategy } from 'unstructured-client/sdk/models/shared';
3+
import { ChunkingRuleParser } from './rules';
34

45
import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas';
56
import { ChunkingStrategy, Unstructured } from '@/libs/unstructured';
@@ -20,10 +21,58 @@ interface ChunkResult {
2021
export class ContentChunk {
2122
private unstructuredClient: Unstructured;
2223
private langchainClient: ChunkingLoader;
24+
private chunkingRules: Record<string, ChunkingService[]>;
2325

2426
constructor() {
2527
this.unstructuredClient = new Unstructured();
2628
this.langchainClient = new ChunkingLoader();
29+
this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES);
30+
}
31+
32+
private getChunkingServices(fileType: string): ChunkingService[] {
33+
const ext = fileType.split('/').pop()?.toLowerCase() || '';
34+
return this.chunkingRules[ext] || ['default'];
35+
}
36+
37+
async chunkContent(params: ChunkContentParams): Promise<ChunkResult> {
38+
const services = this.getChunkingServices(params.fileType);
39+
40+
for (const service of services) {
41+
try {
42+
switch (service) {
43+
case 'unstructured':
44+
if (this.canUseUnstructured()) {
45+
return await this.chunkByUnstructured(params.filename, params.content);
46+
}
47+
break;
48+
49+
case 'doc2x':
50+
// Future implementation
51+
break;
52+
53+
case 'default':
54+
default:
55+
return await this.chunkByLangChain(params.filename, params.content);
56+
}
57+
} catch (error) {
58+
// If this is the last service, throw the error
59+
if (service === services[services.length - 1]) throw error;
60+
// Otherwise continue to next service
61+
console.error(`Chunking failed with service ${service}:`, error);
62+
continue;
63+
}
64+
}
65+
66+
// Fallback to langchain if no service succeeded
67+
return await this.chunkByLangChain(params.filename, params.content);
68+
}
69+
70+
private canUseUnstructured(): boolean {
71+
return !!(
72+
knowledgeEnv.USE_UNSTRUCTURED_FOR_PDF &&
73+
knowledgeEnv.UNSTRUCTURED_API_KEY &&
74+
knowledgeEnv.UNSTRUCTURED_SERVER_URL
75+
);
2776
}
2877

2978
isUsingUnstructured(params: ChunkContentParams) {
@@ -33,13 +82,6 @@ export class ContentChunk {
3382
!!knowledgeEnv.UNSTRUCTURED_SERVER_URL;
3483
}
3584

36-
async chunkContent(params: ChunkContentParams): Promise<ChunkResult> {
37-
if (this.isUsingUnstructured(params))
38-
return await this.chunkByUnstructured(params.filename, params.content);
39-
40-
return await this.chunkByLangChain(params.filename, params.content);
41-
}
42-
4385
private chunkByUnstructured = async (
4486
filename: string,
4587
content: Uint8Array,
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
type ChunkingService = 'unstructured' | 'doc2x' | 'default';
2+
3+
interface FileTypeRule {
4+
fileType: string;
5+
services: ChunkingService[];
6+
}
7+
8+
export class ChunkingRuleParser {
9+
static parse(rulesStr: string): Record<string, ChunkingService[]> {
10+
const rules: Record<string, ChunkingService[]> = {};
11+
12+
// Split by semicolon for different file types
13+
const fileTypeRules = rulesStr.split(';');
14+
15+
for (const rule of fileTypeRules) {
16+
const [fileType, services] = rule.split('=');
17+
if (!fileType || !services) continue;
18+
19+
// Split services by comma and validate each service
20+
rules[fileType.toLowerCase()] = services
21+
.split(',')
22+
.map(s => s.trim().toLowerCase())
23+
.filter((s): s is ChunkingService =>
24+
['unstructured', 'doc2x', 'default'].includes(s));
25+
}
26+
27+
return rules;
28+
}
29+
}

0 commit comments

Comments
 (0)