1
1
import { ChunkingLoader } from 'src/libs/langchain' ;
2
2
import { Strategy } from 'unstructured-client/sdk/models/shared' ;
3
+ import { ChunkingRuleParser } from './rules' ;
3
4
4
5
import { NewChunkItem , NewUnstructuredChunkItem } from '@/database/schemas' ;
5
6
import { ChunkingStrategy , Unstructured } from '@/libs/unstructured' ;
@@ -20,10 +21,58 @@ interface ChunkResult {
20
21
export class ContentChunk {
21
22
private unstructuredClient : Unstructured ;
22
23
private langchainClient : ChunkingLoader ;
24
+ private chunkingRules : Record < string , ChunkingService [ ] > ;
23
25
24
26
constructor ( ) {
25
27
this . unstructuredClient = new Unstructured ( ) ;
26
28
this . langchainClient = new ChunkingLoader ( ) ;
29
+ this . chunkingRules = ChunkingRuleParser . parse ( knowledgeEnv . FILE_TYPE_CHUNKING_RULES ) ;
30
+ }
31
+
32
+ private getChunkingServices ( fileType : string ) : ChunkingService [ ] {
33
+ const ext = fileType . split ( '/' ) . pop ( ) ?. toLowerCase ( ) || '' ;
34
+ return this . chunkingRules [ ext ] || [ 'default' ] ;
35
+ }
36
+
37
+ async chunkContent ( params : ChunkContentParams ) : Promise < ChunkResult > {
38
+ const services = this . getChunkingServices ( params . fileType ) ;
39
+
40
+ for ( const service of services ) {
41
+ try {
42
+ switch ( service ) {
43
+ case 'unstructured' :
44
+ if ( this . canUseUnstructured ( ) ) {
45
+ return await this . chunkByUnstructured ( params . filename , params . content ) ;
46
+ }
47
+ break ;
48
+
49
+ case 'doc2x' :
50
+ // Future implementation
51
+ break ;
52
+
53
+ case 'default' :
54
+ default :
55
+ return await this . chunkByLangChain ( params . filename , params . content ) ;
56
+ }
57
+ } catch ( error ) {
58
+ // If this is the last service, throw the error
59
+ if ( service === services [ services . length - 1 ] ) throw error ;
60
+ // Otherwise continue to next service
61
+ console . error ( `Chunking failed with service ${ service } :` , error ) ;
62
+ continue ;
63
+ }
64
+ }
65
+
66
+ // Fallback to langchain if no service succeeded
67
+ return await this . chunkByLangChain ( params . filename , params . content ) ;
68
+ }
69
+
70
+ private canUseUnstructured ( ) : boolean {
71
+ return ! ! (
72
+ knowledgeEnv . USE_UNSTRUCTURED_FOR_PDF &&
73
+ knowledgeEnv . UNSTRUCTURED_API_KEY &&
74
+ knowledgeEnv . UNSTRUCTURED_SERVER_URL
75
+ ) ;
27
76
}
28
77
29
78
isUsingUnstructured ( params : ChunkContentParams ) {
@@ -33,13 +82,6 @@ export class ContentChunk {
33
82
! ! knowledgeEnv . UNSTRUCTURED_SERVER_URL ;
34
83
}
35
84
36
- async chunkContent ( params : ChunkContentParams ) : Promise < ChunkResult > {
37
- if ( this . isUsingUnstructured ( params ) )
38
- return await this . chunkByUnstructured ( params . filename , params . content ) ;
39
-
40
- return await this . chunkByLangChain ( params . filename , params . content ) ;
41
- }
42
-
43
85
private chunkByUnstructured = async (
44
86
filename : string ,
45
87
content : Uint8Array ,
0 commit comments