From 8745ca570ef9563b30ea3fc20afe242e9676f415 Mon Sep 17 00:00:00 2001 From: Morax Date: Sun, 9 Feb 2025 12:42:53 +0800 Subject: [PATCH 01/11] =?UTF-8?q?=E2=9C=A8=20feat:=20Add=20configurable=20?= =?UTF-8?q?PDF=20processing=20method=20with=20Unstructured?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/server/modules/ContentChunk/index.ts | 10 +++++++++- src/server/utils/env.ts | 12 ++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 src/server/utils/env.ts diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts index ce61616968cb8..14684ae571f65 100644 --- a/src/server/modules/ContentChunk/index.ts +++ b/src/server/modules/ContentChunk/index.ts @@ -3,6 +3,11 @@ import { Strategy } from 'unstructured-client/sdk/models/shared'; import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; import { ChunkingStrategy, Unstructured } from '@/libs/unstructured'; +import { getEnvironment } from '@/utils/env'; +import { knowledgeEnv } from '@/config/knowledge'; + +// Get environment variable to control PDF processing method +const USE_UNSTRUCTURED_FOR_PDF = getEnvironment('USE_UNSTRUCTURED_FOR_PDF') === 'true'; export interface ChunkContentParams { content: Uint8Array; @@ -26,7 +31,10 @@ export class ContentChunk { } isUsingUnstructured(params: ChunkContentParams) { - return params.fileType === 'application/pdf' && params.mode === 'hi-res'; + return params.fileType === 'application/pdf' && + USE_UNSTRUCTURED_FOR_PDF && + !!knowledgeEnv.UNSTRUCTURED_API_KEY && + !!knowledgeEnv.UNSTRUCTURED_SERVER_URL; } async chunkContent(params: ChunkContentParams): Promise { diff --git a/src/server/utils/env.ts b/src/server/utils/env.ts new file mode 100644 index 0000000000000..465b8e25028cc --- /dev/null +++ b/src/server/utils/env.ts @@ -0,0 +1,12 @@ +export const isDev = process.env.NODE_ENV === 'development'; + +export const isOnServerSide = typeof window === 'undefined'; +/** + * Get environment variable value + * @param key - Environment variable key + * @returns Environment variable value or empty string if not found + */ +export const getEnvironment = (key: string): string => { + if (typeof process === 'undefined') return ''; + return process.env[key] || ''; +}; From 06cbc842d2d1a46582c2851f45350e708031c07a Mon Sep 17 00:00:00 2001 From: Morax Date: Sun, 9 Feb 2025 13:13:43 +0800 Subject: [PATCH 02/11] =?UTF-8?q?=F0=9F=94=A7=20fix:=20Update=20import=20p?= =?UTF-8?q?ath=20for=20env=20utility=20in=20ContentChunk=20module?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/server/modules/ContentChunk/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts index 14684ae571f65..4972883cdfcc5 100644 --- a/src/server/modules/ContentChunk/index.ts +++ b/src/server/modules/ContentChunk/index.ts @@ -3,7 +3,7 @@ import { Strategy } from 'unstructured-client/sdk/models/shared'; import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; import { ChunkingStrategy, Unstructured } from '@/libs/unstructured'; -import { getEnvironment } from '@/utils/env'; +import { getEnvironment } from '../../utils/env'; import { knowledgeEnv } from '@/config/knowledge'; // Get environment variable to control PDF processing method From 4de5511c3f427a1f3c4533095e1583caf170a10a Mon Sep 17 00:00:00 2001 From: Morax <100508620+fzlzjerry@users.noreply.github.com> Date: Thu, 13 Feb 2025 07:32:27 +0000 Subject: [PATCH 03/11] feat: add USE_UNSTRUCTURED_FOR_PDF environment variable to knowledge config --- src/config/knowledge.ts | 30 +++++++++++------------- src/server/modules/ContentChunk/index.ts | 6 +---- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/src/config/knowledge.ts b/src/config/knowledge.ts index 02f79a5af19ec..71ea7011f3088 100644 --- a/src/config/knowledge.ts +++ b/src/config/knowledge.ts @@ -1,19 +1,17 @@ import { createEnv } from '@t3-oss/env-nextjs'; import { z } from 'zod'; -export const getKnowledgeConfig = () => { - return createEnv({ - runtimeEnv: { - DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG, - UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY, - UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL, - }, - server: { - DEFAULT_FILES_CONFIG: z.string().optional(), - UNSTRUCTURED_API_KEY: z.string().optional(), - UNSTRUCTURED_SERVER_URL: z.string().optional(), - }, - }); -}; - -export const knowledgeEnv = getKnowledgeConfig(); +export const knowledgeEnv = createEnv({ + runtimeEnv: { + DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG, + UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY, + UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL, + USE_UNSTRUCTURED_FOR_PDF: process.env.USE_UNSTRUCTURED_FOR_PDF, + }, + server: { + DEFAULT_FILES_CONFIG: z.string().optional(), + UNSTRUCTURED_API_KEY: z.string().optional(), + UNSTRUCTURED_SERVER_URL: z.string().optional(), + USE_UNSTRUCTURED_FOR_PDF: z.string().optional(), + }, +}); diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts index 4972883cdfcc5..1c8d99a171ee5 100644 --- a/src/server/modules/ContentChunk/index.ts +++ b/src/server/modules/ContentChunk/index.ts @@ -3,12 +3,8 @@ import { Strategy } from 'unstructured-client/sdk/models/shared'; import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; import { ChunkingStrategy, Unstructured } from '@/libs/unstructured'; -import { getEnvironment } from '../../utils/env'; import { knowledgeEnv } from '@/config/knowledge'; -// Get environment variable to control PDF processing method -const USE_UNSTRUCTURED_FOR_PDF = getEnvironment('USE_UNSTRUCTURED_FOR_PDF') === 'true'; - export interface ChunkContentParams { content: Uint8Array; fileType: string; @@ -32,7 +28,7 @@ export class ContentChunk { isUsingUnstructured(params: ChunkContentParams) { return params.fileType === 'application/pdf' && - USE_UNSTRUCTURED_FOR_PDF && + !!knowledgeEnv.USE_UNSTRUCTURED_FOR_PDF && !!knowledgeEnv.UNSTRUCTURED_API_KEY && !!knowledgeEnv.UNSTRUCTURED_SERVER_URL; } From d6301bbe3831ac7648033080a047ccd70235ec29 Mon Sep 17 00:00:00 2001 From: Morax <100508620+fzlzjerry@users.noreply.github.com> Date: Thu, 13 Feb 2025 19:02:31 +0800 Subject: [PATCH 04/11] Delete src/server/utils/env.ts --- src/server/utils/env.ts | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 src/server/utils/env.ts diff --git a/src/server/utils/env.ts b/src/server/utils/env.ts deleted file mode 100644 index 465b8e25028cc..0000000000000 --- a/src/server/utils/env.ts +++ /dev/null @@ -1,12 +0,0 @@ -export const isDev = process.env.NODE_ENV === 'development'; - -export const isOnServerSide = typeof window === 'undefined'; -/** - * Get environment variable value - * @param key - Environment variable key - * @returns Environment variable value or empty string if not found - */ -export const getEnvironment = (key: string): string => { - if (typeof process === 'undefined') return ''; - return process.env[key] || ''; -}; From c132ff37cd6221fef3ab79359b3639f13dc9eec2 Mon Sep 17 00:00:00 2001 From: Morax <100508620+fzlzjerry@users.noreply.github.com> Date: Thu, 13 Feb 2025 13:24:47 +0000 Subject: [PATCH 05/11] feat: implement ChunkingRuleParser for file type and service mapping --- src/config/knowledge.ts | 1 + src/server/modules/ContentChunk/index.ts | 56 +++++++++++++++++++++--- src/server/modules/ContentChunk/rules.ts | 29 ++++++++++++ 3 files changed, 79 insertions(+), 7 deletions(-) create mode 100644 src/server/modules/ContentChunk/rules.ts diff --git a/src/config/knowledge.ts b/src/config/knowledge.ts index 71ea7011f3088..be6a322ef4580 100644 --- a/src/config/knowledge.ts +++ b/src/config/knowledge.ts @@ -7,6 +7,7 @@ export const knowledgeEnv = createEnv({ UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY, UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL, USE_UNSTRUCTURED_FOR_PDF: process.env.USE_UNSTRUCTURED_FOR_PDF, + FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES || 'pdf=unstructured', }, server: { DEFAULT_FILES_CONFIG: z.string().optional(), diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts index 1c8d99a171ee5..c66ffacf40a4f 100644 --- a/src/server/modules/ContentChunk/index.ts +++ b/src/server/modules/ContentChunk/index.ts @@ -1,5 +1,6 @@ import { ChunkingLoader } from 'src/libs/langchain'; import { Strategy } from 'unstructured-client/sdk/models/shared'; +import { ChunkingRuleParser } from './rules'; import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; import { ChunkingStrategy, Unstructured } from '@/libs/unstructured'; @@ -20,10 +21,58 @@ interface ChunkResult { export class ContentChunk { private unstructuredClient: Unstructured; private langchainClient: ChunkingLoader; + private chunkingRules: Record; constructor() { this.unstructuredClient = new Unstructured(); this.langchainClient = new ChunkingLoader(); + this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES); + } + + private getChunkingServices(fileType: string): ChunkingService[] { + const ext = fileType.split('/').pop()?.toLowerCase() || ''; + return this.chunkingRules[ext] || ['default']; + } + + async chunkContent(params: ChunkContentParams): Promise { + const services = this.getChunkingServices(params.fileType); + + for (const service of services) { + try { + switch (service) { + case 'unstructured': + if (this.canUseUnstructured()) { + return await this.chunkByUnstructured(params.filename, params.content); + } + break; + + case 'doc2x': + // Future implementation + break; + + case 'default': + default: + return await this.chunkByLangChain(params.filename, params.content); + } + } catch (error) { + // If this is the last service, throw the error + if (service === services[services.length - 1]) throw error; + // Otherwise continue to next service + console.error(`Chunking failed with service ${service}:`, error); + continue; + } + } + + // Fallback to langchain if no service succeeded + return await this.chunkByLangChain(params.filename, params.content); + } + + private canUseUnstructured(): boolean { + return !!( + knowledgeEnv.USE_UNSTRUCTURED_FOR_PDF && + knowledgeEnv.UNSTRUCTURED_API_KEY && + knowledgeEnv.UNSTRUCTURED_SERVER_URL + ); } isUsingUnstructured(params: ChunkContentParams) { @@ -33,13 +82,6 @@ export class ContentChunk { !!knowledgeEnv.UNSTRUCTURED_SERVER_URL; } - async chunkContent(params: ChunkContentParams): Promise { - if (this.isUsingUnstructured(params)) - return await this.chunkByUnstructured(params.filename, params.content); - - return await this.chunkByLangChain(params.filename, params.content); - } - private chunkByUnstructured = async ( filename: string, content: Uint8Array, diff --git a/src/server/modules/ContentChunk/rules.ts b/src/server/modules/ContentChunk/rules.ts new file mode 100644 index 0000000000000..d8faf8cb05ebe --- /dev/null +++ b/src/server/modules/ContentChunk/rules.ts @@ -0,0 +1,29 @@ +type ChunkingService = 'unstructured' | 'doc2x' | 'default'; + +interface FileTypeRule { + fileType: string; + services: ChunkingService[]; +} + +export class ChunkingRuleParser { + static parse(rulesStr: string): Record { + const rules: Record = {}; + + // Split by semicolon for different file types + const fileTypeRules = rulesStr.split(';'); + + for (const rule of fileTypeRules) { + const [fileType, services] = rule.split('='); + if (!fileType || !services) continue; + + // Split services by comma and validate each service + rules[fileType.toLowerCase()] = services + .split(',') + .map(s => s.trim().toLowerCase()) + .filter((s): s is ChunkingService => + ['unstructured', 'doc2x', 'default'].includes(s)); + } + + return rules; + } +} From b82f9fe878efddd9035e4362eff9de91e9ab1609 Mon Sep 17 00:00:00 2001 From: Morax <100508620+fzlzjerry@users.noreply.github.com> Date: Fri, 14 Feb 2025 12:23:54 +0000 Subject: [PATCH 06/11] refactor: remove USE_UNSTRUCTURED_FOR_PDF from knowledge environment configuration --- src/config/knowledge.ts | 5 ++--- src/server/modules/ContentChunk/index.ts | 5 +---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/config/knowledge.ts b/src/config/knowledge.ts index be6a322ef4580..6245ed25c80e1 100644 --- a/src/config/knowledge.ts +++ b/src/config/knowledge.ts @@ -6,13 +6,12 @@ export const knowledgeEnv = createEnv({ DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG, UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY, UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL, - USE_UNSTRUCTURED_FOR_PDF: process.env.USE_UNSTRUCTURED_FOR_PDF, - FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES || 'pdf=unstructured', + FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES, }, server: { DEFAULT_FILES_CONFIG: z.string().optional(), UNSTRUCTURED_API_KEY: z.string().optional(), UNSTRUCTURED_SERVER_URL: z.string().optional(), - USE_UNSTRUCTURED_FOR_PDF: z.string().optional(), + FILE_TYPE_CHUNKING_RULES: z.string().optional(), }, }); diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts index c66ffacf40a4f..c2243c55ab2f4 100644 --- a/src/server/modules/ContentChunk/index.ts +++ b/src/server/modules/ContentChunk/index.ts @@ -69,7 +69,6 @@ export class ContentChunk { private canUseUnstructured(): boolean { return !!( - knowledgeEnv.USE_UNSTRUCTURED_FOR_PDF && knowledgeEnv.UNSTRUCTURED_API_KEY && knowledgeEnv.UNSTRUCTURED_SERVER_URL ); @@ -77,9 +76,7 @@ export class ContentChunk { isUsingUnstructured(params: ChunkContentParams) { return params.fileType === 'application/pdf' && - !!knowledgeEnv.USE_UNSTRUCTURED_FOR_PDF && - !!knowledgeEnv.UNSTRUCTURED_API_KEY && - !!knowledgeEnv.UNSTRUCTURED_SERVER_URL; + this.canUseUnstructured(); } private chunkByUnstructured = async ( From 32771f580e0fbf358e21f957b12ad870ccfe214f Mon Sep 17 00:00:00 2001 From: Morax <100508620+fzlzjerry@users.noreply.github.com> Date: Sat, 15 Feb 2025 08:07:35 +0000 Subject: [PATCH 07/11] test: add unit tests for ChunkingRuleParser functionality --- src/server/modules/ContentChunk/rules.test.ts | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 src/server/modules/ContentChunk/rules.test.ts diff --git a/src/server/modules/ContentChunk/rules.test.ts b/src/server/modules/ContentChunk/rules.test.ts new file mode 100644 index 0000000000000..19ec59c793b1b --- /dev/null +++ b/src/server/modules/ContentChunk/rules.test.ts @@ -0,0 +1,81 @@ +import { describe, expect, it } from 'vitest'; +import { ChunkingRuleParser } from './rules'; + +describe('ChunkingRuleParser', () => { + describe('parse', () => { + it('should parse a single file type rule correctly', () => { + const input = 'pdf=unstructured,default'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured', 'default'], + }); + }); + + it('should parse multiple file type rules correctly', () => { + const input = 'pdf=unstructured,default;doc=doc2x,default;txt=default'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured', 'default'], + doc: ['doc2x', 'default'], + txt: ['default'], + }); + }); + + it('should convert file types to lowercase', () => { + const input = 'PDF=unstructured;DOC=doc2x'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured'], + doc: ['doc2x'], + }); + }); + + it('should filter out invalid service names', () => { + const input = 'pdf=unstructured,invalid,default,wrongservice'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured', 'default'], + }); + }); + + it('should handle empty string input', () => { + const input = ''; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({}); + }); + + it('should skip invalid rule formats', () => { + const input = 'pdf=unstructured;invalid;doc=doc2x;=default;txt'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured'], + doc: ['doc2x'], + }); + }); + + it('should handle whitespace in service names', () => { + const input = 'pdf= unstructured , default ;doc=doc2x'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured', 'default'], + doc: ['doc2x'], + }); + }); + + it('should handle duplicate services for same file type', () => { + const input = 'pdf=unstructured,default,unstructured'; + const result = ChunkingRuleParser.parse(input); + + expect(result).toEqual({ + pdf: ['unstructured', 'default', 'unstructured'], + }); + }); + }); +}); From e878342892616475a7a9955aaf2eecc5f993185b Mon Sep 17 00:00:00 2001 From: Morax <100508620+fzlzjerry@users.noreply.github.com> Date: Sat, 15 Feb 2025 12:42:39 +0000 Subject: [PATCH 08/11] refactor: remove isUsingUnstructured method from ContentChunk class --- src/server/modules/ContentChunk/index.ts | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts index c2243c55ab2f4..228ca77305074 100644 --- a/src/server/modules/ContentChunk/index.ts +++ b/src/server/modules/ContentChunk/index.ts @@ -74,11 +74,6 @@ export class ContentChunk { ); } - isUsingUnstructured(params: ChunkContentParams) { - return params.fileType === 'application/pdf' && - this.canUseUnstructured(); - } - private chunkByUnstructured = async ( filename: string, content: Uint8Array, From 044c33d3e55a1f0e4409ce3f46ba8e626f50857d Mon Sep 17 00:00:00 2001 From: Morax <100508620+fzlzjerry@users.noreply.github.com> Date: Sat, 15 Feb 2025 13:49:32 +0000 Subject: [PATCH 09/11] refactor: update ChunkingService type and clean up ContentChunk rules --- src/server/modules/ContentChunk/index.ts | 5 ++--- src/server/modules/ContentChunk/rules.ts | 7 +------ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts index 228ca77305074..3a52403ffa045 100644 --- a/src/server/modules/ContentChunk/index.ts +++ b/src/server/modules/ContentChunk/index.ts @@ -1,6 +1,6 @@ import { ChunkingLoader } from 'src/libs/langchain'; import { Strategy } from 'unstructured-client/sdk/models/shared'; -import { ChunkingRuleParser } from './rules'; +import { ChunkingRuleParser, ChunkingService } from './rules'; import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; import { ChunkingStrategy, Unstructured } from '@/libs/unstructured'; @@ -26,7 +26,7 @@ export class ContentChunk { constructor() { this.unstructuredClient = new Unstructured(); this.langchainClient = new ChunkingLoader(); - this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES); + this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES || ''); } private getChunkingServices(fileType: string): ChunkingService[] { @@ -50,7 +50,6 @@ export class ContentChunk { // Future implementation break; - case 'default': default: return await this.chunkByLangChain(params.filename, params.content); } diff --git a/src/server/modules/ContentChunk/rules.ts b/src/server/modules/ContentChunk/rules.ts index d8faf8cb05ebe..7054fd8f436e6 100644 --- a/src/server/modules/ContentChunk/rules.ts +++ b/src/server/modules/ContentChunk/rules.ts @@ -1,9 +1,4 @@ -type ChunkingService = 'unstructured' | 'doc2x' | 'default'; - -interface FileTypeRule { - fileType: string; - services: ChunkingService[]; -} +export type ChunkingService = 'unstructured' | 'doc2x' | 'default'; export class ChunkingRuleParser { static parse(rulesStr: string): Record { From 4df73f7af0ab908b6cdb44177e58a2864d5270bc Mon Sep 17 00:00:00 2001 From: Morax Date: Sat, 15 Feb 2025 23:11:04 +0800 Subject: [PATCH 10/11] refactor: simplify ChunkingRuleParser and update ContentChunk module --- src/config/knowledge.ts | 4 ++-- src/server/modules/ContentChunk/index.ts | 24 ++++++++++++------------ src/server/modules/ContentChunk/rules.ts | 13 ++++++------- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/src/config/knowledge.ts b/src/config/knowledge.ts index 6245ed25c80e1..50e5004d28efb 100644 --- a/src/config/knowledge.ts +++ b/src/config/knowledge.ts @@ -4,14 +4,14 @@ import { z } from 'zod'; export const knowledgeEnv = createEnv({ runtimeEnv: { DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG, + FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES, UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY, UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL, - FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES, }, server: { DEFAULT_FILES_CONFIG: z.string().optional(), + FILE_TYPE_CHUNKING_RULES: z.string().optional(), UNSTRUCTURED_API_KEY: z.string().optional(), UNSTRUCTURED_SERVER_URL: z.string().optional(), - FILE_TYPE_CHUNKING_RULES: z.string().optional(), }, }); diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts index 3a52403ffa045..6a7ca0f6e866a 100644 --- a/src/server/modules/ContentChunk/index.ts +++ b/src/server/modules/ContentChunk/index.ts @@ -1,10 +1,11 @@ import { ChunkingLoader } from 'src/libs/langchain'; import { Strategy } from 'unstructured-client/sdk/models/shared'; -import { ChunkingRuleParser, ChunkingService } from './rules'; -import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; -import { ChunkingStrategy, Unstructured } from '@/libs/unstructured'; import { knowledgeEnv } from '@/config/knowledge'; +import type { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; +import { ChunkingStrategy, Unstructured } from '@/libs/unstructured'; + +import { ChunkingRuleParser, ChunkingService } from './rules'; export interface ChunkContentParams { content: Uint8Array; @@ -40,25 +41,27 @@ export class ContentChunk { for (const service of services) { try { switch (service) { - case 'unstructured': + case 'unstructured': { if (this.canUseUnstructured()) { return await this.chunkByUnstructured(params.filename, params.content); } break; + } - case 'doc2x': + case 'doc2x': { // Future implementation break; + } - default: + default: { return await this.chunkByLangChain(params.filename, params.content); + } } } catch (error) { // If this is the last service, throw the error - if (service === services[services.length - 1]) throw error; + if (service === services.at(-1)) throw error; // Otherwise continue to next service console.error(`Chunking failed with service ${service}:`, error); - continue; } } @@ -67,10 +70,7 @@ export class ContentChunk { } private canUseUnstructured(): boolean { - return !!( - knowledgeEnv.UNSTRUCTURED_API_KEY && - knowledgeEnv.UNSTRUCTURED_SERVER_URL - ); + return !!(knowledgeEnv.UNSTRUCTURED_API_KEY && knowledgeEnv.UNSTRUCTURED_SERVER_URL); } private chunkByUnstructured = async ( diff --git a/src/server/modules/ContentChunk/rules.ts b/src/server/modules/ContentChunk/rules.ts index 7054fd8f436e6..aa0c0a704b4f6 100644 --- a/src/server/modules/ContentChunk/rules.ts +++ b/src/server/modules/ContentChunk/rules.ts @@ -1,7 +1,7 @@ export type ChunkingService = 'unstructured' | 'doc2x' | 'default'; -export class ChunkingRuleParser { - static parse(rulesStr: string): Record { +export const ChunkingRuleParser = { + parse(rulesStr: string): Record { const rules: Record = {}; // Split by semicolon for different file types @@ -14,11 +14,10 @@ export class ChunkingRuleParser { // Split services by comma and validate each service rules[fileType.toLowerCase()] = services .split(',') - .map(s => s.trim().toLowerCase()) - .filter((s): s is ChunkingService => - ['unstructured', 'doc2x', 'default'].includes(s)); + .map((s) => s.trim().toLowerCase()) + .filter((s): s is ChunkingService => ['unstructured', 'doc2x', 'default'].includes(s)); } return rules; - } -} + }, +} as const; From 9b4038a8724d2237d0ef787e03942b551debe511 Mon Sep 17 00:00:00 2001 From: Morax Date: Sat, 15 Feb 2025 23:11:39 +0800 Subject: [PATCH 11/11] refactor: update ContentChunk module import for ChunkingService --- src/server/modules/ContentChunk/index.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts index 6a7ca0f6e866a..79ec0bdd14dfe 100644 --- a/src/server/modules/ContentChunk/index.ts +++ b/src/server/modules/ContentChunk/index.ts @@ -5,7 +5,8 @@ import { knowledgeEnv } from '@/config/knowledge'; import type { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; import { ChunkingStrategy, Unstructured } from '@/libs/unstructured'; -import { ChunkingRuleParser, ChunkingService } from './rules'; +import { ChunkingRuleParser } from './rules'; +import type { ChunkingService } from './rules'; export interface ChunkContentParams { content: Uint8Array;