From 8745ca570ef9563b30ea3fc20afe242e9676f415 Mon Sep 17 00:00:00 2001 From: Morax Date: Sun, 9 Feb 2025 12:42:53 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E2=9C=A8=20feat:=20Add=20configurable=20PD?= =?UTF-8?q?F=20processing=20method=20with=20Unstructured?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/server/modules/ContentChunk/index.ts | 10 +++++++++- src/server/utils/env.ts | 12 ++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 src/server/utils/env.ts diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts index ce61616968cb8..14684ae571f65 100644 --- a/src/server/modules/ContentChunk/index.ts +++ b/src/server/modules/ContentChunk/index.ts @@ -3,6 +3,11 @@ import { Strategy } from 'unstructured-client/sdk/models/shared'; import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; import { ChunkingStrategy, Unstructured } from '@/libs/unstructured'; +import { getEnvironment } from '@/utils/env'; +import { knowledgeEnv } from '@/config/knowledge'; + +// Get environment variable to control PDF processing method +const USE_UNSTRUCTURED_FOR_PDF = getEnvironment('USE_UNSTRUCTURED_FOR_PDF') === 'true'; export interface ChunkContentParams { content: Uint8Array; @@ -26,7 +31,10 @@ export class ContentChunk { } isUsingUnstructured(params: ChunkContentParams) { - return params.fileType === 'application/pdf' && params.mode === 'hi-res'; + return params.fileType === 'application/pdf' && + USE_UNSTRUCTURED_FOR_PDF && + !!knowledgeEnv.UNSTRUCTURED_API_KEY && + !!knowledgeEnv.UNSTRUCTURED_SERVER_URL; } async chunkContent(params: ChunkContentParams): Promise { diff --git a/src/server/utils/env.ts b/src/server/utils/env.ts new file mode 100644 index 0000000000000..465b8e25028cc --- /dev/null +++ b/src/server/utils/env.ts @@ -0,0 +1,12 @@ +export const isDev = process.env.NODE_ENV === 'development'; + +export const isOnServerSide = typeof window === 'undefined'; +/** + * Get environment variable value + * @param key - Environment variable key + * @returns Environment variable value or empty string if not found + */ +export const getEnvironment = (key: string): string => { + if (typeof process === 'undefined') return ''; + return process.env[key] || ''; +}; From 06cbc842d2d1a46582c2851f45350e708031c07a Mon Sep 17 00:00:00 2001 From: Morax Date: Sun, 9 Feb 2025 13:13:43 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=94=A7=20fix:=20Update=20import=20pat?= =?UTF-8?q?h=20for=20env=20utility=20in=20ContentChunk=20module?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/server/modules/ContentChunk/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/modules/ContentChunk/index.ts b/src/server/modules/ContentChunk/index.ts index 14684ae571f65..4972883cdfcc5 100644 --- a/src/server/modules/ContentChunk/index.ts +++ b/src/server/modules/ContentChunk/index.ts @@ -3,7 +3,7 @@ import { Strategy } from 'unstructured-client/sdk/models/shared'; import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas'; import { ChunkingStrategy, Unstructured } from '@/libs/unstructured'; -import { getEnvironment } from '@/utils/env'; +import { getEnvironment } from '../../utils/env'; import { knowledgeEnv } from '@/config/knowledge'; // Get environment variable to control PDF processing method From a206191586e5f446da80070a3e895183bddad401 Mon Sep 17 00:00:00 2001 From: "gru-agent[bot]" <185149714+gru-agent[bot]@users.noreply.github.com> Date: Sun, 9 Feb 2025 06:18:02 +0000 Subject: [PATCH 3/3] Add unit tests for environment utility functions in env.test.ts --- src/server/utils/env.test.ts | 82 ++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 src/server/utils/env.test.ts diff --git a/src/server/utils/env.test.ts b/src/server/utils/env.test.ts new file mode 100644 index 0000000000000..a58fccf56aae7 --- /dev/null +++ b/src/server/utils/env.test.ts @@ -0,0 +1,82 @@ +import { describe, expect, it, vi } from 'vitest'; + +import { getEnvironment, isDev, isOnServerSide } from './env'; + +describe('env utils', () => { + describe('getEnvironment', () => { + it('should return empty string if process is undefined', () => { + const originalProcess = global.process; + // @ts-ignore + global.process = undefined; + + expect(getEnvironment('TEST_KEY')).toBe(''); + + global.process = originalProcess; + }); + + it('should return empty string if env var not found', () => { + expect(getEnvironment('NON_EXISTENT_KEY')).toBe(''); + }); + + it('should return env var value if exists', () => { + const originalEnv = process.env.TEST_KEY; + process.env.TEST_KEY = 'test-value'; + expect(getEnvironment('TEST_KEY')).toBe('test-value'); + process.env.TEST_KEY = originalEnv; + }); + }); + + describe('isDev', () => { + const originalNodeEnv = process.env.NODE_ENV; + + beforeEach(() => { + vi.resetModules(); + }); + + afterEach(() => { + // @ts-ignore + process.env.NODE_ENV = originalNodeEnv; + }); + + it('should be true in development environment', async () => { + // @ts-ignore + process.env.NODE_ENV = 'development'; + const { isDev } = await import('./env'); + expect(isDev).toBe(true); + }); + + it('should be false in production environment', async () => { + // @ts-ignore + process.env.NODE_ENV = 'production'; + const { isDev } = await import('./env'); + expect(isDev).toBe(false); + }); + }); + + describe('isOnServerSide', () => { + const originalWindow = global.window; + + beforeEach(() => { + vi.resetModules(); + }); + + afterEach(() => { + // @ts-ignore + global.window = originalWindow; + }); + + it('should be true when window is undefined', async () => { + // @ts-ignore + delete global.window; + const { isOnServerSide } = await import('./env'); + expect(isOnServerSide).toBe(true); + }); + + it('should be false when window is defined', async () => { + // @ts-ignore + global.window = {}; + const { isOnServerSide } = await import('./env'); + expect(isOnServerSide).toBe(false); + }); + }); +});