Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ feat: Add configurable PDF processing method with Unstructured #5927

Merged
merged 20 commits into from
Feb 15, 2025
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
8745ca5
✨ feat: Add configurable PDF processing method with Unstructured
fzlzjerry Feb 9, 2025
06cbc84
πŸ”§ fix: Update import path for env utility in ContentChunk module
fzlzjerry Feb 9, 2025
76bdec6
Merge branch 'main' into fix/unstructured_io
fzlzjerry Feb 9, 2025
d868e7b
Merge branch 'main' into fix/unstructured_io
fzlzjerry Feb 9, 2025
f9b7751
Merge branch 'main' into fix/unstructured_io
fzlzjerry Feb 10, 2025
c67fb33
Merge branch 'main' into fix/unstructured_io
fzlzjerry Feb 10, 2025
e8f0b03
Merge branch 'lobehub:main' into fix/unstructured_io
fzlzjerry Feb 13, 2025
4de5511
feat: add USE_UNSTRUCTURED_FOR_PDF environment variable to knowledge …
fzlzjerry Feb 13, 2025
79a6da5
Merge branch 'main' into fix/unstructured_io
fzlzjerry Feb 13, 2025
d6301bb
Delete src/server/utils/env.ts
fzlzjerry Feb 13, 2025
c132ff3
feat: implement ChunkingRuleParser for file type and service mapping
fzlzjerry Feb 13, 2025
6c3f3f4
Merge branch 'main' into fix/unstructured_io
fzlzjerry Feb 14, 2025
b82f9fe
refactor: remove USE_UNSTRUCTURED_FOR_PDF from knowledge environment …
fzlzjerry Feb 14, 2025
32771f5
test: add unit tests for ChunkingRuleParser functionality
fzlzjerry Feb 15, 2025
ef63c8a
Merge branch 'main' into fix/unstructured_io
fzlzjerry Feb 15, 2025
e878342
refactor: remove isUsingUnstructured method from ContentChunk class
fzlzjerry Feb 15, 2025
bd7d700
Merge branch 'main' into fix/unstructured_io
fzlzjerry Feb 15, 2025
044c33d
refactor: update ChunkingService type and clean up ContentChunk rules
fzlzjerry Feb 15, 2025
4df73f7
refactor: simplify ChunkingRuleParser and update ContentChunk module
fzlzjerry Feb 15, 2025
9b4038a
refactor: update ContentChunk module import for ChunkingService
fzlzjerry Feb 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 15 additions & 16 deletions src/config/knowledge.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
import { createEnv } from '@t3-oss/env-nextjs';
import { z } from 'zod';

export const getKnowledgeConfig = () => {
return createEnv({
runtimeEnv: {
DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG,
UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
},
server: {
DEFAULT_FILES_CONFIG: z.string().optional(),
UNSTRUCTURED_API_KEY: z.string().optional(),
UNSTRUCTURED_SERVER_URL: z.string().optional(),
},
});
};

export const knowledgeEnv = getKnowledgeConfig();
export const knowledgeEnv = createEnv({
runtimeEnv: {
DEFAULT_FILES_CONFIG: process.env.DEFAULT_FILES_CONFIG,
UNSTRUCTURED_API_KEY: process.env.UNSTRUCTURED_API_KEY,
UNSTRUCTURED_SERVER_URL: process.env.UNSTRUCTURED_SERVER_URL,
USE_UNSTRUCTURED_FOR_PDF: process.env.USE_UNSTRUCTURED_FOR_PDF,
FILE_TYPE_CHUNKING_RULES: process.env.FILE_TYPE_CHUNKING_RULES || 'pdf=unstructured',
arvinxx marked this conversation as resolved.
Show resolved Hide resolved
},
server: {
DEFAULT_FILES_CONFIG: z.string().optional(),
UNSTRUCTURED_API_KEY: z.string().optional(),
UNSTRUCTURED_SERVER_URL: z.string().optional(),
USE_UNSTRUCTURED_FOR_PDF: z.string().optional(),
},
});
56 changes: 51 additions & 5 deletions src/server/modules/ContentChunk/index.ts
arvinxx marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import { ChunkingLoader } from 'src/libs/langchain';
import { Strategy } from 'unstructured-client/sdk/models/shared';
import { ChunkingRuleParser } from './rules';

import { NewChunkItem, NewUnstructuredChunkItem } from '@/database/schemas';
import { ChunkingStrategy, Unstructured } from '@/libs/unstructured';
import { knowledgeEnv } from '@/config/knowledge';

export interface ChunkContentParams {
content: Uint8Array;
Expand All @@ -19,23 +21,67 @@ interface ChunkResult {
export class ContentChunk {
private unstructuredClient: Unstructured;
private langchainClient: ChunkingLoader;
private chunkingRules: Record<string, ChunkingService[]>;

constructor() {
this.unstructuredClient = new Unstructured();
this.langchainClient = new ChunkingLoader();
this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES);
}

isUsingUnstructured(params: ChunkContentParams) {
return params.fileType === 'application/pdf' && params.mode === 'hi-res';
private getChunkingServices(fileType: string): ChunkingService[] {
const ext = fileType.split('/').pop()?.toLowerCase() || '';
return this.chunkingRules[ext] || ['default'];
}

async chunkContent(params: ChunkContentParams): Promise<ChunkResult> {
if (this.isUsingUnstructured(params))
return await this.chunkByUnstructured(params.filename, params.content);

const services = this.getChunkingServices(params.fileType);

for (const service of services) {
try {
switch (service) {
case 'unstructured':
if (this.canUseUnstructured()) {
return await this.chunkByUnstructured(params.filename, params.content);
}
break;

case 'doc2x':
// Future implementation
break;

case 'default':
default:
return await this.chunkByLangChain(params.filename, params.content);
}
} catch (error) {
// If this is the last service, throw the error
if (service === services[services.length - 1]) throw error;
// Otherwise continue to next service
console.error(`Chunking failed with service ${service}:`, error);
continue;
}
}

// Fallback to langchain if no service succeeded
return await this.chunkByLangChain(params.filename, params.content);
}

private canUseUnstructured(): boolean {
return !!(
knowledgeEnv.USE_UNSTRUCTURED_FOR_PDF &&
arvinxx marked this conversation as resolved.
Show resolved Hide resolved
knowledgeEnv.UNSTRUCTURED_API_KEY &&
knowledgeEnv.UNSTRUCTURED_SERVER_URL
);
}

isUsingUnstructured(params: ChunkContentParams) {
arvinxx marked this conversation as resolved.
Show resolved Hide resolved
return params.fileType === 'application/pdf' &&
!!knowledgeEnv.USE_UNSTRUCTURED_FOR_PDF &&
!!knowledgeEnv.UNSTRUCTURED_API_KEY &&
!!knowledgeEnv.UNSTRUCTURED_SERVER_URL;
}

private chunkByUnstructured = async (
filename: string,
content: Uint8Array,
Expand Down
29 changes: 29 additions & 0 deletions src/server/modules/ContentChunk/rules.ts
arvinxx marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
type ChunkingService = 'unstructured' | 'doc2x' | 'default';

interface FileTypeRule {
fileType: string;
services: ChunkingService[];
}

export class ChunkingRuleParser {
static parse(rulesStr: string): Record<string, ChunkingService[]> {
const rules: Record<string, ChunkingService[]> = {};

// Split by semicolon for different file types
const fileTypeRules = rulesStr.split(';');

for (const rule of fileTypeRules) {
const [fileType, services] = rule.split('=');
if (!fileType || !services) continue;

// Split services by comma and validate each service
rules[fileType.toLowerCase()] = services
.split(',')
.map(s => s.trim().toLowerCase())
.filter((s): s is ChunkingService =>
['unstructured', 'doc2x', 'default'].includes(s));
}

return rules;
}
}