Skip to content

Commit eed122c

Browse files
committed
feature: expose custom html based splitter
1 parent 05e4099 commit eed122c

File tree

8 files changed

+341
-5
lines changed

8 files changed

+341
-5
lines changed

clients/ts-sdk/openapi.json

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,6 +1420,49 @@
14201420
]
14211421
}
14221422
},
1423+
"/api/chunk/split": {
1424+
"post": {
1425+
"tags": [
1426+
"Chunk"
1427+
],
1428+
"summary": "Split HTML Content into Chunks",
1429+
"description": "This endpoint receives a single html string and splits it into chunks based on the headings and\nbody content. The headings are split based on headding html tags. chunk_html has a maximum size\nof 256Kb.",
1430+
"operationId": "split_html_content",
1431+
"requestBody": {
1432+
"description": "JSON request payload to perform RAG on some chunks (chunks)",
1433+
"content": {
1434+
"application/json": {
1435+
"schema": {
1436+
"$ref": "#/components/schemas/ChunkHtmlContentReqPayload"
1437+
}
1438+
}
1439+
},
1440+
"required": true
1441+
},
1442+
"responses": {
1443+
"200": {
1444+
"description": "This will be a JSON response of the chunks split from the HTML content with the headings and body",
1445+
"content": {
1446+
"application/json": {
1447+
"schema": {
1448+
"$ref": "#/components/schemas/SplitHtmlResponse"
1449+
}
1450+
}
1451+
}
1452+
},
1453+
"413": {
1454+
"description": "Payload too large, if the HTML contnet is greater than 256Kb",
1455+
"content": {
1456+
"application/json": {
1457+
"schema": {
1458+
"$ref": "#/components/schemas/ErrorResponseBody"
1459+
}
1460+
}
1461+
}
1462+
}
1463+
}
1464+
}
1465+
},
14231466
"/api/chunk/suggestions": {
14241467
"post": {
14251468
"tags": [
@@ -6994,6 +7037,46 @@
69947037
}
69957038
]
69967039
},
7040+
"ChunkHtmlContentReqPayload": {
7041+
"type": "object",
7042+
"required": [
7043+
"chunk_html"
7044+
],
7045+
"properties": {
7046+
"body_remove_strings": {
7047+
"type": "array",
7048+
"items": {
7049+
"type": "string"
7050+
},
7051+
"description": "Text strings to remove from body when creating chunks for each page",
7052+
"nullable": true
7053+
},
7054+
"chunk_html": {
7055+
"type": "string",
7056+
"description": "The HTML content to be split into chunks"
7057+
},
7058+
"heading_remove_strings": {
7059+
"type": "array",
7060+
"items": {
7061+
"type": "string"
7062+
},
7063+
"description": "Text strings to remove from headings when creating chunks for each page",
7064+
"nullable": true
7065+
}
7066+
},
7067+
"example": {
7068+
"body_remove_strings": [
7069+
"Warning:",
7070+
"Note:"
7071+
],
7072+
"chunk_html": "",
7073+
"heading_remove_strings": [
7074+
"###",
7075+
"##",
7076+
"#"
7077+
]
7078+
}
7079+
},
69977080
"ChunkMetadata": {
69987081
"type": "object",
69997082
"title": "V2",
@@ -7487,6 +7570,34 @@
74877570
}
74887571
}
74897572
},
7573+
"ChunkedContent": {
7574+
"type": "object",
7575+
"required": [
7576+
"headings",
7577+
"body"
7578+
],
7579+
"properties": {
7580+
"body": {
7581+
"type": "string",
7582+
"description": "The body of the content"
7583+
},
7584+
"headings": {
7585+
"type": "array",
7586+
"items": {
7587+
"type": "string"
7588+
},
7589+
"description": "The headings of the content in order of when they appear"
7590+
}
7591+
},
7592+
"example": {
7593+
"body": "This is the body of the content",
7594+
"headings": [
7595+
"Title Heading",
7596+
"Sub Heading 1",
7597+
"Last SubHeading"
7598+
]
7599+
}
7600+
},
74907601
"ClickhouseRagTypes": {
74917602
"type": "string",
74927603
"enum": [
@@ -14388,6 +14499,40 @@
1438814499
"asc"
1438914500
]
1439014501
},
14502+
"SplitHtmlResponse": {
14503+
"type": "object",
14504+
"required": [
14505+
"chunks"
14506+
],
14507+
"properties": {
14508+
"chunks": {
14509+
"type": "array",
14510+
"items": {
14511+
"$ref": "#/components/schemas/ChunkedContent"
14512+
}
14513+
}
14514+
},
14515+
"example": {
14516+
"chunks": [
14517+
{
14518+
"body": "This is the body of the content",
14519+
"headings": [
14520+
"Title Heading",
14521+
"Sub Heading 1",
14522+
"Sub Sub Heading 1"
14523+
]
14524+
},
14525+
{
14526+
"body": "This is the body of the content",
14527+
"headings": [
14528+
"Title Heading",
14529+
"Sub Heading 1",
14530+
"Sub Sub Heading 2"
14531+
]
14532+
}
14533+
]
14534+
}
14535+
},
1439114536
"StripeInvoice": {
1439214537
"type": "object",
1439314538
"required": [

clients/ts-sdk/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"files": [
77
"dist"
88
],
9-
"version": "0.0.26",
9+
"version": "0.0.27",
1010
"license": "MIT",
1111
"scripts": {
1212
"lint": "eslint 'src/**/*.ts'",

clients/ts-sdk/src/functions/chunks/index.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import {
88
AutocompleteReqPayload,
9+
ChunkHtmlContentReqPayload,
910
CountChunksReqPayload,
1011
CreateChunkReqPayloadEnum,
1112
DeleteChunkByTrackingIdData,
@@ -584,3 +585,30 @@ export async function getChunksByTrackingIds(
584585
signal
585586
);
586587
}
588+
589+
/**
590+
* Function that splits an html string into chunks.
591+
* The html string will be split into chunks based on the number of characters in the string and header tags.
592+
*
593+
* Example:
594+
* ```js
595+
*const data = await trieve.splitChunkHtml({
596+
* chunk_html: "<p>Some HTML content</p>",
597+
*});
598+
* ```
599+
*/
600+
export async function splitChunkHtml(
601+
/** @hidden */
602+
this: TrieveSDK,
603+
props: ChunkHtmlContentReqPayload,
604+
signal?: AbortSignal
605+
) {
606+
return this.trieve.fetch(
607+
"/api/chunk/split",
608+
"post",
609+
{
610+
data: props,
611+
},
612+
signal
613+
);
614+
}

clients/ts-sdk/src/types.gen.ts

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,21 @@ export type ChunkGroupAndFileId = {
220220

221221
export type ChunkGroups = Array<ChunkGroup>;
222222

223+
export type ChunkHtmlContentReqPayload = {
224+
/**
225+
* Text strings to remove from body when creating chunks for each page
226+
*/
227+
body_remove_strings?: Array<(string)> | null;
228+
/**
229+
* The HTML content to be split into chunks
230+
*/
231+
chunk_html: string;
232+
/**
233+
* Text strings to remove from headings when creating chunks for each page
234+
*/
235+
heading_remove_strings?: Array<(string)> | null;
236+
};
237+
223238
export type ChunkMetadata = {
224239
/**
225240
* HTML content of the chunk, can also be an arbitrary string which is not HTML
@@ -391,6 +406,17 @@ export type ChunkWithPosition = {
391406
position: number;
392407
};
393408

409+
export type ChunkedContent = {
410+
/**
411+
* The body of the content
412+
*/
413+
body: string;
414+
/**
415+
* The headings of the content in order of when they appear
416+
*/
417+
headings: Array<(string)>;
418+
};
419+
394420
export type ClickhouseRagTypes = 'chosen_chunks' | 'all_chunks';
395421

396422
export type ClickhouseRecommendationTypes = 'Chunk' | 'Group';
@@ -2616,6 +2642,10 @@ export type SortOptions = {
26162642

26172643
export type SortOrder = 'desc' | 'asc';
26182644

2645+
export type SplitHtmlResponse = {
2646+
chunks: Array<ChunkedContent>;
2647+
};
2648+
26192649
export type StripeInvoice = {
26202650
created_at: string;
26212651
hosted_invoice_url: string;
@@ -3310,6 +3340,15 @@ export type SearchChunksData = {
33103340

33113341
export type SearchChunksResponse = (SearchResponseTypes);
33123342

3343+
export type SplitHtmlContentData = {
3344+
/**
3345+
* JSON request payload to perform RAG on some chunks (chunks)
3346+
*/
3347+
requestBody: ChunkHtmlContentReqPayload;
3348+
};
3349+
3350+
export type SplitHtmlContentResponse = (SplitHtmlResponse);
3351+
33133352
export type GetSuggestedQueriesData = {
33143353
/**
33153354
* JSON request payload to get alternative suggested queries
@@ -4653,6 +4692,21 @@ export type $OpenApiTs = {
46534692
};
46544693
};
46554694
};
4695+
'/api/chunk/split': {
4696+
post: {
4697+
req: SplitHtmlContentData;
4698+
res: {
4699+
/**
4700+
* This will be a JSON response of the chunks split from the HTML content with the headings and body
4701+
*/
4702+
200: SplitHtmlResponse;
4703+
/**
4704+
* Payload too large, if the HTML contnet is greater than 256Kb
4705+
*/
4706+
413: ErrorResponseBody;
4707+
};
4708+
};
4709+
};
46564710
'/api/chunk/suggestions': {
46574711
post: {
46584712
req: GetSuggestedQueriesData;

server/src/bin/crawl-worker.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,11 @@ async fn get_chunks_with_firecrawl(
452452
}
453453
}
454454

455-
let chunked_html = chunk_html(&page_html.clone(), &scrape_request.crawl_options);
455+
let chunked_html = chunk_html(
456+
&page_html.clone(),
457+
scrape_request.crawl_options.heading_remove_strings.clone(),
458+
scrape_request.crawl_options.body_remove_strings.clone(),
459+
);
456460

457461
for chunk in chunked_html {
458462
let heading = chunk.0.last().unwrap_or(&String::new()).clone();

0 commit comments

Comments
 (0)