Skip to content

Commit b9a3cf3

Browse files
committed
feature: expose custom html based splitter
1 parent c94e26d commit b9a3cf3

File tree

7 files changed

+339
-4
lines changed

7 files changed

+339
-4
lines changed

clients/ts-sdk/openapi.json

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,6 +1420,49 @@
14201420
]
14211421
}
14221422
},
1423+
"/api/chunk/split": {
1424+
"post": {
1425+
"tags": [
1426+
"Chunk"
1427+
],
1428+
"summary": "Split HTML Content into Chunks",
1429+
"description": "This endpoint receives a single html string and splits it into chunks based on the headings and\nbody content. The headings are split based on headding html tags. chunk_html has a maximum size\nof 256Kb.",
1430+
"operationId": "split_html_content",
1431+
"requestBody": {
1432+
"description": "JSON request payload to perform RAG on some chunks (chunks)",
1433+
"content": {
1434+
"application/json": {
1435+
"schema": {
1436+
"$ref": "#/components/schemas/ChunkHtmlContentReqPayload"
1437+
}
1438+
}
1439+
},
1440+
"required": true
1441+
},
1442+
"responses": {
1443+
"200": {
1444+
"description": "This will be a JSON response of the chunks split from the HTML content with the headings and body",
1445+
"content": {
1446+
"application/json": {
1447+
"schema": {
1448+
"$ref": "#/components/schemas/SplitHtmlResponse"
1449+
}
1450+
}
1451+
}
1452+
},
1453+
"413": {
1454+
"description": "Payload too large, if the HTML contnet is greater than 256Kb",
1455+
"content": {
1456+
"application/json": {
1457+
"schema": {
1458+
"$ref": "#/components/schemas/ErrorResponseBody"
1459+
}
1460+
}
1461+
}
1462+
}
1463+
}
1464+
}
1465+
},
14231466
"/api/chunk/suggestions": {
14241467
"post": {
14251468
"tags": [
@@ -6994,6 +7037,46 @@
69947037
}
69957038
]
69967039
},
7040+
"ChunkHtmlContentReqPayload": {
7041+
"type": "object",
7042+
"required": [
7043+
"chunk_html"
7044+
],
7045+
"properties": {
7046+
"body_remove_strings": {
7047+
"type": "array",
7048+
"items": {
7049+
"type": "string"
7050+
},
7051+
"description": "Text strings to remove from body when creating chunks for each page",
7052+
"nullable": true
7053+
},
7054+
"chunk_html": {
7055+
"type": "string",
7056+
"description": "The HTML content to be split into chunks"
7057+
},
7058+
"heading_remove_strings": {
7059+
"type": "array",
7060+
"items": {
7061+
"type": "string"
7062+
},
7063+
"description": "Text strings to remove from headings when creating chunks for each page",
7064+
"nullable": true
7065+
}
7066+
},
7067+
"example": {
7068+
"body_remove_strings": [
7069+
"Warning:",
7070+
"Note:"
7071+
],
7072+
"chunk_html": "",
7073+
"heading_remove_strings": [
7074+
"###",
7075+
"##",
7076+
"#"
7077+
]
7078+
}
7079+
},
69977080
"ChunkMetadata": {
69987081
"type": "object",
69997082
"title": "V2",
@@ -7487,6 +7570,34 @@
74877570
}
74887571
}
74897572
},
7573+
"ChunkedContent": {
7574+
"type": "object",
7575+
"required": [
7576+
"headings",
7577+
"body"
7578+
],
7579+
"properties": {
7580+
"body": {
7581+
"type": "string",
7582+
"description": "The body of the content"
7583+
},
7584+
"headings": {
7585+
"type": "array",
7586+
"items": {
7587+
"type": "string"
7588+
},
7589+
"description": "The headings of the content in order of when they appear"
7590+
}
7591+
},
7592+
"example": {
7593+
"body": "This is the body of the content",
7594+
"headings": [
7595+
"Title Heading",
7596+
"Sub Heading 1",
7597+
"Last SubHeading"
7598+
]
7599+
}
7600+
},
74907601
"ClickhouseRagTypes": {
74917602
"type": "string",
74927603
"enum": [
@@ -14388,6 +14499,40 @@
1438814499
"asc"
1438914500
]
1439014501
},
14502+
"SplitHtmlResponse": {
14503+
"type": "object",
14504+
"required": [
14505+
"chunks"
14506+
],
14507+
"properties": {
14508+
"chunks": {
14509+
"type": "array",
14510+
"items": {
14511+
"$ref": "#/components/schemas/ChunkedContent"
14512+
}
14513+
}
14514+
},
14515+
"example": {
14516+
"chunks": [
14517+
{
14518+
"body": "This is the body of the content",
14519+
"headings": [
14520+
"Title Heading",
14521+
"Sub Heading 1",
14522+
"Sub Sub Heading 1"
14523+
]
14524+
},
14525+
{
14526+
"body": "This is the body of the content",
14527+
"headings": [
14528+
"Title Heading",
14529+
"Sub Heading 1",
14530+
"Sub Sub Heading 2"
14531+
]
14532+
}
14533+
]
14534+
}
14535+
},
1439114536
"StripeInvoice": {
1439214537
"type": "object",
1439314538
"required": [

clients/ts-sdk/src/functions/chunks/index.ts

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import {
88
AutocompleteReqPayload,
9+
ChunkHtmlContentReqPayload,
910
CountChunksReqPayload,
1011
CreateChunkReqPayloadEnum,
1112
DeleteChunkByTrackingIdData,
@@ -584,3 +585,29 @@ export async function getChunksByTrackingIds(
584585
signal
585586
);
586587
}
588+
589+
/**
590+
* Function that creates chunk(s). If the chunk has the same tracking_id as an existing chunk, the request will fail. Once a chunk is created, it can be searched for using the search endpoint. If uploading in bulk, the maximum amount of chunks that can be uploaded at once is 120 chunks. Auth’ed user or api key must have an admin or owner role for the specified dataset’s organization.
591+
*
592+
* Example:
593+
* ```js
594+
*const data = await trieve.splitChunks({
595+
* chunk_html: "<p>Some HTML content</p>",
596+
*});
597+
* ```
598+
*/
599+
export async function splitChunks(
600+
/** @hidden */
601+
this: TrieveSDK,
602+
props: ChunkHtmlContentReqPayload,
603+
signal?: AbortSignal
604+
) {
605+
return this.trieve.fetch(
606+
"/api/chunk/split",
607+
"post",
608+
{
609+
data: props,
610+
},
611+
signal
612+
);
613+
}

clients/ts-sdk/src/types.gen.ts

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,21 @@ export type ChunkGroupAndFileId = {
220220

221221
export type ChunkGroups = Array<ChunkGroup>;
222222

223+
export type ChunkHtmlContentReqPayload = {
224+
/**
225+
* Text strings to remove from body when creating chunks for each page
226+
*/
227+
body_remove_strings?: Array<(string)> | null;
228+
/**
229+
* The HTML content to be split into chunks
230+
*/
231+
chunk_html: string;
232+
/**
233+
* Text strings to remove from headings when creating chunks for each page
234+
*/
235+
heading_remove_strings?: Array<(string)> | null;
236+
};
237+
223238
export type ChunkMetadata = {
224239
/**
225240
* HTML content of the chunk, can also be an arbitrary string which is not HTML
@@ -391,6 +406,17 @@ export type ChunkWithPosition = {
391406
position: number;
392407
};
393408

409+
export type ChunkedContent = {
410+
/**
411+
* The body of the content
412+
*/
413+
body: string;
414+
/**
415+
* The headings of the content in order of when they appear
416+
*/
417+
headings: Array<(string)>;
418+
};
419+
394420
export type ClickhouseRagTypes = 'chosen_chunks' | 'all_chunks';
395421

396422
export type ClickhouseRecommendationTypes = 'Chunk' | 'Group';
@@ -2616,6 +2642,10 @@ export type SortOptions = {
26162642

26172643
export type SortOrder = 'desc' | 'asc';
26182644

2645+
export type SplitHtmlResponse = {
2646+
chunks: Array<ChunkedContent>;
2647+
};
2648+
26192649
export type StripeInvoice = {
26202650
created_at: string;
26212651
hosted_invoice_url: string;
@@ -3310,6 +3340,15 @@ export type SearchChunksData = {
33103340

33113341
export type SearchChunksResponse = (SearchResponseTypes);
33123342

3343+
export type SplitHtmlContentData = {
3344+
/**
3345+
* JSON request payload to perform RAG on some chunks (chunks)
3346+
*/
3347+
requestBody: ChunkHtmlContentReqPayload;
3348+
};
3349+
3350+
export type SplitHtmlContentResponse = (SplitHtmlResponse);
3351+
33133352
export type GetSuggestedQueriesData = {
33143353
/**
33153354
* JSON request payload to get alternative suggested queries
@@ -4653,6 +4692,21 @@ export type $OpenApiTs = {
46534692
};
46544693
};
46554694
};
4695+
'/api/chunk/split': {
4696+
post: {
4697+
req: SplitHtmlContentData;
4698+
res: {
4699+
/**
4700+
* This will be a JSON response of the chunks split from the HTML content with the headings and body
4701+
*/
4702+
200: SplitHtmlResponse;
4703+
/**
4704+
* Payload too large, if the HTML contnet is greater than 256Kb
4705+
*/
4706+
413: ErrorResponseBody;
4707+
};
4708+
};
4709+
};
46564710
'/api/chunk/suggestions': {
46574711
post: {
46584712
req: GetSuggestedQueriesData;

server/src/bin/crawl-worker.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,11 @@ async fn get_chunks_with_firecrawl(
452452
}
453453
}
454454

455-
let chunked_html = chunk_html(&page_html.clone(), &scrape_request.crawl_options);
455+
let chunked_html = chunk_html(
456+
&page_html.clone(),
457+
scrape_request.crawl_options.heading_remove_strings.clone(),
458+
scrape_request.crawl_options.body_remove_strings.clone(),
459+
);
456460

457461
for chunk in chunked_html {
458462
let heading = chunk.0.last().unwrap_or(&String::new()).clone();

0 commit comments

Comments
 (0)