Skip to content

Commit

Permalink
feature: expose custom html based splitter
Browse files Browse the repository at this point in the history
  • Loading branch information
cdxker committed Nov 13, 2024
1 parent 05e4099 commit eed122c
Show file tree
Hide file tree
Showing 8 changed files with 341 additions and 5 deletions.
145 changes: 145 additions & 0 deletions clients/ts-sdk/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -1420,6 +1420,49 @@
]
}
},
"/api/chunk/split": {
"post": {
"tags": [
"Chunk"
],
"summary": "Split HTML Content into Chunks",
"description": "This endpoint receives a single html string and splits it into chunks based on the headings and\nbody content. The headings are split based on headding html tags. chunk_html has a maximum size\nof 256Kb.",
"operationId": "split_html_content",
"requestBody": {
"description": "JSON request payload to perform RAG on some chunks (chunks)",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ChunkHtmlContentReqPayload"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "This will be a JSON response of the chunks split from the HTML content with the headings and body",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SplitHtmlResponse"
}
}
}
},
"413": {
"description": "Payload too large, if the HTML contnet is greater than 256Kb",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponseBody"
}
}
}
}
}
}
},
"/api/chunk/suggestions": {
"post": {
"tags": [
Expand Down Expand Up @@ -6994,6 +7037,46 @@
}
]
},
"ChunkHtmlContentReqPayload": {
"type": "object",
"required": [
"chunk_html"
],
"properties": {
"body_remove_strings": {
"type": "array",
"items": {
"type": "string"
},
"description": "Text strings to remove from body when creating chunks for each page",
"nullable": true
},
"chunk_html": {
"type": "string",
"description": "The HTML content to be split into chunks"
},
"heading_remove_strings": {
"type": "array",
"items": {
"type": "string"
},
"description": "Text strings to remove from headings when creating chunks for each page",
"nullable": true
}
},
"example": {
"body_remove_strings": [
"Warning:",
"Note:"
],
"chunk_html": "",
"heading_remove_strings": [
"###",
"##",
"#"
]
}
},
"ChunkMetadata": {
"type": "object",
"title": "V2",
Expand Down Expand Up @@ -7487,6 +7570,34 @@
}
}
},
"ChunkedContent": {
"type": "object",
"required": [
"headings",
"body"
],
"properties": {
"body": {
"type": "string",
"description": "The body of the content"
},
"headings": {
"type": "array",
"items": {
"type": "string"
},
"description": "The headings of the content in order of when they appear"
}
},
"example": {
"body": "This is the body of the content",
"headings": [
"Title Heading",
"Sub Heading 1",
"Last SubHeading"
]
}
},
"ClickhouseRagTypes": {
"type": "string",
"enum": [
Expand Down Expand Up @@ -14388,6 +14499,40 @@
"asc"
]
},
"SplitHtmlResponse": {
"type": "object",
"required": [
"chunks"
],
"properties": {
"chunks": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ChunkedContent"
}
}
},
"example": {
"chunks": [
{
"body": "This is the body of the content",
"headings": [
"Title Heading",
"Sub Heading 1",
"Sub Sub Heading 1"
]
},
{
"body": "This is the body of the content",
"headings": [
"Title Heading",
"Sub Heading 1",
"Sub Sub Heading 2"
]
}
]
}
},
"StripeInvoice": {
"type": "object",
"required": [
Expand Down
2 changes: 1 addition & 1 deletion clients/ts-sdk/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"files": [
"dist"
],
"version": "0.0.26",
"version": "0.0.27",
"license": "MIT",
"scripts": {
"lint": "eslint 'src/**/*.ts'",
Expand Down
28 changes: 28 additions & 0 deletions clients/ts-sdk/src/functions/chunks/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import {
AutocompleteReqPayload,
ChunkHtmlContentReqPayload,
CountChunksReqPayload,
CreateChunkReqPayloadEnum,
DeleteChunkByTrackingIdData,
Expand Down Expand Up @@ -584,3 +585,30 @@ export async function getChunksByTrackingIds(
signal
);
}

/**
* Function that splits an html string into chunks.
* The html string will be split into chunks based on the number of characters in the string and header tags.
*
* Example:
* ```js
*const data = await trieve.splitChunkHtml({
* chunk_html: "<p>Some HTML content</p>",
*});
* ```
*/
export async function splitChunkHtml(
/** @hidden */
this: TrieveSDK,
props: ChunkHtmlContentReqPayload,
signal?: AbortSignal
) {
return this.trieve.fetch(
"/api/chunk/split",
"post",
{
data: props,
},
signal
);
}
54 changes: 54 additions & 0 deletions clients/ts-sdk/src/types.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,21 @@ export type ChunkGroupAndFileId = {

export type ChunkGroups = Array<ChunkGroup>;

export type ChunkHtmlContentReqPayload = {
/**
* Text strings to remove from body when creating chunks for each page
*/
body_remove_strings?: Array<(string)> | null;
/**
* The HTML content to be split into chunks
*/
chunk_html: string;
/**
* Text strings to remove from headings when creating chunks for each page
*/
heading_remove_strings?: Array<(string)> | null;
};

export type ChunkMetadata = {
/**
* HTML content of the chunk, can also be an arbitrary string which is not HTML
Expand Down Expand Up @@ -391,6 +406,17 @@ export type ChunkWithPosition = {
position: number;
};

export type ChunkedContent = {
/**
* The body of the content
*/
body: string;
/**
* The headings of the content in order of when they appear
*/
headings: Array<(string)>;
};

export type ClickhouseRagTypes = 'chosen_chunks' | 'all_chunks';

export type ClickhouseRecommendationTypes = 'Chunk' | 'Group';
Expand Down Expand Up @@ -2616,6 +2642,10 @@ export type SortOptions = {

export type SortOrder = 'desc' | 'asc';

export type SplitHtmlResponse = {
chunks: Array<ChunkedContent>;
};

export type StripeInvoice = {
created_at: string;
hosted_invoice_url: string;
Expand Down Expand Up @@ -3310,6 +3340,15 @@ export type SearchChunksData = {

export type SearchChunksResponse = (SearchResponseTypes);

export type SplitHtmlContentData = {
/**
* JSON request payload to perform RAG on some chunks (chunks)
*/
requestBody: ChunkHtmlContentReqPayload;
};

export type SplitHtmlContentResponse = (SplitHtmlResponse);

export type GetSuggestedQueriesData = {
/**
* JSON request payload to get alternative suggested queries
Expand Down Expand Up @@ -4653,6 +4692,21 @@ export type $OpenApiTs = {
};
};
};
'/api/chunk/split': {
post: {
req: SplitHtmlContentData;
res: {
/**
* This will be a JSON response of the chunks split from the HTML content with the headings and body
*/
200: SplitHtmlResponse;
/**
* Payload too large, if the HTML contnet is greater than 256Kb
*/
413: ErrorResponseBody;
};
};
};
'/api/chunk/suggestions': {
post: {
req: GetSuggestedQueriesData;
Expand Down
6 changes: 5 additions & 1 deletion server/src/bin/crawl-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,11 @@ async fn get_chunks_with_firecrawl(
}
}

let chunked_html = chunk_html(&page_html.clone(), &scrape_request.crawl_options);
let chunked_html = chunk_html(
&page_html.clone(),
scrape_request.crawl_options.heading_remove_strings.clone(),
scrape_request.crawl_options.body_remove_strings.clone(),
);

for chunk in chunked_html {
let heading = chunk.0.last().unwrap_or(&String::new()).clone();
Expand Down
Loading

0 comments on commit eed122c

Please sign in to comment.