Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: expose custom html based splitter #2737

Merged
merged 1 commit into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 145 additions & 0 deletions clients/ts-sdk/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -1420,6 +1420,49 @@
]
}
},
"/api/chunk/split": {
"post": {
"tags": [
"Chunk"
],
"summary": "Split HTML Content into Chunks",
"description": "This endpoint receives a single html string and splits it into chunks based on the headings and\nbody content. The headings are split based on headding html tags. chunk_html has a maximum size\nof 256Kb.",
"operationId": "split_html_content",
"requestBody": {
"description": "JSON request payload to perform RAG on some chunks (chunks)",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ChunkHtmlContentReqPayload"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "This will be a JSON response of the chunks split from the HTML content with the headings and body",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SplitHtmlResponse"
}
}
}
},
"413": {
"description": "Payload too large, if the HTML contnet is greater than 256Kb",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponseBody"
}
}
}
}
}
}
},
"/api/chunk/suggestions": {
"post": {
"tags": [
Expand Down Expand Up @@ -6994,6 +7037,46 @@
}
]
},
"ChunkHtmlContentReqPayload": {
"type": "object",
"required": [
"chunk_html"
],
"properties": {
"body_remove_strings": {
"type": "array",
"items": {
"type": "string"
},
"description": "Text strings to remove from body when creating chunks for each page",
"nullable": true
},
"chunk_html": {
"type": "string",
"description": "The HTML content to be split into chunks"
},
"heading_remove_strings": {
"type": "array",
"items": {
"type": "string"
},
"description": "Text strings to remove from headings when creating chunks for each page",
"nullable": true
}
},
"example": {
"body_remove_strings": [
"Warning:",
"Note:"
],
"chunk_html": "",
"heading_remove_strings": [
"###",
"##",
"#"
]
}
},
"ChunkMetadata": {
"type": "object",
"title": "V2",
Expand Down Expand Up @@ -7487,6 +7570,34 @@
}
}
},
"ChunkedContent": {
"type": "object",
"required": [
"headings",
"body"
],
"properties": {
"body": {
"type": "string",
"description": "The body of the content"
},
"headings": {
"type": "array",
"items": {
"type": "string"
},
"description": "The headings of the content in order of when they appear"
}
},
"example": {
"body": "This is the body of the content",
"headings": [
"Title Heading",
"Sub Heading 1",
"Last SubHeading"
]
}
},
"ClickhouseRagTypes": {
"type": "string",
"enum": [
Expand Down Expand Up @@ -14388,6 +14499,40 @@
"asc"
]
},
"SplitHtmlResponse": {
"type": "object",
"required": [
"chunks"
],
"properties": {
"chunks": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ChunkedContent"
}
}
},
"example": {
"chunks": [
{
"body": "This is the body of the content",
"headings": [
"Title Heading",
"Sub Heading 1",
"Sub Sub Heading 1"
]
},
{
"body": "This is the body of the content",
"headings": [
"Title Heading",
"Sub Heading 1",
"Sub Sub Heading 2"
]
}
]
}
},
"StripeInvoice": {
"type": "object",
"required": [
Expand Down
2 changes: 1 addition & 1 deletion clients/ts-sdk/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"files": [
"dist"
],
"version": "0.0.26",
"version": "0.0.27",
"license": "MIT",
"scripts": {
"lint": "eslint 'src/**/*.ts'",
Expand Down
28 changes: 28 additions & 0 deletions clients/ts-sdk/src/functions/chunks/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import {
AutocompleteReqPayload,
ChunkHtmlContentReqPayload,
CountChunksReqPayload,
CreateChunkReqPayloadEnum,
DeleteChunkByTrackingIdData,
Expand Down Expand Up @@ -584,3 +585,30 @@ export async function getChunksByTrackingIds(
signal
);
}

/**
* Function that splits an html string into chunks.
* The html string will be split into chunks based on the number of characters in the string and header tags.
*
* Example:
* ```js
*const data = await trieve.splitChunkHtml({
* chunk_html: "<p>Some HTML content</p>",
*});
* ```
*/
export async function splitChunkHtml(
/** @hidden */
this: TrieveSDK,
props: ChunkHtmlContentReqPayload,
signal?: AbortSignal
) {
return this.trieve.fetch(
"/api/chunk/split",
"post",
{
data: props,
},
signal
);
}
54 changes: 54 additions & 0 deletions clients/ts-sdk/src/types.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,21 @@ export type ChunkGroupAndFileId = {

export type ChunkGroups = Array<ChunkGroup>;

export type ChunkHtmlContentReqPayload = {
/**
* Text strings to remove from body when creating chunks for each page
*/
body_remove_strings?: Array<(string)> | null;
/**
* The HTML content to be split into chunks
*/
chunk_html: string;
/**
* Text strings to remove from headings when creating chunks for each page
*/
heading_remove_strings?: Array<(string)> | null;
};

export type ChunkMetadata = {
/**
* HTML content of the chunk, can also be an arbitrary string which is not HTML
Expand Down Expand Up @@ -391,6 +406,17 @@ export type ChunkWithPosition = {
position: number;
};

export type ChunkedContent = {
/**
* The body of the content
*/
body: string;
/**
* The headings of the content in order of when they appear
*/
headings: Array<(string)>;
};

export type ClickhouseRagTypes = 'chosen_chunks' | 'all_chunks';

export type ClickhouseRecommendationTypes = 'Chunk' | 'Group';
Expand Down Expand Up @@ -2616,6 +2642,10 @@ export type SortOptions = {

export type SortOrder = 'desc' | 'asc';

export type SplitHtmlResponse = {
chunks: Array<ChunkedContent>;
};

export type StripeInvoice = {
created_at: string;
hosted_invoice_url: string;
Expand Down Expand Up @@ -3310,6 +3340,15 @@ export type SearchChunksData = {

export type SearchChunksResponse = (SearchResponseTypes);

export type SplitHtmlContentData = {
/**
* JSON request payload to perform RAG on some chunks (chunks)
*/
requestBody: ChunkHtmlContentReqPayload;
};

export type SplitHtmlContentResponse = (SplitHtmlResponse);

export type GetSuggestedQueriesData = {
/**
* JSON request payload to get alternative suggested queries
Expand Down Expand Up @@ -4653,6 +4692,21 @@ export type $OpenApiTs = {
};
};
};
'/api/chunk/split': {
post: {
req: SplitHtmlContentData;
res: {
/**
* This will be a JSON response of the chunks split from the HTML content with the headings and body
*/
200: SplitHtmlResponse;
/**
* Payload too large, if the HTML contnet is greater than 256Kb
*/
413: ErrorResponseBody;
};
};
};
'/api/chunk/suggestions': {
post: {
req: GetSuggestedQueriesData;
Expand Down
6 changes: 5 additions & 1 deletion server/src/bin/crawl-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,11 @@ async fn get_chunks_with_firecrawl(
}
}

let chunked_html = chunk_html(&page_html.clone(), &scrape_request.crawl_options);
let chunked_html = chunk_html(
&page_html.clone(),
scrape_request.crawl_options.heading_remove_strings.clone(),
scrape_request.crawl_options.body_remove_strings.clone(),
);

for chunk in chunked_html {
let heading = chunk.0.last().unwrap_or(&String::new()).clone();
Expand Down
Loading
Loading