diff --git a/clients/ts-sdk/openapi.json b/clients/ts-sdk/openapi.json index 4d7109154..038d76b68 100644 --- a/clients/ts-sdk/openapi.json +++ b/clients/ts-sdk/openapi.json @@ -1420,6 +1420,49 @@ ] } }, + "/api/chunk/split": { + "post": { + "tags": [ + "Chunk" + ], + "summary": "Split HTML Content into Chunks", + "description": "This endpoint receives a single html string and splits it into chunks based on the headings and\nbody content. The headings are split based on headding html tags. chunk_html has a maximum size\nof 256Kb.", + "operationId": "split_html_content", + "requestBody": { + "description": "JSON request payload to perform RAG on some chunks (chunks)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ChunkHtmlContentReqPayload" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "This will be a JSON response of the chunks split from the HTML content with the headings and body", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SplitHtmlResponse" + } + } + } + }, + "413": { + "description": "Payload too large, if the HTML contnet is greater than 256Kb", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponseBody" + } + } + } + } + } + } + }, "/api/chunk/suggestions": { "post": { "tags": [ @@ -6994,6 +7037,46 @@ } ] }, + "ChunkHtmlContentReqPayload": { + "type": "object", + "required": [ + "chunk_html" + ], + "properties": { + "body_remove_strings": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Text strings to remove from body when creating chunks for each page", + "nullable": true + }, + "chunk_html": { + "type": "string", + "description": "The HTML content to be split into chunks" + }, + "heading_remove_strings": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Text strings to remove from headings when creating chunks for each page", + "nullable": true + } + }, + "example": { + "body_remove_strings": [ + "Warning:", + "Note:" + ], + "chunk_html": "", + "heading_remove_strings": [ + "###", + "##", + "#" + ] + } + }, "ChunkMetadata": { "type": "object", "title": "V2", @@ -7487,6 +7570,34 @@ } } }, + "ChunkedContent": { + "type": "object", + "required": [ + "headings", + "body" + ], + "properties": { + "body": { + "type": "string", + "description": "The body of the content" + }, + "headings": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The headings of the content in order of when they appear" + } + }, + "example": { + "body": "This is the body of the content", + "headings": [ + "Title Heading", + "Sub Heading 1", + "Last SubHeading" + ] + } + }, "ClickhouseRagTypes": { "type": "string", "enum": [ @@ -14388,6 +14499,40 @@ "asc" ] }, + "SplitHtmlResponse": { + "type": "object", + "required": [ + "chunks" + ], + "properties": { + "chunks": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ChunkedContent" + } + } + }, + "example": { + "chunks": [ + { + "body": "This is the body of the content", + "headings": [ + "Title Heading", + "Sub Heading 1", + "Sub Sub Heading 1" + ] + }, + { + "body": "This is the body of the content", + "headings": [ + "Title Heading", + "Sub Heading 1", + "Sub Sub Heading 2" + ] + } + ] + } + }, "StripeInvoice": { "type": "object", "required": [ diff --git a/clients/ts-sdk/package.json b/clients/ts-sdk/package.json index b19e7bed9..8a97c1b7b 100644 --- a/clients/ts-sdk/package.json +++ b/clients/ts-sdk/package.json @@ -6,7 +6,7 @@ "files": [ "dist" ], - "version": "0.0.26", + "version": "0.0.27", "license": "MIT", "scripts": { "lint": "eslint 'src/**/*.ts'", diff --git a/clients/ts-sdk/src/functions/chunks/index.ts b/clients/ts-sdk/src/functions/chunks/index.ts index a05c31955..ab9f9f029 100644 --- a/clients/ts-sdk/src/functions/chunks/index.ts +++ b/clients/ts-sdk/src/functions/chunks/index.ts @@ -6,6 +6,7 @@ import { AutocompleteReqPayload, + ChunkHtmlContentReqPayload, CountChunksReqPayload, CreateChunkReqPayloadEnum, DeleteChunkByTrackingIdData, @@ -584,3 +585,30 @@ export async function getChunksByTrackingIds( signal ); } + +/** + * Function that splits an html string into chunks. + * The html string will be split into chunks based on the number of characters in the string and header tags. + * + * Example: + * ```js + *const data = await trieve.splitChunkHtml({ + * chunk_html: "

Some HTML content

", + *}); + * ``` + */ +export async function splitChunkHtml( + /** @hidden */ + this: TrieveSDK, + props: ChunkHtmlContentReqPayload, + signal?: AbortSignal +) { + return this.trieve.fetch( + "/api/chunk/split", + "post", + { + data: props, + }, + signal + ); +} diff --git a/clients/ts-sdk/src/types.gen.ts b/clients/ts-sdk/src/types.gen.ts index 4182a87a1..518922b76 100644 --- a/clients/ts-sdk/src/types.gen.ts +++ b/clients/ts-sdk/src/types.gen.ts @@ -220,6 +220,21 @@ export type ChunkGroupAndFileId = { export type ChunkGroups = Array; +export type ChunkHtmlContentReqPayload = { + /** + * Text strings to remove from body when creating chunks for each page + */ + body_remove_strings?: Array<(string)> | null; + /** + * The HTML content to be split into chunks + */ + chunk_html: string; + /** + * Text strings to remove from headings when creating chunks for each page + */ + heading_remove_strings?: Array<(string)> | null; +}; + export type ChunkMetadata = { /** * HTML content of the chunk, can also be an arbitrary string which is not HTML @@ -391,6 +406,17 @@ export type ChunkWithPosition = { position: number; }; +export type ChunkedContent = { + /** + * The body of the content + */ + body: string; + /** + * The headings of the content in order of when they appear + */ + headings: Array<(string)>; +}; + export type ClickhouseRagTypes = 'chosen_chunks' | 'all_chunks'; export type ClickhouseRecommendationTypes = 'Chunk' | 'Group'; @@ -2616,6 +2642,10 @@ export type SortOptions = { export type SortOrder = 'desc' | 'asc'; +export type SplitHtmlResponse = { + chunks: Array; +}; + export type StripeInvoice = { created_at: string; hosted_invoice_url: string; @@ -3310,6 +3340,15 @@ export type SearchChunksData = { export type SearchChunksResponse = (SearchResponseTypes); +export type SplitHtmlContentData = { + /** + * JSON request payload to perform RAG on some chunks (chunks) + */ + requestBody: ChunkHtmlContentReqPayload; +}; + +export type SplitHtmlContentResponse = (SplitHtmlResponse); + export type GetSuggestedQueriesData = { /** * JSON request payload to get alternative suggested queries @@ -4653,6 +4692,21 @@ export type $OpenApiTs = { }; }; }; + '/api/chunk/split': { + post: { + req: SplitHtmlContentData; + res: { + /** + * This will be a JSON response of the chunks split from the HTML content with the headings and body + */ + 200: SplitHtmlResponse; + /** + * Payload too large, if the HTML contnet is greater than 256Kb + */ + 413: ErrorResponseBody; + }; + }; + }; '/api/chunk/suggestions': { post: { req: GetSuggestedQueriesData; diff --git a/server/src/bin/crawl-worker.rs b/server/src/bin/crawl-worker.rs index 3d7005110..b54fc10fe 100644 --- a/server/src/bin/crawl-worker.rs +++ b/server/src/bin/crawl-worker.rs @@ -452,7 +452,11 @@ async fn get_chunks_with_firecrawl( } } - let chunked_html = chunk_html(&page_html.clone(), &scrape_request.crawl_options); + let chunked_html = chunk_html( + &page_html.clone(), + scrape_request.crawl_options.heading_remove_strings.clone(), + scrape_request.crawl_options.body_remove_strings.clone(), + ); for chunk in chunked_html { let heading = chunk.0.last().unwrap_or(&String::new()).clone(); diff --git a/server/src/handlers/chunk_handler.rs b/server/src/handlers/chunk_handler.rs index 6a7c495b0..18be021b8 100644 --- a/server/src/handlers/chunk_handler.rs +++ b/server/src/handlers/chunk_handler.rs @@ -14,6 +14,7 @@ use crate::middleware::api_version::APIVersion; use crate::operators::chunk_operator::get_metadata_from_id_query; use crate::operators::chunk_operator::*; use crate::operators::clickhouse_operator::{get_latency_from_header, ClickHouseEvent, EventQueue}; +use crate::operators::crawl_operator; use crate::operators::dataset_operator::{ get_dataset_usage_query, ChunkDeleteMessage, DeleteMessage, }; @@ -2820,6 +2821,97 @@ pub async fn generate_off_chunks( .streaming(completion_stream)) } +#[derive(Debug, Serialize, Deserialize, ToSchema)] +#[schema(example = json!({ + "chunk_html": "", + "heading_remove_strings": ["###", "##", "#"], + "body_remove_strings": ["Warning:", "Note:"] +}))] +pub struct ChunkHtmlContentReqPayload { + /// The HTML content to be split into chunks + pub chunk_html: String, + /// Text strings to remove from headings when creating chunks for each page + pub heading_remove_strings: Option>, + /// Text strings to remove from body when creating chunks for each page + pub body_remove_strings: Option>, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +#[schema(example = json!({ + "chunks": [ + { + "headings": ["Title Heading", "Sub Heading 1", "Sub Sub Heading 1"], + "body": "This is the body of the content" + }, + { + "headings": ["Title Heading", "Sub Heading 1", "Sub Sub Heading 2"], + "body": "This is the body of the content" + } + // ... + ] +}))] +pub struct SplitHtmlResponse { + pub chunks: Vec, +} + +#[derive(Debug, Serialize, Deserialize, ToSchema)] +#[schema(example = json!({ + "headings": ["Title Heading", "Sub Heading 1", "Last SubHeading"], + "body": "This is the body of the content" +}))] +pub struct ChunkedContent { + /// The headings of the content in order of when they appear + pub headings: Vec, + /// The body of the content + pub body: String, +} + +/// Split HTML Content into Chunks +/// +/// This endpoint receives a single html string and splits it into chunks based on the headings and +/// body content. The headings are split based on headding html tags. chunk_html has a maximum size +/// of 256Kb. +#[utoipa::path( + post, + path = "/chunk/split", + context_path = "/api", + tag = "Chunk", + request_body(content = ChunkHtmlContentReqPayload, description = "JSON request payload to perform RAG on some chunks (chunks)", content_type = "application/json"), + responses( + ( + status = 200, description = "This will be a JSON response of the chunks split from the HTML content with the headings and body", + body = SplitHtmlResponse, + ), + ( + status = 413, description = "Payload too large, if the HTML contnet is greater than 256Kb", + body = ErrorResponseBody, + ), + ), +)] +#[tracing::instrument] +pub async fn split_html_content( + body: web::Json, +) -> Result { + if body.chunk_html.bytes().len() >= 262_144 { + return Err(ServiceError::PayloadTooLarge( + "The HTML content is too large".to_string(), + )); + } + + let chunked_content = crawl_operator::chunk_html( + &body.chunk_html, + body.heading_remove_strings.clone(), + body.body_remove_strings.clone(), + ); + + Ok(HttpResponse::Ok().json(SplitHtmlResponse { + chunks: chunked_content + .into_iter() + .map(|(headings, body)| ChunkedContent { headings, body }) + .collect(), + })) +} + pub fn check_completion_param_validity( temperature: Option, frequency_penalty: Option, diff --git a/server/src/lib.rs b/server/src/lib.rs index f01e084ea..be61eb65f 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -175,6 +175,7 @@ impl Modify for SecurityAddon { handlers::chunk_handler::create_chunk, handlers::chunk_handler::update_chunk, handlers::chunk_handler::delete_chunk, + handlers::chunk_handler::split_html_content, handlers::chunk_handler::get_recommended_chunks, handlers::chunk_handler::update_chunk_by_tracking_id, handlers::chunk_handler::search_chunks, @@ -274,6 +275,9 @@ impl Modify for SecurityAddon { handlers::chunk_handler::SearchResponseTypes, handlers::chunk_handler::CreateBatchChunkReqPayload, handlers::chunk_handler::SingleQueuedChunkResponse, + handlers::chunk_handler::ChunkHtmlContentReqPayload, + handlers::chunk_handler::SplitHtmlResponse, + handlers::chunk_handler::ChunkedContent, handlers::chunk_handler::BatchQueuedChunkResponse, handlers::chunk_handler::ReturnQueuedChunk, handlers::chunk_handler::RecommendChunksResponseBody, @@ -929,6 +933,11 @@ pub fn main() -> std::io::Result<()> { .route(web::put().to(handlers::chunk_handler::update_chunk)) .route(web::delete().to(handlers::chunk_handler::bulk_delete_chunk)), ) + .service( + web::resource("split").route( + web::post().to(handlers::chunk_handler::split_html_content), + ), + ) .service(web::resource("/recommend").route( web::post().to(handlers::chunk_handler::get_recommended_chunks), ) diff --git a/server/src/operators/crawl_operator.rs b/server/src/operators/crawl_operator.rs index 753845f17..7415ec6c1 100644 --- a/server/src/operators/crawl_operator.rs +++ b/server/src/operators/crawl_operator.rs @@ -602,7 +602,11 @@ pub fn get_tags(url: String) -> Vec { Vec::new() } -pub fn chunk_html(html: &str, crawl_options: &CrawlOptions) -> Vec<(Vec, String)> { +pub fn chunk_html( + html: &str, + heading_remove_strings: Option>, + body_remove_strings: Option>, +) -> Vec<(Vec, String)> { let re = Regex::new(r"(?i)").unwrap(); let mut chunks = Vec::new(); let mut current_chunk = String::new(); @@ -656,14 +660,14 @@ pub fn chunk_html(html: &str, crawl_options: &CrawlOptions) -> Vec<(Vec, let mut headings_text = headings.last().unwrap_or(&String::new()).clone(); let mut content = content.clone(); - if let Some(heading_remove_strings) = &crawl_options.heading_remove_strings { + if let Some(heading_remove_strings) = &heading_remove_strings { heading_remove_strings.iter().for_each(|remove_string| { headings_text = headings_text.replace(remove_string, ""); }); headings.pop(); headings.push(headings_text); } - if let Some(body_remove_strings) = &crawl_options.body_remove_strings { + if let Some(body_remove_strings) = &body_remove_strings { body_remove_strings.iter().for_each(|remove_string| { content = content.replace(remove_string, ""); });