diff --git a/clients/ts-sdk/openapi.json b/clients/ts-sdk/openapi.json
index 4d7109154..038d76b68 100644
--- a/clients/ts-sdk/openapi.json
+++ b/clients/ts-sdk/openapi.json
@@ -1420,6 +1420,49 @@
]
}
},
+ "/api/chunk/split": {
+ "post": {
+ "tags": [
+ "Chunk"
+ ],
+ "summary": "Split HTML Content into Chunks",
+ "description": "This endpoint receives a single html string and splits it into chunks based on the headings and\nbody content. The headings are split based on headding html tags. chunk_html has a maximum size\nof 256Kb.",
+ "operationId": "split_html_content",
+ "requestBody": {
+ "description": "JSON request payload to perform RAG on some chunks (chunks)",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ChunkHtmlContentReqPayload"
+ }
+ }
+ },
+ "required": true
+ },
+ "responses": {
+ "200": {
+ "description": "This will be a JSON response of the chunks split from the HTML content with the headings and body",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/SplitHtmlResponse"
+ }
+ }
+ }
+ },
+ "413": {
+ "description": "Payload too large, if the HTML contnet is greater than 256Kb",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ErrorResponseBody"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
"/api/chunk/suggestions": {
"post": {
"tags": [
@@ -6994,6 +7037,46 @@
}
]
},
+ "ChunkHtmlContentReqPayload": {
+ "type": "object",
+ "required": [
+ "chunk_html"
+ ],
+ "properties": {
+ "body_remove_strings": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "Text strings to remove from body when creating chunks for each page",
+ "nullable": true
+ },
+ "chunk_html": {
+ "type": "string",
+ "description": "The HTML content to be split into chunks"
+ },
+ "heading_remove_strings": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "Text strings to remove from headings when creating chunks for each page",
+ "nullable": true
+ }
+ },
+ "example": {
+ "body_remove_strings": [
+ "Warning:",
+ "Note:"
+ ],
+ "chunk_html": "",
+ "heading_remove_strings": [
+ "###",
+ "##",
+ "#"
+ ]
+ }
+ },
"ChunkMetadata": {
"type": "object",
"title": "V2",
@@ -7487,6 +7570,34 @@
}
}
},
+ "ChunkedContent": {
+ "type": "object",
+ "required": [
+ "headings",
+ "body"
+ ],
+ "properties": {
+ "body": {
+ "type": "string",
+ "description": "The body of the content"
+ },
+ "headings": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "The headings of the content in order of when they appear"
+ }
+ },
+ "example": {
+ "body": "This is the body of the content",
+ "headings": [
+ "Title Heading",
+ "Sub Heading 1",
+ "Last SubHeading"
+ ]
+ }
+ },
"ClickhouseRagTypes": {
"type": "string",
"enum": [
@@ -14388,6 +14499,40 @@
"asc"
]
},
+ "SplitHtmlResponse": {
+ "type": "object",
+ "required": [
+ "chunks"
+ ],
+ "properties": {
+ "chunks": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ChunkedContent"
+ }
+ }
+ },
+ "example": {
+ "chunks": [
+ {
+ "body": "This is the body of the content",
+ "headings": [
+ "Title Heading",
+ "Sub Heading 1",
+ "Sub Sub Heading 1"
+ ]
+ },
+ {
+ "body": "This is the body of the content",
+ "headings": [
+ "Title Heading",
+ "Sub Heading 1",
+ "Sub Sub Heading 2"
+ ]
+ }
+ ]
+ }
+ },
"StripeInvoice": {
"type": "object",
"required": [
diff --git a/clients/ts-sdk/package.json b/clients/ts-sdk/package.json
index b19e7bed9..8a97c1b7b 100644
--- a/clients/ts-sdk/package.json
+++ b/clients/ts-sdk/package.json
@@ -6,7 +6,7 @@
"files": [
"dist"
],
- "version": "0.0.26",
+ "version": "0.0.27",
"license": "MIT",
"scripts": {
"lint": "eslint 'src/**/*.ts'",
diff --git a/clients/ts-sdk/src/functions/chunks/index.ts b/clients/ts-sdk/src/functions/chunks/index.ts
index a05c31955..ab9f9f029 100644
--- a/clients/ts-sdk/src/functions/chunks/index.ts
+++ b/clients/ts-sdk/src/functions/chunks/index.ts
@@ -6,6 +6,7 @@
import {
AutocompleteReqPayload,
+ ChunkHtmlContentReqPayload,
CountChunksReqPayload,
CreateChunkReqPayloadEnum,
DeleteChunkByTrackingIdData,
@@ -584,3 +585,30 @@ export async function getChunksByTrackingIds(
signal
);
}
+
+/**
+ * Function that splits an html string into chunks.
+ * The html string will be split into chunks based on the number of characters in the string and header tags.
+ *
+ * Example:
+ * ```js
+ *const data = await trieve.splitChunkHtml({
+ * chunk_html: "
Some HTML content
",
+ *});
+ * ```
+ */
+export async function splitChunkHtml(
+ /** @hidden */
+ this: TrieveSDK,
+ props: ChunkHtmlContentReqPayload,
+ signal?: AbortSignal
+) {
+ return this.trieve.fetch(
+ "/api/chunk/split",
+ "post",
+ {
+ data: props,
+ },
+ signal
+ );
+}
diff --git a/clients/ts-sdk/src/types.gen.ts b/clients/ts-sdk/src/types.gen.ts
index 4182a87a1..518922b76 100644
--- a/clients/ts-sdk/src/types.gen.ts
+++ b/clients/ts-sdk/src/types.gen.ts
@@ -220,6 +220,21 @@ export type ChunkGroupAndFileId = {
export type ChunkGroups = Array;
+export type ChunkHtmlContentReqPayload = {
+ /**
+ * Text strings to remove from body when creating chunks for each page
+ */
+ body_remove_strings?: Array<(string)> | null;
+ /**
+ * The HTML content to be split into chunks
+ */
+ chunk_html: string;
+ /**
+ * Text strings to remove from headings when creating chunks for each page
+ */
+ heading_remove_strings?: Array<(string)> | null;
+};
+
export type ChunkMetadata = {
/**
* HTML content of the chunk, can also be an arbitrary string which is not HTML
@@ -391,6 +406,17 @@ export type ChunkWithPosition = {
position: number;
};
+export type ChunkedContent = {
+ /**
+ * The body of the content
+ */
+ body: string;
+ /**
+ * The headings of the content in order of when they appear
+ */
+ headings: Array<(string)>;
+};
+
export type ClickhouseRagTypes = 'chosen_chunks' | 'all_chunks';
export type ClickhouseRecommendationTypes = 'Chunk' | 'Group';
@@ -2616,6 +2642,10 @@ export type SortOptions = {
export type SortOrder = 'desc' | 'asc';
+export type SplitHtmlResponse = {
+ chunks: Array;
+};
+
export type StripeInvoice = {
created_at: string;
hosted_invoice_url: string;
@@ -3310,6 +3340,15 @@ export type SearchChunksData = {
export type SearchChunksResponse = (SearchResponseTypes);
+export type SplitHtmlContentData = {
+ /**
+ * JSON request payload to perform RAG on some chunks (chunks)
+ */
+ requestBody: ChunkHtmlContentReqPayload;
+};
+
+export type SplitHtmlContentResponse = (SplitHtmlResponse);
+
export type GetSuggestedQueriesData = {
/**
* JSON request payload to get alternative suggested queries
@@ -4653,6 +4692,21 @@ export type $OpenApiTs = {
};
};
};
+ '/api/chunk/split': {
+ post: {
+ req: SplitHtmlContentData;
+ res: {
+ /**
+ * This will be a JSON response of the chunks split from the HTML content with the headings and body
+ */
+ 200: SplitHtmlResponse;
+ /**
+ * Payload too large, if the HTML contnet is greater than 256Kb
+ */
+ 413: ErrorResponseBody;
+ };
+ };
+ };
'/api/chunk/suggestions': {
post: {
req: GetSuggestedQueriesData;
diff --git a/server/src/bin/crawl-worker.rs b/server/src/bin/crawl-worker.rs
index 3d7005110..b54fc10fe 100644
--- a/server/src/bin/crawl-worker.rs
+++ b/server/src/bin/crawl-worker.rs
@@ -452,7 +452,11 @@ async fn get_chunks_with_firecrawl(
}
}
- let chunked_html = chunk_html(&page_html.clone(), &scrape_request.crawl_options);
+ let chunked_html = chunk_html(
+ &page_html.clone(),
+ scrape_request.crawl_options.heading_remove_strings.clone(),
+ scrape_request.crawl_options.body_remove_strings.clone(),
+ );
for chunk in chunked_html {
let heading = chunk.0.last().unwrap_or(&String::new()).clone();
diff --git a/server/src/handlers/chunk_handler.rs b/server/src/handlers/chunk_handler.rs
index 6a7c495b0..18be021b8 100644
--- a/server/src/handlers/chunk_handler.rs
+++ b/server/src/handlers/chunk_handler.rs
@@ -14,6 +14,7 @@ use crate::middleware::api_version::APIVersion;
use crate::operators::chunk_operator::get_metadata_from_id_query;
use crate::operators::chunk_operator::*;
use crate::operators::clickhouse_operator::{get_latency_from_header, ClickHouseEvent, EventQueue};
+use crate::operators::crawl_operator;
use crate::operators::dataset_operator::{
get_dataset_usage_query, ChunkDeleteMessage, DeleteMessage,
};
@@ -2820,6 +2821,97 @@ pub async fn generate_off_chunks(
.streaming(completion_stream))
}
+#[derive(Debug, Serialize, Deserialize, ToSchema)]
+#[schema(example = json!({
+ "chunk_html": "",
+ "heading_remove_strings": ["###", "##", "#"],
+ "body_remove_strings": ["Warning:", "Note:"]
+}))]
+pub struct ChunkHtmlContentReqPayload {
+ /// The HTML content to be split into chunks
+ pub chunk_html: String,
+ /// Text strings to remove from headings when creating chunks for each page
+ pub heading_remove_strings: Option>,
+ /// Text strings to remove from body when creating chunks for each page
+ pub body_remove_strings: Option>,
+}
+
+#[derive(Debug, Serialize, Deserialize, ToSchema)]
+#[schema(example = json!({
+ "chunks": [
+ {
+ "headings": ["Title Heading", "Sub Heading 1", "Sub Sub Heading 1"],
+ "body": "This is the body of the content"
+ },
+ {
+ "headings": ["Title Heading", "Sub Heading 1", "Sub Sub Heading 2"],
+ "body": "This is the body of the content"
+ }
+ // ...
+ ]
+}))]
+pub struct SplitHtmlResponse {
+ pub chunks: Vec,
+}
+
+#[derive(Debug, Serialize, Deserialize, ToSchema)]
+#[schema(example = json!({
+ "headings": ["Title Heading", "Sub Heading 1", "Last SubHeading"],
+ "body": "This is the body of the content"
+}))]
+pub struct ChunkedContent {
+ /// The headings of the content in order of when they appear
+ pub headings: Vec,
+ /// The body of the content
+ pub body: String,
+}
+
+/// Split HTML Content into Chunks
+///
+/// This endpoint receives a single html string and splits it into chunks based on the headings and
+/// body content. The headings are split based on headding html tags. chunk_html has a maximum size
+/// of 256Kb.
+#[utoipa::path(
+ post,
+ path = "/chunk/split",
+ context_path = "/api",
+ tag = "Chunk",
+ request_body(content = ChunkHtmlContentReqPayload, description = "JSON request payload to perform RAG on some chunks (chunks)", content_type = "application/json"),
+ responses(
+ (
+ status = 200, description = "This will be a JSON response of the chunks split from the HTML content with the headings and body",
+ body = SplitHtmlResponse,
+ ),
+ (
+ status = 413, description = "Payload too large, if the HTML contnet is greater than 256Kb",
+ body = ErrorResponseBody,
+ ),
+ ),
+)]
+#[tracing::instrument]
+pub async fn split_html_content(
+ body: web::Json,
+) -> Result {
+ if body.chunk_html.bytes().len() >= 262_144 {
+ return Err(ServiceError::PayloadTooLarge(
+ "The HTML content is too large".to_string(),
+ ));
+ }
+
+ let chunked_content = crawl_operator::chunk_html(
+ &body.chunk_html,
+ body.heading_remove_strings.clone(),
+ body.body_remove_strings.clone(),
+ );
+
+ Ok(HttpResponse::Ok().json(SplitHtmlResponse {
+ chunks: chunked_content
+ .into_iter()
+ .map(|(headings, body)| ChunkedContent { headings, body })
+ .collect(),
+ }))
+}
+
pub fn check_completion_param_validity(
temperature: Option,
frequency_penalty: Option,
diff --git a/server/src/lib.rs b/server/src/lib.rs
index f01e084ea..be61eb65f 100644
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@@ -175,6 +175,7 @@ impl Modify for SecurityAddon {
handlers::chunk_handler::create_chunk,
handlers::chunk_handler::update_chunk,
handlers::chunk_handler::delete_chunk,
+ handlers::chunk_handler::split_html_content,
handlers::chunk_handler::get_recommended_chunks,
handlers::chunk_handler::update_chunk_by_tracking_id,
handlers::chunk_handler::search_chunks,
@@ -274,6 +275,9 @@ impl Modify for SecurityAddon {
handlers::chunk_handler::SearchResponseTypes,
handlers::chunk_handler::CreateBatchChunkReqPayload,
handlers::chunk_handler::SingleQueuedChunkResponse,
+ handlers::chunk_handler::ChunkHtmlContentReqPayload,
+ handlers::chunk_handler::SplitHtmlResponse,
+ handlers::chunk_handler::ChunkedContent,
handlers::chunk_handler::BatchQueuedChunkResponse,
handlers::chunk_handler::ReturnQueuedChunk,
handlers::chunk_handler::RecommendChunksResponseBody,
@@ -929,6 +933,11 @@ pub fn main() -> std::io::Result<()> {
.route(web::put().to(handlers::chunk_handler::update_chunk))
.route(web::delete().to(handlers::chunk_handler::bulk_delete_chunk)),
)
+ .service(
+ web::resource("split").route(
+ web::post().to(handlers::chunk_handler::split_html_content),
+ ),
+ )
.service(web::resource("/recommend").route(
web::post().to(handlers::chunk_handler::get_recommended_chunks),
)
diff --git a/server/src/operators/crawl_operator.rs b/server/src/operators/crawl_operator.rs
index 753845f17..7415ec6c1 100644
--- a/server/src/operators/crawl_operator.rs
+++ b/server/src/operators/crawl_operator.rs
@@ -602,7 +602,11 @@ pub fn get_tags(url: String) -> Vec {
Vec::new()
}
-pub fn chunk_html(html: &str, crawl_options: &CrawlOptions) -> Vec<(Vec, String)> {
+pub fn chunk_html(
+ html: &str,
+ heading_remove_strings: Option>,
+ body_remove_strings: Option>,
+) -> Vec<(Vec, String)> {
let re = Regex::new(r"(?i)").unwrap();
let mut chunks = Vec::new();
let mut current_chunk = String::new();
@@ -656,14 +660,14 @@ pub fn chunk_html(html: &str, crawl_options: &CrawlOptions) -> Vec<(Vec,
let mut headings_text = headings.last().unwrap_or(&String::new()).clone();
let mut content = content.clone();
- if let Some(heading_remove_strings) = &crawl_options.heading_remove_strings {
+ if let Some(heading_remove_strings) = &heading_remove_strings {
heading_remove_strings.iter().for_each(|remove_string| {
headings_text = headings_text.replace(remove_string, "");
});
headings.pop();
headings.push(headings_text);
}
- if let Some(body_remove_strings) = &crawl_options.body_remove_strings {
+ if let Some(body_remove_strings) = &body_remove_strings {
body_remove_strings.iter().for_each(|remove_string| {
content = content.replace(remove_string, "");
});