Skip to content

Commit d8ad642

Browse files
committed
feature: expose custom html based splitter
1 parent c94e26d commit d8ad642

File tree

4 files changed

+113
-4
lines changed

4 files changed

+113
-4
lines changed

server/src/bin/crawl-worker.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,11 @@ async fn get_chunks_with_firecrawl(
452452
}
453453
}
454454

455-
let chunked_html = chunk_html(&page_html.clone(), &scrape_request.crawl_options);
455+
let chunked_html = chunk_html(
456+
&page_html.clone(),
457+
scrape_request.crawl_options.heading_remove_strings.clone(),
458+
scrape_request.crawl_options.body_remove_strings.clone(),
459+
);
456460

457461
for chunk in chunked_html {
458462
let heading = chunk.0.last().unwrap_or(&String::new()).clone();

server/src/handlers/chunk_handler.rs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use crate::middleware::api_version::APIVersion;
1414
use crate::operators::chunk_operator::get_metadata_from_id_query;
1515
use crate::operators::chunk_operator::*;
1616
use crate::operators::clickhouse_operator::{get_latency_from_header, ClickHouseEvent, EventQueue};
17+
use crate::operators::crawl_operator;
1718
use crate::operators::dataset_operator::{
1819
get_dataset_usage_query, ChunkDeleteMessage, DeleteMessage,
1920
};
@@ -2820,6 +2821,97 @@ pub async fn generate_off_chunks(
28202821
.streaming(completion_stream))
28212822
}
28222823

2824+
#[derive(Debug, Serialize, Deserialize, ToSchema)]
2825+
#[schema(example = json!({
2826+
"chunk_html": "",
2827+
"heading_remove_strings": ["###", "##", "#"],
2828+
"body_remove_strings": ["Warning:", "Note:"]
2829+
}))]
2830+
pub struct ChunkHtmlContentReqPayload {
2831+
/// The HTML content to be split into chunks
2832+
pub chunk_html: String,
2833+
/// Text strings to remove from headings when creating chunks for each page
2834+
pub heading_remove_strings: Option<Vec<String>>,
2835+
/// Text strings to remove from body when creating chunks for each page
2836+
pub body_remove_strings: Option<Vec<String>>,
2837+
}
2838+
2839+
#[derive(Debug, Serialize, Deserialize, ToSchema)]
2840+
#[schema(example = json!({
2841+
"chunks": [
2842+
{
2843+
"headings": ["Title Heading", "Sub Heading 1", "Sub Sub Heading 1"],
2844+
"body": "This is the body of the content"
2845+
},
2846+
{
2847+
"headings": ["Title Heading", "Sub Heading 1", "Sub Sub Heading 2"],
2848+
"body": "This is the body of the content"
2849+
}
2850+
// ...
2851+
]
2852+
}))]
2853+
pub struct SplitHtmlResponse {
2854+
pub chunks: Vec<ChunkedContent>,
2855+
}
2856+
2857+
#[derive(Debug, Serialize, Deserialize, ToSchema)]
2858+
#[schema(example = json!({
2859+
"headings": ["Title Heading", "Sub Heading 1", "Last SubHeading"],
2860+
"body": "This is the body of the content"
2861+
}))]
2862+
pub struct ChunkedContent {
2863+
/// The headings of the content in order of when they appear
2864+
pub headings: Vec<String>,
2865+
/// The body of the content
2866+
pub body: String,
2867+
}
2868+
2869+
/// Split HTML Content into Chunks
2870+
///
2871+
/// This endpoint receives a single html string and splits it into chunks based on the headings and
2872+
/// body content. The headings are split based on headding html tags. chunk_html has a maximum size
2873+
/// of 256Kb.
2874+
#[utoipa::path(
2875+
post,
2876+
path = "/chunk/split",
2877+
context_path = "/api",
2878+
tag = "Chunk",
2879+
request_body(content = ChunkHtmlContentReqPayload, description = "JSON request payload to perform RAG on some chunks (chunks)", content_type = "application/json"),
2880+
responses(
2881+
(
2882+
status = 200, description = "This will be a JSON response of the chunks split from the HTML content with the headings and body",
2883+
body = SplitHtmlResponse,
2884+
),
2885+
(
2886+
status = 413, description = "Payload too large, if the HTML contnet is greater than 256Kb",
2887+
body = ErrorResponseBody,
2888+
),
2889+
),
2890+
)]
2891+
#[tracing::instrument]
2892+
pub async fn split_html_content(
2893+
body: web::Json<ChunkHtmlContentReqPayload>,
2894+
) -> Result<HttpResponse, ServiceError> {
2895+
if body.chunk_html.bytes().len() >= 262_144 {
2896+
return Err(ServiceError::PayloadTooLarge(
2897+
"The HTML content is too large".to_string(),
2898+
));
2899+
}
2900+
2901+
let chunked_content = crawl_operator::chunk_html(
2902+
&body.chunk_html,
2903+
body.heading_remove_strings.clone(),
2904+
body.body_remove_strings.clone(),
2905+
);
2906+
2907+
Ok(HttpResponse::Ok().json(SplitHtmlResponse {
2908+
chunks: chunked_content
2909+
.into_iter()
2910+
.map(|(headings, body)| ChunkedContent { headings, body })
2911+
.collect(),
2912+
}))
2913+
}
2914+
28232915
pub fn check_completion_param_validity(
28242916
temperature: Option<f32>,
28252917
frequency_penalty: Option<f32>,

server/src/lib.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ impl Modify for SecurityAddon {
175175
handlers::chunk_handler::create_chunk,
176176
handlers::chunk_handler::update_chunk,
177177
handlers::chunk_handler::delete_chunk,
178+
handlers::chunk_handler::split_html_content,
178179
handlers::chunk_handler::get_recommended_chunks,
179180
handlers::chunk_handler::update_chunk_by_tracking_id,
180181
handlers::chunk_handler::search_chunks,
@@ -274,6 +275,9 @@ impl Modify for SecurityAddon {
274275
handlers::chunk_handler::SearchResponseTypes,
275276
handlers::chunk_handler::CreateBatchChunkReqPayload,
276277
handlers::chunk_handler::SingleQueuedChunkResponse,
278+
handlers::chunk_handler::ChunkHtmlContentReqPayload,
279+
handlers::chunk_handler::SplitHtmlResponse,
280+
handlers::chunk_handler::ChunkedContent,
277281
handlers::chunk_handler::BatchQueuedChunkResponse,
278282
handlers::chunk_handler::ReturnQueuedChunk,
279283
handlers::chunk_handler::RecommendChunksResponseBody,
@@ -929,6 +933,11 @@ pub fn main() -> std::io::Result<()> {
929933
.route(web::put().to(handlers::chunk_handler::update_chunk))
930934
.route(web::delete().to(handlers::chunk_handler::bulk_delete_chunk)),
931935
)
936+
.service(
937+
web::resource("split").route(
938+
web::post().to(handlers::chunk_handler::split_html_content),
939+
),
940+
)
932941
.service(web::resource("/recommend").route(
933942
web::post().to(handlers::chunk_handler::get_recommended_chunks),
934943
)

server/src/operators/crawl_operator.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -602,7 +602,11 @@ pub fn get_tags(url: String) -> Vec<String> {
602602
Vec::new()
603603
}
604604

605-
pub fn chunk_html(html: &str, crawl_options: &CrawlOptions) -> Vec<(Vec<String>, String)> {
605+
pub fn chunk_html(
606+
html: &str,
607+
heading_remove_strings: Option<Vec<String>>,
608+
body_remove_strings: Option<Vec<String>>,
609+
) -> Vec<(Vec<String>, String)> {
606610
let re = Regex::new(r"(?i)<h[1-6].*?>").unwrap();
607611
let mut chunks = Vec::new();
608612
let mut current_chunk = String::new();
@@ -656,14 +660,14 @@ pub fn chunk_html(html: &str, crawl_options: &CrawlOptions) -> Vec<(Vec<String>,
656660
let mut headings_text = headings.last().unwrap_or(&String::new()).clone();
657661
let mut content = content.clone();
658662

659-
if let Some(heading_remove_strings) = &crawl_options.heading_remove_strings {
663+
if let Some(heading_remove_strings) = &heading_remove_strings {
660664
heading_remove_strings.iter().for_each(|remove_string| {
661665
headings_text = headings_text.replace(remove_string, "");
662666
});
663667
headings.pop();
664668
headings.push(headings_text);
665669
}
666-
if let Some(body_remove_strings) = &crawl_options.body_remove_strings {
670+
if let Some(body_remove_strings) = &body_remove_strings {
667671
body_remove_strings.iter().for_each(|remove_string| {
668672
content = content.replace(remove_string, "");
669673
});

0 commit comments

Comments
 (0)