@@ -14,6 +14,7 @@ use crate::middleware::api_version::APIVersion;
1414use crate :: operators:: chunk_operator:: get_metadata_from_id_query;
1515use crate :: operators:: chunk_operator:: * ;
1616use crate :: operators:: clickhouse_operator:: { get_latency_from_header, ClickHouseEvent , EventQueue } ;
17+ use crate :: operators:: crawl_operator;
1718use crate :: operators:: dataset_operator:: {
1819 get_dataset_usage_query, ChunkDeleteMessage , DeleteMessage ,
1920} ;
@@ -2820,6 +2821,97 @@ pub async fn generate_off_chunks(
28202821 . streaming ( completion_stream) )
28212822}
28222823
2824+ #[ derive( Debug , Serialize , Deserialize , ToSchema ) ]
2825+ #[ schema( example = json!( {
2826+ "chunk_html" : "" ,
2827+ "heading_remove_strings" : [ "###" , "##" , "#" ] ,
2828+ "body_remove_strings" : [ "Warning:" , "Note:" ]
2829+ } ) ) ]
2830+ pub struct ChunkHtmlContentReqPayload {
2831+ /// The HTML content to be split into chunks
2832+ pub chunk_html : String ,
2833+ /// Text strings to remove from headings when creating chunks for each page
2834+ pub heading_remove_strings : Option < Vec < String > > ,
2835+ /// Text strings to remove from body when creating chunks for each page
2836+ pub body_remove_strings : Option < Vec < String > > ,
2837+ }
2838+
2839+ #[ derive( Debug , Serialize , Deserialize , ToSchema ) ]
2840+ #[ schema( example = json!( {
2841+ "chunks" : [
2842+ {
2843+ "headings" : [ "Title Heading" , "Sub Heading 1" , "Sub Sub Heading 1" ] ,
2844+ "body" : "This is the body of the content"
2845+ } ,
2846+ {
2847+ "headings" : [ "Title Heading" , "Sub Heading 1" , "Sub Sub Heading 2" ] ,
2848+ "body" : "This is the body of the content"
2849+ }
2850+ // ...
2851+ ]
2852+ } ) ) ]
2853+ pub struct SplitHtmlResponse {
2854+ pub chunks : Vec < ChunkedContent > ,
2855+ }
2856+
2857+ #[ derive( Debug , Serialize , Deserialize , ToSchema ) ]
2858+ #[ schema( example = json!( {
2859+ "headings" : [ "Title Heading" , "Sub Heading 1" , "Last SubHeading" ] ,
2860+ "body" : "This is the body of the content"
2861+ } ) ) ]
2862+ pub struct ChunkedContent {
2863+ /// The headings of the content in order of when they appear
2864+ pub headings : Vec < String > ,
2865+ /// The body of the content
2866+ pub body : String ,
2867+ }
2868+
2869+ /// Split HTML Content into Chunks
2870+ ///
2871+ /// This endpoint receives a single html string and splits it into chunks based on the headings and
2872+ /// body content. The headings are split based on headding html tags. chunk_html has a maximum size
2873+ /// of 256Kb.
2874+ #[ utoipa:: path(
2875+ post,
2876+ path = "/chunk/split" ,
2877+ context_path = "/api" ,
2878+ tag = "Chunk" ,
2879+ request_body( content = ChunkHtmlContentReqPayload , description = "JSON request payload to perform RAG on some chunks (chunks)" , content_type = "application/json" ) ,
2880+ responses(
2881+ (
2882+ status = 200 , description = "This will be a JSON response of the chunks split from the HTML content with the headings and body" ,
2883+ body = SplitHtmlResponse ,
2884+ ) ,
2885+ (
2886+ status = 413 , description = "Payload too large, if the HTML contnet is greater than 256Kb" ,
2887+ body = ErrorResponseBody ,
2888+ ) ,
2889+ ) ,
2890+ ) ]
2891+ #[ tracing:: instrument]
2892+ pub async fn split_html_content (
2893+ body : web:: Json < ChunkHtmlContentReqPayload > ,
2894+ ) -> Result < HttpResponse , ServiceError > {
2895+ if body. chunk_html . bytes ( ) . len ( ) >= 262_144 {
2896+ return Err ( ServiceError :: PayloadTooLarge (
2897+ "The HTML content is too large" . to_string ( ) ,
2898+ ) ) ;
2899+ }
2900+
2901+ let chunked_content = crawl_operator:: chunk_html (
2902+ & body. chunk_html ,
2903+ body. heading_remove_strings . clone ( ) ,
2904+ body. body_remove_strings . clone ( ) ,
2905+ ) ;
2906+
2907+ Ok ( HttpResponse :: Ok ( ) . json ( SplitHtmlResponse {
2908+ chunks : chunked_content
2909+ . into_iter ( )
2910+ . map ( |( headings, body) | ChunkedContent { headings, body } )
2911+ . collect ( ) ,
2912+ } ) )
2913+ }
2914+
28232915pub fn check_completion_param_validity (
28242916 temperature : Option < f32 > ,
28252917 frequency_penalty : Option < f32 > ,
0 commit comments