Skip to content

Commit

Permalink
feature: expose custom html based splitter
Browse files Browse the repository at this point in the history
  • Loading branch information
cdxker committed Nov 13, 2024
1 parent c94e26d commit b9a3cf3
Show file tree
Hide file tree
Showing 7 changed files with 339 additions and 4 deletions.
145 changes: 145 additions & 0 deletions clients/ts-sdk/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -1420,6 +1420,49 @@
]
}
},
"/api/chunk/split": {
"post": {
"tags": [
"Chunk"
],
"summary": "Split HTML Content into Chunks",
"description": "This endpoint receives a single html string and splits it into chunks based on the headings and\nbody content. The headings are split based on headding html tags. chunk_html has a maximum size\nof 256Kb.",
"operationId": "split_html_content",
"requestBody": {
"description": "JSON request payload to perform RAG on some chunks (chunks)",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ChunkHtmlContentReqPayload"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "This will be a JSON response of the chunks split from the HTML content with the headings and body",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SplitHtmlResponse"
}
}
}
},
"413": {
"description": "Payload too large, if the HTML contnet is greater than 256Kb",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponseBody"
}
}
}
}
}
}
},
"/api/chunk/suggestions": {
"post": {
"tags": [
Expand Down Expand Up @@ -6994,6 +7037,46 @@
}
]
},
"ChunkHtmlContentReqPayload": {
"type": "object",
"required": [
"chunk_html"
],
"properties": {
"body_remove_strings": {
"type": "array",
"items": {
"type": "string"
},
"description": "Text strings to remove from body when creating chunks for each page",
"nullable": true
},
"chunk_html": {
"type": "string",
"description": "The HTML content to be split into chunks"
},
"heading_remove_strings": {
"type": "array",
"items": {
"type": "string"
},
"description": "Text strings to remove from headings when creating chunks for each page",
"nullable": true
}
},
"example": {
"body_remove_strings": [
"Warning:",
"Note:"
],
"chunk_html": "",
"heading_remove_strings": [
"###",
"##",
"#"
]
}
},
"ChunkMetadata": {
"type": "object",
"title": "V2",
Expand Down Expand Up @@ -7487,6 +7570,34 @@
}
}
},
"ChunkedContent": {
"type": "object",
"required": [
"headings",
"body"
],
"properties": {
"body": {
"type": "string",
"description": "The body of the content"
},
"headings": {
"type": "array",
"items": {
"type": "string"
},
"description": "The headings of the content in order of when they appear"
}
},
"example": {
"body": "This is the body of the content",
"headings": [
"Title Heading",
"Sub Heading 1",
"Last SubHeading"
]
}
},
"ClickhouseRagTypes": {
"type": "string",
"enum": [
Expand Down Expand Up @@ -14388,6 +14499,40 @@
"asc"
]
},
"SplitHtmlResponse": {
"type": "object",
"required": [
"chunks"
],
"properties": {
"chunks": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ChunkedContent"
}
}
},
"example": {
"chunks": [
{
"body": "This is the body of the content",
"headings": [
"Title Heading",
"Sub Heading 1",
"Sub Sub Heading 1"
]
},
{
"body": "This is the body of the content",
"headings": [
"Title Heading",
"Sub Heading 1",
"Sub Sub Heading 2"
]
}
]
}
},
"StripeInvoice": {
"type": "object",
"required": [
Expand Down
27 changes: 27 additions & 0 deletions clients/ts-sdk/src/functions/chunks/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import {
AutocompleteReqPayload,
ChunkHtmlContentReqPayload,
CountChunksReqPayload,
CreateChunkReqPayloadEnum,
DeleteChunkByTrackingIdData,
Expand Down Expand Up @@ -584,3 +585,29 @@ export async function getChunksByTrackingIds(
signal
);
}

/**
* Function that creates chunk(s). If the chunk has the same tracking_id as an existing chunk, the request will fail. Once a chunk is created, it can be searched for using the search endpoint. If uploading in bulk, the maximum amount of chunks that can be uploaded at once is 120 chunks. Auth’ed user or api key must have an admin or owner role for the specified dataset’s organization.
*
* Example:
* ```js
*const data = await trieve.splitChunks({
* chunk_html: "<p>Some HTML content</p>",
*});
* ```
*/
export async function splitChunks(
/** @hidden */
this: TrieveSDK,
props: ChunkHtmlContentReqPayload,
signal?: AbortSignal
) {
return this.trieve.fetch(
"/api/chunk/split",
"post",
{
data: props,
},
signal
);
}
54 changes: 54 additions & 0 deletions clients/ts-sdk/src/types.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,21 @@ export type ChunkGroupAndFileId = {

export type ChunkGroups = Array<ChunkGroup>;

export type ChunkHtmlContentReqPayload = {
/**
* Text strings to remove from body when creating chunks for each page
*/
body_remove_strings?: Array<(string)> | null;
/**
* The HTML content to be split into chunks
*/
chunk_html: string;
/**
* Text strings to remove from headings when creating chunks for each page
*/
heading_remove_strings?: Array<(string)> | null;
};

export type ChunkMetadata = {
/**
* HTML content of the chunk, can also be an arbitrary string which is not HTML
Expand Down Expand Up @@ -391,6 +406,17 @@ export type ChunkWithPosition = {
position: number;
};

export type ChunkedContent = {
/**
* The body of the content
*/
body: string;
/**
* The headings of the content in order of when they appear
*/
headings: Array<(string)>;
};

export type ClickhouseRagTypes = 'chosen_chunks' | 'all_chunks';

export type ClickhouseRecommendationTypes = 'Chunk' | 'Group';
Expand Down Expand Up @@ -2616,6 +2642,10 @@ export type SortOptions = {

export type SortOrder = 'desc' | 'asc';

export type SplitHtmlResponse = {
chunks: Array<ChunkedContent>;
};

export type StripeInvoice = {
created_at: string;
hosted_invoice_url: string;
Expand Down Expand Up @@ -3310,6 +3340,15 @@ export type SearchChunksData = {

export type SearchChunksResponse = (SearchResponseTypes);

export type SplitHtmlContentData = {
/**
* JSON request payload to perform RAG on some chunks (chunks)
*/
requestBody: ChunkHtmlContentReqPayload;
};

export type SplitHtmlContentResponse = (SplitHtmlResponse);

export type GetSuggestedQueriesData = {
/**
* JSON request payload to get alternative suggested queries
Expand Down Expand Up @@ -4653,6 +4692,21 @@ export type $OpenApiTs = {
};
};
};
'/api/chunk/split': {
post: {
req: SplitHtmlContentData;
res: {
/**
* This will be a JSON response of the chunks split from the HTML content with the headings and body
*/
200: SplitHtmlResponse;
/**
* Payload too large, if the HTML contnet is greater than 256Kb
*/
413: ErrorResponseBody;
};
};
};
'/api/chunk/suggestions': {
post: {
req: GetSuggestedQueriesData;
Expand Down
6 changes: 5 additions & 1 deletion server/src/bin/crawl-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,11 @@ async fn get_chunks_with_firecrawl(
}
}

let chunked_html = chunk_html(&page_html.clone(), &scrape_request.crawl_options);
let chunked_html = chunk_html(
&page_html.clone(),
scrape_request.crawl_options.heading_remove_strings.clone(),
scrape_request.crawl_options.body_remove_strings.clone(),
);

for chunk in chunked_html {
let heading = chunk.0.last().unwrap_or(&String::new()).clone();
Expand Down
Loading

0 comments on commit b9a3cf3

Please sign in to comment.