diff --git a/clients/ts-sdk/openapi.json b/clients/ts-sdk/openapi.json index 453d39f48..b0ce87fdc 100644 --- a/clients/ts-sdk/openapi.json +++ b/clients/ts-sdk/openapi.json @@ -12,7 +12,7 @@ "name": "BSL", "url": "https://github.com/devflowinc/trieve/blob/main/LICENSE.txt" }, - "version": "0.11.13" + "version": "0.12.0" }, "servers": [ { @@ -86,8 +86,8 @@ "tags": [ "Analytics" ], - "summary": "Send Event Data", - "description": "This route allows you to send event data to the system.", + "summary": "Send User Event Data", + "description": "This route allows you to send user event data to the system.", "operationId": "send_event_data", "parameters": [ { @@ -141,8 +141,8 @@ "tags": [ "Analytics" ], - "summary": "Get All Events", - "description": "This route allows you to view all events.", + "summary": "Get All User Events", + "description": "This route allows you to view all user events.", "operationId": "get_all_events", "requestBody": { "description": "JSON request payload to filter the events", @@ -253,8 +253,8 @@ "tags": [ "Analytics" ], - "summary": "Get Event By ID", - "description": "This route allows you to view an event by its ID.", + "summary": "Get User Event By ID", + "description": "This route allows you to view an user event by its ID. You can pass in any type of event and get the details for that event.", "operationId": "get_event_by_id", "parameters": [ { @@ -6640,6 +6640,7 @@ "type": "string", "enum": [ "search", + "rag", "recommendation" ] }, @@ -7662,6 +7663,14 @@ "description": "Option for allowing the crawl to follow links to external websites.", "nullable": true }, + "body_remove_strings": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Text strings to remove from body when creating chunks for each page", + "nullable": true + }, "boost_titles": { "type": "boolean", "description": "Boost titles such that keyword matches in titles are prioritized in search results. Strongly recommended to leave this on. Defaults to true.", @@ -7683,6 +7692,14 @@ "description": "Specify the HTML tags, classes and ids to exclude from the response.", "nullable": true }, + "heading_remove_strings": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Text strings to remove from headings when creating chunks for each page", + "nullable": true + }, "ignore_sitemap": { "type": "boolean", "description": "Ignore the website sitemap when crawling, defaults to true.", @@ -7739,6 +7756,10 @@ } }, "example": { + "body_remove_strings": [ + "Edit on github" + ], + "boost_titles": true, "exclude_paths": [ "https://example.com/exclude*" ], @@ -7746,6 +7767,10 @@ "#ad", "#footer" ], + "heading_remove_strings": [ + "Advertisement", + "Sponsored" + ], "include_paths": [ "https://example.com/include*" ], @@ -8842,6 +8867,10 @@ "type": "string", "nullable": true }, + "request_type": { + "type": "string", + "nullable": true + }, "updated_at": { "type": "string" }, @@ -8941,9 +8970,12 @@ "description": "Any other metadata associated with the event", "nullable": true }, - "request_id": { - "type": "string", - "description": "The request id of the event to associate it with a request", + "request": { + "allOf": [ + { + "$ref": "#/components/schemas/RequestInfo" + } + ], "nullable": true }, "user_id": { @@ -8987,9 +9019,12 @@ "description": "Any other metadata associated with the event", "nullable": true }, - "request_id": { - "type": "string", - "description": "The request id of the event to associate it with a request", + "request": { + "allOf": [ + { + "$ref": "#/components/schemas/RequestInfo" + } + ], "nullable": true }, "user_id": { @@ -9025,9 +9060,12 @@ "description": "Whether the event is a conversion event", "nullable": true }, - "request_id": { - "type": "string", - "description": "The request id of the event to associate it with a request", + "request": { + "allOf": [ + { + "$ref": "#/components/schemas/RequestInfo" + } + ], "nullable": true }, "user_id": { @@ -9072,9 +9110,12 @@ }, "description": "The items that were purchased" }, - "request_id": { - "type": "string", - "description": "The request id of the event to associate it with a request", + "request": { + "allOf": [ + { + "$ref": "#/components/schemas/RequestInfo" + } + ], "nullable": true }, "user_id": { @@ -9120,9 +9161,12 @@ "type": "string" } }, - "request_id": { - "type": "string", - "description": "The request id of the event to associate it with a request", + "request": { + "allOf": [ + { + "$ref": "#/components/schemas/RequestInfo" + } + ], "nullable": true }, "user_id": { @@ -11864,6 +11908,22 @@ } } }, + "RequestInfo": { + "type": "object", + "required": [ + "request_type", + "request_id" + ], + "properties": { + "request_id": { + "type": "string", + "format": "uuid" + }, + "request_type": { + "$ref": "#/components/schemas/CTRType" + } + } + }, "ReturnQueuedChunk": { "oneOf": [ { diff --git a/clients/ts-sdk/package.json b/clients/ts-sdk/package.json index e388e2f93..e8dcffa6e 100644 --- a/clients/ts-sdk/package.json +++ b/clients/ts-sdk/package.json @@ -6,7 +6,7 @@ "files": [ "dist" ], - "version": "0.0.12", + "version": "0.0.13", "license": "MIT", "scripts": { "lint": "eslint 'src/**/*.ts'", diff --git a/clients/ts-sdk/src/types.gen.ts b/clients/ts-sdk/src/types.gen.ts index 626a88994..00a31ab47 100644 --- a/clients/ts-sdk/src/types.gen.ts +++ b/clients/ts-sdk/src/types.gen.ts @@ -160,7 +160,7 @@ export type CTRSearchQueryWithoutClicksResponse = { queries: Array; }; -export type CTRType = 'search' | 'recommendation'; +export type CTRType = 'search' | 'rag' | 'recommendation'; export type ChatMessageProxy = { content: string; @@ -446,6 +446,10 @@ export type CrawlOptions = { * Option for allowing the crawl to follow links to external websites. */ allow_external_links?: (boolean) | null; + /** + * Text strings to remove from body when creating chunks for each page + */ + body_remove_strings?: Array<(string)> | null; /** * Boost titles such that keyword matches in titles are prioritized in search results. Strongly recommended to leave this on. Defaults to true. */ @@ -458,6 +462,10 @@ export type CrawlOptions = { * Specify the HTML tags, classes and ids to exclude from the response. */ exclude_tags?: Array<(string)> | null; + /** + * Text strings to remove from headings when creating chunks for each page + */ + heading_remove_strings?: Array<(string)> | null; /** * Ignore the website sitemap when crawling, defaults to true. */ @@ -864,6 +872,7 @@ export type EventData = { items: Array<(string)>; metadata?: unknown; request_id?: (string) | null; + request_type?: (string) | null; updated_at: string; user_id?: (string) | null; }; @@ -890,10 +899,7 @@ export type EventTypes = { * Any other metadata associated with the event */ metadata?: unknown; - /** - * The request id of the event to associate it with a request - */ - request_id?: (string) | null; + request?: ((RequestInfo) | null); /** * The user id of the user who viewed the items */ @@ -916,10 +922,7 @@ export type EventTypes = { * Any other metadata associated with the event */ metadata?: unknown; - /** - * The request id of the event to associate it with a request - */ - request_id?: (string) | null; + request?: ((RequestInfo) | null); /** * The user id of the user who added the items to the cart */ @@ -935,10 +938,7 @@ export type EventTypes = { * Whether the event is a conversion event */ is_conversion?: (boolean) | null; - /** - * The request id of the event to associate it with a request - */ - request_id?: (string) | null; + request?: ((RequestInfo) | null); /** * The user id of the user who clicked the items */ @@ -961,10 +961,7 @@ export type EventTypes = { * The items that were purchased */ items: Array<(string)>; - /** - * The request id of the event to associate it with a request - */ - request_id?: (string) | null; + request?: ((RequestInfo) | null); /** * The user id of the user who purchased the items */ @@ -989,10 +986,7 @@ export type EventTypes = { items: { [key: string]: (string); }; - /** - * The request id of the event to associate it with a request - */ - request_id?: (string) | null; + request?: ((RequestInfo) | null); /** * The user id of the user who clicked the items */ @@ -1843,6 +1837,11 @@ export type RemoveChunkFromGroupReqPayload = { chunk_id: string; }; +export type RequestInfo = { + request_id: string; + request_type: CTRType; +}; + export type ReturnQueuedChunk = SingleQueuedChunkResponse | BatchQueuedChunkResponse; export type RoleProxy = 'system' | 'user' | 'assistant'; diff --git a/frontends/dashboard/src/pages/dataset/CrawlingSettings.tsx b/frontends/dashboard/src/pages/dataset/CrawlingSettings.tsx index 40386faea..f5ce0d25c 100644 --- a/frontends/dashboard/src/pages/dataset/CrawlingSettings.tsx +++ b/frontends/dashboard/src/pages/dataset/CrawlingSettings.tsx @@ -492,12 +492,12 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
-
Include Tags
+
Include Query Selectors
{ setOptions("include_tags", value); }} @@ -506,12 +506,12 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
-
Exclude Tags
+
Exclude Query Selectors
{ setOptions("exclude_tags", value); }} @@ -519,6 +519,32 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => { />
+
+
Heading Remove Strings
+ { + setOptions("heading_remove_strings", value); + }} + value={options.heading_remove_strings || []} + /> + +
+
+
Body Remove Strings
+ { + setOptions("body_remove_strings", value); + }} + value={options.body_remove_strings || []} + /> + +
diff --git a/server/src/bin/crawl-worker.rs b/server/src/bin/crawl-worker.rs index 490ce8b75..91f86b9ec 100644 --- a/server/src/bin/crawl-worker.rs +++ b/server/src/bin/crawl-worker.rs @@ -81,6 +81,22 @@ fn create_chunk_req_payload( ) -> Result { let image_urls: Vec = product.images.iter().map(|img| img.src.clone()).collect(); + let mut product_title = product.title.clone(); + let mut variant_title = variant.title.clone(); + let mut product_body_html = product.body_html.clone(); + + if let Some(heading_remove_strings) = &scrape_request.crawl_options.heading_remove_strings { + heading_remove_strings.iter().for_each(|remove_string| { + product_title = product_title.replace(remove_string, ""); + variant_title = variant_title.replace(remove_string, ""); + }); + } + if let Some(body_remove_strings) = &scrape_request.crawl_options.body_remove_strings { + body_remove_strings.iter().for_each(|remove_string| { + product_body_html = product_body_html.replace(remove_string, ""); + }); + } + let link = format!( "{}/products/{}?variant={}", base_url, product.handle, variant.id @@ -475,7 +491,7 @@ async fn get_chunks_with_firecrawl( } } - let chunked_html = chunk_html(&page_html.clone()); + let chunked_html = chunk_html(&page_html.clone(), &scrape_request.crawl_options); for chunk in chunked_html { let heading = chunk.0.clone(); diff --git a/server/src/data/models.rs b/server/src/data/models.rs index 69a6b4473..99645a943 100644 --- a/server/src/data/models.rs +++ b/server/src/data/models.rs @@ -6580,6 +6580,9 @@ impl From for CrawlRequestPG { "max_depth": 10, "include_tags": ["h1", "p", "a", ".main-content"], "exclude_tags": ["#ad", "#footer"], + "heading_remove_strings": ["Advertisement", "Sponsored"], + "body_remove_strings": ["Edit on github"], + "boost_titles": true, }))] pub struct CrawlOptions { /// The URL to crawl @@ -6604,6 +6607,10 @@ pub struct CrawlOptions { pub allow_external_links: Option, /// Ignore the website sitemap when crawling, defaults to true. pub ignore_sitemap: Option, + /// Text strings to remove from headings when creating chunks for each page + pub heading_remove_strings: Option>, + /// Text strings to remove from body when creating chunks for each page + pub body_remove_strings: Option>, /// Options for including an openapi spec in the crawl pub scrape_options: Option, } @@ -6643,6 +6650,14 @@ impl CrawlOptions { boost_titles: self.boost_titles.or(other.boost_titles), scrape_options: self.scrape_options.clone(), allow_external_links: self.allow_external_links.or(other.allow_external_links), + heading_remove_strings: self + .heading_remove_strings + .clone() + .or(other.heading_remove_strings.clone()), + body_remove_strings: self + .body_remove_strings + .clone() + .or(other.body_remove_strings.clone()), } } } diff --git a/server/src/operators/crawl_operator.rs b/server/src/operators/crawl_operator.rs index 3268efa84..0557f48e6 100644 --- a/server/src/operators/crawl_operator.rs +++ b/server/src/operators/crawl_operator.rs @@ -587,7 +587,7 @@ pub fn get_tags(url: String) -> Vec { Vec::new() } -pub fn chunk_html(html: &str) -> Vec<(String, String)> { +pub fn chunk_html(html: &str, crawl_options: &CrawlOptions) -> Vec<(String, String)> { let re = Regex::new(r"(?i)").unwrap(); let mut chunks = Vec::new(); let mut current_chunk = String::new(); @@ -651,6 +651,27 @@ pub fn chunk_html(html: &str) -> Vec<(String, String)> { chunks.push((headings_text, last_short_chunk)); } + chunks = chunks + .into_iter() + .map(|(headings_text, content)| { + let mut headings_text = headings_text.clone(); + let mut content = content.clone(); + + if let Some(heading_remove_strings) = &crawl_options.heading_remove_strings { + heading_remove_strings.iter().for_each(|remove_string| { + headings_text = headings_text.replace(remove_string, ""); + }); + } + if let Some(body_remove_strings) = &crawl_options.body_remove_strings { + body_remove_strings.iter().for_each(|remove_string| { + content = content.replace(remove_string, ""); + }); + } + + (headings_text, content) + }) + .collect(); + chunks.retain(|(headings_text, content)| { !headings_text.trim().is_empty() && !content.trim().is_empty() });