Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: add the ability to remove strings from titles and bodies during crawl #2612

Merged
merged 1 commit into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 82 additions & 22 deletions clients/ts-sdk/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"name": "BSL",
"url": "https://github.com/devflowinc/trieve/blob/main/LICENSE.txt"
},
"version": "0.11.13"
"version": "0.12.0"
},
"servers": [
{
Expand Down Expand Up @@ -86,8 +86,8 @@
"tags": [
"Analytics"
],
"summary": "Send Event Data",
"description": "This route allows you to send event data to the system.",
"summary": "Send User Event Data",
"description": "This route allows you to send user event data to the system.",
"operationId": "send_event_data",
"parameters": [
{
Expand Down Expand Up @@ -141,8 +141,8 @@
"tags": [
"Analytics"
],
"summary": "Get All Events",
"description": "This route allows you to view all events.",
"summary": "Get All User Events",
"description": "This route allows you to view all user events.",
"operationId": "get_all_events",
"requestBody": {
"description": "JSON request payload to filter the events",
Expand Down Expand Up @@ -253,8 +253,8 @@
"tags": [
"Analytics"
],
"summary": "Get Event By ID",
"description": "This route allows you to view an event by its ID.",
"summary": "Get User Event By ID",
"description": "This route allows you to view an user event by its ID. You can pass in any type of event and get the details for that event.",
"operationId": "get_event_by_id",
"parameters": [
{
Expand Down Expand Up @@ -6640,6 +6640,7 @@
"type": "string",
"enum": [
"search",
"rag",
"recommendation"
]
},
Expand Down Expand Up @@ -7662,6 +7663,14 @@
"description": "Option for allowing the crawl to follow links to external websites.",
"nullable": true
},
"body_remove_strings": {
"type": "array",
"items": {
"type": "string"
},
"description": "Text strings to remove from body when creating chunks for each page",
"nullable": true
},
"boost_titles": {
"type": "boolean",
"description": "Boost titles such that keyword matches in titles are prioritized in search results. Strongly recommended to leave this on. Defaults to true.",
Expand All @@ -7683,6 +7692,14 @@
"description": "Specify the HTML tags, classes and ids to exclude from the response.",
"nullable": true
},
"heading_remove_strings": {
"type": "array",
"items": {
"type": "string"
},
"description": "Text strings to remove from headings when creating chunks for each page",
"nullable": true
},
"ignore_sitemap": {
"type": "boolean",
"description": "Ignore the website sitemap when crawling, defaults to true.",
Expand Down Expand Up @@ -7739,13 +7756,21 @@
}
},
"example": {
"body_remove_strings": [
"Edit on github"
],
"boost_titles": true,
"exclude_paths": [
"https://example.com/exclude*"
],
"exclude_tags": [
"#ad",
"#footer"
],
"heading_remove_strings": [
"Advertisement",
"Sponsored"
],
"include_paths": [
"https://example.com/include*"
],
Expand Down Expand Up @@ -8842,6 +8867,10 @@
"type": "string",
"nullable": true
},
"request_type": {
"type": "string",
"nullable": true
},
"updated_at": {
"type": "string"
},
Expand Down Expand Up @@ -8941,9 +8970,12 @@
"description": "Any other metadata associated with the event",
"nullable": true
},
"request_id": {
"type": "string",
"description": "The request id of the event to associate it with a request",
"request": {
"allOf": [
{
"$ref": "#/components/schemas/RequestInfo"
}
],
"nullable": true
},
"user_id": {
Expand Down Expand Up @@ -8987,9 +9019,12 @@
"description": "Any other metadata associated with the event",
"nullable": true
},
"request_id": {
"type": "string",
"description": "The request id of the event to associate it with a request",
"request": {
"allOf": [
{
"$ref": "#/components/schemas/RequestInfo"
}
],
"nullable": true
},
"user_id": {
Expand Down Expand Up @@ -9025,9 +9060,12 @@
"description": "Whether the event is a conversion event",
"nullable": true
},
"request_id": {
"type": "string",
"description": "The request id of the event to associate it with a request",
"request": {
"allOf": [
{
"$ref": "#/components/schemas/RequestInfo"
}
],
"nullable": true
},
"user_id": {
Expand Down Expand Up @@ -9072,9 +9110,12 @@
},
"description": "The items that were purchased"
},
"request_id": {
"type": "string",
"description": "The request id of the event to associate it with a request",
"request": {
"allOf": [
{
"$ref": "#/components/schemas/RequestInfo"
}
],
"nullable": true
},
"user_id": {
Expand Down Expand Up @@ -9120,9 +9161,12 @@
"type": "string"
}
},
"request_id": {
"type": "string",
"description": "The request id of the event to associate it with a request",
"request": {
"allOf": [
{
"$ref": "#/components/schemas/RequestInfo"
}
],
"nullable": true
},
"user_id": {
Expand Down Expand Up @@ -11864,6 +11908,22 @@
}
}
},
"RequestInfo": {
"type": "object",
"required": [
"request_type",
"request_id"
],
"properties": {
"request_id": {
"type": "string",
"format": "uuid"
},
"request_type": {
"$ref": "#/components/schemas/CTRType"
}
}
},
"ReturnQueuedChunk": {
"oneOf": [
{
Expand Down
2 changes: 1 addition & 1 deletion clients/ts-sdk/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"files": [
"dist"
],
"version": "0.0.12",
"version": "0.0.13",
"license": "MIT",
"scripts": {
"lint": "eslint 'src/**/*.ts'",
Expand Down
41 changes: 20 additions & 21 deletions clients/ts-sdk/src/types.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ export type CTRSearchQueryWithoutClicksResponse = {
queries: Array<SearchQueriesWithoutClicksCTRResponse>;
};

export type CTRType = 'search' | 'recommendation';
export type CTRType = 'search' | 'rag' | 'recommendation';

export type ChatMessageProxy = {
content: string;
Expand Down Expand Up @@ -446,6 +446,10 @@ export type CrawlOptions = {
* Option for allowing the crawl to follow links to external websites.
*/
allow_external_links?: (boolean) | null;
/**
* Text strings to remove from body when creating chunks for each page
*/
body_remove_strings?: Array<(string)> | null;
/**
* Boost titles such that keyword matches in titles are prioritized in search results. Strongly recommended to leave this on. Defaults to true.
*/
Expand All @@ -458,6 +462,10 @@ export type CrawlOptions = {
* Specify the HTML tags, classes and ids to exclude from the response.
*/
exclude_tags?: Array<(string)> | null;
/**
* Text strings to remove from headings when creating chunks for each page
*/
heading_remove_strings?: Array<(string)> | null;
/**
* Ignore the website sitemap when crawling, defaults to true.
*/
Expand Down Expand Up @@ -864,6 +872,7 @@ export type EventData = {
items: Array<(string)>;
metadata?: unknown;
request_id?: (string) | null;
request_type?: (string) | null;
updated_at: string;
user_id?: (string) | null;
};
Expand All @@ -890,10 +899,7 @@ export type EventTypes = {
* Any other metadata associated with the event
*/
metadata?: unknown;
/**
* The request id of the event to associate it with a request
*/
request_id?: (string) | null;
request?: ((RequestInfo) | null);
/**
* The user id of the user who viewed the items
*/
Expand All @@ -916,10 +922,7 @@ export type EventTypes = {
* Any other metadata associated with the event
*/
metadata?: unknown;
/**
* The request id of the event to associate it with a request
*/
request_id?: (string) | null;
request?: ((RequestInfo) | null);
/**
* The user id of the user who added the items to the cart
*/
Expand All @@ -935,10 +938,7 @@ export type EventTypes = {
* Whether the event is a conversion event
*/
is_conversion?: (boolean) | null;
/**
* The request id of the event to associate it with a request
*/
request_id?: (string) | null;
request?: ((RequestInfo) | null);
/**
* The user id of the user who clicked the items
*/
Expand All @@ -961,10 +961,7 @@ export type EventTypes = {
* The items that were purchased
*/
items: Array<(string)>;
/**
* The request id of the event to associate it with a request
*/
request_id?: (string) | null;
request?: ((RequestInfo) | null);
/**
* The user id of the user who purchased the items
*/
Expand All @@ -989,10 +986,7 @@ export type EventTypes = {
items: {
[key: string]: (string);
};
/**
* The request id of the event to associate it with a request
*/
request_id?: (string) | null;
request?: ((RequestInfo) | null);
/**
* The user id of the user who clicked the items
*/
Expand Down Expand Up @@ -1843,6 +1837,11 @@ export type RemoveChunkFromGroupReqPayload = {
chunk_id: string;
};

export type RequestInfo = {
request_id: string;
request_type: CTRType;
};

export type ReturnQueuedChunk = SingleQueuedChunkResponse | BatchQueuedChunkResponse;

export type RoleProxy = 'system' | 'user' | 'assistant';
Expand Down
34 changes: 30 additions & 4 deletions frontends/dashboard/src/pages/dataset/CrawlingSettings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -492,12 +492,12 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
<Error error={errors.exclude_paths} />
</div>
<div class="">
<div>Include Tags</div>
<div>Include Query Selectors</div>
<MultiStringInput
disabled={isShopify()}
placeholder="h1..."
addClass="bg-magenta-100/40 text-sm px-2 rounded border border-magenta-300/40"
addLabel="Add Tag"
addLabel="Add Selector"
onChange={(value) => {
setOptions("include_tags", value);
}}
Expand All @@ -506,19 +506,45 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
<Error error={errors.include_tags} />
</div>
<div class="">
<div>Exclude Tags</div>
<div>Exclude Query Selectors</div>
<MultiStringInput
disabled={isShopify()}
placeholder="button..."
addClass="bg-magenta-100/40 px-2 text-sm rounded border border-magenta-300/40"
addLabel="Add Tag"
addLabel="Add Selector"
onChange={(value) => {
setOptions("exclude_tags", value);
}}
value={options.exclude_tags || []}
/>
<Error error={errors.exclude_tags} />
</div>
<div class="">
<div>Heading Remove Strings</div>
<MultiStringInput
placeholder="#"
addClass="bg-magenta-100/40 px-2 text-sm rounded border border-magenta-300/40"
addLabel="Add Text"
onChange={(value) => {
setOptions("heading_remove_strings", value);
}}
value={options.heading_remove_strings || []}
/>
<Error error={errors.heading_remove_strings} />
</div>
<div class="">
<div>Body Remove Strings</div>
<MultiStringInput
placeholder="#"
addClass="bg-magenta-100/40 px-2 text-sm rounded border border-magenta-300/40"
addLabel="Add Text"
onChange={(value) => {
setOptions("body_remove_strings", value);
}}
value={options.body_remove_strings || []}
/>
<Error error={errors.body_remove_strings} />
</div>
</div>
<Spacer h={18} />
<div class="flex justify-end">
Expand Down
Loading
Loading