diff --git a/.actor/input_schema.json b/.actor/input_schema.json index 67dba0f..b529a26 100644 --- a/.actor/input_schema.json +++ b/.actor/input_schema.json @@ -145,6 +145,12 @@ "description": "If enabled, the Actor attempts to close or remove cookie consent dialogs to improve the quality of extracted text. Note that this setting increases the latency.", "default": true }, + "blockMedia": { + "title": "Block media resources", + "type": "boolean", + "description": "If enabled, the Actor will block loading of images, videos and CSS resources when using the Playwright browser. This can improve performance and reduce bandwidth usage.", + "default": true + }, "debugMode": { "title": "Enable debug mode", "type": "boolean", diff --git a/README.md b/README.md index 2c6fa1b..663aefd 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ The extracted text can then be injected into prompts and retrieval augmented gen - 📝 Output formats include **Markdown**, plain text, and HTML - 🔌 Supports **OpenAPI and MCP** for easy integration - 🪟 It's **open source**, so you can review and modify it +- 🖼️ **Media blocking** to skip images, videos, and CSS for faster scraping and lower bandwidth usage ## Example @@ -119,6 +120,7 @@ The `/search` GET HTTP endpoint accepts the following query parameters: | `maxRequestRetries` | number | `1` | The maximum number of times the Actor will retry loading the target web page on error. If the last attempt fails, the page will be skipped in the results. | | `dynamicContentWaitSecs` | number | `10` | The maximum time in seconds to wait for dynamic page content to load. The Actor considers the web page as fully loaded once this time elapses or when the network becomes idle. | | `removeCookieWarnings` | boolean | `true` | If enabled, removes cookie consent dialogs to improve text extraction accuracy. This might increase latency. | +| `blockMedia` | boolean | `true` | If enabled, blocks loading of images, videos, and CSS when using `browser-playwright`, improving speed and bandwidth. | | `removeElementsCssSelector` | string | `see input` | A CSS selector matching HTML elements that will be removed from the DOM, before converting it to text, Markdown, or saving as HTML. This is useful to skip irrelevant page content. The value must be a valid CSS selector as accepted by the `document.querySelectorAll()` function. \n\nBy default, the Actor removes common navigation elements, headers, footers, modals, scripts, and inline image. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`. | | `debugMode` | boolean | `false` | If enabled, the Actor will store debugging information in the dataset's debug field. | diff --git a/src/const.ts b/src/const.ts index 8df5514..64f1015 100644 --- a/src/const.ts +++ b/src/const.ts @@ -37,6 +37,7 @@ export const defaults = { query: undefined, // No default value in input_schema.json readableTextCharThreshold: 100, // Not in input_schema.json removeCookieWarnings: inputSchema.properties.removeCookieWarnings.default, + blockMedia: inputSchema.properties.blockMedia.default, removeElementsCssSelector: inputSchema.properties.removeElementsCssSelector.default, requestTimeoutSecs: inputSchema.properties.requestTimeoutSecs.default, requestTimeoutSecsMax: inputSchema.properties.requestTimeoutSecs.maximum, diff --git a/src/input.ts b/src/input.ts index d0f9822..a44a2ad 100644 --- a/src/input.ts +++ b/src/input.ts @@ -111,6 +111,27 @@ function createPlaywrightCrawlerOptions(input: Input, proxy: ProxyConfiguration maxConcurrency, minConcurrency, }, + preNavigationHooks: input.blockMedia ? [ + async ({ page }) => { + await page.route('**/*', async (route) => { + const resourceType = route.request().resourceType(); + const url = route.request().url(); + + // Block if it's an image/video/css resource type or has an image/video extension + if ( + resourceType === 'image' + || resourceType === 'video' + || resourceType === 'media' + || resourceType === 'stylesheet' + || /\.(jpg|jpeg|png|gif|bmp|webp|mp4|webm|ogg|mov|css)$/i.test(url) + ) { + await route.abort(); + } else { + await route.continue(); + } + }); + }, + ] : [], }, }; } diff --git a/src/types.ts b/src/types.ts index 8cd14ab..21c5642 100644 --- a/src/types.ts +++ b/src/types.ts @@ -32,6 +32,7 @@ export type Input = { removeElementsCssSelector: string; removeCookieWarnings: boolean; scrapingTool: 'browser-playwright' | 'raw-http'; + blockMedia: boolean; }; export type StandbyInput = Input & {