Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions api/constants/pipeline_templates.json
Original file line number Diff line number Diff line change
Expand Up @@ -1266,6 +1266,14 @@
"main_content"
]
},
"crawl_entire_domain": {
"type": "variable",
"value": [
"rag",
"1756907397615",
"crawl_entire_domain"
]
},
"url": {
"type": "mixed",
"value": "{{#rag.1756907397615.firecrawl_url1#}}"
Expand Down Expand Up @@ -2243,6 +2251,14 @@
"main_content"
]
},
"crawl_entire_domain": {
"type": "variable",
"value": [
"rag",
"1756907397615",
"crawl_entire_domain"
]
},
"url": {
"type": "mixed",
"value": "{{#rag.1756907397615.firecrawl_url1#}}"
Expand Down Expand Up @@ -4786,6 +4802,14 @@
"main_content"
]
},
"crawl_entire_domain": {
"type": "variable",
"value": [
"rag",
"1756907397615",
"crawl_entire_domain"
]
},
"url": {
"type": "mixed",
"value": "{{#rag.1756907397615.firecrawl_url1#}}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ workflow:
only_main_content:
type: mixed
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
crawl_entire_domain:
type: mixed
value: '{{#rag.1752565402678.firecrawl_crawl_entire_domain#}}'
url:
type: mixed
value: '{{#rag.1752565402678.firecrawl_url#}}'
Expand Down Expand Up @@ -598,6 +601,20 @@ workflow:
type: checkbox
unit: null
variable: firecrawl_extract_main_content
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: false
label: Crawl entire domain
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_crawl_entire_domain
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ workflow:
only_main_content:
type: mixed
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
crawl_entire_domain:
type: mixed
value: '{{#rag.1752565402678.firecrawl_crawl_entire_domain#}}'
url:
type: mixed
value: '{{#rag.1752565402678.firecrawl_url#}}'
Expand Down Expand Up @@ -598,6 +601,20 @@ workflow:
type: checkbox
unit: null
variable: firecrawl_extract_main_content
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: false
label: Crawl entire domain
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_crawl_entire_domain
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
Expand Down
17 changes: 17 additions & 0 deletions api/services/rag_pipeline/transform/website-crawl-parentchild.yml
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,9 @@ workflow:
only_main_content:
type: mixed
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
crawl_entire_domain:
type: mixed
value: '{{#rag.1752565402678.firecrawl_crawl_entire_domain#}}'
url:
type: mixed
value: '{{#rag.1752565402678.firecrawl_url#}}'
Expand Down Expand Up @@ -673,6 +676,20 @@ workflow:
type: checkbox
unit: null
variable: firecrawl_extract_main_content
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: false
label: Crawl entire domain
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_crawl_entire_domain
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
Expand Down
10 changes: 10 additions & 0 deletions api/services/website_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class CrawlOptions:
prompt: str | None = None
max_depth: int | None = None
use_sitemap: bool = True
crawl_entire_domain: bool = False

def get_include_paths(self) -> list[str]:
"""Get list of include paths from comma-separated string."""
Expand Down Expand Up @@ -74,6 +75,7 @@ def to_crawl_request(self) -> CrawlRequest:
prompt=self.options.get("prompt"),
max_depth=self.options.get("max_depth"),
use_sitemap=self.options.get("use_sitemap", True),
crawl_entire_domain=self.options.get("crawl_entire_domain", False),
)
return CrawlRequest(url=self.url, provider=self.provider, options=options)

Expand Down Expand Up @@ -196,6 +198,14 @@ def _crawl_with_firecrawl(cls, request: CrawlRequest, api_key: str, config: dict
if request.options.prompt:
params["prompt"] = request.options.prompt

# Add maxDiscoveryDepth if max_depth is provided
if request.options.max_depth is not None:
params["maxDiscoveryDepth"] = request.options.max_depth

# Add crawlEntireDomain if crawl_entire_domain is True
if request.options.crawl_entire_domain:
params["crawlEntireDomain"] = True

job_id = firecrawl_app.crawl_url(request.url, params)
website_crawl_time_cache_key = f"website_crawl_{job_id}"
time = str(datetime.datetime.now().timestamp())
Expand Down
1 change: 1 addition & 0 deletions web/app/components/datasets/create/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ const DEFAULT_CRAWL_OPTIONS: CrawlOptions = {
limit: 10,
max_depth: '',
use_sitemap: true,
crawl_entire_domain: false,
}

const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ const Options: FC<Props> = ({
onChange={handleChange('only_main_content')}
labelClassName='text-[13px] leading-[16px] font-medium text-text-secondary'
/>
<CheckboxWithLabel
label={t(`${I18N_PREFIX}.crawlEntireDomain`)}
isChecked={payload.crawl_entire_domain}
onChange={handleChange('crawl_entire_domain')}
labelClassName='text-[13px] leading-[16px] font-medium text-text-secondary'
/>
</div>
)
}
Expand Down
1 change: 1 addition & 0 deletions web/i18n/en-US/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ const translation = {
excludePaths: 'Exclude paths',
includeOnlyPaths: 'Include only paths',
extractOnlyMainContent: 'Extract only main content (no headers, navs, footers, etc.)',
crawlEntireDomain: 'Crawl entire domain',
exceptionErrorTitle: 'An exception occurred while running crawling job:',
unknownError: 'Unknown error',
totalPageScraped: 'Total pages scraped:',
Expand Down
1 change: 1 addition & 0 deletions web/i18n/zh-Hans/dataset-creation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ const translation = {
excludePaths: '排除路径',
includeOnlyPaths: '仅包含路径',
extractOnlyMainContent: '仅提取主要内容(无标题、导航、页脚等)',
crawlEntireDomain: '爬取整个域名',
exceptionErrorTitle: '运行时发生异常:',
unknownError: '未知错误',
totalPageScraped: '抓取页面总数:',
Expand Down
1 change: 1 addition & 0 deletions web/models/datasets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ export type CrawlOptions = {
limit: number | string
max_depth: number | string
use_sitemap: boolean
crawl_entire_domain: boolean
}

export type CrawlResultItem = {
Expand Down
Loading