How to deep crawl a website which contains pdf urls also ? #1190

gauravmindzk · 2025-06-06T11:31:32Z

gauravmindzk
Jun 6, 2025

Hello everyone,
I want to deep crawl a website.
The code that I've come up with is below :

import asyncio
import os
from pathlib import Path

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai import PruningContentFilter
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain

from crawl4ai.async_configs import CrawlerRunConfig

__cur_dir__ = Path(__file__).parent

import os
from crawl4ai.deep_crawling.filters import ( FilterChain, DomainFilter )

async def demo_deep_crawl():
    """Deep crawling with BFS strategy"""
    print("\n=== Deep Crawling ===")

    # Ensure the 'markdowns' directory exists
    os.makedirs("DirectoryName", exist_ok=True)

    filter_chain = FilterChain([DomainFilter(allowed_domains=["website_to_crawl.org"])])

    config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=10,max_pages=500, filter_chain=filter_chain),
        stream=True , # Enable streaming
        markdown_generator=DefaultMarkdownGenerator(
            content_filter=PruningContentFilter()
        ),
        page_timeout=120000,
        delay_before_return_html=60,
        excluded_tags = ['header', 'footer']
    )
    i=0

    async with AsyncWebCrawler() as crawler:
        # Returns an async iterator
        async for result in await crawler.arun("https://website_to_crawl.org", config=config):
            # Process each result as it becomes available
            i=i+1
            filename = os.path.join("DirectoryName", f"{i}.md")
            if result.markdown:
                with open(filename, "w", encoding="utf-8") as f:
                    f.write(result.markdown.fit_markdown)
                with open(os.path.join("DirectoryName", "DirectoryName_urls.md"), "a", encoding="utf-8") as f:
                    f.write(result.url + "\n")

async def main():
    """Run all demo functions sequentially"""
    print("=== Comprehensive Crawl4AI Demo ===")
    
    await demo_deep_crawl()
    
    print("\n=== Demo Complete ===")

if __name__ == "__main__":
    asyncio.run(main())

The webpage URLs are getting scrapped properly and I have the fit markdown of each of them but my current code is not able to scrape PDF urls of the website , I want to scrape both the webpages and the pdf urls that are hosted by the website.

How to do it ?

yadhuvarshinikl · 2025-11-28T08:53:51Z

yadhuvarshinikl
Nov 28, 2025

can you send the website url

0 replies

ntohidi · 2025-12-08T14:50:45Z

ntohidi
Dec 8, 2025
Collaborator Sponsor

@gauravmindzk Hi. The deep crawler discovers PDF URLs but doesn't process them by default because PDFs require different handling than HTML pages. Here are two approaches:
Collect PDF URLs during crawl, then process separately with PDFContentScrapingStrategy:

  async def demo_deep_crawl():
      ....

      async with AsyncWebCrawler() as crawler:
          async for result in await crawler.arun("https://website_to_crawl.org",
  config=config):
              i += 1

              # Save HTML page markdown
              if result.markdown:
                  filename = os.path.join("DirectoryName", f"{i}.md")
                  with open(filename, "w", encoding="utf-8") as f:
                      f.write(result.markdown.fit_markdown)

              # Collect PDF URLs from links
              if result.links:
                  internal_links = result.links.get("internal", []) if
  isinstance(result.links, dict) else result.links.internal
                  external_links = result.links.get("external", []) if
  isinstance(result.links, dict) else result.links.external

                  for link in internal_links + external_links:
                      href = link.get("href", "") if isinstance(link, dict) else link.href
                      if href.lower().endswith('.pdf'):
                          pdf_urls.append(href)

      # Now process PDF URLs
      print(f"\nFound {len(pdf_urls)} PDF URLs. Processing...")

      pdf_config = CrawlerRunConfig(
          scraping_strategy=PDFContentScrapingStrategy(
              extract_images=False
          ),
          cache_mode="bypass"
      )

      async with AsyncWebCrawler() as crawler:
          for j, pdf_url in enumerate(set(pdf_urls)):  # dedupe
              try:
                  result = await crawler.arun(pdf_url, config=pdf_config)
                  if result.markdown:
                      filename = os.path.join("DirectoryName", f"pdf_{j+1}.md")
                      with open(filename, "w", encoding="utf-8") as f:
                          f.write(result.markdown.raw_markdown)
                      print(f"✓ Processed: {pdf_url}")
              except Exception as e:
                  print(f"✗ Failed {pdf_url}: {e}")

  asyncio.run(demo_deep_crawl())

Option 2: Use arun_many for batch PDF processing (more efficient)

  # After collecting pdf_urls from Option 1...

  from crawl4ai import CrawlResult

  async with AsyncWebCrawler() as crawler:
      pdf_config = CrawlerRunConfig(
          scraping_strategy=PDFContentScrapingStrategy(),
      )

      results = await crawler.arun_many(
          urls=list(set(pdf_urls)),
          config=pdf_config
      )

      for j, result in enumerate(results):
          if result.success and result.markdown:
              with open(f"DirectoryName/pdf_{j+1}.md", "w") as f:
                  f.write(result.markdown.raw_markdown)

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

How to deep crawl a website which contains pdf urls also ? #1190

Uh oh!

{{title}}

Uh oh!

Replies: 2 comments

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Uh oh!

How to deep crawl a website which contains pdf urls also ? #1190

Uh oh!

gauravmindzk Jun 6, 2025

Replies: 2 comments

Uh oh!

yadhuvarshinikl Nov 28, 2025

Uh oh!

ntohidi Dec 8, 2025 Collaborator Sponsor

gauravmindzk
Jun 6, 2025

yadhuvarshinikl
Nov 28, 2025

ntohidi
Dec 8, 2025
Collaborator Sponsor