|
24 | 24 | BM25ContentFilter, |
25 | 25 | PruningContentFilter, |
26 | 26 | BrowserProfiler, |
| 27 | + DefaultMarkdownGenerator, |
27 | 28 | LLMConfig |
28 | 29 | ) |
29 | 30 | from litellm import completion |
@@ -614,17 +615,28 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: |
614 | 615 | crawler_cfg = crawler_cfg.clone(**crawler) |
615 | 616 |
|
616 | 617 | # Handle content filter config |
617 | | - if filter_config: |
618 | | - filter_conf = load_config_file(filter_config) |
| 618 | + if filter_config or output in ["markdown-fit", "md-fit"]: |
| 619 | + if filter_config: |
| 620 | + filter_conf = load_config_file(filter_config) |
| 621 | + elif not filter_config and output in ["markdown-fit", "md-fit"]: |
| 622 | + filter_conf = { |
| 623 | + "type": "pruning", |
| 624 | + "query": "", |
| 625 | + "threshold": 0.48 |
| 626 | + } |
619 | 627 | if filter_conf["type"] == "bm25": |
620 | | - crawler_cfg.content_filter = BM25ContentFilter( |
621 | | - user_query=filter_conf.get("query"), |
622 | | - bm25_threshold=filter_conf.get("threshold", 1.0) |
| 628 | + crawler_cfg.markdown_generator = DefaultMarkdownGenerator( |
| 629 | + content_filter = BM25ContentFilter( |
| 630 | + user_query=filter_conf.get("query"), |
| 631 | + bm25_threshold=filter_conf.get("threshold", 1.0) |
| 632 | + ) |
623 | 633 | ) |
624 | 634 | elif filter_conf["type"] == "pruning": |
625 | | - crawler_cfg.content_filter = PruningContentFilter( |
626 | | - user_query=filter_conf.get("query"), |
627 | | - threshold=filter_conf.get("threshold", 0.48) |
| 635 | + crawler_cfg.markdown_generator = DefaultMarkdownGenerator( |
| 636 | + content_filter = PruningContentFilter( |
| 637 | + user_query=filter_conf.get("query"), |
| 638 | + threshold=filter_conf.get("threshold", 0.48) |
| 639 | + ) |
628 | 640 | ) |
629 | 641 |
|
630 | 642 | # Handle extraction strategy |
|
0 commit comments