Commit d02f9a4
committed
fix: deep crawl duplicate url processing
Fix BFSDeepCrawlStrategy processing URLs that vary based on base domain
or port so they only process once. The common case for this is
www.example.com vs example.com but it also addresses https://example.com
vs https://example.com:443.
Fixes #8431 parent fb7fe6a commit d02f9a4
File tree
3 files changed
+142
-49
lines changed- crawl4ai
- deep_crawling
- tests/20241401
3 files changed
+142
-49
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
10 | 10 | | |
11 | 11 | | |
12 | 12 | | |
13 | | - | |
| 13 | + | |
14 | 14 | | |
15 | 15 | | |
16 | 16 | | |
| |||
96 | 96 | | |
97 | 97 | | |
98 | 98 | | |
99 | | - | |
100 | | - | |
| 99 | + | |
| 100 | + | |
101 | 101 | | |
| 102 | + | |
102 | 103 | | |
103 | 104 | | |
104 | 105 | | |
105 | 106 | | |
106 | 107 | | |
107 | 108 | | |
108 | | - | |
109 | | - | |
110 | | - | |
| 109 | + | |
| 110 | + | |
| 111 | + | |
| 112 | + | |
| 113 | + | |
| 114 | + | |
| 115 | + | |
111 | 116 | | |
| 117 | + | |
| 118 | + | |
| 119 | + | |
| 120 | + | |
| 121 | + | |
| 122 | + | |
| 123 | + | |
112 | 124 | | |
113 | 125 | | |
114 | 126 | | |
115 | 127 | | |
116 | 128 | | |
117 | | - | |
118 | | - | |
| 129 | + | |
| 130 | + | |
119 | 131 | | |
120 | 132 | | |
121 | 133 | | |
122 | 134 | | |
123 | 135 | | |
124 | | - | |
125 | | - | |
126 | | - | |
| 136 | + | |
| 137 | + | |
| 138 | + | |
127 | 139 | | |
128 | 140 | | |
129 | 141 | | |
| |||
162 | 174 | | |
163 | 175 | | |
164 | 176 | | |
165 | | - | |
| 177 | + | |
| 178 | + | |
166 | 179 | | |
167 | 180 | | |
168 | 181 | | |
| |||
204 | 217 | | |
205 | 218 | | |
206 | 219 | | |
207 | | - | |
| 220 | + | |
208 | 221 | | |
209 | 222 | | |
210 | 223 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1974 | 1974 | | |
1975 | 1975 | | |
1976 | 1976 | | |
1977 | | - | |
1978 | | - | |
1979 | 1977 | | |
1980 | 1978 | | |
1981 | 1979 | | |
| |||
1988 | 1986 | | |
1989 | 1987 | | |
1990 | 1988 | | |
1991 | | - | |
1992 | 1989 | | |
1993 | 1990 | | |
1994 | 1991 | | |
1995 | 1992 | | |
1996 | 1993 | | |
1997 | | - | |
1998 | | - | |
1999 | | - | |
2000 | | - | |
2001 | | - | |
2002 | | - | |
2003 | | - | |
2004 | | - | |
2005 | | - | |
2006 | | - | |
2007 | | - | |
2008 | | - | |
2009 | | - | |
2010 | | - | |
2011 | | - | |
2012 | | - | |
2013 | | - | |
2014 | | - | |
2015 | | - | |
2016 | | - | |
2017 | | - | |
2018 | | - | |
2019 | | - | |
| 1994 | + | |
| 1995 | + | |
2020 | 1996 | | |
2021 | 1997 | | |
2022 | 1998 | | |
2023 | | - | |
| 1999 | + | |
2024 | 2000 | | |
2025 | 2001 | | |
2026 | | - | |
2027 | | - | |
| 2002 | + | |
| 2003 | + | |
2028 | 2004 | | |
2029 | | - | |
| 2005 | + | |
2030 | 2006 | | |
2031 | 2007 | | |
| 2008 | + | |
2032 | 2009 | | |
2033 | 2010 | | |
2034 | 2011 | | |
| |||
2152 | 2129 | | |
2153 | 2130 | | |
2154 | 2131 | | |
| 2132 | + | |
| 2133 | + | |
| 2134 | + | |
| 2135 | + | |
| 2136 | + | |
| 2137 | + | |
| 2138 | + | |
| 2139 | + | |
| 2140 | + | |
| 2141 | + | |
| 2142 | + | |
| 2143 | + | |
| 2144 | + | |
| 2145 | + | |
| 2146 | + | |
| 2147 | + | |
| 2148 | + | |
| 2149 | + | |
| 2150 | + | |
| 2151 | + | |
| 2152 | + | |
| 2153 | + | |
| 2154 | + | |
| 2155 | + | |
| 2156 | + | |
| 2157 | + | |
| 2158 | + | |
| 2159 | + | |
| 2160 | + | |
| 2161 | + | |
| 2162 | + | |
| 2163 | + | |
| 2164 | + | |
| 2165 | + | |
| 2166 | + | |
| 2167 | + | |
| 2168 | + | |
| 2169 | + | |
| 2170 | + | |
| 2171 | + | |
| 2172 | + | |
| 2173 | + | |
| 2174 | + | |
| 2175 | + | |
| 2176 | + | |
| 2177 | + | |
| 2178 | + | |
| 2179 | + | |
| 2180 | + | |
| 2181 | + | |
| 2182 | + | |
| 2183 | + | |
| 2184 | + | |
| 2185 | + | |
| 2186 | + | |
| 2187 | + | |
| 2188 | + | |
| 2189 | + | |
| 2190 | + | |
| 2191 | + | |
| 2192 | + | |
| 2193 | + | |
| 2194 | + | |
| 2195 | + | |
| 2196 | + | |
| 2197 | + | |
| 2198 | + | |
| 2199 | + | |
| 2200 | + | |
| 2201 | + | |
| 2202 | + | |
| 2203 | + | |
| 2204 | + | |
| 2205 | + | |
| 2206 | + | |
| 2207 | + | |
| 2208 | + | |
| 2209 | + | |
| 2210 | + | |
| 2211 | + | |
| 2212 | + | |
| 2213 | + | |
| 2214 | + | |
| 2215 | + | |
| 2216 | + | |
2155 | 2217 | | |
2156 | 2218 | | |
2157 | 2219 | | |
| |||
2753 | 2815 | | |
2754 | 2816 | | |
2755 | 2817 | | |
2756 | | - | |
2757 | | - | |
| 2818 | + | |
| 2819 | + | |
2758 | 2820 | | |
2759 | 2821 | | |
2760 | 2822 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | 1 | | |
2 | 2 | | |
| 3 | + | |
| 4 | + | |
3 | 5 | | |
4 | 6 | | |
5 | 7 | | |
| |||
9 | 11 | | |
10 | 12 | | |
11 | 13 | | |
| 14 | + | |
12 | 15 | | |
13 | 16 | | |
14 | 17 | | |
| |||
18 | 21 | | |
19 | 22 | | |
20 | 23 | | |
| 24 | + | |
| 25 | + | |
| 26 | + | |
| 27 | + | |
| 28 | + | |
| 29 | + | |
21 | 30 | | |
22 | 31 | | |
23 | 32 | | |
| |||
26 | 35 | | |
27 | 36 | | |
28 | 37 | | |
29 | | - | |
| 38 | + | |
30 | 39 | | |
31 | | - | |
| 40 | + | |
| 41 | + | |
| 42 | + | |
| 43 | + | |
32 | 44 | | |
33 | 45 | | |
34 | 46 | | |
| |||
42 | 54 | | |
43 | 55 | | |
44 | 56 | | |
| 57 | + | |
| 58 | + | |
45 | 59 | | |
46 | 60 | | |
47 | 61 | | |
48 | | - | |
| 62 | + | |
49 | 63 | | |
50 | 64 | | |
51 | 65 | | |
| |||
73 | 87 | | |
74 | 88 | | |
75 | 89 | | |
| 90 | + | |
| 91 | + | |
76 | 92 | | |
77 | | - | |
| 93 | + | |
78 | 94 | | |
79 | 95 | | |
80 | 96 | | |
| |||
106 | 122 | | |
107 | 123 | | |
108 | 124 | | |
| 125 | + | |
| 126 | + | |
109 | 127 | | |
110 | 128 | | |
111 | 129 | | |
| |||
0 commit comments