Commit 3a1e872
committed
fix: deep crawl duplicate url processing
Fix BFSDeepCrawlStrategy processing URLs that vary based on base domain
or port so they only process once. The common case for this is
www.example.com vs example.com but it also addresses https://example.com
vs https://example.com:443.
Fixes #8431 parent fb7fe6a commit 3a1e872
File tree
3 files changed
+128
-53
lines changed- crawl4ai
- deep_crawling
- tests/20241401
3 files changed
+128
-53
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
10 | 10 | | |
11 | 11 | | |
12 | 12 | | |
13 | | - | |
| 13 | + | |
14 | 14 | | |
15 | 15 | | |
16 | 16 | | |
| |||
96 | 96 | | |
97 | 97 | | |
98 | 98 | | |
99 | | - | |
100 | | - | |
| 99 | + | |
| 100 | + | |
101 | 101 | | |
| 102 | + | |
102 | 103 | | |
103 | 104 | | |
104 | 105 | | |
105 | 106 | | |
106 | 107 | | |
107 | 108 | | |
108 | | - | |
109 | 109 | | |
110 | | - | |
| 110 | + | |
| 111 | + | |
| 112 | + | |
| 113 | + | |
| 114 | + | |
| 115 | + | |
111 | 116 | | |
| 117 | + | |
| 118 | + | |
| 119 | + | |
| 120 | + | |
| 121 | + | |
| 122 | + | |
| 123 | + | |
112 | 124 | | |
113 | 125 | | |
114 | 126 | | |
| |||
161 | 173 | | |
162 | 174 | | |
163 | 175 | | |
164 | | - | |
| 176 | + | |
165 | 177 | | |
166 | 178 | | |
167 | 179 | | |
| |||
203 | 215 | | |
204 | 216 | | |
205 | 217 | | |
206 | | - | |
| 218 | + | |
207 | 219 | | |
208 | 220 | | |
209 | 221 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1974 | 1974 | | |
1975 | 1975 | | |
1976 | 1976 | | |
1977 | | - | |
1978 | | - | |
1979 | 1977 | | |
1980 | 1978 | | |
1981 | 1979 | | |
| |||
1988 | 1986 | | |
1989 | 1987 | | |
1990 | 1988 | | |
1991 | | - | |
| 1989 | + | |
1992 | 1990 | | |
1993 | 1991 | | |
1994 | 1992 | | |
| |||
2029 | 2027 | | |
2030 | 2028 | | |
2031 | 2029 | | |
| 2030 | + | |
| 2031 | + | |
| 2032 | + | |
| 2033 | + | |
| 2034 | + | |
| 2035 | + | |
| 2036 | + | |
| 2037 | + | |
| 2038 | + | |
| 2039 | + | |
| 2040 | + | |
| 2041 | + | |
| 2042 | + | |
| 2043 | + | |
| 2044 | + | |
| 2045 | + | |
| 2046 | + | |
| 2047 | + | |
| 2048 | + | |
| 2049 | + | |
| 2050 | + | |
| 2051 | + | |
| 2052 | + | |
| 2053 | + | |
| 2054 | + | |
| 2055 | + | |
| 2056 | + | |
| 2057 | + | |
| 2058 | + | |
| 2059 | + | |
| 2060 | + | |
2032 | 2061 | | |
2033 | 2062 | | |
2034 | 2063 | | |
| |||
2112 | 2141 | | |
2113 | 2142 | | |
2114 | 2143 | | |
2115 | | - | |
2116 | | - | |
2117 | | - | |
2118 | | - | |
2119 | | - | |
2120 | | - | |
2121 | | - | |
2122 | | - | |
2123 | | - | |
2124 | | - | |
2125 | | - | |
2126 | | - | |
2127 | | - | |
2128 | | - | |
2129 | | - | |
2130 | | - | |
2131 | | - | |
2132 | | - | |
2133 | | - | |
2134 | | - | |
2135 | | - | |
2136 | | - | |
2137 | | - | |
2138 | | - | |
2139 | | - | |
2140 | | - | |
2141 | | - | |
2142 | | - | |
2143 | | - | |
2144 | | - | |
2145 | | - | |
2146 | | - | |
2147 | | - | |
2148 | | - | |
2149 | | - | |
2150 | | - | |
2151 | | - | |
| 2144 | + | |
2152 | 2145 | | |
2153 | 2146 | | |
2154 | 2147 | | |
2155 | 2148 | | |
| 2149 | + | |
| 2150 | + | |
| 2151 | + | |
| 2152 | + | |
| 2153 | + | |
| 2154 | + | |
| 2155 | + | |
| 2156 | + | |
| 2157 | + | |
| 2158 | + | |
| 2159 | + | |
| 2160 | + | |
| 2161 | + | |
| 2162 | + | |
| 2163 | + | |
| 2164 | + | |
| 2165 | + | |
| 2166 | + | |
| 2167 | + | |
| 2168 | + | |
| 2169 | + | |
| 2170 | + | |
| 2171 | + | |
| 2172 | + | |
| 2173 | + | |
| 2174 | + | |
| 2175 | + | |
| 2176 | + | |
| 2177 | + | |
| 2178 | + | |
| 2179 | + | |
| 2180 | + | |
| 2181 | + | |
| 2182 | + | |
| 2183 | + | |
| 2184 | + | |
| 2185 | + | |
| 2186 | + | |
| 2187 | + | |
| 2188 | + | |
| 2189 | + | |
| 2190 | + | |
| 2191 | + | |
| 2192 | + | |
| 2193 | + | |
| 2194 | + | |
| 2195 | + | |
| 2196 | + | |
| 2197 | + | |
| 2198 | + | |
| 2199 | + | |
| 2200 | + | |
| 2201 | + | |
2156 | 2202 | | |
2157 | 2203 | | |
2158 | 2204 | | |
| |||
2753 | 2799 | | |
2754 | 2800 | | |
2755 | 2801 | | |
2756 | | - | |
2757 | | - | |
| 2802 | + | |
| 2803 | + | |
2758 | 2804 | | |
2759 | 2805 | | |
2760 | 2806 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1 | 1 | | |
2 | 2 | | |
| 3 | + | |
3 | 4 | | |
4 | 5 | | |
| 6 | + | |
5 | 7 | | |
6 | 8 | | |
7 | 9 | | |
8 | 10 | | |
9 | 11 | | |
10 | 12 | | |
11 | 13 | | |
| 14 | + | |
12 | 15 | | |
13 | 16 | | |
14 | 17 | | |
| |||
18 | 21 | | |
19 | 22 | | |
20 | 23 | | |
| 24 | + | |
| 25 | + | |
| 26 | + | |
| 27 | + | |
| 28 | + | |
| 29 | + | |
21 | 30 | | |
22 | 31 | | |
23 | 32 | | |
| |||
26 | 35 | | |
27 | 36 | | |
28 | 37 | | |
29 | | - | |
| 38 | + | |
30 | 39 | | |
31 | | - | |
| 40 | + | |
| 41 | + | |
| 42 | + | |
32 | 43 | | |
33 | 44 | | |
34 | 45 | | |
| |||
42 | 53 | | |
43 | 54 | | |
44 | 55 | | |
| 56 | + | |
| 57 | + | |
45 | 58 | | |
46 | 59 | | |
47 | 60 | | |
48 | | - | |
| 61 | + | |
49 | 62 | | |
50 | 63 | | |
51 | 64 | | |
| |||
73 | 86 | | |
74 | 87 | | |
75 | 88 | | |
| 89 | + | |
| 90 | + | |
76 | 91 | | |
77 | | - | |
| 92 | + | |
78 | 93 | | |
79 | 94 | | |
80 | 95 | | |
| |||
106 | 121 | | |
107 | 122 | | |
108 | 123 | | |
| 124 | + | |
| 125 | + | |
109 | 126 | | |
110 | 127 | | |
111 | 128 | | |
| |||
0 commit comments