6767 * production system.
6868 * </p>
6969 *
70+ * The class also allows specifying custom min. and max. re-fetch intervals per
71+ * hostname, in adaptive-host-specific-intervals.txt. If they are specified,
72+ * the calculated re-fetch interval for a URL matching the hostname will not be
73+ * allowed to fall outside of the corresponding range, instead of the default
74+ * range.
75+ *
7076 * @author Andrzej Bialecki
7177 */
7278public class AdaptiveFetchSchedule extends AbstractFetchSchedule {
@@ -89,9 +95,9 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule {
8995
9096 private Configuration conf ;
9197
92- private Map <String ,Float > hostSpecificMaxInterval = new HashMap <>();
98+ private Map <String , Float > hostSpecificMaxInterval = new HashMap <>();
9399
94- private Map <String ,Float > hostSpecificMinInterval = new HashMap <>();
100+ private Map <String , Float > hostSpecificMinInterval = new HashMap <>();
95101
96102 @ Override
97103 public void setConf (Configuration conf ) {
@@ -109,18 +115,24 @@ public void setConf(Configuration conf) {
109115 "db.fetch.schedule.adaptive.sync_delta_rate" , 0.2f );
110116 try {
111117 setHostSpecificIntervals ("adaptive-host-specific-intervals.txt" ,
112- MIN_INTERVAL , MAX_INTERVAL );
113- } catch (IOException e ){
114- LOG .error ("Failed reading the configuration file. " , e );
118+ MIN_INTERVAL , MAX_INTERVAL );
119+ } catch (IOException e ) {
120+ LOG .error ("Failed reading the configuration file: " + e . toString () );
115121 }
116122 }
117123
118124 /**
119- * Load host-specific min_intervals and max_intervals
120- * from the configuration file into the HashMaps.
125+ * Load host-specific minimal and maximal refetch intervals from
126+ * the configuration file into the corresponding HashMaps.
127+ *
128+ * @param fileName the name of the configuration file containing
129+ * the specific intervals
130+ * @param defaultMin the value of the default min interval
131+ * @param defaultMax the value of the default max interval
121132 */
122133 private void setHostSpecificIntervals (String fileName ,
123- float defaultMin , float defaultMax ) throws IOException {
134+ float defaultMin , float defaultMax ) throws IOException {
135+ // Setup for reading the config file.
124136 Reader configReader = null ;
125137 configReader = conf .getConfResourceAsReader (fileName );
126138 if (configReader == null ) {
@@ -129,67 +141,105 @@ private void setHostSpecificIntervals(String fileName,
129141 BufferedReader reader = new BufferedReader (configReader );
130142 String line ;
131143 int lineNo = 0 ;
144+
145+ // Read the file line by line.
132146 while ((line = reader .readLine ()) != null ) {
133147 lineNo ++;
134- if (StringUtils .isNotBlank (line ) && !line .startsWith ("#" )) {
135- line = line .trim ();
136- String [] parts = line .split ("\\ s+" );
137- if (parts .length == 3 ) {
138- // TODO: Maybe add host validatio here?
139- // It might get computationally expensive for large files, though.
140- String host = parts [0 ].trim ().toLowerCase ();
141- String minInt = parts [1 ].trim ();
142- String maxInt = parts [2 ].trim ();
143- if (minInt .equalsIgnoreCase ("default" )){ minInt = "0" ; }
144- if (maxInt .equalsIgnoreCase ("default" )){ maxInt = "0" ; }
145- float m ,M ;
146- try {
147- m = Float .parseFloat (minInt );
148- M = Float .parseFloat (maxInt );
149-
150- //negative values and mismatched boundaries are ignored
151- //(default to global settings)
152- if (m < 0 || M < 0 || m > M ){
153- LOG .error ("Improper fetch intervals given on line " + String .valueOf (lineNo )
154- + " in the config. file: " + line );
155- } else {
156-
157- // min. interval should be positive and above the global minimum
158- if (m > 0 && m > defaultMin ){
159- hostSpecificMinInterval .put (host ,m );
160- LOG .debug ("Added custom min. interval " + m + " for host " + host + "." );
161- } else if (m > 0 ) {
162- LOG .error ("Min. interval out of bounds on line " + String .valueOf (lineNo )
163- + " in the config. file: " + line );
164- }
165-
166- // max. interval should be positive and below the global maximum
167- if (M > 0 && M < defaultMax ){
168- hostSpecificMaxInterval .put (host ,M );
169- LOG .debug ("Added custom max. interval " + M + " for host " + host + "." );
170- } else if (M > 0 ){
171- LOG .error ("Max. interval out of bounds on line " + String .valueOf (lineNo )
172- + " in the config. file: " + line );
173- }
174-
175- // zero values are ignored (default to global settings)
176- }
177- } catch (NumberFormatException e ){
178- LOG .error ("No proper fetch intervals given on line " + String .valueOf (lineNo )
179- + " in the config. file: " + line , e );
180- }
181- } else {
182- LOG .error ("Malformed (domain, min_interval, max_interval) triplet on line "
183- + String .valueOf (lineNo ) + " of the config. file: " + line );
184- }
148+
149+ // Skip blank lines and comments.
150+ if (StringUtils .isBlank (line ) || line .startsWith ("#" )) {
151+ continue ;
152+ }
153+
154+ // Trim and partition the line.
155+ line = line .trim ();
156+ String [] parts = line .split ("\\ s+" );
157+
158+ // There should be three parts.
159+ if (parts .length != 3 ) {
160+ LOG .error ("Malformed (domain, min_interval, max_interval) triplet on line "
161+ + String .valueOf (lineNo ) + " of config. file: `" + line + "`" );
162+ continue ;
163+ }
164+
165+ // Normalize the parts.
166+ String host = parts [0 ].trim ().toLowerCase ();
167+ String minInt = parts [1 ].trim ();
168+ String maxInt = parts [2 ].trim ();
169+
170+ // "0" and "default" both mean `use default interval`; normalize to "0".
171+ if (minInt .equalsIgnoreCase ("default" )) { minInt = "0" ; }
172+ if (maxInt .equalsIgnoreCase ("default" )) { maxInt = "0" ; }
173+
174+ // Convert intervals to float and ignore the line in case of failure.
175+ float m , M ;
176+ try {
177+ m = Float .parseFloat (minInt );
178+ M = Float .parseFloat (maxInt );
179+ } catch (NumberFormatException e ) {
180+ LOG .error ("Improper fetch intervals given on line " + String .valueOf (lineNo )
181+ + " of config. file: `" + line + "`: " + e .toString ());
182+ continue ;
183+ }
184+
185+ // If both intervals are set to default,
186+ // ignore the line and issue a warning.
187+ if (m == 0 && M == 0 ) {
188+ LOG .warn ("Ignoring default interval values on line " + String .valueOf (lineNo )
189+ + " of config. file: `" + line + "`" );
190+ continue ;
185191 }
192+
193+ // Replace the zero with the default value.
194+ if (m == 0 ) {
195+ m = defaultMin ;
196+ } else if (M == 0 ) {
197+ M = defaultMax ;
198+ }
199+
200+ // Intervals cannot be negative and the min cannot be above the max
201+ // (we assume here that the default values satisfy this).
202+ if (m < 0 || M < 0 ) {
203+ LOG .error ("Improper fetch intervals given on line " + String .valueOf (lineNo )
204+ + " of config. file: `" + line
205+ + "`: intervals cannot be negative" );
206+ continue ;
207+ }
208+
209+ if (m > M ) {
210+ LOG .error ("Improper fetch intervals given on line " + String .valueOf (lineNo )
211+ + " of config. file: `" + line
212+ + "`: min. interval cannot be above max. interval" );
213+ continue ;
214+ }
215+
216+ // The custom intervals should respect the boundaries of the default values.
217+ if (m < defaultMin ) {
218+ LOG .error ("Min. interval out of bounds on line " + String .valueOf (lineNo )
219+ + " of config. file: `" + line + "`" );
220+ continue ;
221+ }
222+
223+ if (M > defaultMax ) {
224+ LOG .error ("Max. interval out of bounds on line " + String .valueOf (lineNo )
225+ + " of config. file: `" + line + "`" );
226+ continue ;
227+ }
228+
229+ // If all is well, store the specific intervals.
230+ hostSpecificMinInterval .put (host , m );
231+ LOG .debug ("Added custom min. interval " + m + " for host " + host );
232+
233+ hostSpecificMaxInterval .put (host , M );
234+ LOG .debug ("Added custom max. interval " + M + " for host " + host );
235+
186236 }
187237 }
188238
189239 /**
190- * Strip a URL, leaving only the host name .
240+ * Strip a URL, leaving only the hostname .
191241 *
192- * @param url url to get hostname for
242+ * @param url the URL for which to get the hostname
193243 * @return hostname
194244 * @throws URISyntaxException if the given string violates RFC 2396
195245 */
@@ -200,49 +250,49 @@ public static String getHostName(String url) throws URISyntaxException {
200250 }
201251
202252 /**
203- * Returns the max_interval for this URL, which might depend on the host.
253+ * Returns the custom max. refetch interval for this URL,
254+ * if specified for the corresponding hostname.
204255 *
205256 * @param url the URL to be scheduled
206- * @param defaultMaxInterval the value to which to default if max_interval has not been configured for this host
207- * @return the configured maximum interval or the default interval
257+ * @return the configured max. interval or null
208258 */
209- public float getMaxInterval (Text url , float defaultMaxInterval ) {
259+ public Float getCustomMaxInterval (Text url ) {
210260 if (hostSpecificMaxInterval .isEmpty ()) {
211- return defaultMaxInterval ;
261+ return null ;
212262 }
213263 String host ;
214264 try {
215265 host = getHostName (url .toString ());
216266 } catch (URISyntaxException e ){
217- return defaultMaxInterval ;
267+ return null ;
218268 }
219- if (hostSpecificMaxInterval .containsKey (host )){
220- return hostSpecificMaxInterval . get ( host ) ;
269+ if (! hostSpecificMaxInterval .containsKey (host )) {
270+ return null ;
221271 }
222- return defaultMaxInterval ;
272+ return hostSpecificMaxInterval . get ( host ) ;
223273 }
224274
225275 /**
226- * Returns the min_interval for this URL, which might depend on the host.
276+ * Returns the custom min. refetch interval for this URL,
277+ * if specified for the corresponding hostname.
227278 *
228279 * @param url the URL to be scheduled
229- * @param defaultMinInterval the value to which to default if min_interval has not been configured for this host
230- * @return the configured minimum interval or the default interval
280+ * @return the configured min. interval or null
231281 */
232- public float getMinInterval (Text url , float defaultMinInterval ) {
282+ public Float getCustomMinInterval (Text url ) {
233283 if (hostSpecificMinInterval .isEmpty ()) {
234- return defaultMinInterval ;
284+ return null ;
235285 }
236286 String host ;
237287 try {
238288 host = getHostName (url .toString ());
239289 } catch (URISyntaxException e ){
240- return defaultMinInterval ;
290+ return null ;
241291 }
242- if (hostSpecificMinInterval .containsKey (host )){
243- return hostSpecificMinInterval . get ( host ) ;
292+ if (! hostSpecificMinInterval .containsKey (host )) {
293+ return null ;
244294 }
245- return defaultMinInterval ;
295+ return hostSpecificMinInterval . get ( host ) ;
246296 }
247297
248298 @ Override
@@ -285,14 +335,13 @@ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
285335 refTime = fetchTime - Math .round (delta * SYNC_DELTA_RATE * 1000 );
286336 }
287337
288- // replace min_interval and max_interval with a domain-specific ones,
289- // if so configured.
290- float newMaxInterval = getMaxInterval (url , MAX_INTERVAL );
291- float newMinInterval = getMinInterval (url , MIN_INTERVAL );
292- if (interval < newMinInterval ) {
293- interval = newMinInterval ;
294- } else if (interval > newMaxInterval ) {
295- interval = newMaxInterval ;
338+ // Ensure the interval does not fall outside of bounds
339+ float minInterval = (getCustomMinInterval (url ) != null ) ? getCustomMinInterval (url ) : MIN_INTERVAL ;
340+ float maxInterval = (getCustomMaxInterval (url ) != null ) ? getCustomMaxInterval (url ) : MAX_INTERVAL ;
341+ if (interval < minInterval ) {
342+ interval = minInterval ;
343+ } else if (interval > maxInterval ) {
344+ interval = maxInterval ;
296345 }
297346 }
298347
0 commit comments