Skip to content

Commit 5335e6b

Browse files
Merge pull request #848 from martin-djukanovic/NUTCH-3103
[NUTCH-3103] Fixed custom max intervals for AdaptiveFetchSchedule
2 parents b61d11f + 931ba17 commit 5335e6b

File tree

2 files changed

+145
-96
lines changed

2 files changed

+145
-96
lines changed
Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
1-
# This file defines a mapping that associates specific min. and max. refetching time intervals
2-
# to a host, that deviate from the default settings of the AdaptiveFetchSchedule class.
1+
# This file defines a mapping that associates specific min and max refetching intervals
2+
# with a host, that deviate from the default settings of the AdaptiveFetchSchedule class.
33
#
4-
# Format: <hostname> <min_interval> <max_interval>.
4+
# Format: <hostname> <min_interval> <max_interval>
55
#
6-
# The two values will be parsed as float and should be STRICTLY between
6+
# The two interval values will be parsed as float and should be between
77
# db.fetch.schedule.adaptive.min_interval and db.fetch.schedule.adaptive.max_interval.
88
#
9-
# To use default values, write "default" or "0".
10-
# The default min. is 60 (1 min) and default max. is 31536000 (1 year).
9+
# To use the default as a value, write either "default" or "0".
10+
# The default min is 60 (1 min), while the default max is 31536000 (1 year).
1111
#
12-
www.apache.org default 1728000
13-
www.example.org 1296000 0
14-
nutch.apache.org 864000 2160000
12+
www.example.com default 1728000
13+
www.apache.org 1296000 0
14+
nutch.apache.org 864000 2160000

src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java

Lines changed: 136 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,12 @@
6767
* production system.
6868
* </p>
6969
*
70+
* The class also allows specifying custom min. and max. re-fetch intervals per
71+
* hostname, in adaptive-host-specific-intervals.txt. If they are specified,
72+
* the calculated re-fetch interval for a URL matching the hostname will not be
73+
* allowed to fall outside of the corresponding range, instead of the default
74+
* range.
75+
*
7076
* @author Andrzej Bialecki
7177
*/
7278
public class AdaptiveFetchSchedule extends AbstractFetchSchedule {
@@ -89,9 +95,9 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule {
8995

9096
private Configuration conf;
9197

92-
private Map<String,Float> hostSpecificMaxInterval = new HashMap<>();
98+
private Map<String, Float> hostSpecificMaxInterval = new HashMap<>();
9399

94-
private Map<String,Float> hostSpecificMinInterval = new HashMap<>();
100+
private Map<String, Float> hostSpecificMinInterval = new HashMap<>();
95101

96102
@Override
97103
public void setConf(Configuration conf) {
@@ -109,18 +115,24 @@ public void setConf(Configuration conf) {
109115
"db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
110116
try {
111117
setHostSpecificIntervals("adaptive-host-specific-intervals.txt",
112-
MIN_INTERVAL, MAX_INTERVAL);
113-
} catch (IOException e){
114-
LOG.error("Failed reading the configuration file. ", e);
118+
MIN_INTERVAL, MAX_INTERVAL);
119+
} catch (IOException e) {
120+
LOG.error("Failed reading the configuration file: " + e.toString());
115121
}
116122
}
117123

118124
/**
119-
* Load host-specific min_intervals and max_intervals
120-
* from the configuration file into the HashMaps.
125+
* Load host-specific minimal and maximal refetch intervals from
126+
* the configuration file into the corresponding HashMaps.
127+
*
128+
* @param fileName the name of the configuration file containing
129+
* the specific intervals
130+
* @param defaultMin the value of the default min interval
131+
* @param defaultMax the value of the default max interval
121132
*/
122133
private void setHostSpecificIntervals(String fileName,
123-
float defaultMin, float defaultMax) throws IOException {
134+
float defaultMin, float defaultMax) throws IOException {
135+
// Setup for reading the config file.
124136
Reader configReader = null;
125137
configReader = conf.getConfResourceAsReader(fileName);
126138
if (configReader == null) {
@@ -129,67 +141,105 @@ private void setHostSpecificIntervals(String fileName,
129141
BufferedReader reader = new BufferedReader(configReader);
130142
String line;
131143
int lineNo = 0;
144+
145+
// Read the file line by line.
132146
while ((line = reader.readLine()) != null) {
133147
lineNo++;
134-
if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
135-
line = line.trim();
136-
String[] parts = line.split("\\s+");
137-
if (parts.length == 3) {
138-
// TODO: Maybe add host validatio here?
139-
// It might get computationally expensive for large files, though.
140-
String host = parts[0].trim().toLowerCase();
141-
String minInt = parts[1].trim();
142-
String maxInt = parts[2].trim();
143-
if (minInt.equalsIgnoreCase("default")){ minInt = "0"; }
144-
if (maxInt.equalsIgnoreCase("default")){ maxInt = "0"; }
145-
float m,M;
146-
try {
147-
m = Float.parseFloat(minInt);
148-
M = Float.parseFloat(maxInt);
149-
150-
//negative values and mismatched boundaries are ignored
151-
//(default to global settings)
152-
if (m < 0 || M < 0 || m > M){
153-
LOG.error("Improper fetch intervals given on line " + String.valueOf(lineNo)
154-
+ " in the config. file: " + line);
155-
} else {
156-
157-
// min. interval should be positive and above the global minimum
158-
if (m > 0 && m > defaultMin){
159-
hostSpecificMinInterval.put(host,m);
160-
LOG.debug("Added custom min. interval " + m + " for host " + host + ".");
161-
} else if (m > 0) {
162-
LOG.error("Min. interval out of bounds on line " + String.valueOf(lineNo)
163-
+ " in the config. file: " + line);
164-
}
165-
166-
// max. interval should be positive and below the global maximum
167-
if (M > 0 && M < defaultMax){
168-
hostSpecificMaxInterval.put(host,M);
169-
LOG.debug("Added custom max. interval " + M + " for host " + host + ".");
170-
} else if (M > 0){
171-
LOG.error("Max. interval out of bounds on line " + String.valueOf(lineNo)
172-
+ " in the config. file: " + line);
173-
}
174-
175-
// zero values are ignored (default to global settings)
176-
}
177-
} catch (NumberFormatException e){
178-
LOG.error("No proper fetch intervals given on line " + String.valueOf(lineNo)
179-
+ " in the config. file: " + line, e);
180-
}
181-
} else {
182-
LOG.error("Malformed (domain, min_interval, max_interval) triplet on line "
183-
+ String.valueOf(lineNo) + " of the config. file: " + line);
184-
}
148+
149+
// Skip blank lines and comments.
150+
if (StringUtils.isBlank(line) || line.startsWith("#")) {
151+
continue;
152+
}
153+
154+
// Trim and partition the line.
155+
line = line.trim();
156+
String[] parts = line.split("\\s+");
157+
158+
// There should be three parts.
159+
if (parts.length != 3) {
160+
LOG.error("Malformed (domain, min_interval, max_interval) triplet on line "
161+
+ String.valueOf(lineNo) + " of config. file: `" + line + "`");
162+
continue;
163+
}
164+
165+
// Normalize the parts.
166+
String host = parts[0].trim().toLowerCase();
167+
String minInt = parts[1].trim();
168+
String maxInt = parts[2].trim();
169+
170+
// "0" and "default" both mean `use default interval`; normalize to "0".
171+
if (minInt.equalsIgnoreCase("default")) { minInt = "0"; }
172+
if (maxInt.equalsIgnoreCase("default")) { maxInt = "0"; }
173+
174+
// Convert intervals to float and ignore the line in case of failure.
175+
float m, M;
176+
try {
177+
m = Float.parseFloat(minInt);
178+
M = Float.parseFloat(maxInt);
179+
} catch (NumberFormatException e) {
180+
LOG.error("Improper fetch intervals given on line " + String.valueOf(lineNo)
181+
+ " of config. file: `" + line + "`: " + e.toString());
182+
continue;
183+
}
184+
185+
// If both intervals are set to default,
186+
// ignore the line and issue a warning.
187+
if (m == 0 && M == 0) {
188+
LOG.warn("Ignoring default interval values on line " + String.valueOf(lineNo)
189+
+ " of config. file: `" + line + "`");
190+
continue;
185191
}
192+
193+
// Replace the zero with the default value.
194+
if (m == 0) {
195+
m = defaultMin;
196+
} else if (M == 0) {
197+
M = defaultMax;
198+
}
199+
200+
// Intervals cannot be negative and the min cannot be above the max
201+
// (we assume here that the default values satisfy this).
202+
if (m < 0 || M < 0) {
203+
LOG.error("Improper fetch intervals given on line " + String.valueOf(lineNo)
204+
+ " of config. file: `" + line
205+
+ "`: intervals cannot be negative");
206+
continue;
207+
}
208+
209+
if (m > M) {
210+
LOG.error("Improper fetch intervals given on line " + String.valueOf(lineNo)
211+
+ " of config. file: `" + line
212+
+ "`: min. interval cannot be above max. interval");
213+
continue;
214+
}
215+
216+
// The custom intervals should respect the boundaries of the default values.
217+
if (m < defaultMin) {
218+
LOG.error("Min. interval out of bounds on line " + String.valueOf(lineNo)
219+
+ " of config. file: `" + line + "`");
220+
continue;
221+
}
222+
223+
if (M > defaultMax) {
224+
LOG.error("Max. interval out of bounds on line " + String.valueOf(lineNo)
225+
+ " of config. file: `" + line + "`");
226+
continue;
227+
}
228+
229+
// If all is well, store the specific intervals.
230+
hostSpecificMinInterval.put(host, m);
231+
LOG.debug("Added custom min. interval " + m + " for host " + host);
232+
233+
hostSpecificMaxInterval.put(host, M);
234+
LOG.debug("Added custom max. interval " + M + " for host " + host);
235+
186236
}
187237
}
188238

189239
/**
190-
* Strip a URL, leaving only the host name.
240+
* Strip a URL, leaving only the hostname.
191241
*
192-
* @param url url to get hostname for
242+
* @param url the URL for which to get the hostname
193243
* @return hostname
194244
* @throws URISyntaxException if the given string violates RFC 2396
195245
*/
@@ -200,49 +250,49 @@ public static String getHostName(String url) throws URISyntaxException {
200250
}
201251

202252
/**
203-
* Returns the max_interval for this URL, which might depend on the host.
253+
* Returns the custom max. refetch interval for this URL,
254+
* if specified for the corresponding hostname.
204255
*
205256
* @param url the URL to be scheduled
206-
* @param defaultMaxInterval the value to which to default if max_interval has not been configured for this host
207-
* @return the configured maximum interval or the default interval
257+
* @return the configured max. interval or null
208258
*/
209-
public float getMaxInterval(Text url, float defaultMaxInterval){
259+
public Float getCustomMaxInterval(Text url) {
210260
if (hostSpecificMaxInterval.isEmpty()) {
211-
return defaultMaxInterval;
261+
return null;
212262
}
213263
String host;
214264
try {
215265
host = getHostName(url.toString());
216266
} catch (URISyntaxException e){
217-
return defaultMaxInterval;
267+
return null;
218268
}
219-
if (hostSpecificMaxInterval.containsKey(host)){
220-
return hostSpecificMaxInterval.get(host);
269+
if (!hostSpecificMaxInterval.containsKey(host)) {
270+
return null;
221271
}
222-
return defaultMaxInterval;
272+
return hostSpecificMaxInterval.get(host);
223273
}
224274

225275
/**
226-
* Returns the min_interval for this URL, which might depend on the host.
276+
* Returns the custom min. refetch interval for this URL,
277+
* if specified for the corresponding hostname.
227278
*
228279
* @param url the URL to be scheduled
229-
* @param defaultMinInterval the value to which to default if min_interval has not been configured for this host
230-
* @return the configured minimum interval or the default interval
280+
* @return the configured min. interval or null
231281
*/
232-
public float getMinInterval(Text url, float defaultMinInterval){
282+
public Float getCustomMinInterval(Text url) {
233283
if (hostSpecificMinInterval.isEmpty()) {
234-
return defaultMinInterval;
284+
return null;
235285
}
236286
String host;
237287
try {
238288
host = getHostName(url.toString());
239289
} catch (URISyntaxException e){
240-
return defaultMinInterval;
290+
return null;
241291
}
242-
if (hostSpecificMinInterval.containsKey(host)){
243-
return hostSpecificMinInterval.get(host);
292+
if (!hostSpecificMinInterval.containsKey(host)) {
293+
return null;
244294
}
245-
return defaultMinInterval;
295+
return hostSpecificMinInterval.get(host);
246296
}
247297

248298
@Override
@@ -285,14 +335,13 @@ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
285335
refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
286336
}
287337

288-
// replace min_interval and max_interval with a domain-specific ones,
289-
// if so configured.
290-
float newMaxInterval = getMaxInterval(url, MAX_INTERVAL);
291-
float newMinInterval = getMinInterval(url, MIN_INTERVAL);
292-
if (interval < newMinInterval) {
293-
interval = newMinInterval;
294-
} else if (interval > newMaxInterval) {
295-
interval = newMaxInterval;
338+
// Ensure the interval does not fall outside of bounds
339+
float minInterval = (getCustomMinInterval(url) != null) ? getCustomMinInterval(url) : MIN_INTERVAL;
340+
float maxInterval = (getCustomMaxInterval(url) != null) ? getCustomMaxInterval(url) : MAX_INTERVAL;
341+
if (interval < minInterval) {
342+
interval = minInterval;
343+
} else if (interval > maxInterval) {
344+
interval = maxInterval;
296345
}
297346
}
298347

0 commit comments

Comments
 (0)