Skip to content

Commit bc137a4

Browse files
authored
Make isExplicitlyDisallowed() return undefined if invalid URL and add caution. (#39)
Update behaviour to match documentation and add a caution to to documentation about usage.
1 parent f07168c commit bc137a4

File tree

3 files changed

+34
-9
lines changed

3 files changed

+34
-9
lines changed

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,19 @@ This will return `undefined` if the URL isn't valid for this robots.txt.
6767

6868
**boolean or undefined**
6969

70+
> [!CAUTION]
71+
> This is not part of the robots.txt specification and should only be used with
72+
> the websites owners permission.
73+
> This method is only intended for special purposes where a user-agent shouldn't
74+
> fallback to matching against global (*) rules.
75+
>
76+
> An example of this behaviour is [Google AdsBot](https://developers.google.com/search/docs/crawling-indexing/google-special-case-crawlers)
77+
> which must be explicitly excluded. This is done with the website owners permission.
78+
7079
Returns trues if explicitly disallowed for the specified user agent (User Agent wildcards are discarded).
7180

7281
This will return undefined if the URL is not valid for this robots.txt file.
82+
7383
### getMatchingLineNumber(url, [ua])
7484

7585
**number or undefined**

Robots.js

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -376,9 +376,9 @@ Robots.prototype._getRule = function (url, ua, explicit) {
376376

377377
var rules = this._rules[userAgent];
378378
if (!explicit) {
379-
rules = rules || this._rules['*']
379+
rules = rules || this._rules['*'];
380380
}
381-
rules = rules || []
381+
rules = rules || [];
382382

383383
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
384384
var rule = findRule(path, rules);
@@ -438,21 +438,23 @@ Robots.prototype.isDisallowed = function (url, ua) {
438438
};
439439

440440
/**
441-
* Returns trues if explicitly disallowed
441+
* Returns trues if explicitly disallowed
442442
* for the specified user agent (User Agent wildcards are discarded).
443-
*
443+
*
444444
* This will return undefined if the URL is not valid for this robots.txt file.
445+
*
445446
* @param {string} url
446447
* @param {string} ua
447448
* @return {boolean?}
448449
*/
449-
Robots.prototype.isExplicitlyDisallowed = function(url, ua) {
450+
Robots.prototype.isExplicitlyDisallowed = function (url, ua) {
450451
var rule = this._getRule(url, ua, true);
451452
if (typeof rule === 'undefined') {
452-
return true;
453+
return;
453454
}
455+
454456
return !(!rule || rule.allow);
455-
}
457+
};
456458

457459
/**
458460
* Gets the crawl delay if there is one.

test/Robots.js

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -873,7 +873,7 @@ describe('Robots', function () {
873873
var robots = robotsParser(url, contents);
874874

875875
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false)
876-
})
876+
});
877877

878878
it('should be disallowed when user agent equal robots rule in explicit mode', function () {
879879
var contents = [
@@ -886,5 +886,18 @@ describe('Robots', function () {
886886
var robots = robotsParser(url, contents);
887887

888888
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true)
889-
})
889+
});
890+
891+
it('should return undefined when given an invalid URL in explicit mode', function () {
892+
var contents = [
893+
'User-agent: SomeBot',
894+
'Disallow: /',
895+
].join('\n')
896+
897+
var url = 'https://www.example.com/hello'
898+
var userAgent = 'SomeBot';
899+
var robots = robotsParser('http://example.com', contents);
900+
901+
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(undefined)
902+
});
890903
});

0 commit comments

Comments
 (0)