Skip to content

Commit d5f8b28

Browse files
add explicit disallow feature
1 parent 21c66bf commit d5f8b28

File tree

4 files changed

+89
-9
lines changed

4 files changed

+89
-9
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ The parser currently supports:
66

77
- User-agent:
88
- Allow:
9-
- Disallow:
9+
- Disallow (with explicit mode support):
1010
- Sitemap:
1111
- Crawl-delay:
1212
- Host:
@@ -41,6 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [
4141
robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true
4242
robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true
4343
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true
44+
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0', true); // false
4445
robots.getCrawlDelay('Sams-Bot/1.0'); // 1
4546
robots.getSitemaps(); // ['http://example.com/sitemap.xml']
4647
robots.getPreferredHost(); // example.com
@@ -54,11 +55,12 @@ Returns true if crawling the specified URL is allowed for the specified user-age
5455

5556
This will return `undefined` if the URL isn't valid for this robots.txt.
5657

57-
### isDisallowed(url, [ua])
58+
### isDisallowed(url, [ua], [explicit])
5859

5960
**boolean or undefined**
6061

6162
Returns true if crawling the specified URL is not allowed for the specified user-agent.
63+
In explicit mode, user agents wildcards are discarded.
6264

6365
This will return `undefined` if the URL isn't valid for this robots.txt.
6466

Robots.js

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ Robots.prototype.setPreferredHost = function (url) {
361361
this._preferredHost = url;
362362
};
363363

364-
Robots.prototype._getRule = function (url, ua) {
364+
Robots.prototype._getRule = function (url, ua, explicit) {
365365
var parsedUrl = parseUrl(url) || {};
366366
var userAgent = formatUserAgent(ua || '*');
367367

@@ -374,7 +374,12 @@ Robots.prototype._getRule = function (url, ua) {
374374
return;
375375
}
376376

377-
var rules = this._rules[userAgent] || this._rules['*'] || [];
377+
var rules = this._rules[userAgent];
378+
if (!explicit) {
379+
rules = rules || this._rules['*']
380+
}
381+
rules = rules || []
382+
378383
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
379384
var rule = findRule(path, rules);
380385

@@ -422,16 +427,51 @@ Robots.prototype.getMatchingLineNumber = function (url, ua) {
422427
};
423428

424429
/**
425-
* Returns the opposite of isAllowed()
426-
*
430+
* In standard mode, it returns the opposite of is allowed().
431+
* In explicit mode, it will return:
432+
* - true if the the agent is explicitly disallowed (wildcard non included),
433+
* - throws an error if the user agent is not specified,
434+
* - and false otherwise.
427435
* @param {string} url
428436
* @param {string} ua
429437
* @return {boolean}
430438
*/
431-
Robots.prototype.isDisallowed = function (url, ua) {
432-
return !this.isAllowed(url, ua);
439+
Robots.prototype.isDisallowed = function (url, ua, explicit) {
440+
if ((explicit === true) && (ua === undefined)) {
441+
throw new Error("User Agent must be specified in explicit mode")
442+
}
443+
444+
var rule = this._getRule(url, ua, explicit);
445+
if (typeof rule === 'undefined') {
446+
return true;
447+
}
448+
return !(!rule || rule.allow);
433449
};
434450

451+
Robots.prototype.isExplicitlyDisallowed = function(url, ua) {
452+
var parsedUrl = parseUrl(url) || {};
453+
var userAgent = formatUserAgent(ua);
454+
455+
// The base URL must match otherwise this robots.txt is not valid for it.
456+
if (
457+
parsedUrl.protocol !== this._url.protocol ||
458+
parsedUrl.hostname !== this._url.hostname ||
459+
parsedUrl.port !== this._url.port
460+
) {
461+
return;
462+
}
463+
464+
var rules = this._rules[userAgent] || [];
465+
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
466+
var rule = findRule(path, rules);
467+
468+
if (typeof rule === 'undefined') {
469+
return;
470+
}
471+
472+
return !(!rule || rule.allow);
473+
}
474+
435475
/**
436476
* Gets the crawl delay if there is one.
437477
*

index.d.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ declare module 'robots-parser';
22

33
interface Robot {
44
isAllowed(url: string, ua?: string): boolean | undefined;
5-
isDisallowed(url: string, ua?: string): boolean | undefined;
5+
isDisallowed(url: string, ua?: string, explicit?: boolean): boolean | undefined;
66
getMatchingLineNumber(url: string, ua?: string): number;
77
getCrawlDelay(ua?: string): number | undefined;
88
getSitemaps(): string[];

test/Robots.js

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -861,4 +861,42 @@ describe('Robots', function () {
861861

862862
testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed);
863863
});
864+
865+
it('should not be disallowed when wildcard is used in explicit mode', function () {
866+
var contents = [
867+
'User-agent: *',
868+
'Disallow: /',
869+
].join('\n')
870+
871+
var url = 'https://www.example.com/hello'
872+
var userAgent = 'SomeBot';
873+
var robots = robotsParser(url, contents);
874+
875+
expect(robots.isDisallowed(url, userAgent, true)).to.equal(false)
876+
})
877+
878+
it('should be disallowed when user agent equal robots rule in explicit mode', function () {
879+
var contents = [
880+
'User-agent: SomeBot',
881+
'Disallow: /',
882+
].join('\n')
883+
884+
var url = 'https://www.example.com/hello'
885+
var userAgent = 'SomeBot';
886+
var robots = robotsParser(url, contents);
887+
888+
expect(robots.isDisallowed(url, userAgent, true)).to.equal(true)
889+
})
890+
891+
it('should throw an error when user agent is not set in explicit mode', function () {
892+
var contents = [
893+
'User-agent: SomeBot',
894+
'Disallow: /',
895+
].join('\n')
896+
897+
var url = 'https://www.example.com/hello'
898+
var robots = robotsParser(url, contents);
899+
900+
expect(robots.isDisallowed.bind(robots, url, undefined, true)).to.throw("User Agent must be specified in explicit mode")
901+
})
864902
});

0 commit comments

Comments
 (0)