Skip to content

Commit f07168c

Browse files
Add explicit disallow feature (#36)
* add explicit disallow feature isExplictlyDisallowed()
1 parent 982657e commit f07168c

File tree

4 files changed

+63
-6
lines changed

4 files changed

+63
-6
lines changed

Diff for: README.md

+9-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ The parser currently supports:
66

77
- User-agent:
88
- Allow:
9-
- Disallow:
9+
- Disallow (with explicit mode support):
1010
- Sitemap:
1111
- Crawl-delay:
1212
- Host:
@@ -41,6 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [
4141
robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true
4242
robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true
4343
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true
44+
robots.isExplicitlyDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // false
4445
robots.getCrawlDelay('Sams-Bot/1.0'); // 1
4546
robots.getSitemaps(); // ['http://example.com/sitemap.xml']
4647
robots.getPreferredHost(); // example.com
@@ -62,6 +63,13 @@ Returns true if crawling the specified URL is not allowed for the specified user
6263

6364
This will return `undefined` if the URL isn't valid for this robots.txt.
6465

66+
### isExplicitlyDisallowed(url, ua)
67+
68+
**boolean or undefined**
69+
70+
Returns trues if explicitly disallowed for the specified user agent (User Agent wildcards are discarded).
71+
72+
This will return undefined if the URL is not valid for this robots.txt file.
6573
### getMatchingLineNumber(url, [ua])
6674

6775
**number or undefined**

Diff for: Robots.js

+27-5
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ Robots.prototype.setPreferredHost = function (url) {
361361
this._preferredHost = url;
362362
};
363363

364-
Robots.prototype._getRule = function (url, ua) {
364+
Robots.prototype._getRule = function (url, ua, explicit) {
365365
var parsedUrl = parseUrl(url) || {};
366366
var userAgent = formatUserAgent(ua || '*');
367367

@@ -374,7 +374,12 @@ Robots.prototype._getRule = function (url, ua) {
374374
return;
375375
}
376376

377-
var rules = this._rules[userAgent] || this._rules['*'] || [];
377+
var rules = this._rules[userAgent];
378+
if (!explicit) {
379+
rules = rules || this._rules['*']
380+
}
381+
rules = rules || []
382+
378383
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
379384
var rule = findRule(path, rules);
380385

@@ -392,7 +397,7 @@ Robots.prototype._getRule = function (url, ua) {
392397
* @return {boolean?}
393398
*/
394399
Robots.prototype.isAllowed = function (url, ua) {
395-
var rule = this._getRule(url, ua);
400+
var rule = this._getRule(url, ua, false);
396401

397402
if (typeof rule === 'undefined') {
398403
return;
@@ -416,7 +421,7 @@ Robots.prototype.isAllowed = function (url, ua) {
416421
* @return {number?}
417422
*/
418423
Robots.prototype.getMatchingLineNumber = function (url, ua) {
419-
var rule = this._getRule(url, ua);
424+
var rule = this._getRule(url, ua, false);
420425

421426
return rule ? rule.lineNumber : -1;
422427
};
@@ -425,13 +430,30 @@ Robots.prototype.getMatchingLineNumber = function (url, ua) {
425430
* Returns the opposite of isAllowed()
426431
*
427432
* @param {string} url
428-
* @param {string} ua
433+
* @param {string?} ua
429434
* @return {boolean}
430435
*/
431436
Robots.prototype.isDisallowed = function (url, ua) {
432437
return !this.isAllowed(url, ua);
433438
};
434439

440+
/**
441+
* Returns trues if explicitly disallowed
442+
* for the specified user agent (User Agent wildcards are discarded).
443+
*
444+
* This will return undefined if the URL is not valid for this robots.txt file.
445+
* @param {string} url
446+
* @param {string} ua
447+
* @return {boolean?}
448+
*/
449+
Robots.prototype.isExplicitlyDisallowed = function(url, ua) {
450+
var rule = this._getRule(url, ua, true);
451+
if (typeof rule === 'undefined') {
452+
return true;
453+
}
454+
return !(!rule || rule.allow);
455+
}
456+
435457
/**
436458
* Gets the crawl delay if there is one.
437459
*

Diff for: index.d.ts

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ declare module 'robots-parser';
33
interface Robot {
44
isAllowed(url: string, ua?: string): boolean | undefined;
55
isDisallowed(url: string, ua?: string): boolean | undefined;
6+
isExplicitlyDisallowed(url: string, ua: string): boolean | undefined;
67
getMatchingLineNumber(url: string, ua?: string): number;
78
getCrawlDelay(ua?: string): number | undefined;
89
getSitemaps(): string[];

Diff for: test/Robots.js

+26
Original file line numberDiff line numberDiff line change
@@ -861,4 +861,30 @@ describe('Robots', function () {
861861

862862
testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed);
863863
});
864+
865+
it('should not be disallowed when wildcard is used in explicit mode', function () {
866+
var contents = [
867+
'User-agent: *',
868+
'Disallow: /',
869+
].join('\n')
870+
871+
var url = 'https://www.example.com/hello'
872+
var userAgent = 'SomeBot';
873+
var robots = robotsParser(url, contents);
874+
875+
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false)
876+
})
877+
878+
it('should be disallowed when user agent equal robots rule in explicit mode', function () {
879+
var contents = [
880+
'User-agent: SomeBot',
881+
'Disallow: /',
882+
].join('\n')
883+
884+
var url = 'https://www.example.com/hello'
885+
var userAgent = 'SomeBot';
886+
var robots = robotsParser(url, contents);
887+
888+
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true)
889+
})
864890
});

0 commit comments

Comments
 (0)