Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The parser currently supports:

- User-agent:
- Allow:
- Disallow:
- Disallow (with explicit mode support):
- Sitemap:
- Crawl-delay:
- Host:
Expand Down Expand Up @@ -41,6 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [
robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true
robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true
robots.isExplicitlyDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // false
robots.getCrawlDelay('Sams-Bot/1.0'); // 1
robots.getSitemaps(); // ['http://example.com/sitemap.xml']
robots.getPreferredHost(); // example.com
Expand All @@ -62,6 +63,13 @@ Returns true if crawling the specified URL is not allowed for the specified user

This will return `undefined` if the URL isn't valid for this robots.txt.

### isExplicitlyDisallowed(url, ua)

**boolean or undefined**

Returns trues if explicitly disallowed for the specified user agent (User Agent wildcards are discarded).

This will return undefined if the URL is not valid for this robots.txt file.
### getMatchingLineNumber(url, [ua])

**number or undefined**
Expand Down
32 changes: 27 additions & 5 deletions Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ Robots.prototype.setPreferredHost = function (url) {
this._preferredHost = url;
};

Robots.prototype._getRule = function (url, ua) {
Robots.prototype._getRule = function (url, ua, explicit) {
var parsedUrl = parseUrl(url) || {};
var userAgent = formatUserAgent(ua || '*');

Expand All @@ -374,7 +374,12 @@ Robots.prototype._getRule = function (url, ua) {
return;
}

var rules = this._rules[userAgent] || this._rules['*'] || [];
var rules = this._rules[userAgent];
if (!explicit) {
rules = rules || this._rules['*']
}
rules = rules || []

var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
var rule = findRule(path, rules);

Expand All @@ -392,7 +397,7 @@ Robots.prototype._getRule = function (url, ua) {
* @return {boolean?}
*/
Robots.prototype.isAllowed = function (url, ua) {
var rule = this._getRule(url, ua);
var rule = this._getRule(url, ua, false);

if (typeof rule === 'undefined') {
return;
Expand All @@ -416,7 +421,7 @@ Robots.prototype.isAllowed = function (url, ua) {
* @return {number?}
*/
Robots.prototype.getMatchingLineNumber = function (url, ua) {
var rule = this._getRule(url, ua);
var rule = this._getRule(url, ua, false);

return rule ? rule.lineNumber : -1;
};
Expand All @@ -425,13 +430,30 @@ Robots.prototype.getMatchingLineNumber = function (url, ua) {
* Returns the opposite of isAllowed()
*
* @param {string} url
* @param {string} ua
* @param {string?} ua
* @return {boolean}
*/
Robots.prototype.isDisallowed = function (url, ua) {
return !this.isAllowed(url, ua);
};

/**
* Returns trues if explicitly disallowed
* for the specified user agent (User Agent wildcards are discarded).
*
* This will return undefined if the URL is not valid for this robots.txt file.
* @param {string} url
* @param {string} ua
* @return {boolean?}
*/
Robots.prototype.isExplicitlyDisallowed = function(url, ua) {
var rule = this._getRule(url, ua, true);
if (typeof rule === 'undefined') {
return true;
}
return !(!rule || rule.allow);
}

/**
* Gets the crawl delay if there is one.
*
Expand Down
1 change: 1 addition & 0 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ declare module 'robots-parser';
interface Robot {
isAllowed(url: string, ua?: string): boolean | undefined;
isDisallowed(url: string, ua?: string): boolean | undefined;
isExplicitlyDisallowed(url: string, ua: string): boolean | undefined;
getMatchingLineNumber(url: string, ua?: string): number;
getCrawlDelay(ua?: string): number | undefined;
getSitemaps(): string[];
Expand Down
26 changes: 26 additions & 0 deletions test/Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -861,4 +861,30 @@ describe('Robots', function () {

testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed);
});

it('should not be disallowed when wildcard is used in explicit mode', function () {
var contents = [
'User-agent: *',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false)
})

it('should be disallowed when user agent equal robots rule in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true)
})
});