Skip to content

Commit 3db2cce

Browse files
committed
Read the robots.txt while creating sitemaps
If the robots.txt file already exist, read it and ignore ROBOTS_EXCLUSIONS.
1 parent 8833af2 commit 3db2cce

File tree

1 file changed

+22
-4
lines changed

1 file changed

+22
-4
lines changed

Diff for: nikola/plugins/task/sitemap/__init__.py

+22-4
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
import urllib.robotparser as robotparser # NOQA
4141

4242
from nikola.plugin_categories import LateTask
43-
from nikola.utils import apply_filters, config_changed, encodelink
43+
from nikola.utils import apply_filters, config_changed, encodelink, get_asset_path
4444

4545

4646
urlset_header = """<?xml version="1.0" encoding="UTF-8"?>
@@ -118,6 +118,7 @@ def gen_tasks(self):
118118
"base_url": self.site.config["BASE_URL"],
119119
"site_url": self.site.config["SITE_URL"],
120120
"output_folder": self.site.config["OUTPUT_FOLDER"],
121+
"files_folders": self.site.config['FILES_FOLDERS'],
121122
"strip_indexes": self.site.config["STRIP_INDEXES"],
122123
"index_file": self.site.config["INDEX_FILE"],
123124
"sitemap_include_fileless_dirs": self.site.config["SITEMAP_INCLUDE_FILELESS_DIRS"],
@@ -140,9 +141,8 @@ def gen_tasks(self):
140141
sitemapindex = {}
141142
urlset = {}
142143

143-
def scan_locs():
144+
def scan_locs(robots_rules):
144145
"""Scan site locations."""
145-
robots_rules = parse_robots_exclusions(kw['robots_exclusions'])
146146
for root, dirs, files in os.walk(output, followlinks=True):
147147
if not dirs and not files and not kw['sitemap_include_fileless_dirs']:
148148
continue # Totally empty, not on sitemap
@@ -228,6 +228,16 @@ def scan_locs():
228228
alternates.append(alternates_format.format(lang, alt_url))
229229
urlset[loc] = loc_format.format(encodelink(loc), lastmod, '\n'.join(alternates))
230230

231+
def parse_robotstxt(path):
232+
robot = robotparser.RobotFileParser()
233+
fh = io.open(path, 'r', encoding='utf-8')
234+
rules = fh.readlines()
235+
if sys.version_info[0] == 2:
236+
rules = [ line.encode('utf-8') for line in rules ]
237+
fh.close()
238+
robot.parse(rules)
239+
return robot
240+
231241
def parse_robots_exclusions(exclusions):
232242
"""Parse rules to check fetchable."""
233243
rules = []
@@ -268,7 +278,12 @@ def scan_locs_task():
268278
Other tasks can depend on this output, instead of having
269279
to scan locations.
270280
"""
271-
scan_locs()
281+
robotstxt = get_asset_path("robots.txt", [], files_folders=kw["files_folders"])
282+
if robotstxt:
283+
robots_rules = parse_robotstxt(robotstxt)
284+
else:
285+
robots_rules = parse_robots_exclusions(kw['robots_exclusions'])
286+
scan_locs(robots_rules)
272287

273288
# Generate a list of file dependencies for the actual generation
274289
# task, so rebuilds are triggered. (Issue #1032)
@@ -289,6 +304,9 @@ def scan_locs_task():
289304
if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')):
290305
file_dep.append(p + 'index.html')
291306

307+
if robotstxt:
308+
file_dep.append(os.path.join(output, 'robots.txt'))
309+
292310
return {'file_dep': file_dep}
293311

294312
yield {

0 commit comments

Comments
 (0)