40
40
import urllib .robotparser as robotparser # NOQA
41
41
42
42
from nikola .plugin_categories import LateTask
43
- from nikola .utils import apply_filters , config_changed , encodelink
43
+ from nikola .utils import apply_filters , config_changed , encodelink , get_asset_path
44
44
45
45
46
46
urlset_header = """<?xml version="1.0" encoding="UTF-8"?>
@@ -118,6 +118,7 @@ def gen_tasks(self):
118
118
"base_url" : self .site .config ["BASE_URL" ],
119
119
"site_url" : self .site .config ["SITE_URL" ],
120
120
"output_folder" : self .site .config ["OUTPUT_FOLDER" ],
121
+ "files_folders" : self .site .config ['FILES_FOLDERS' ],
121
122
"strip_indexes" : self .site .config ["STRIP_INDEXES" ],
122
123
"index_file" : self .site .config ["INDEX_FILE" ],
123
124
"sitemap_include_fileless_dirs" : self .site .config ["SITEMAP_INCLUDE_FILELESS_DIRS" ],
@@ -140,9 +141,8 @@ def gen_tasks(self):
140
141
sitemapindex = {}
141
142
urlset = {}
142
143
143
- def scan_locs ():
144
+ def scan_locs (robots_rules ):
144
145
"""Scan site locations."""
145
- robots_rules = parse_robots_exclusions (kw ['robots_exclusions' ])
146
146
for root , dirs , files in os .walk (output , followlinks = True ):
147
147
if not dirs and not files and not kw ['sitemap_include_fileless_dirs' ]:
148
148
continue # Totally empty, not on sitemap
@@ -228,6 +228,16 @@ def scan_locs():
228
228
alternates .append (alternates_format .format (lang , alt_url ))
229
229
urlset [loc ] = loc_format .format (encodelink (loc ), lastmod , '\n ' .join (alternates ))
230
230
231
+ def parse_robotstxt (path ):
232
+ robot = robotparser .RobotFileParser ()
233
+ fh = io .open (path , 'r' , encoding = 'utf-8' )
234
+ rules = fh .readlines ()
235
+ if sys .version_info [0 ] == 2 :
236
+ rules = [ line .encode ('utf-8' ) for line in rules ]
237
+ fh .close ()
238
+ robot .parse (rules )
239
+ return robot
240
+
231
241
def parse_robots_exclusions (exclusions ):
232
242
"""Parse rules to check fetchable."""
233
243
rules = []
@@ -268,7 +278,12 @@ def scan_locs_task():
268
278
Other tasks can depend on this output, instead of having
269
279
to scan locations.
270
280
"""
271
- scan_locs ()
281
+ robotstxt = get_asset_path ("robots.txt" , [], files_folders = kw ["files_folders" ])
282
+ if robotstxt :
283
+ robots_rules = parse_robotstxt (robotstxt )
284
+ else :
285
+ robots_rules = parse_robots_exclusions (kw ['robots_exclusions' ])
286
+ scan_locs (robots_rules )
272
287
273
288
# Generate a list of file dependencies for the actual generation
274
289
# task, so rebuilds are triggered. (Issue #1032)
@@ -289,6 +304,9 @@ def scan_locs_task():
289
304
if os .path .isdir (p ) and os .path .exists (os .path .join (p , 'index.html' )):
290
305
file_dep .append (p + 'index.html' )
291
306
307
+ if robotstxt :
308
+ file_dep .append (os .path .join (output , 'robots.txt' ))
309
+
292
310
return {'file_dep' : file_dep }
293
311
294
312
yield {
0 commit comments