@@ -52,18 +52,29 @@ def _check_mirror_exists(mirror):
5252 .format (mirror ))
5353
5454
55- def _format_download_uri (etextno , mirror = None ):
55+ def _format_download_uri (etextno , mirror = None , prefer_ascii = False ):
5656 """Returns the download location on the Project Gutenberg servers for a
5757 given text.
5858
59+ Use prefer_ascii to control whether you want to fetch plaintext us-ascii
60+ file first (default old behavior) or if you prefer UTF-8 then 8-bits then
61+ plaintext.
62+
5963 Raises:
6064 UnknownDownloadUri: If no download location can be found for the text.
6165 """
6266 uri_root = mirror or _GUTENBERG_MIRROR
6367 uri_root = uri_root .strip ().rstrip ('/' )
6468 _check_mirror_exists (uri_root )
6569
66- extensions = ('.txt' , '-8.txt' , '-0.txt' )
70+ # Check https://www.gutenberg.org/files/ for details about available
71+ # extensions ;
72+ # - .txt is plaintext us-ascii
73+ # - -8.txt is 8-bit plaintext, multiple encodings
74+ # - -0.txt is UTF-8
75+ ascii_first = ('.txt' , '-0.txt' , '-8.txt' )
76+ utf8_first = ('-0.txt' , '-8.txt' , '.txt' )
77+ extensions = ascii_first if prefer_ascii else utf8_first
6778 for extension in extensions :
6879 path = _etextno_to_uri_subdirectory (etextno )
6980 uri = '{root}/{path}/{etextno}{extension}' .format (
@@ -79,7 +90,7 @@ def _format_download_uri(etextno, mirror=None):
7990 .format (etextno , uri_root ))
8091
8192
82- def load_etext (etextno , refresh_cache = False , mirror = None ):
93+ def load_etext (etextno , refresh_cache = False , mirror = None , prefer_ascii = False ):
8394 """Returns a unicode representation of the full body of a Project Gutenberg
8495 text. After making an initial remote call to Project Gutenberg's servers,
8596 the text is persisted locally.
@@ -92,8 +103,18 @@ def load_etext(etextno, refresh_cache=False, mirror=None):
92103 remove (cached )
93104 if not os .path .exists (cached ):
94105 makedirs (os .path .dirname (cached ))
95- download_uri = _format_download_uri (etextno , mirror )
106+ download_uri = _format_download_uri (etextno , mirror , prefer_ascii )
96107 response = requests .get (download_uri )
108+ # Ensure proper UTF-8 saving. There might be instances of ebooks or
109+ # mirrors which advertise a broken encoding, and this will break
110+ # downstream usages. For example, #55517 from aleph.gutenberg.org:
111+ #
112+ # from gutenberg.acquire import load_etext
113+ # print(load_etext(55517, refresh_cache=True)[0:1000])
114+ #
115+ # response.encoding will be 'ISO-8859-1' while the file is UTF-8
116+ if response .encoding != response .apparent_encoding :
117+ response .encoding = response .apparent_encoding
97118 text = response .text
98119 with closing (gzip .open (cached , 'w' )) as cache :
99120 cache .write (text .encode ('utf-8' ))
@@ -115,12 +136,15 @@ def _main():
115136 parser .add_argument ('etextno' , type = int )
116137 parser .add_argument ('outfile' , type = FileType ('w' ))
117138 parser .add_argument ('--mirror' , '-m' , type = str )
139+ parser .add_argument ('--prefer-ascii' , '-a' , type = bool , default = False )
118140 args = parser .parse_args ()
119141
120142 mirror = args .mirror or os .environ .get ('GUTENBERG_MIRROR' )
121143
122144 try :
123- text = load_etext (args .etextno , mirror = mirror )
145+ text = load_etext (args .etextno ,
146+ mirror = mirror ,
147+ prefer_ascii = args .prefer_ascii )
124148 with reopen_encoded (args .outfile , 'w' , 'utf8' ) as outfile :
125149 outfile .write (text )
126150 except Error as error :
0 commit comments