Skip to content
This repository was archived by the owner on Jan 12, 2023. It is now read-only.

Commit 36cc023

Browse files
lissyxc-w
authored andcommitted
Changing default ordering of extensions for fetching ebooks (#103)
Resolves #102
1 parent e76d96e commit 36cc023

File tree

3 files changed

+105
-6
lines changed

3 files changed

+105
-6
lines changed

gutenberg/acquire/text.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,29 @@ def _check_mirror_exists(mirror):
5252
.format(mirror))
5353

5454

55-
def _format_download_uri(etextno, mirror=None):
55+
def _format_download_uri(etextno, mirror=None, prefer_ascii=False):
5656
"""Returns the download location on the Project Gutenberg servers for a
5757
given text.
5858
59+
Use prefer_ascii to control whether you want to fetch plaintext us-ascii
60+
file first (default old behavior) or if you prefer UTF-8 then 8-bits then
61+
plaintext.
62+
5963
Raises:
6064
UnknownDownloadUri: If no download location can be found for the text.
6165
"""
6266
uri_root = mirror or _GUTENBERG_MIRROR
6367
uri_root = uri_root.strip().rstrip('/')
6468
_check_mirror_exists(uri_root)
6569

66-
extensions = ('.txt', '-8.txt', '-0.txt')
70+
# Check https://www.gutenberg.org/files/ for details about available
71+
# extensions ;
72+
# - .txt is plaintext us-ascii
73+
# - -8.txt is 8-bit plaintext, multiple encodings
74+
# - -0.txt is UTF-8
75+
ascii_first = ('.txt', '-0.txt', '-8.txt')
76+
utf8_first = ('-0.txt', '-8.txt', '.txt')
77+
extensions = ascii_first if prefer_ascii else utf8_first
6778
for extension in extensions:
6879
path = _etextno_to_uri_subdirectory(etextno)
6980
uri = '{root}/{path}/{etextno}{extension}'.format(
@@ -79,7 +90,7 @@ def _format_download_uri(etextno, mirror=None):
7990
.format(etextno, uri_root))
8091

8192

82-
def load_etext(etextno, refresh_cache=False, mirror=None):
93+
def load_etext(etextno, refresh_cache=False, mirror=None, prefer_ascii=False):
8394
"""Returns a unicode representation of the full body of a Project Gutenberg
8495
text. After making an initial remote call to Project Gutenberg's servers,
8596
the text is persisted locally.
@@ -92,8 +103,18 @@ def load_etext(etextno, refresh_cache=False, mirror=None):
92103
remove(cached)
93104
if not os.path.exists(cached):
94105
makedirs(os.path.dirname(cached))
95-
download_uri = _format_download_uri(etextno, mirror)
106+
download_uri = _format_download_uri(etextno, mirror, prefer_ascii)
96107
response = requests.get(download_uri)
108+
# Ensure proper UTF-8 saving. There might be instances of ebooks or
109+
# mirrors which advertise a broken encoding, and this will break
110+
# downstream usages. For example, #55517 from aleph.gutenberg.org:
111+
#
112+
# from gutenberg.acquire import load_etext
113+
# print(load_etext(55517, refresh_cache=True)[0:1000])
114+
#
115+
# response.encoding will be 'ISO-8859-1' while the file is UTF-8
116+
if response.encoding != response.apparent_encoding:
117+
response.encoding = response.apparent_encoding
97118
text = response.text
98119
with closing(gzip.open(cached, 'w')) as cache:
99120
cache.write(text.encode('utf-8'))
@@ -115,12 +136,15 @@ def _main():
115136
parser.add_argument('etextno', type=int)
116137
parser.add_argument('outfile', type=FileType('w'))
117138
parser.add_argument('--mirror', '-m', type=str)
139+
parser.add_argument('--prefer-ascii', '-a', type=bool, default=False)
118140
args = parser.parse_args()
119141

120142
mirror = args.mirror or os.environ.get('GUTENBERG_MIRROR')
121143

122144
try:
123-
text = load_etext(args.etextno, mirror=mirror)
145+
text = load_etext(args.etextno,
146+
mirror=mirror,
147+
prefer_ascii=args.prefer_ascii)
124148
with reopen_encoded(args.outfile, 'w', 'utf8') as outfile:
125149
outfile.write(text)
126150
except Error as error:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def requirements_for(version=None):
3939

4040
setup(
4141
name='Gutenberg',
42-
version='0.6.1',
42+
version='0.7.0',
4343
author='Clemens Wolff',
4444
author_email='[email protected]',
4545
packages=find_packages(exclude=['tests']),

tests/test_acquire.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,81 @@ def test_unreachable_mirror(self):
8080
with self.assertRaises(UnknownDownloadUriException):
8181
text.load_etext(1)
8282

83+
class TestExtensionsLoadEtext(unittest.TestCase):
84+
def setUp(self):
85+
self._original_head = text.requests.head
86+
self._original_check = text._check_mirror_exists
87+
88+
def tearDown(self):
89+
text.requests.head = self._original_head
90+
text._check_mirror_exists = self._original_check
91+
92+
def request_head_response(self, valid_files):
93+
response = namedtuple('Response', 'ok')
94+
95+
def head(*args, **kwargs):
96+
req_file = args[0].split('/')[-1]
97+
return response(req_file in valid_files)
98+
text.requests.head = head
99+
100+
def mirror_exist(*args, **kwargs):
101+
return response(True)
102+
text._check_mirror_exists = mirror_exist
103+
104+
def test_extensions_order_utf8_only(self):
105+
utf8_filename = '12345-0.txt'
106+
self.request_head_response(valid_files=[utf8_filename])
107+
108+
extensions = text._format_download_uri(12345)
109+
self.assertEqual(extensions.split('/')[-1], utf8_filename)
110+
111+
extensions = text._format_download_uri(12345, prefer_ascii=False)
112+
self.assertEqual(extensions.split('/')[-1], utf8_filename)
113+
114+
def test_extensions_order_ascii_only(self):
115+
ascii_filename = '12345.txt'
116+
self.request_head_response(valid_files=[ascii_filename])
117+
118+
extensions = text._format_download_uri(12345)
119+
self.assertEqual(extensions.split('/')[-1], ascii_filename)
120+
121+
extensions = text._format_download_uri(12345, prefer_ascii=True)
122+
self.assertEqual(extensions.split('/')[-1], ascii_filename)
123+
124+
def test_extensions_order_utf8_first(self):
125+
utf8_filename = '12345-0.txt'
126+
all_files = ['12345.txt', '12345-8.txt', '12345-0.txt']
127+
self.request_head_response(valid_files=all_files)
128+
129+
extensions = text._format_download_uri(12345)
130+
self.assertEqual(extensions.split('/')[-1], utf8_filename)
131+
132+
extensions = text._format_download_uri(12345, prefer_ascii=False)
133+
self.assertEqual(extensions.split('/')[-1], utf8_filename)
134+
135+
def test_extensions_order_ascii_first(self):
136+
ascii_filename = '12345.txt'
137+
all_files = ['12345-8.txt', '12345-0.txt', '12345.txt']
138+
self.request_head_response(valid_files=all_files)
139+
140+
extensions = text._format_download_uri(12345)
141+
self.assertNotEqual(extensions.split('/')[-1], ascii_filename)
142+
143+
extensions = text._format_download_uri(12345, prefer_ascii=True)
144+
self.assertEqual(extensions.split('/')[-1], ascii_filename)
145+
146+
def test_extensions_order_eightbit_first(self):
147+
eightbit_filename = '12345-8.txt'
148+
ascii_filename = '12345.txt'
149+
all_files = ['12345-8.txt', '12345.txt']
150+
self.request_head_response(valid_files=all_files)
151+
152+
extensions = text._format_download_uri(12345)
153+
self.assertEqual(extensions.split('/')[-1], eightbit_filename)
154+
155+
extensions = text._format_download_uri(12345, prefer_ascii=True)
156+
self.assertEqual(extensions.split('/')[-1], ascii_filename)
157+
83158

84159
if __name__ == '__main__':
85160
unittest.main()

0 commit comments

Comments
 (0)