Skip to content

Commit ce8731f

Browse files
Don't try to download missing files (#316)
* Don't try to download missing files Fix dataset url path for experiment downloading * Add SUPER fancy fast mirror selection Should save the average user over 1 second of time!
1 parent 3c67e0e commit ce8731f

File tree

3 files changed

+33
-34
lines changed

3 files changed

+33
-34
lines changed

minerl/data/download.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323
logger = logging.getLogger(__name__)
2424

2525

26-
27-
2826
def download(directory=None, resolution='low', texture_pack=0, update_environment_variables=True, disable_cache=False,
2927
experiment=None, minimal=False):
3028
"""Downloads MineRLv0 to specified directory. If directory is None, attempts to
@@ -79,7 +77,7 @@ def download(directory=None, resolution='low', texture_pack=0, update_environmen
7977
mirrors = [
8078
"https://minerl.s3.amazonaws.com/",
8179
"https://minerl-asia.s3.amazonaws.com/",
82-
"https://minerl-europe.s3.amazonaws.com/"] # , "https://router2.sneakywines.me/"]
80+
"https://minerl-europe.s3.amazonaws.com/"]
8381

8482
if experiment is None:
8583
min_str = '_minimal' if minimal else ''
@@ -91,7 +89,7 @@ def download(directory=None, resolution='low', texture_pack=0, update_environmen
9189
if os.path.exists(os.path.join(directory, experiment)):
9290
logger.warning("{} exists - skipping re-download!".format(os.path.join(directory, experiment)))
9391
return directory
94-
filename = "minerl/v{}/{}.tar".format(DATA_VERSION, experiment)
92+
filename = "v{}/{}.tar".format(DATA_VERSION, experiment)
9593
urls = [mirror + filename for mirror in mirrors]
9694
try:
9795
logger.info("Fetching download hash ...")
@@ -103,10 +101,9 @@ def download(directory=None, resolution='low', texture_pack=0, update_environmen
103101
os.makedirs(os.path.dirname(dest_file), exist_ok=True)
104102
download_with_resume(urls, dest_file)
105103
except HTTPError as e:
106-
logger.error("HTTP error encountered when downloading")
104+
logger.error("HTTP {} error encountered when downloading files!".format(e.code))
107105
if experiment is not None:
108-
logger.error("is {} a valid minerl environment?".format(experiment))
109-
logger.error(e.errno)
106+
logger.error("Is \"{}\" a valid minerl environment?".format(experiment))
110107
return None
111108
except URLError as e:
112109
logger.error("URL error encountered when downloading - please try again")

minerl/data/util/__init__.py

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os.path
2+
import sys
23
import urllib
34
import atexit
45
import requests
@@ -9,6 +10,8 @@
910
import time
1011
import numpy as np
1112

13+
from urllib.error import HTTPError
14+
1215
import queue
1316
import concurrent.futures
1417
import threading
@@ -59,24 +62,28 @@ def validate_file(file_path, hash):
5962
return m.hexdigest() == hash
6063

6164

62-
def time_request(url, max_n=2, timeout=0.15):
63-
times = 0
64-
n = max_n
65-
for i in range(max_n):
66-
try:
67-
req = requests.head(url, timeout=timeout)
68-
times += req.elapsed.seconds
69-
if req.status_code != 200:
70-
n -= 1
71-
except requests.Timeout:
72-
n -= 1
73-
except (requests.exceptions.BaseHTTPError, urllib.error.URLError) as e:
74-
logging.log(logging.WARNING, e)
75-
n -= 1
76-
if n == 0:
77-
return 1000 * 1000 * 1000 + times
65+
def get_mirror(urls) -> requests.Response:
66+
# Interactive python downloads dont get fancy as_completed support =(
67+
if bool(getattr(sys, 'ps1', sys.flags.interactive)):
68+
reqs = [requests.head(url) for url in urls]
69+
successes = [req for req in reqs if req.status_code == 200]
70+
if len(successes) > 0:
71+
return min(successes, key=lambda r: r.elapsed.seconds)
72+
else:
73+
req = min(reqs, key=lambda r: r.elapsed.seconds)
74+
raise HTTPError(req.url, req.status_code, "resource not found", req.headers, None)
7875
else:
79-
return times / n
76+
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as worker_pool:
77+
futures = [worker_pool.submit(requests.head, url) for url in urls]
78+
first_request = None
79+
for future in concurrent.futures.as_completed(futures):
80+
request = future.result()
81+
first_request = request if first_request is None else first_request
82+
if request.status_code == 200:
83+
return request
84+
else:
85+
logging.warning('Mirror {} returned status code {}'.format(request.url, request.status_code))
86+
raise HTTPError(first_request.url, first_request.status_code, "resource not found", first_request.headers, None)
8087

8188

8289
def download_with_resume(urls, file_path, hash=None, timeout=10):
@@ -103,23 +110,18 @@ def download_with_resume(urls, file_path, hash=None, timeout=10):
103110
# urllib can be verbose
104111
logging.getLogger("urllib3").setLevel(logging.WARNING)
105112

106-
latency = [time_request(url) for url in urls]
107-
if min(latency) < 1000 * 1000 * 1000:
108-
i = np.argmin(latency)
109-
else:
110-
logging.warning('Re-checking mirrors, latency above 0.1s')
111-
i = np.argmin([time_request(url, timeout=30) for url in urls])
113+
mirror = get_mirror(urls)
114+
url, ping_ms = mirror.url, mirror.elapsed.microseconds/1000
112115

113-
logging.debug('Picked {}'.format(urls[i]))
114-
url = urls[i]
116+
logging.debug('Picked {} ping={}ms'.format(url, ping_ms))
115117

116118
try:
117119
logging.debug('Starting download at %.1fMB' % (first_byte / 1e6))
118120

119121
head = requests.head(url)
120122
file_size = int(head.headers['Content-length'])
121123

122-
logging.debug('File size is %s' % file_size)
124+
logging.debug('File size is %.1fMB' % (file_size / 1e6))
123125
headers = {"Range": "bytes=%s-" % first_byte}
124126

125127
disp = tqdm.tqdm(total=file_size / 1e6, desc='Download: {}'.format(url), unit='MB', )

tests/local/handler_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,4 +254,4 @@ def test_env(environment='MineRLObtainTest-v0', interactive=False):
254254

255255

256256
if __name__ == '__main__':
257-
test_env()
257+
test_wrapped_env()

0 commit comments

Comments
 (0)