11import os .path
2+ import sys
23import urllib
34import atexit
45import requests
910import time
1011import numpy as np
1112
13+ from urllib .error import HTTPError
14+
1215import queue
1316import concurrent .futures
1417import threading
@@ -59,24 +62,28 @@ def validate_file(file_path, hash):
5962 return m .hexdigest () == hash
6063
6164
62- def time_request (url , max_n = 2 , timeout = 0.15 ):
63- times = 0
64- n = max_n
65- for i in range (max_n ):
66- try :
67- req = requests .head (url , timeout = timeout )
68- times += req .elapsed .seconds
69- if req .status_code != 200 :
70- n -= 1
71- except requests .Timeout :
72- n -= 1
73- except (requests .exceptions .BaseHTTPError , urllib .error .URLError ) as e :
74- logging .log (logging .WARNING , e )
75- n -= 1
76- if n == 0 :
77- return 1000 * 1000 * 1000 + times
65+ def get_mirror (urls ) -> requests .Response :
66+ # Interactive python downloads dont get fancy as_completed support =(
67+ if bool (getattr (sys , 'ps1' , sys .flags .interactive )):
68+ reqs = [requests .head (url ) for url in urls ]
69+ successes = [req for req in reqs if req .status_code == 200 ]
70+ if len (successes ) > 0 :
71+ return min (successes , key = lambda r : r .elapsed .seconds )
72+ else :
73+ req = min (reqs , key = lambda r : r .elapsed .seconds )
74+ raise HTTPError (req .url , req .status_code , "resource not found" , req .headers , None )
7875 else :
79- return times / n
76+ with concurrent .futures .ThreadPoolExecutor (max_workers = 5 ) as worker_pool :
77+ futures = [worker_pool .submit (requests .head , url ) for url in urls ]
78+ first_request = None
79+ for future in concurrent .futures .as_completed (futures ):
80+ request = future .result ()
81+ first_request = request if first_request is None else first_request
82+ if request .status_code == 200 :
83+ return request
84+ else :
85+ logging .warning ('Mirror {} returned status code {}' .format (request .url , request .status_code ))
86+ raise HTTPError (first_request .url , first_request .status_code , "resource not found" , first_request .headers , None )
8087
8188
8289def download_with_resume (urls , file_path , hash = None , timeout = 10 ):
@@ -103,23 +110,18 @@ def download_with_resume(urls, file_path, hash=None, timeout=10):
103110 # urllib can be verbose
104111 logging .getLogger ("urllib3" ).setLevel (logging .WARNING )
105112
106- latency = [time_request (url ) for url in urls ]
107- if min (latency ) < 1000 * 1000 * 1000 :
108- i = np .argmin (latency )
109- else :
110- logging .warning ('Re-checking mirrors, latency above 0.1s' )
111- i = np .argmin ([time_request (url , timeout = 30 ) for url in urls ])
113+ mirror = get_mirror (urls )
114+ url , ping_ms = mirror .url , mirror .elapsed .microseconds / 1000
112115
113- logging .debug ('Picked {}' .format (urls [i ]))
114- url = urls [i ]
116+ logging .debug ('Picked {} ping={}ms' .format (url , ping_ms ))
115117
116118 try :
117119 logging .debug ('Starting download at %.1fMB' % (first_byte / 1e6 ))
118120
119121 head = requests .head (url )
120122 file_size = int (head .headers ['Content-length' ])
121123
122- logging .debug ('File size is %s ' % file_size )
124+ logging .debug ('File size is %.1fMB ' % ( file_size / 1e6 ) )
123125 headers = {"Range" : "bytes=%s-" % first_byte }
124126
125127 disp = tqdm .tqdm (total = file_size / 1e6 , desc = 'Download: {}' .format (url ), unit = 'MB' , )
0 commit comments