|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# license: Do What the Fuck You Want to Public License version 2 |
| 3 | +# [http://wtfpl.net] |
| 4 | + |
| 5 | +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter |
| 6 | +from fractions import Fraction |
| 7 | +from io import BytesIO |
| 8 | +import logging |
| 9 | +import re |
| 10 | +from urllib.parse import urljoin |
| 11 | + |
| 12 | +from PIL import Image |
| 13 | +from weboob.browser import PagesBrowser, URL |
| 14 | +from weboob.browser.cache import CacheMixin |
| 15 | +from weboob.browser.pages import HTMLPage, RawPage |
| 16 | + |
| 17 | + |
| 18 | +class MimeURL(URL): |
| 19 | + def __init__(self, *args, types, **kwargs): |
| 20 | + super(MimeURL, self).__init__(*args, **kwargs) |
| 21 | + self.types = types |
| 22 | + |
| 23 | + def handle(self, response): |
| 24 | + response_type = response.headers.get('Content-Type') |
| 25 | + if not response_type: |
| 26 | + return |
| 27 | + response_type = re.match('[^;]+', response_type)[0] # ignore mime params |
| 28 | + |
| 29 | + for accepted_type in self.types: |
| 30 | + if isinstance(accepted_type, str) and accepted_type == response_type: |
| 31 | + break |
| 32 | + elif isinstance(accepted_type, re.Pattern) and accepted_type.fullmatch(response_type): |
| 33 | + break |
| 34 | + else: |
| 35 | + # not found any match |
| 36 | + return |
| 37 | + |
| 38 | + return super(MimeURL, self).handle(response) |
| 39 | + |
| 40 | + |
| 41 | +class HPage(HTMLPage): |
| 42 | + def search_thumbs(self): |
| 43 | + for link_el in self.doc.xpath('//a[.//img]'): |
| 44 | + link = urljoin(self.url, link_el.attrib['href']) |
| 45 | + |
| 46 | + imgs = link_el.xpath('.//img') |
| 47 | + if len(imgs) != 1: |
| 48 | + continue |
| 49 | + img = urljoin(self.url, imgs[0].attrib['src']) |
| 50 | + |
| 51 | + yield link, img |
| 52 | + |
| 53 | + def search_big_images(self): |
| 54 | + for img_el in self.doc.xpath('//img'): |
| 55 | + img = urljoin(self.url, img_el.attrib['src']) |
| 56 | + |
| 57 | + imgpage = self.browser.open(img).page |
| 58 | + if not (isinstance(imgpage, IPage) and bigger_than(imgpage.size, args.min_image_size)): |
| 59 | + continue |
| 60 | + |
| 61 | + yield img |
| 62 | + |
| 63 | + def search_big_image_old(self): |
| 64 | + for img_el in self.doc.xpath('//img'): |
| 65 | + img = urljoin(self.url, img_el.attrib['src']) |
| 66 | + |
| 67 | + imgpage = self.browser.open(img).page |
| 68 | + if not (isinstance(imgpage, IPage) and bigger_than(imgpage.size, args.min_image_size)): |
| 69 | + continue |
| 70 | + |
| 71 | + # found, now look if we're on a link to hi-res |
| 72 | + links = img_el.xpath('./ancestor::a[@href]') |
| 73 | + if len(links) != 1: |
| 74 | + return img |
| 75 | + |
| 76 | + link_el = links[0] |
| 77 | + link = urljoin(self.url, link_el.attrib['href']) |
| 78 | + logging.debug(f'[-] found higher res? {link} for {img}') |
| 79 | + return img |
| 80 | + |
| 81 | + def search_big_image(self): |
| 82 | + for img_el in self.doc.xpath('//img'): |
| 83 | + img = self._url_of(img_el, 'src') |
| 84 | + |
| 85 | + if self.browser.test_image_link(img): |
| 86 | + return img |
| 87 | + |
| 88 | + def _container_link_el(self, img_el): |
| 89 | + links = img_el.xpath('./ancestor::a[@href]') |
| 90 | + |
| 91 | + try: |
| 92 | + link_el, = links |
| 93 | + except ValueError: |
| 94 | + return |
| 95 | + |
| 96 | + return link_el |
| 97 | + |
| 98 | + def _url_of(self, el, attr): |
| 99 | + return urljoin(self.url, el.attrib[attr]) |
| 100 | + |
| 101 | + def search_2(self): |
| 102 | + for img_el in self.doc.xpath('//img'): |
| 103 | + img = self._url_of(img_el, 'src') |
| 104 | + |
| 105 | + if not self.browser.test_min_thumb(img): |
| 106 | + # doesn't even qualify as probable thumbnail |
| 107 | + continue |
| 108 | + |
| 109 | + link_el = self._container_link_el(img_el) |
| 110 | + if link_el is not None: |
| 111 | + link = self._url_of(link_el, 'href') |
| 112 | + if self.browser.test_image_link(link): |
| 113 | + # img is link to bigger image |
| 114 | + yield link |
| 115 | + continue |
| 116 | + |
| 117 | + sub = self.browser.get_page_image(link) |
| 118 | + if sub: |
| 119 | + # img is link to page with bigger image |
| 120 | + yield sub |
| 121 | + continue |
| 122 | + |
| 123 | + if self.browser.test_image_link(img): |
| 124 | + # img is already the big image |
| 125 | + yield img |
| 126 | + continue |
| 127 | + |
| 128 | + |
| 129 | +class IPage(RawPage): |
| 130 | + def build_doc(self, content): |
| 131 | + return Image.open(BytesIO(content)) |
| 132 | + |
| 133 | + @property |
| 134 | + def size(self): |
| 135 | + return self.doc.size |
| 136 | + |
| 137 | + |
| 138 | +class MyBrowser(CacheMixin, PagesBrowser): |
| 139 | + BASEURL = 'http://example.com' |
| 140 | + |
| 141 | + hmatch = MimeURL('https?://.*', HPage, types=['text/html']) |
| 142 | + imatch = MimeURL('https?://.*', IPage, types=[re.compile('image/(?!svg).*')]) |
| 143 | + |
| 144 | + def __init__(self, *args, **kwargs): |
| 145 | + super(MyBrowser, self).__init__(*args, **kwargs) |
| 146 | + self.is_updatable = False # cache requests without caring about ETags |
| 147 | + |
| 148 | + def test_min_thumb(self, url): |
| 149 | + if url.startswith('data:'): |
| 150 | + return |
| 151 | + |
| 152 | + imgpage = self.open(url).page |
| 153 | + return isinstance(imgpage, IPage) and bigger_than(imgpage.size, args.min_thumb_size) |
| 154 | + |
| 155 | + def test_image_link(self, url): |
| 156 | + if url.startswith('data:'): |
| 157 | + return |
| 158 | + |
| 159 | + imgpage = self.open(url).page |
| 160 | + return isinstance(imgpage, IPage) and bigger_than(imgpage.size, args.min_image_size) |
| 161 | + |
| 162 | + def get_page_image(self, url): |
| 163 | + hpage = self.open(url).page |
| 164 | + if isinstance(hpage, HPage): |
| 165 | + return hpage.search_big_image() |
| 166 | + |
| 167 | + def crawl_old(self, url): |
| 168 | + self.location(url) |
| 169 | + |
| 170 | + for link, thumb in self.page.search_thumbs(): |
| 171 | + logging.debug(f'[-] testing {thumb}') |
| 172 | + |
| 173 | + thumbpage = self.open(thumb).page |
| 174 | + if isinstance(thumbpage, IPage) and bigger_than(thumbpage.size, args.min_thumb_size): |
| 175 | + logging.debug(f'[-] {link}') |
| 176 | + |
| 177 | + imgpage = self.open(link).page |
| 178 | + if isinstance(imgpage, IPage) and bigger_than(imgpage.size, args.min_image_size): |
| 179 | + logging.debug(f'[+] found direct {link}') |
| 180 | + self.download(link) |
| 181 | + continue |
| 182 | + elif not isinstance(imgpage, HPage): |
| 183 | + logging.debug(f'[-] too bad, {link} was not html or image') |
| 184 | + continue |
| 185 | + |
| 186 | + bigimg = imgpage.search_big_image() |
| 187 | + if bigimg: |
| 188 | + logging.debug(f'[+] found {bigimg}') |
| 189 | + self.download(bigimg) |
| 190 | + |
| 191 | + for img in self.page.search_big_images(): |
| 192 | + logging.debug(f'[+] found embedded {img}') |
| 193 | + self.download(img) |
| 194 | + |
| 195 | + def crawl(self, url): |
| 196 | + self.location(url) |
| 197 | + |
| 198 | + for img in self.page.search_2(): |
| 199 | + self.download(img) |
| 200 | + |
| 201 | + def download(self, url): |
| 202 | + name_re = re.compile(r'/([^/?]+)(\?.*)?$') |
| 203 | + with open(name_re.search(url)[1], 'wb') as fd: |
| 204 | + logging.info(f'writing to {fd.name}') |
| 205 | + fd.write(self.open(url).content) |
| 206 | + |
| 207 | + |
| 208 | +def bigger_than(test, expected): |
| 209 | + if test[0] < expected[0] or test[1] < expected[1]: |
| 210 | + return False |
| 211 | + |
| 212 | + ratio_test = Fraction(test[0], test[1]) |
| 213 | + if ratio_test < 1: |
| 214 | + ratio_test = 1 / ratio_test |
| 215 | + |
| 216 | + return ratio_test <= args.max_aspect_ratio |
| 217 | + |
| 218 | + |
| 219 | +def build_tuple_maker(sep): |
| 220 | + def arg2size(s): |
| 221 | + m = re.fullmatch(fr'(\d+){sep}(\d+)', s) |
| 222 | + if m: |
| 223 | + return (int(m[1]), int(m[2])) |
| 224 | + |
| 225 | + |
| 226 | +def parse_cookie(cstr): |
| 227 | + v = cstr.partition('=') |
| 228 | + return v[0], v[2] |
| 229 | + |
| 230 | + |
| 231 | +logging.basicConfig( |
| 232 | + level=logging.INFO, |
| 233 | + format='%(asctime)s %(levelname)s %(filename)s:%(lineno)s %(message)s', |
| 234 | +) |
| 235 | + |
| 236 | +parser = ArgumentParser( |
| 237 | + formatter_class=ArgumentDefaultsHelpFormatter, |
| 238 | + description='Extract images from a page', |
| 239 | +) |
| 240 | +parser.add_argument('url') |
| 241 | +parser.add_argument( |
| 242 | + '--min-thumb-size', type=build_tuple_maker('x'), default=(128, 128), |
| 243 | + metavar='WIDTHxHEIGHT', |
| 244 | +) |
| 245 | +parser.add_argument( |
| 246 | + '--min-image-size', type=build_tuple_maker('x'), default=(400, 400), |
| 247 | + metavar='WIDTHxHEIGHT', |
| 248 | +) |
| 249 | +parser.add_argument( |
| 250 | + '--max-aspect-ratio', type=build_tuple_maker('[:/]'), default=(4, 1), |
| 251 | + help="Max ratio between width/height to skip banners, ads etc. " |
| 252 | + "(and height/width for portrait format)", |
| 253 | + metavar='NUM:DENOM', |
| 254 | +) |
| 255 | +parser.add_argument('--cookie', type=parse_cookie) |
| 256 | + |
| 257 | +args = parser.parse_args() |
| 258 | +args.max_aspect_ratio = Fraction(*args.max_aspect_ratio) |
| 259 | +if args.max_aspect_ratio < 1: |
| 260 | + args.max_aspect_ratio = 1 / args.max_aspect_ratio |
| 261 | + |
| 262 | +b = MyBrowser() |
| 263 | +if args.cookie: |
| 264 | + b.session.cookies[args.cookie[0]] = args.cookie[1] |
| 265 | + |
| 266 | +b.crawl(args.url) |
0 commit comments