Skip to content

Commit de70fdf

Browse files
committed
first version of img-lurker
Committing to track versions.
1 parent 2b22731 commit de70fdf

File tree

2 files changed

+280
-0
lines changed

2 files changed

+280
-0
lines changed

COPYING.wtfpl

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
2+
Version 2, December 2004
3+
4+
Copyright (C) 2004 Sam Hocevar <[email protected]>
5+
6+
Everyone is permitted to copy and distribute verbatim or modified
7+
copies of this license document, and changing it is allowed as long
8+
as the name is changed.
9+
10+
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
11+
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
12+
13+
0. You just DO WHAT THE FUCK YOU WANT TO.
14+

img-lurker.py

+266
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
#!/usr/bin/env python3
2+
# license: Do What the Fuck You Want to Public License version 2
3+
# [http://wtfpl.net]
4+
5+
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
6+
from fractions import Fraction
7+
from io import BytesIO
8+
import logging
9+
import re
10+
from urllib.parse import urljoin
11+
12+
from PIL import Image
13+
from weboob.browser import PagesBrowser, URL
14+
from weboob.browser.cache import CacheMixin
15+
from weboob.browser.pages import HTMLPage, RawPage
16+
17+
18+
class MimeURL(URL):
19+
def __init__(self, *args, types, **kwargs):
20+
super(MimeURL, self).__init__(*args, **kwargs)
21+
self.types = types
22+
23+
def handle(self, response):
24+
response_type = response.headers.get('Content-Type')
25+
if not response_type:
26+
return
27+
response_type = re.match('[^;]+', response_type)[0] # ignore mime params
28+
29+
for accepted_type in self.types:
30+
if isinstance(accepted_type, str) and accepted_type == response_type:
31+
break
32+
elif isinstance(accepted_type, re.Pattern) and accepted_type.fullmatch(response_type):
33+
break
34+
else:
35+
# not found any match
36+
return
37+
38+
return super(MimeURL, self).handle(response)
39+
40+
41+
class HPage(HTMLPage):
42+
def search_thumbs(self):
43+
for link_el in self.doc.xpath('//a[.//img]'):
44+
link = urljoin(self.url, link_el.attrib['href'])
45+
46+
imgs = link_el.xpath('.//img')
47+
if len(imgs) != 1:
48+
continue
49+
img = urljoin(self.url, imgs[0].attrib['src'])
50+
51+
yield link, img
52+
53+
def search_big_images(self):
54+
for img_el in self.doc.xpath('//img'):
55+
img = urljoin(self.url, img_el.attrib['src'])
56+
57+
imgpage = self.browser.open(img).page
58+
if not (isinstance(imgpage, IPage) and bigger_than(imgpage.size, args.min_image_size)):
59+
continue
60+
61+
yield img
62+
63+
def search_big_image_old(self):
64+
for img_el in self.doc.xpath('//img'):
65+
img = urljoin(self.url, img_el.attrib['src'])
66+
67+
imgpage = self.browser.open(img).page
68+
if not (isinstance(imgpage, IPage) and bigger_than(imgpage.size, args.min_image_size)):
69+
continue
70+
71+
# found, now look if we're on a link to hi-res
72+
links = img_el.xpath('./ancestor::a[@href]')
73+
if len(links) != 1:
74+
return img
75+
76+
link_el = links[0]
77+
link = urljoin(self.url, link_el.attrib['href'])
78+
logging.debug(f'[-] found higher res? {link} for {img}')
79+
return img
80+
81+
def search_big_image(self):
82+
for img_el in self.doc.xpath('//img'):
83+
img = self._url_of(img_el, 'src')
84+
85+
if self.browser.test_image_link(img):
86+
return img
87+
88+
def _container_link_el(self, img_el):
89+
links = img_el.xpath('./ancestor::a[@href]')
90+
91+
try:
92+
link_el, = links
93+
except ValueError:
94+
return
95+
96+
return link_el
97+
98+
def _url_of(self, el, attr):
99+
return urljoin(self.url, el.attrib[attr])
100+
101+
def search_2(self):
102+
for img_el in self.doc.xpath('//img'):
103+
img = self._url_of(img_el, 'src')
104+
105+
if not self.browser.test_min_thumb(img):
106+
# doesn't even qualify as probable thumbnail
107+
continue
108+
109+
link_el = self._container_link_el(img_el)
110+
if link_el is not None:
111+
link = self._url_of(link_el, 'href')
112+
if self.browser.test_image_link(link):
113+
# img is link to bigger image
114+
yield link
115+
continue
116+
117+
sub = self.browser.get_page_image(link)
118+
if sub:
119+
# img is link to page with bigger image
120+
yield sub
121+
continue
122+
123+
if self.browser.test_image_link(img):
124+
# img is already the big image
125+
yield img
126+
continue
127+
128+
129+
class IPage(RawPage):
130+
def build_doc(self, content):
131+
return Image.open(BytesIO(content))
132+
133+
@property
134+
def size(self):
135+
return self.doc.size
136+
137+
138+
class MyBrowser(CacheMixin, PagesBrowser):
139+
BASEURL = 'http://example.com'
140+
141+
hmatch = MimeURL('https?://.*', HPage, types=['text/html'])
142+
imatch = MimeURL('https?://.*', IPage, types=[re.compile('image/(?!svg).*')])
143+
144+
def __init__(self, *args, **kwargs):
145+
super(MyBrowser, self).__init__(*args, **kwargs)
146+
self.is_updatable = False # cache requests without caring about ETags
147+
148+
def test_min_thumb(self, url):
149+
if url.startswith('data:'):
150+
return
151+
152+
imgpage = self.open(url).page
153+
return isinstance(imgpage, IPage) and bigger_than(imgpage.size, args.min_thumb_size)
154+
155+
def test_image_link(self, url):
156+
if url.startswith('data:'):
157+
return
158+
159+
imgpage = self.open(url).page
160+
return isinstance(imgpage, IPage) and bigger_than(imgpage.size, args.min_image_size)
161+
162+
def get_page_image(self, url):
163+
hpage = self.open(url).page
164+
if isinstance(hpage, HPage):
165+
return hpage.search_big_image()
166+
167+
def crawl_old(self, url):
168+
self.location(url)
169+
170+
for link, thumb in self.page.search_thumbs():
171+
logging.debug(f'[-] testing {thumb}')
172+
173+
thumbpage = self.open(thumb).page
174+
if isinstance(thumbpage, IPage) and bigger_than(thumbpage.size, args.min_thumb_size):
175+
logging.debug(f'[-] {link}')
176+
177+
imgpage = self.open(link).page
178+
if isinstance(imgpage, IPage) and bigger_than(imgpage.size, args.min_image_size):
179+
logging.debug(f'[+] found direct {link}')
180+
self.download(link)
181+
continue
182+
elif not isinstance(imgpage, HPage):
183+
logging.debug(f'[-] too bad, {link} was not html or image')
184+
continue
185+
186+
bigimg = imgpage.search_big_image()
187+
if bigimg:
188+
logging.debug(f'[+] found {bigimg}')
189+
self.download(bigimg)
190+
191+
for img in self.page.search_big_images():
192+
logging.debug(f'[+] found embedded {img}')
193+
self.download(img)
194+
195+
def crawl(self, url):
196+
self.location(url)
197+
198+
for img in self.page.search_2():
199+
self.download(img)
200+
201+
def download(self, url):
202+
name_re = re.compile(r'/([^/?]+)(\?.*)?$')
203+
with open(name_re.search(url)[1], 'wb') as fd:
204+
logging.info(f'writing to {fd.name}')
205+
fd.write(self.open(url).content)
206+
207+
208+
def bigger_than(test, expected):
209+
if test[0] < expected[0] or test[1] < expected[1]:
210+
return False
211+
212+
ratio_test = Fraction(test[0], test[1])
213+
if ratio_test < 1:
214+
ratio_test = 1 / ratio_test
215+
216+
return ratio_test <= args.max_aspect_ratio
217+
218+
219+
def build_tuple_maker(sep):
220+
def arg2size(s):
221+
m = re.fullmatch(fr'(\d+){sep}(\d+)', s)
222+
if m:
223+
return (int(m[1]), int(m[2]))
224+
225+
226+
def parse_cookie(cstr):
227+
v = cstr.partition('=')
228+
return v[0], v[2]
229+
230+
231+
logging.basicConfig(
232+
level=logging.INFO,
233+
format='%(asctime)s %(levelname)s %(filename)s:%(lineno)s %(message)s',
234+
)
235+
236+
parser = ArgumentParser(
237+
formatter_class=ArgumentDefaultsHelpFormatter,
238+
description='Extract images from a page',
239+
)
240+
parser.add_argument('url')
241+
parser.add_argument(
242+
'--min-thumb-size', type=build_tuple_maker('x'), default=(128, 128),
243+
metavar='WIDTHxHEIGHT',
244+
)
245+
parser.add_argument(
246+
'--min-image-size', type=build_tuple_maker('x'), default=(400, 400),
247+
metavar='WIDTHxHEIGHT',
248+
)
249+
parser.add_argument(
250+
'--max-aspect-ratio', type=build_tuple_maker('[:/]'), default=(4, 1),
251+
help="Max ratio between width/height to skip banners, ads etc. "
252+
"(and height/width for portrait format)",
253+
metavar='NUM:DENOM',
254+
)
255+
parser.add_argument('--cookie', type=parse_cookie)
256+
257+
args = parser.parse_args()
258+
args.max_aspect_ratio = Fraction(*args.max_aspect_ratio)
259+
if args.max_aspect_ratio < 1:
260+
args.max_aspect_ratio = 1 / args.max_aspect_ratio
261+
262+
b = MyBrowser()
263+
if args.cookie:
264+
b.session.cookies[args.cookie[0]] = args.cookie[1]
265+
266+
b.crawl(args.url)

0 commit comments

Comments
 (0)