-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathwaybackimages.py
executable file
·122 lines (101 loc) · 3.86 KB
/
waybackimages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
import bs4
import hashlib
import json
import os
import pandas
import pyexifinfo
import requests
import sys
import urllib
import waybackpack
import argparse
# Author :
# Source : http://www.automatingosint.com/blog/2016/12/vacuuming-image-metadata-from-the-wayback-machine/
#
# Searches the Wayback machine for the provided URL
#
def search_archive(url):
# search for all unique captures for the URL
results = waybackpack.search(url,uniques_only=True)
timestamps = []
# build a list of timestamps for captures
for snapshot in results:
timestamps.append(snapshot['timestamp'])
# request a list of archives for each timestamp
packed_results = waybackpack.Pack(url,timestamps=timestamps)
return packed_results
#
# Retrieve the archived page and extract the images from it.
#
def get_image_paths(packed_results):
images = []
count = 1
for asset in packed_results.assets:
# get the location of the archived URL
archive_url = asset.get_archive_url()
print("[*] Retrieving %s (%d of %d)" % (archive_url,count,len(packed_results.assets)))
# grab the HTML from the Wayback machine
result = asset.fetch()
# parse out all image tags
soup = bs4.BeautifulSoup(result)
image_list = soup.findAll("img")
# loop over the images and build full URLs out of them
if len(image_list):
for image in image_list:
if not image.attrs['src'].startswith("http"):
image_path = urllib.parse.urljoin(archive_url,image.attrs['src'])
else:
image_path = image.attrs['src']
if image_path not in images:
print("[+] Adding new image: %s" % image_path)
images.append(image_path)
count += 1
return images
#
# Download the images and extract the EXIF data.
#
def download_images(image_list, url, output_path):
image_results = []
image_hashes = []
for image in image_list:
# this filters out images not from our target domain
if url not in image:
continue
try:
print("[v] Downloading %s" % image)
response = requests.get(image)
except:
print("[!] Failed to download: %s" % image)
continue
if "image" in response.headers['content-type']:
sha1 = hashlib.sha1(response.content).hexdigest()
if sha1 not in image_hashes:
image_hashes.append(sha1)
image_path = os.path.join(output_path, "%s-%s" % (sha1,image.split("/")[-1]))
with open(image_path,"wb") as fd:
fd.write(response.content)
print("[*] Saved %s" % image)
info = pyexifinfo.get_json(image_path)
info[0]['ImageHash'] = sha1
image_results.append(info[0])
return image_results
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process some urls')
parser.add_argument('URL', help='URL to look for')
parser.add_argument('--output', '-o', default='waybackimages',
help='output folder')
args = parser.parse_args()
output_path = os.path.expanduser(args.output)
if not os.path.exists(output_path):
os.mkdir(output_path)
results = search_archive(args.URL)
print("[*] Retrieved %d possible stored URLs" % len(results.assets))
image_paths = get_image_paths(results)
print("[*] Retrieved %d image paths." % len(image_paths))
image_results = download_images(image_paths, args.URL, output_path)
# return to JSON and have pandas build a csv
image_results_json = json.dumps(image_results)
data_frame = pandas.read_json(image_results_json)
csv = data_frame.to_csv("results.csv")
print("[*] Finished writing CSV to results.csv")