This repository was archived by the owner on Jul 24, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathproperty_scraper.py
executable file
·270 lines (212 loc) · 6.43 KB
/
property_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
# -*- coding: utf-8 -*-
#! /usr/bin/env python3
"""Airbnb listing scraping
"""
import sys, threading
import json
from collections import namedtuple
import pickle
try:
import Queue
except ImportError:
import queue as Queue
import requests
import sqlite3
import click
from xml import etree
# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try:
import signal
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
pass
import validators
__author__ = "Carter Harwood"
__copyright__ = "2017"
__credits__ = ["Carter Harwood"]
__license__ = "MIT"
__version__ = "0.0.3"
__maintainer__ = "Carter Harwood"
__email__ = "[email protected]"
__status__ = "Prototype"
# Source: https://stackoverflow.com/a/15882054
def _json_object_hook(d):
return namedtuple('X', d.keys())(*d.values())
def json2obj(data):
return json.loads(data, object_hook=_json_object_hook)
class PropertyScraper(object):
"""Handles scraping Airbnb pages and pulling out listing details
Attributes:
site_url: web url of Airbnb listing
"""
def __init__(self, url):
"""Inits PropertyScraper with url of listing"""
try:
self.site_url = url
self.html = self.__set_full_site_html()
self.listing = self.__set_listing()
except requests.exceptions.MissingSchema:
print("Invalid URL (?)", url)
exit()
def __set_full_site_html(self):
"""Retreives full html contents of site_url"""
return requests.get(self.site_url)
def get_full_site_html(self):
"""Returns full html contents of site_url"""
return self.html
def __get_full_site_json(self):
"""Extracts json object from site html"""
page = self.get_full_site_html()
tree = html.fromstring(page.content)
react_items = tree.xpath("//script[@type='application/json']")
# The substring is used to remove surrounding <!-- -->
try:
return json.loads(react_items[2].text[4:-3])
except IndexError:
sys.stderr.write('Experiencing throttling from Airbnb. Stopping.')
sys.exit()
def get_listing_json(self):
"""Pulls only json related to listing
NOTE: This is required before calling json2obj() as
json['bootstrapData']['headerParams'] contains keys
that can not be converted. Ex: shared.Home
"""
return self.__get_full_site_json()['bootstrapData']['listing']
def __set_listing(self):
"""Sets self.listing"""
return json2obj(json.dumps(self.get_listing_json()))
def get_listing(self):
"""Returns self.listing"""
return self.listing
class DownloadManager(object):
"""Handles init and threading of scraping"""
class Worker(threading.Thread):
"""Thread worker"""
def __init__(self, db, queue):
"""Constructor"""
threading.Thread.__init__(self)
self.db = db
self.db_conn = sqlite3.connect(self.db, check_same_thread=False)
self.queue = queue
def __store_db(self, url, listing):
"""Insert listing into db"""
c = self.db_conn.cursor()
amenities = []
for amenity in listing.listing_amenities:
if amenity.is_present:
amenities.append(amenity.name)
c.execute(
'insert into listings values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
(url, listing.name, listing.market,
str(listing.review_details_interface.review_score),
str(listing.review_details_interface.review_count),
listing.room_and_property_type, listing.bed_label, listing.bathroom_label,
listing.guest_label, listing.price_formatted_for_embed,
listing.photos[0].large_cover.split('?')[0], listing.description,
','.join(amenities)))
self.db_conn.commit()
c.close()
def run(self):
"""Thread Main"""
while 1:
try:
url = self.queue.get_nowait()
except Queue.Empty:
break
ps = PropertyScraper(url)
self.__store_db(url, ps.listing)
def __init__(self, db, conn, urls):
"""Constructor"""
self.db = db
self.db_conn = sqlite3.connect(self.db)
self.__init_db()
self.conn = conn
self.urls = urls
self.queue = self.__process_input()
self.threads = []
def __init_db(self):
"""Setup db if file does not already exists"""
c = self.db_conn.cursor()
try:
# Create table
c.execute('''create table listings (url text, name text,
market text, rating num, review_count num, type text,
bed text, bath text, guest text, price text,
photo_url text, description text, amenities text)''')
self.db_conn.commit()
except sqlite3.OperationalError:
pass
c.close()
def __process_input(self):
"""Builds queue from input urls"""
queue = Queue.Queue()
print('Processing urls...')
input = []
try:
input = click.open_file(self.urls, 'r')
except FileNotFoundError:
if validators.url(self.urls):
input.append(self.urls.strip())
with click.progressbar(input) as url_file:
for url in url_file:
url = url.strip()
if not url or url[0] == "#":
continue
queue.put(url)
return queue
def run(self):
"""Spawns threads based number of urls and desired concurrent threads.
"""
try:
assert self.queue.queue, "no URLs given"
except AssertionError:
sys.stderr.write('No URLs given')
sys.exit()
n_conn = min(self.conn, len(self.queue.queue))
try:
assert 1 <= n_conn <= 10000, "invalid number of concurrent connections"
except AssertionError:
sys.stderr.write('invalid number of concurrent connections')
for url in range(n_conn):
thread = self.Worker(self.db, self.queue)
thread.start()
self.threads.append(thread)
print('Scraping begins...')
with click.progressbar(self.threads) as threads:
for thread in threads:
thread.join()
def read_urls_file(file):
"""Read urls from file"""
with open(file) as f:
for line in f:
print(line, __is_valid_url(line))
def __is_valid_url(string):
""" """
return validators.url(string)
def __handle_any_missing_modules():
"""Prints error when required python modules are missing"""
if missing_modules:
sys.stderr.write('Missing required modules. Fix:')
sys.stderr.write(' pip3 install ', ' '.join(missing_modules))
sys.exit()
@click.command()
@click.option(
'--conn',
default=10,
nargs=1,
help='Number of simultaneous connections, defaults to 10.')
@click.option(
'--db',
default='listings.db',
type=click.Path(
exists=False, file_okay=True, dir_okay=False, writable=True, readable=False),
help='SQLite database to create/use, defaults to listings.db.')
@click.argument('file', nargs=1)
def main(conn, db, file):
"""Main"""
dm = DownloadManager(db, conn, file)
dm.run()
if __name__ == "__main__":
"""Entry point"""
main()