-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimmoscout_at_scraper.py
105 lines (83 loc) · 3.86 KB
/
immoscout_at_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from datetime import datetime
import asyncio
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException
from tinydb import Query
from alarm import sound_alarm
# Define the format for the timestamp
timestamp_format = '%Y%m%d %H:%M:%S'
def scrape(webdriver, search_address, database, chat_id):
# Open the provided search address in the WebDriver
webdriver.get(search_address)
# Find the result list element using its XPath
result_list_xpath = '//*[@id="cope_ref"]/section[2]/ol'
result_list = webdriver.find_element(By.XPATH, result_list_xpath)
# Retrieve all the individual result items from the result list
results = result_list.find_elements(By.CSS_SELECTOR, 'li')
# Initialize a counter to keep track of added listings
added_listings = 0
for result in results:
# Filter out ads
if result.get_attribute("class") != "J04SL w-full":
continue
attempts = 0
# Initialize a dictionary to store the listing details
property_listing = {
'link': '',
'price': '',
'address': '',
'type': ''
}
# Initialize variables to track errors during scraping
errors = ""
error_occurred = False
while attempts < 2:
error_occurred = False
try:
# Retrieve the link element for the property listing
link = result.find_element(By.CSS_SELECTOR, 'a')
property_listing['link'] = link.get_property("href")
except WebDriverException as e:
errors += e.msg + '\n'
error_occurred = True
try:
# Retrieve the information container for the property listing
info = result.find_element(By.XPATH, './a/section/section[1]/ul')
except WebDriverException as e:
errors += e.msg + '\n'
error_occurred = True
else:
try:
info_elements = info.find_elements(By.CSS_SELECTOR, 'li')
# Retrieve the price information for the property listing
price = info_elements.pop(-1)
property_listing['price'] = price.text.replace('\u20ac', '')
for info_element in info_elements:
property_listing['type'] += info_element.text + ' | '
except WebDriverException as e:
errors += e.msg + '\n'
error_occurred = True
try:
# Retrieve the address information for the property listing
address = result.find_element(By.CSS_SELECTOR, 'address')
property_listing['address'] = address.text
except WebDriverException as e:
errors += e.msg + '\n'
error_occurred = True
if not error_occurred:
break
attempts += 1
# Add the timestamp to the property listing
property_listing['time'] = datetime.now().strftime(timestamp_format)
entries = Query()
if error_occurred:
# Print any errors that occurred during scraping
print(errors)
elif not database.contains(entries.link == property_listing['link']):
# Insert the property listing into the database if it's not a duplicate
database.insert(property_listing)
added_listings += 1
# Run the sound_alarm function asynchronously to trigger a notification
asyncio.run(sound_alarm(property_listing, chat_id))
# Print the number of added listings after the scraping is complete
print("Added " + str(added_listings) + " listings")