-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
36 lines (30 loc) · 1.28 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import requests
from bs4 import BeautifulSoup
all_quotes = []
URLs = [("https://www.goodreads.com/quotes/tag/zen-buddhism", 9), ("https://www.goodreads.com/quotes/tag/zen", 58)]
for URL, page_count in URLs:
print("Scraping", URL)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:125.0) Gecko/20100101 Firefox/125.0'}
for x in range(page_count):
page = requests.get(f"{URL}?page={x + 1}", headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
quotes = soup.find_all('div', class_='quoteDetails')
for quote in quotes:
quote_text = quote.find('div', class_='quoteText')
text = quote_text.text.split('―')[0].strip()
text = text[1:-1]
text = text.replace('"', "\"")
text = text.replace('–', '-')
quote_author = quote.find('span', class_='authorOrTitle')
author = quote_author.text.strip().replace(',', '')
all_quotes.append((author, text))
all_quotes = list(set(all_quotes)) # remove duplicates
print(all_quotes)
with open('data.json', 'w') as f:
f.write('[')
for quote in all_quotes:
if quote == all_quotes[-1]: # last quote
f.write('{ "author": "%s", "quote": "%s" }' % (quote[0], quote[1]))
else:
f.write('{ "author": "%s", "quote": "%s" }, ' % (quote[0], quote[1]))
f.write(']')