-
Notifications
You must be signed in to change notification settings - Fork 3
/
poem-scraper.py
44 lines (33 loc) · 1.31 KB
/
poem-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
base_url = "https://www.poemhunter.com/john-keats/poems/"
page = requests.get(base_url)
soup = BeautifulSoup(page.content, "html.parser")
links = []
while True:
for link in soup.select("td.title a"):
links.append(urljoin(base_url, link["href"]))
next_buttons = soup.select(".poets-poems li.next a")
if not next_buttons: break
next_url = urljoin(base_url, next_buttons[0]["href"])
page = requests.get(next_url)
soup = BeautifulSoup(page.content, "html.parser")
with open("all-keats-poems-token.txt", 'w') as master_file:
with open("all-keats-poems-notoken.txt", 'w') as master_file2:
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.select("h1.title")[0].text
print(title)
if title.lower() == "hyperion" or "excerpt" in title.lower(): continue # duplicates
with open("poems/{}.txt".format(title), 'w') as f:
element = soup.select(".poem-detail .KonaBody [itemprop='text']")[0]
text = element.get_text("\n")
for line in text.split("\n"):
if not line or line.isspace(): continue
line = " ".join(line.split())
f.write(line + "\n")
master_file.write(line + "\n")
master_file2.write(line + "\n")
master_file.write("<|endoftext|>\n")