forked from AliShug/musicleague-scraper
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
83 lines (71 loc) · 3.17 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from selenium import webdriver
import pickle
import json
import os
from process import process
def do_login(browser, url):
browser.get(url)
cookies = browser.get_cookies()
with open('cookies.pickle', 'wb') as f:
pickle.dump(cookies, f)
input('Login in the browser, then press Enter to continue here...')
cookies = browser.get_cookies()
with open('cookies.pickle', 'wb') as f:
pickle.dump(cookies, f)
def scrape(browser, url):
if not os.path.isfile('cookies.pickle'):
do_login(browser, url)
browser.get('https://musicleague.app')
with open('cookies.pickle', 'rb') as f:
cookies = pickle.load(f)
for cookie in cookies:
browser.add_cookie(cookie)
print('Logged in')
browser.get(url)
# Just get the number of rounds
round_count = len(browser.find_elements_by_link_text('Round Results'))
rounds = []
round_names = [x.text for x in browser.find_elements_by_class_name('round-title')]
assert(round_count == len(round_names))
for round_num, round_name in enumerate(round_names):
browser.get(url)
browser.find_elements_by_link_text('Round Results')[round_num].click()
song_names = [x.text for x in browser.find_elements_by_class_name('name') if x.tag_name=='a' and len(x.text) > 0]
submitters = [x.text[13:] for x in browser.find_elements_by_class_name('submitter') if x.tag_name=='span' and len(x.text) > 0]
vote_containers = browser.find_elements_by_class_name('vote-breakdown')
all_voters = []
all_vote_counts = []
for vote_container in vote_containers:
upvotes = vote_container.find_elements_by_class_name('upvote')
upvotes = upvotes[:len(upvotes)//2] # half are hidden
all_vote_counts.append([ int(upvote.find_element_by_class_name('vote-count').text) for upvote in upvotes ])
all_voters.append([ upvote.find_element_by_class_name('voter').text for upvote in upvotes ])
songs = []
for song_name, submitter, voters, vote_counts in zip(song_names, submitters, all_voters, all_vote_counts):
song = {
'name': song_name,
'submitter': submitter,
'votes': {voter:count for voter, count in zip(voters, vote_counts)}
}
songs.append(song)
rounds.append({
'name': round_name,
'songs': songs
})
with open('data.json', 'w') as f:
json.dump(rounds, f)
return rounds
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('url', help='URL for the music league - https://musicleague.app/l/<base64 crap>/')
parser.add_argument('--login', '-l', action='store_true', help="Login to music league. Try this if the scraper isn't running correctly.")
parser.add_argument('--process', '-p', action='store_true', help="Immediately process the data to csv (semicolon-separated) and print this.")
args = parser.parse_args()
browser = webdriver.Chrome()
if args.login:
do_login(browser, args.url)
data = scrape(browser, args.url)
if args.process:
process(data)
browser.close()