-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrapeSpotify.py
executable file
·181 lines (168 loc) · 7.59 KB
/
scrapeSpotify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python
# Scrape the Spotify Web Player for research.
# Author: Yuxuan "Ethan" Chen
# Date: April 30, 2014
# Version: 0.9.5
#
# ===================================================
# VERSION HISTORY
# ===================================================
# Version 0.9.5 Posted Apr 30, 2014
#
# ___________________________________________________
# Version 0.9.4 Posted Nov 13, 2014
# - Removed time.sleep(10) calls, explicit waits now
# - Changed scraping to class SpotifyScrape in
# anticipation of logging and database storage
# - Simplified iframe transitions
# - Created continuous queue of profile scrapes
# - Can support users with no recent artists
# - Can support users with no public playlists
# - Scrapes users followings as well as followers
# - Can support users without followings
# - Improved scrolling capability
# - Created the wrapper function 'gather' which
# automatically scrolls down and scrapes all
# elements in playlists, followers, and following
# - Enabled MySQL database storage of results
# ___________________________________________________
# Version 0.9.3 Posted Nov 10, 2014
# - Can scrape the followers.
# - Can load all the playlists and scrape them.
# ___________________________________________________
# Version 0.9.2 Posted Nov 8, 2014
# - Can scroll to the bottom
# ___________________________________________________
# Version 0.9.1 Posted Nov 7, 2014
# - Can switch to the logged-in Spotify browse page
# - Eliminate the unsupported command-line flag
# - Can switch to the user profile page
# - Can get iframes
# - Can scrape user name
# - Can click on the tabs on the user profile
# - Can get recently played artists
# - Can get public playlists
# ___________________________________________________
# Version 0.9 Posted Nov 5, 2014
# - Can log in Spotify
# ===================================================
from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
import getpass
import string
import time
class SpotifyScraper:
def __init__(self):
self.q = []
self.q.append('spotify')
self.driver = None
self.display = Display(visible=0, size=(800, 600))
#self.db = MySQLdb.connect("localhost", "ubuntu", "", "spotify")
def connect(self):
print 'Spotify Social Network Project'
print '=============================='
# Open a broswer and navigate to the Spotify player
print 'Creating webdriver ...'
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches",
["ignore-certificate-errors"]) # Suppress a command-line flag
self.display.start()
self.driver = webdriver.Chrome(chrome_options=options,
service_args=["--verbose", "--log-path=webdriver.log"])
self.driver.implicitly_wait(2)
print 'Navigating to Spotify ...'
self.driver.get('http://play.spotify.com/')
# Click the "Already have an account" link
action_chains = ActionChains(self.driver)
login = WebDriverWait(self.driver, 10).until(
EC.element_to_be_clickable((By.ID, 'has-account')))
action_chains.double_click(login).perform()
# Type in credentials at the command line to log in Spotiy with Facebook
fb_login = WebDriverWait(self.driver, 10).until(
EC.element_to_be_clickable((By.ID, 'fb-login-btn')))
fb_login.click()
self.driver.switch_to_window(self.driver.window_handles[1])
print 'Logging in via Facebook ...'
email_blank = self.driver.find_element_by_id('email')
pass_blank = self.driver.find_element_by_id('pass')
input_email = raw_input('Email or Phone: ')
input_pass = getpass.getpass(' Password: ')
email_blank.send_keys(input_email)
pass_blank.send_keys(input_pass)
email_blank.submit()
# Navigate from the browse page to the user page
print 'Waiting for Spotify to load ...'
self.driver.switch_to_window(self.driver.window_handles[0])
WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//li[@class='item-profile etched-top has-extra-bottom-row show show show show']")))
print 'Connection complete ...'
print '=============================='
def scrape(self):
# Load user page
user = self.q.pop(0)
print 'Scraping user: ' + user
self.driver.get('http://play.spotify.com/user/' + user)
WebDriverWait(self.driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe[contains(@id, 'user')]")))
# Scrape user name
print 'Scraping user name ...'
WebDriverWait(self.driver, 20).until(lambda x: self.driver.find_element_by_xpath("//h1[@class='h-title']").text)
name = self.driver.find_element_by_xpath("//h1[@class='h-title']").text
print name
# Scrape recently played artists
artists = self.gather('recently-played-artists')
for artist in artists:
self.store('recent_artists', ['id', 'user'], [artist, user])
# Scrape public playlists
playlists = self.gather('public-playlists')
for playlist in playlists:
self.store('playlists', ['id', 'user'], [playlist, user])
# Scrape following
following = self.gather('following')
for follow in following:
self.store('follows', ['outgoing', 'incoming'], [user, follow])
self.q.append(follow)
# Scrape followers
followers = self.gather('followers')
for follower in followers:
self.store('follows', ['outgoing', 'incoming'], [follower, user])
self.q.append(follower)
#self.db.commit()
def gather(self, type):
try:
print 'Scraping ' + type + ' ...'
tab = self.driver.find_element_by_xpath("//li[@data-navbar-item-id='" + type + "']")
tab.click()
if len(self.driver.find_elements_by_xpath("//section[@class='" + type + "']/descendant::a[contains(@class, 'title')]")) > 20:
while True:
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
try:
WebDriverWait(self.driver, 2).until(lambda x: self.driver.execute_script('return document.body.scrollHeight != document.body.scrollTop + window.innerHeight;'))
except:
break
self.driver.execute_script('window.scrollTo(0, 0);')
items = self.driver.find_elements_by_xpath("//section[@class='" + type + "']/descendant::a[contains(@class, 'title')]")
for i in xrange(len(items)):
items[i] = items[i].get_attribute('href').split('/')[-1]
return items
except NoSuchElementException:
print 'No ' + type
return []
def store(self, table, fields, values):
print values
#cur = self.db.cursor()
#cur.execute('INSERT INTO ' + table + "(" + string.join(fields, ", ") + ") VALUES ('" + string.join(values, "', '") + "');")
#cur.close()
def close(self):
self.driver.close()
self.display.stop()
# self.db.close()
if __name__ == '__main__':
scraper = SpotifyScraper()
scraper.connect()
while scraper.q:
scraper.scrape()
scraper.close()