-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindeed.py
102 lines (83 loc) · 3.77 KB
/
indeed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import pandas as pd
import time
driver = webdriver.Firefox()
driver.get('https://www.indeed.com/')
input_job_name = driver.find_element_by_xpath('//*[@id="text-input-what"]')
input_job_name.send_keys('data analyst')
input_location = driver.find_element_by_xpath('//*[@id="text-input-where"]')
input_location.send_keys('New York, NY')
time.sleep(5)
find_job = driver.find_element_by_xpath('/html/body/div/div[2]/div[3]/div[1]/div/div/div/form/div[3]').click()
# Creates a dataframe
df = pd.DataFrame({'Link': [''], 'Job Title': [''], 'Company': [''], 'Location': [''], 'Salary': [''], 'Date': ['']})
# This loop goes through every page and grabs all the details of each posting
# Loop will only end when there are no more pages to go through
while True:
# Imports the HTML of the current page into python
soup = BeautifulSoup(driver.page_source, 'lxml')
# Grabs the HTML of each posting
postings = soup.find_all('div', class_='jobsearch-SerpJobCard unifiedRow row result clickcard')
# grabs all the details for each posting and adds it as a row to the dataframe
for post in postings:
link = post.find('a', class_='jobtitle turnstileLink').get('href')
link_full = 'https://www.indeed.com/' + link
name = post.find('h2', class_='title').text.strip()
company = post.find('span', class_='company').text.strip()
try:
location = post.find('div', class_='location accessible-contrast-color-location').text.strip()
except:
location = 'N/A'
date = post.find('span', class_='date').text.strip()
try:
salary = post.find('span', class_='salaryText').text.strip()
except:
salary = 'N/A'
df = df.append(
{'Link': link_full, 'Job Title': name, 'Company': company, 'Location': location, 'Salary': salary,
'Date': date},
ignore_index=True)
# checks if there is a button to go to the next page, and if not will stop the loop
try:
button = soup.find('a', attrs={'aria-label': 'Next'}).get('href')
driver.get('https://www.indeed.com/' + button)
except:
break
df['Date_num'] = df['Date'].apply(lambda x: x[:2].strip())
def integer(x):
try:
return int(x)
except:
return x
df['Date_new'] = df['Date_num'].apply(integer)
df.sort_values(by= ['Date_new', 'Salary'], inplace= True)
df = df[['Link', 'Job Title', 'Company', 'Location', 'Salary', 'Date']]
df.to_csv('~/Scraped-Data/indeed_scraped_data.csv')
#####################################################################################
#Code below sends an email to whomever through python
import smtplib, ssl
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart
from email import encoders
#Input the email account that will send the email and who will receiving it
sender = '[email protected]'
receiver = '[email protected]'
#Creates the Message, Subject line, From and To
msg = MIMEMultipart()
msg['Subject'] = 'New Jobs on Indeed'
msg['From'] = sender
msg['To'] = ','.join(receiver)
#Adds a csv file as an attachment to the email (indeed_jobs.csv is our attahced csv in this case)
part = MIMEBase('application', 'octet-stream')
part.set_payload(open('A/File/Path/indeed_jobs.csv', 'rb').read())
encoders.encode_base64(part)
part.add_header('Content-Disposition', 'attachment; filename ="indeed_jobs.csv"')
msg.attach(part)
#Will login to your email and actually send the message above to the receiver
s = smtplib.SMTP_SSL(host = 'smtp.gmail.com', port = 465)
s.login(user = '[email protected]', password = 'input your password')
s.sendmail(sender, receiver, msg.as_string())
s.quit()