forked from kislaykumarkk/Scraper
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscraper_linkedin.py
68 lines (55 loc) · 2.2 KB
/
scraper_linkedin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as BS
import sys
import os
import pandas as pd
import numpy as np
#import pdfkit
#Use this command in your terminal to quarnatine chromedriver
#xattr -d com.apple.quarantine chromedriver
i = 1
big_username_array = []
big_review_titles_array = []
chrome_options = webdriver.ChromeOptions()
chrome_options.add_extension("extension_3_2_3_0.crx")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get("https://www.linkedin.com/in/zachariah-mithani-4212271b2/")
#driver.switch_to.window(driver.window_handles[0])
# while i <= 2:
# #Chrome driver to open Browser
# #Download from https://sites.google.com/a/chromium.org/chromedriver/downloads
# #Add correct path for ChromeDriver here
# #driver = webdriver.Chrome('/Users/kaustav/Downloads/test/Scraper/chromedriver')
# s=Service('/Users/kaustav/Downloads/test/Scraper/chromedriver')
# driver = webdriver.Chrome(service=s)
# #URL of the website to Scrap
# url = "https://www.amazon.com/product-reviews/1338596705/ref=cm_cr_arp_d_viewopt_sr?pageNumber="
# url = url + str(i)
# driver.get(url)
# #Parse HTML source using BeautifulSoup
# soup = BS(driver.page_source)
# #Usernames
# usernames = soup.select('#cm_cr-review_list .a-profile-name')
# username_array =[]
# for username in usernames:
# username_array.append(username.text.strip())
# big_username_array.extend(username_array)
# #Review Title
# review_titles = soup.select('#cm_cr-review_list .review-title')
# review_titles_array =[]
# for review_title in review_titles:
# review_titles_array.append(review_title.text.strip())
# big_review_titles_array.extend(review_titles_array)
# driver.quit() # closes the webbrowser
# i = i + 1
#Status
# print(big_username_array)
# a = np.array(big_username_array)
# b = np.array(big_review_titles_array)
# df = pd.DataFrame({"Username" : a, "ReviewTitle" : b})
# df.to_csv("output.csv", index=False)