-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscraper.py
93 lines (93 loc) · 3.47 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
pd.options.mode.chained_assignment = None
def delay():
"""Function to add a random delay."""
time.sleep(random.randint(3, 10))
def lazy_loading():
"""Function for lazy loading."""
element = driver.find_element(By.TAG_NAME, 'body')
count = 0
while count < 20:
element.send_keys(Keys.PAGE_DOWN)
delay()
count += 1
def extract_content(url):
"""Function to fetch page content."""
base_url = 'https://silverstones.pk'
if not url.startswith('http'):
url = base_url + url
print("Attempting to visit:", url)
try:
driver.get(url)
page_content = driver.page_source
product_soup = BeautifulSoup(page_content, 'html.parser')
return product_soup
except Exception as e:
print(f"Error while visiting {url}: {e}")
return None
def extract_product_details(soup):
"""Function to extract product details."""
details = {
'Name': 'N/A',
'Regular price': 'N/A',
'SKU': 'N/A',
'Description': 'N/A',
'Categories': 'N/A',
'Images': 'N/A',
'URL': 'N/A'
}
# Product Name
product_name_element = soup.find('h1', {'class': 'product_title'})
details['Name'] = product_name_element.text.strip() if product_name_element else 'N/A'
# Product Price
product_price_element = soup.find('span', {'class': 'woocommerce-Price-amount'})
details['Regular price'] = product_price_element.text.strip() if product_price_element else 'N/A'
# SKU
sku_element = soup.find('span', {'class': 'sku'})
details['SKU'] = sku_element.text.strip() if sku_element else 'N/A'
tab_description_element = soup.find(id='tab-description')
details['Long Description'] = tab_description_element.text.strip() if tab_description_element else 'N/A'
# Categories
category_element = soup.find('span', {'class': 'posted_in'})
details['Categories'] = category_element.text.strip() if category_element else 'N/A'
# Images
product_image_element = soup.find('img', {'class': 'wp-post-image'})
details['Images'] = product_image_element['src'] if product_image_element else 'N/A'
return details
# List of Jewelry website links
urls = [
'https://silverstones.pk'
]
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
all_product_details = []
for start_url in urls:
driver.get(start_url)
lazy_loading()
content = driver.page_source
homepage_soup = BeautifulSoup(content, 'html.parser')
product_links = homepage_soup.find_all('a', {'class': 'woocommerce-LoopProduct-link'})
for link in product_links:
if not link:
continue
product_content = extract_content(link['href'])
if product_content:
product_details = extract_product_details(product_content)
product_details['URL'] = link['href']
all_product_details.append(product_details)
data = pd.DataFrame(all_product_details)
# Adding the ID column
data['ID'] = range(1, len(data) + 1)
# Reordering columns so ID comes first
cols = ['ID'] + [col for col in data if col != 'ID']
data = data[cols]
data.to_csv('woocommerce_data.csv', index=False)
print("Scraping complete!")
driver.close()