Skip to content

Add Pandas v2 support, improve naming in example, add type hinting, upgrade gitignore #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,28 @@
venv
venv/
.idea
example.py
parents.csv
children.csv
images.csv
products.csv
variants.csv
images.csv

**/__pycache__/

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
29 changes: 20 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,37 @@ To install ShopifyScraper, run the following command:
pip3 install git+https://github.com/practical-data-science/ShopifyScraper.git
```

#### Development
To install ShopifyScraper in development mode, run the following command:

```bash
git clone https://github.com/practical-data-science/ShopifyScraper.git
cd ShopifyScraper
pip3 install -e .
```

### Usage

```python
from shopify_scraper import scraper

url = "https://yourshopifydomain.com"

parents = scraper.get_products(url)
parents.to_csv('parents.csv', index=False)
print('Parents: ', len(parents))
products = scraper.get_products(url)
products.to_csv('products.csv', index=False)
print('Products: count=', len(products))


children = scraper.get_variants(parents)
children.to_csv('children.csv', index=False)
print('Children: ', len(children))
variants = scraper.get_variants(products)
variants.to_csv('variants.csv', index=False)
print('Variants: count=', len(variants))


images = scraper.get_images(parents)
images = scraper.get_images(products)
images.to_csv('images.csv', index=False)
print('Images: ', len(images))

print('Images: count=', len(images))
```

Note that variants has a many-to-one relationship with products, ON variant.product_id = product.id.

Note that images has a many-to-one relationship with products, ON image.product_id = product.id.
10 changes: 10 additions & 0 deletions shopify_scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

__all__ = [
'get_json',
'get_products',
'get_variants',
'get_images'
]

# excluded from __all__:
# '_flatten_column_to_dataframe', '_products_json_to_df'
70 changes: 21 additions & 49 deletions shopify_scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import pandas as pd
import requests


def get_json(url, page):
def get_json(url: str, page: int) -> str:
"""
Get Shopify products.json from a store URL.

Expand All @@ -20,26 +19,12 @@ def get_json(url, page):
products_json: Products.json from the store.
"""

try:
response = requests.get(f'{url}/products.json?limit=250&page={page}', timeout=5)
products_json = response.text
response.raise_for_status()
return products_json

except requests.exceptions.HTTPError as error_http:
print("HTTP Error:", error_http)

except requests.exceptions.ConnectionError as error_connection:
print("Connection Error:", error_connection)

except requests.exceptions.Timeout as error_timeout:
print("Timeout Error:", error_timeout)
response = requests.get(f'{url}/products.json?limit=250&page={page}', timeout=5)
products_json = response.text
response.raise_for_status()
return products_json

except requests.exceptions.RequestException as error:
print("Error: ", error)


def to_df(products_json):
def _products_json_to_df(products_json: str) -> pd.DataFrame:
"""
Convert products.json to a pandas DataFrame.

Expand All @@ -49,15 +34,11 @@ def to_df(products_json):
df: Pandas DataFrame of the products.json.
"""

try:
products_dict = json.loads(products_json)
df = pd.DataFrame.from_dict(products_dict['products'])
return df
except Exception as e:
print(e)

products_dict = json.loads(products_json)
df = pd.DataFrame.from_dict(products_dict['products'])
return df

def get_products(url):
def get_products(url: str) -> pd.DataFrame:
"""
Get all products from a store.

Expand All @@ -71,7 +52,7 @@ def get_products(url):

while results:
products_json = get_json(url, page)
products_dict = to_df(products_json)
products_dict = _products_json_to_df(products_json)

if len(products_dict) == 0:
break
Expand All @@ -82,9 +63,8 @@ def get_products(url):
df['url'] = f"{url}/products/" + df['handle']
return df


def get_variants(products):
"""Get variants from a list of products.
def get_variants(products: pd.DataFrame) -> pd.DataFrame:
"""Get variants from a table of products.

Args:
products (pd.DataFrame): Pandas dataframe of products from get_products()
Expand All @@ -97,19 +77,17 @@ def get_variants(products):
df_variants = pd.DataFrame()

for row in products.itertuples(index='True'):

for variant in getattr(row, 'variants'):
df_variants = pd.concat([df_variants, pd.DataFrame.from_records(variant, index=[0])])

df_variants['id'].astype(int)
df_variants['product_id'].astype(int)
df_parent_data = products[['id', 'title', 'vendor']]
df_parent_data = df_parent_data.rename(columns={'title': 'parent_title', 'id': 'parent_id'})
df_variants = df_variants.merge(df_parent_data, left_on='product_id', right_on='parent_id')
df_product_data = products[['id', 'title', 'vendor']]
df_product_data = df_product_data.rename(columns={'title': 'product_title', 'id': 'product_id', 'vendor': 'product_vendor'})
df_variants = df_variants.merge(df_product_data, left_on='product_id', right_on='product_id', validate='m:1')
return df_variants


def json_list_to_df(df, col):
def _flatten_column_to_dataframe(df: pd.DataFrame, col: str) -> pd.DataFrame:
"""Return a Pandas dataframe based on a column that contains a list of JSON objects.

Args:
Expand All @@ -120,15 +98,10 @@ def json_list_to_df(df, col):
Pandas dataframe: A new dataframe with the JSON objects expanded into columns.
"""

rows = []
for index, row in df[col].iteritems():
for item in row:
rows.append(item)
df = pd.DataFrame(rows)
return df

rows = [item for row in df[col] for item in row]
return pd.DataFrame(rows)

def get_images(df_products):
def get_images(df_products: pd.DataFrame) -> pd.DataFrame:
"""Get images from a list of products.

Args:
Expand All @@ -138,5 +111,4 @@ def get_images(df_products):
images (pd.DataFrame): Pandas dataframe of images
"""

return json_list_to_df(df_products, 'images')

return _flatten_column_to_dataframe(df_products, 'images')