Skip to content

Commit 50ef8b0

Browse files
authored
Merge pull request #12 from MidraLab/fix/remove_drive
chromedriverを使わない処理に置き換え
2 parents 2ca4b30 + 7853472 commit 50ef8b0

File tree

5 files changed

+74
-102
lines changed

5 files changed

+74
-102
lines changed

Dockerfile

Lines changed: 4 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,7 @@
11
FROM python:3.10
22
USER root
33

4-
# Install dependencies
5-
RUN apt-get update && apt-get install -y \
6-
wget \
7-
unzip \
8-
libxss1 \
9-
libappindicator1 \
10-
libappindicator3-1 \
11-
libasound2 \
12-
libdbus-glib-1-2 \
13-
libcairo2 \
14-
libcups2 \
15-
libfontconfig1 \
16-
libgdk-pixbuf2.0-0 \
17-
libgtk-3-0 \
18-
libnspr4 \
19-
libnss3 \
20-
libpango-1.0-0 \
21-
libpangocairo-1.0-0 \
22-
libx11-xcb1 \
23-
libxtst6 \
24-
libxss1 \
25-
libasound2 \
26-
fonts-liberation \
27-
xdg-utils \
28-
libdrm2 \
29-
libgbm1 \
30-
libu2f-udev \
31-
libvulkan1 \
32-
--no-install-recommends
33-
34-
# Install Google Chrome
35-
RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
36-
&& dpkg -i google-chrome-stable_current_amd64.deb \
37-
&& apt-get install -yf \
38-
&& rm google-chrome-stable_current_amd64.deb
39-
40-
# Install ChromeDriver
41-
RUN CHROME_DRIVER_VERSION=`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE` \
42-
&& wget -N http://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip -P ~/ \
43-
&& unzip ~/chromedriver_linux64.zip -d ~/ \
44-
&& rm ~/chromedriver_linux64.zip \
45-
&& mv -f ~/chromedriver /usr/local/bin/chromedriver \
46-
&& chown root:root /usr/local/bin/chromedriver \
47-
&& chmod 0755 /usr/local/bin/chromedriver
48-
49-
RUN pip install --upgrade pip
50-
RUN pip install --upgrade setuptools
51-
RUN python -m pip install selenium
52-
RUN python -m pip install notion-client
53-
RUN python -m pip install beautifulsoup4
54-
RUN python -m pip install python-dotenv
4+
# Python依存関係のインストール
5+
COPY requirements.txt /tmp/
6+
RUN pip install --upgrade pip setuptools \
7+
&& pip install --requirement /tmp/requirements.txt

opt/main.py

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,35 @@
1+
import asyncio
12
from dotenv import load_dotenv
2-
from selenium import webdriver
3-
from selenium.webdriver.chrome.options import Options
43
import os
54
import sys
65

76
from zenn_scraper import ZennScraper
87
from notion_manager import NotionManager
98

10-
if __name__ == "__main__":
9+
10+
async def main():
1111
load_dotenv()
1212

1313
notion_api_key = os.environ["MIDRA_LAB_NOTION_API"]
14-
notion_database_id = os.environ["NOTION_DATABASE_URL"]
14+
notion_database_id = os.environ["NOTION_DATABASE_ID"]
15+
# TODO: 使用者によって変更する
16+
usernames = ["keisuke114", "ayousanz"]
1517

1618
publication_url = "https://zenn.dev/p/midra_lab"
1719

18-
options = Options()
19-
options.add_argument('--headless')
20-
options.add_argument('--log-level=3')
21-
options.add_argument('--no-sandbox')
22-
options.add_argument('--disable-dev-shm-usage')
23-
driver = webdriver.Chrome(options=options)
24-
2520
zenn_scraper = ZennScraper(publication_url)
26-
zenn_scraper.get_articles(driver)
21+
zenn_scraper.get_midra_lab_articles(usernames)
2722

2823
if zenn_scraper.is_articles_empty():
29-
driver.quit()
24+
print("記事がありません。")
3025
sys.exit()
3126

3227
notion_manager = NotionManager(notion_api_key, notion_database_id)
3328
notion_manager.delete_all_pages()
3429

3530
for article in zenn_scraper.articles:
36-
details = zenn_scraper.get_article_details(driver, article)
37-
notion_manager.add_article(article['title'], article['url'], details['tags'], article['name'], details['date'])
31+
notion_manager.add_article(article['title'], article['url'], article['name'], article['created_at'])
32+
3833

39-
driver.quit()
34+
# 非同期イベントループを使用してmain関数を実行
35+
asyncio.run(main())

opt/notion_manager.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,48 @@
11
from concurrent.futures import ThreadPoolExecutor
2-
32
from notion_client import Client
43

54

65
class NotionManager:
76
def __init__(self, api_key, database_id):
7+
"""
8+
Notion APIのクライアントを初期化し、データベースIDを設定する。
9+
"""
810
self.notion = Client(auth=api_key)
911
self.database_id = database_id
1012

11-
def add_article(self, title, url, tags, name, date):
13+
def add_article(self, title, url, name, date):
14+
"""
15+
新しい記事をNotionデータベースに追加する。
16+
"""
1217
new_page = {
1318
"Title": {"title": [{"text": {"content": title}}]},
14-
"Tags": {"multi_select": self.get_tags_and_remove_default_tag(tags)},
1519
"Link": {"url": url},
1620
"Author": {"rich_text": [{"text": {"content": name}}]},
1721
"Date": {"date": {"start": date}}
1822
}
1923
self.notion.pages.create(parent={"database_id": self.database_id}, properties=new_page)
2024

21-
22-
# Get the list of tags with zenn's default tag removed from tags
2325
def get_tags_and_remove_default_tag(self, tags) -> list:
26+
"""
27+
提供されたタグのリストからデフォルトのタグ(`tech`, `idea`)を除外し、
28+
Notionに適合する形式のリストを返す。
29+
"""
2430
notion_tags = []
2531
for tag in tags:
2632
if tag not in ["tech", "idea"]:
2733
notion_tags.append({"name": tag})
2834
return notion_tags
2935

30-
def delete_page(self,page_id):
36+
def delete_page(self, page_id):
37+
"""
38+
指定された`page_id`のページをNotionデータベースから削除(アーカイブ)する。
39+
"""
3140
self.notion.pages.update(page_id=page_id, archived=True)
3241

3342
def delete_all_pages(self):
43+
"""
44+
データベース内の全ページを削除する。`ThreadPoolExecutor`を使用して並列削除を行う。
45+
"""
3446
pages = self.notion.databases.query(database_id=self.database_id)
3547
with ThreadPoolExecutor() as executor:
36-
# ページを並列で削除する
3748
[executor.submit(self.delete_page, page['id']) for page in pages['results']]

opt/zenn_scraper.py

Lines changed: 37 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,48 @@
1-
from bs4 import BeautifulSoup
1+
import requests
22
from datetime import datetime
33

44

55
class ZennScraper:
66
def __init__(self, url):
7+
"""
8+
ZennScraperクラスのコンストラクタ。
9+
指定したURLを初期化し、空の記事リストを作成する。
10+
11+
Args:
12+
url (str): スクレイピングするZennのURL。
13+
"""
714
self.url = url
815
self.articles = []
916

10-
def get_articles(self, driver):
11-
driver.get(self.url)
12-
13-
soup = BeautifulSoup(driver.page_source, 'html.parser')
14-
articles = soup.find_all('article', class_='ArticleCard_container__3qUYt')
15-
16-
for article in articles:
17-
title = article.find('h3', class_='ArticleCard_title__UnBHE').text
18-
url = "https://zenn.dev" + article.find('a', class_='ArticleCard_mainLink__X2TOE')['href']
19-
name = article.find('div', class_='ArticleCard_userName__1q_wZ').text
20-
self.articles.append({'title': title, 'url': url, 'name': name})
17+
def get_midra_lab_articles(self, usernames):
18+
base_url = "https://zenn.dev/api/articles"
19+
20+
for username in usernames:
21+
response = requests.get(base_url, params={'username': username, 'order': 'latest'})
22+
if response.status_code == 200:
23+
data = response.json()
24+
for article in data["articles"]:
25+
# 'publication'が存在し、その'name'が'midra_lab'であるか確認
26+
if article.get("publication") and article["publication"].get("name") == "midra_lab":
27+
28+
# 日付の解析とフォーマット
29+
published_at = article.get("published_at")
30+
if published_at:
31+
date_obj = datetime.strptime(published_at, '%Y-%m-%dT%H:%M:%S.%f%z')
32+
formatted_date = date_obj.strftime('%Y-%m-%d')
33+
article_info = {
34+
'title': article["title"],
35+
'name': article["user"]["username"],
36+
'url': f"https://zenn.dev{article['path']}",
37+
'created_at': formatted_date
38+
}
39+
self.articles.append(article_info)
2140

2241
def is_articles_empty(self):
23-
return len(self.articles) == 0
42+
"""
43+
記事リストが空かどうかを確認する。
2444
25-
def get_article_details(self, driver, article):
26-
driver.get(article['url'])
27-
28-
soup = BeautifulSoup(driver.page_source, 'html.parser')
29-
date = soup.find('span', class_='ArticleHeader_num__rSDj6').text
30-
date_obj = datetime.strptime(date, '%Y/%m/%d')
31-
formatted_date = date_obj.strftime('%Y-%m-%d')
32-
tags_container = soup.find('div', class_='View_topics__OVMdM')
33-
tags = tags_container.find_all('div', class_='View_topicName__rxKth')
34-
35-
results = []
36-
for tag in tags:
37-
results.append(tag.text)
38-
39-
return {"tags": results, "date": formatted_date}
45+
Returns:
46+
bool: リストが空の場合はTrue、それ以外の場合はFalse。
47+
"""
48+
return len(self.articles) == 0

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
notion-client==2.1.0
2+
python-dotenv==1.0.0
3+
requests

0 commit comments

Comments
 (0)