Merge pull request #12 from MidraLab/fix/remove_drive

ayutaz · web-flow · commit 50ef8b0e9aae · 2023-12-03T18:34:06.000+09:00
chromedriverを使わない処理に置き換え
diff --git a/Dockerfile b/Dockerfile
@@ -1,54 +1,7 @@
 FROM python:3.10
 USER root
 
-# Install dependencies
-RUN apt-get update && apt-get install -y \
-    wget \
-    unzip \
-    libxss1 \
-    libappindicator1 \
-    libappindicator3-1 \
-    libasound2 \
-    libdbus-glib-1-2 \
-    libcairo2 \
-    libcups2 \
-    libfontconfig1 \
-    libgdk-pixbuf2.0-0 \
-    libgtk-3-0 \
-    libnspr4 \
-    libnss3 \
-    libpango-1.0-0 \
-    libpangocairo-1.0-0 \
-    libx11-xcb1 \
-    libxtst6 \
-    libxss1 \
-    libasound2 \
-    fonts-liberation \
-    xdg-utils \
-    libdrm2 \
-    libgbm1 \
-    libu2f-udev \
-    libvulkan1 \
-    --no-install-recommends
-
-# Install Google Chrome
-RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
-    && dpkg -i google-chrome-stable_current_amd64.deb \
-    && apt-get install -yf \
-    && rm google-chrome-stable_current_amd64.deb
-
-# Install ChromeDriver
-RUN CHROME_DRIVER_VERSION=`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE` \
-    && wget -N http://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip -P ~/ \
-    && unzip ~/chromedriver_linux64.zip -d ~/ \
-    && rm ~/chromedriver_linux64.zip \
-    && mv -f ~/chromedriver /usr/local/bin/chromedriver \
-    && chown root:root /usr/local/bin/chromedriver \
-    && chmod 0755 /usr/local/bin/chromedriver
-
-RUN pip install --upgrade pip
-RUN pip install --upgrade setuptools
-RUN python -m pip install selenium
-RUN python -m pip install notion-client
-RUN python -m pip install beautifulsoup4
-RUN python -m pip install python-dotenv
+# Python依存関係のインストール
+COPY requirements.txt /tmp/
+RUN pip install --upgrade pip setuptools \
+ && pip install --requirement /tmp/requirements.txt
diff --git a/opt/main.py b/opt/main.py
@@ -1,39 +1,35 @@
+import asyncio
 from dotenv import load_dotenv
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
 import os
 import sys
 
 from zenn_scraper import ZennScraper
 from notion_manager import NotionManager
 
-if __name__ == "__main__":
+
+async def main():
     load_dotenv()
 
     notion_api_key = os.environ["MIDRA_LAB_NOTION_API"]
-    notion_database_id = os.environ["NOTION_DATABASE_URL"]
+    notion_database_id = os.environ["NOTION_DATABASE_ID"]
+    # TODO: 使用者によって変更する
+    usernames = ["keisuke114", "ayousanz"]
 
     publication_url = "https://zenn.dev/p/midra_lab"
 
-    options = Options()
-    options.add_argument('--headless')
-    options.add_argument('--log-level=3')
-    options.add_argument('--no-sandbox')
-    options.add_argument('--disable-dev-shm-usage')
-    driver = webdriver.Chrome(options=options)
-
     zenn_scraper = ZennScraper(publication_url)
-    zenn_scraper.get_articles(driver)
+    zenn_scraper.get_midra_lab_articles(usernames)
 
     if zenn_scraper.is_articles_empty():
-        driver.quit()
+        print("記事がありません。")
         sys.exit()
 
     notion_manager = NotionManager(notion_api_key, notion_database_id)
     notion_manager.delete_all_pages()
 
     for article in zenn_scraper.articles:
-        details = zenn_scraper.get_article_details(driver, article)
-        notion_manager.add_article(article['title'], article['url'], details['tags'], article['name'], details['date'])
+        notion_manager.add_article(article['title'], article['url'], article['name'], article['created_at'])
+
 
-    driver.quit()
+# 非同期イベントループを使用してmain関数を実行
+asyncio.run(main())
diff --git a/opt/notion_manager.py b/opt/notion_manager.py
@@ -1,37 +1,48 @@
 from concurrent.futures import ThreadPoolExecutor
-
 from notion_client import Client
 
 
 class NotionManager:
     def __init__(self, api_key, database_id):
+        """
+        Notion APIのクライアントを初期化し、データベースIDを設定する。
+        """
         self.notion = Client(auth=api_key)
         self.database_id = database_id
 
-    def add_article(self, title, url, tags, name, date):
+    def add_article(self, title, url, name, date):
+        """
+        新しい記事をNotionデータベースに追加する。
+        """
         new_page = {
             "Title": {"title": [{"text": {"content": title}}]},
-            "Tags": {"multi_select": self.get_tags_and_remove_default_tag(tags)},
             "Link": {"url": url},
             "Author": {"rich_text": [{"text": {"content": name}}]},
             "Date": {"date": {"start": date}}
         }
         self.notion.pages.create(parent={"database_id": self.database_id}, properties=new_page)
 
-
-    # Get the list of tags with zenn's default tag removed from tags
     def get_tags_and_remove_default_tag(self, tags) -> list:
+        """
+        提供されたタグのリストからデフォルトのタグ（`tech`, `idea`）を除外し、
+        Notionに適合する形式のリストを返す。
+        """
         notion_tags = []
         for tag in tags:
             if tag not in ["tech", "idea"]:
                 notion_tags.append({"name": tag})
         return notion_tags
 
-    def delete_page(self,page_id):
+    def delete_page(self, page_id):
+        """
+        指定された`page_id`のページをNotionデータベースから削除（アーカイブ）する。
+        """
         self.notion.pages.update(page_id=page_id, archived=True)
 
     def delete_all_pages(self):
+        """
+        データベース内の全ページを削除する。`ThreadPoolExecutor`を使用して並列削除を行う。
+        """
         pages = self.notion.databases.query(database_id=self.database_id)
         with ThreadPoolExecutor() as executor:
-            # ページを並列で削除する
             [executor.submit(self.delete_page, page['id']) for page in pages['results']]
diff --git a/opt/zenn_scraper.py b/opt/zenn_scraper.py
@@ -1,39 +1,48 @@
-from bs4 import BeautifulSoup
+import requests
 from datetime import datetime
 
 
 class ZennScraper:
     def __init__(self, url):
+        """
+        ZennScraperクラスのコンストラクタ。
+        指定したURLを初期化し、空の記事リストを作成する。
+
+        Args:
+            url (str): スクレイピングするZennのURL。
+        """
         self.url = url
         self.articles = []
 
-    def get_articles(self, driver):
-        driver.get(self.url)
-
-        soup = BeautifulSoup(driver.page_source, 'html.parser')
-        articles = soup.find_all('article', class_='ArticleCard_container__3qUYt')
-
-        for article in articles:
-            title = article.find('h3', class_='ArticleCard_title__UnBHE').text
-            url = "https://zenn.dev" + article.find('a', class_='ArticleCard_mainLink__X2TOE')['href']
-            name = article.find('div', class_='ArticleCard_userName__1q_wZ').text
-            self.articles.append({'title': title, 'url': url, 'name': name})
+    def get_midra_lab_articles(self, usernames):
+        base_url = "https://zenn.dev/api/articles"
+
+        for username in usernames:
+            response = requests.get(base_url, params={'username': username, 'order': 'latest'})
+            if response.status_code == 200:
+                data = response.json()
+                for article in data["articles"]:
+                    # 'publication'が存在し、その'name'が'midra_lab'であるか確認
+                    if article.get("publication") and article["publication"].get("name") == "midra_lab":
+
+                        # 日付の解析とフォーマット
+                        published_at = article.get("published_at")
+                        if published_at:
+                            date_obj = datetime.strptime(published_at, '%Y-%m-%dT%H:%M:%S.%f%z')
+                            formatted_date = date_obj.strftime('%Y-%m-%d')
+                        article_info = {
+                            'title': article["title"],
+                            'name': article["user"]["username"],
+                            'url': f"https://zenn.dev{article['path']}",
+                            'created_at': formatted_date
+                        }
+                        self.articles.append(article_info)
 
     def is_articles_empty(self):
-        return len(self.articles) == 0
+        """
+        記事リストが空かどうかを確認する。
 
-    def get_article_details(self, driver, article):
-        driver.get(article['url'])
-
-        soup = BeautifulSoup(driver.page_source, 'html.parser')
-        date = soup.find('span', class_='ArticleHeader_num__rSDj6').text
-        date_obj = datetime.strptime(date, '%Y/%m/%d')
-        formatted_date = date_obj.strftime('%Y-%m-%d')
-        tags_container = soup.find('div', class_='View_topics__OVMdM')
-        tags = tags_container.find_all('div', class_='View_topicName__rxKth')
-
-        results = []
-        for tag in tags:
-            results.append(tag.text)
-
-        return {"tags": results, "date": formatted_date}
+        Returns:
+            bool: リストが空の場合はTrue、それ以外の場合はFalse。
+        """
+        return len(self.articles) == 0
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+notion-client==2.1.0
+python-dotenv==1.0.0
+requests

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+notion-client==2.1.0`
	`2`	`+python-dotenv==1.0.0`
	`3`	`+requests`