tmp

yu23ki14 · yu23ki14 · commit 5fe1b489842e · 2025-07-11T16:09:42.000+09:00
diff --git a/common/birdxplorer_common/storage.py b/common/birdxplorer_common/storage.py
@@ -221,6 +221,43 @@ class RowNoteStatusRecord(Base):
     timestamp_millis_of_first_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column(nullable=True)
 
 
+class RowNoteRatingRecord(Base):
+    __tablename__ = "row_note_ratings"
+
+    note_id: Mapped[NoteId] = mapped_column(primary_key=True)
+    rater_participant_id: Mapped[ParticipantId] = mapped_column(primary_key=True)
+    created_at_millis: Mapped[TwitterTimestamp] = mapped_column(nullable=False)
+    version: Mapped[int] = mapped_column(nullable=False)
+    agree: Mapped[BinaryBool] = mapped_column(nullable=False)
+    disagree: Mapped[BinaryBool] = mapped_column(nullable=False)
+    helpful: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful: Mapped[BinaryBool] = mapped_column(nullable=False)
+    helpfulness_level: Mapped[String] = mapped_column(nullable=False)
+    helpful_other: Mapped[BinaryBool] = mapped_column(nullable=False)
+    helpful_informative: Mapped[BinaryBool] = mapped_column(nullable=False)
+    helpful_clear: Mapped[BinaryBool] = mapped_column(nullable=False)
+    helpful_empathetic: Mapped[BinaryBool] = mapped_column(nullable=False)
+    helpful_good_sources: Mapped[BinaryBool] = mapped_column(nullable=False)
+    helpful_unique_context: Mapped[BinaryBool] = mapped_column(nullable=False)
+    helpful_addresses_claim: Mapped[BinaryBool] = mapped_column(nullable=False)
+    helpful_important_context: Mapped[BinaryBool] = mapped_column(nullable=False)
+    helpful_unbiased_language: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_other: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_incorrect: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_sources_missing_or_unreliable: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_opinion_speculation_or_bias: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_missing_key_points: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_outdated: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_hard_to_understand: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_argumentative_or_biased: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_off_topic: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_spam_harassment_or_abuse: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_irrelevant_sources: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_opinion_speculation: Mapped[BinaryBool] = mapped_column(nullable=False)
+    not_helpful_note_not_needed: Mapped[BinaryBool] = mapped_column(nullable=False)
+    rated_on_tweet_id: Mapped[PostId] = mapped_column(nullable=False)
+
+
 class RowPostRecord(Base):
     __tablename__ = "row_posts"
 
@@ -237,6 +274,7 @@ class RowPostRecord(Base):
     lang: Mapped[String] = mapped_column()
     row_notes: Mapped["RowNoteRecord"] = relationship("RowNoteRecord", back_populates="row_post")
     user: Mapped["RowUserRecord"] = relationship("RowUserRecord", back_populates="row_post")
+    extracted_at: Mapped[TwitterTimestamp] = mapped_column(nullable=False)
 
 
 class RowPostMediaRecord(Base):
diff --git a/etl/src/birdxplorer_etl/constants.py b/etl/src/birdxplorer_etl/constants.py
@@ -0,0 +1,70 @@
+"""
+Shared constants for BirdXplorer ETL pipeline
+"""
+
+# Keywords to filter notes by summary content (Japanese political terms)
+TARGET_KEYWORDS = [
+    "公明党",
+    "国民民主党",
+    "国民",
+    "民主",
+    "参政党",
+    "社会民主党",
+    "社民",
+    "自由民主党",
+    "自民",
+    "日本維新の会",
+    "維新",
+    "日本保守党",
+    "保守党",
+    "日本共産党",
+    "共産",
+    "みんなでつくる党",
+    "立憲民主党",
+    "立憲",
+    "れいわ新選組",
+    "れいわ",
+    "NHK",
+    "斉藤 鉄夫",
+    "玉木雄一郎",
+    "神谷 宗幣",
+    "福島 瑞穂",
+    "石破 茂",
+    "吉村 洋文",
+    "百田 尚樹",
+    "田村 智子",
+    "大津 綾香",
+    "野田 佳彦",
+    "山本 太郎",
+    "立花 孝志",
+    "斉藤",
+    "玉木",
+    "神谷",
+    "福島",
+    "石破",
+    "吉村",
+    "百田",
+    "田村",
+    "大津",
+    "野田",
+    "山本",
+    "立花",
+    "選挙",
+    "参議院",
+    "参院",
+    "投票",
+    "開票",
+    "期日前",
+    "演説",
+    "政党",
+    "当選",
+    "落選",
+    "チームみらい",
+    "みらい",
+    "再生の道",
+    "再生",
+    "安野 貴博",
+    "安野",
+    "石丸 信二",
+    "石丸",
+]
diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py
@@ -1,9 +1,12 @@
 import csv
 import logging
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 import requests
 import stringcase
+import zipfile
+import io
 from sqlalchemy.orm import Session
+import sqlalchemy
 from lib.x.postlookup import lookup
 from birdxplorer_common.storage import (
     RowNoteRecord,
@@ -12,8 +15,10 @@
     RowUserRecord,
     RowNoteStatusRecord,
     RowPostEmbedURLRecord,
+    RowNoteRatingRecord,
 )
 import settings
+from constants import TARGET_KEYWORDS
 
 
 def extract_data(sqlite: Session, postgresql: Session):
@@ -36,7 +41,7 @@ def extract_data(sqlite: Session, postgresql: Session):
             break
 
         dateString = date.strftime("%Y/%m/%d")
-        note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv"
+        note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.zip"
         if settings.USE_DUMMY_DATA:
             note_url = (
                 "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv"
@@ -46,48 +51,125 @@ def extract_data(sqlite: Session, postgresql: Session):
         res = requests.get(note_url)
 
         if res.status_code == 200:
-            # res.contentをsqliteのNoteテーブル
-            tsv_data = res.content.decode("utf-8").splitlines()
-            reader = csv.DictReader(tsv_data, delimiter="\t")
-            reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
-
-            rows_to_add = []
-            for index, row in enumerate(reader):
-                if sqlite.query(RowNoteRecord).filter(RowNoteRecord.note_id == row["note_id"]).first():
-                    continue
-                rows_to_add.append(RowNoteRecord(**row))
-                if index % 1000 == 0:
-                    sqlite.bulk_save_objects(rows_to_add)
-                    rows_to_add = []
-            sqlite.bulk_save_objects(rows_to_add)
-
-            status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv"
             if settings.USE_DUMMY_DATA:
-                status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv"
-
-            logging.info(status_url)
-            res = requests.get(status_url)
-
-            if res.status_code == 200:
+                # Handle dummy data as TSV
                 tsv_data = res.content.decode("utf-8").splitlines()
                 reader = csv.DictReader(tsv_data, delimiter="\t")
                 reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
 
                 rows_to_add = []
                 for index, row in enumerate(reader):
-                    for key, value in list(row.items()):
-                        if value == "":
-                            row[key] = None
-                    status = (
-                        sqlite.query(RowNoteStatusRecord).filter(RowNoteStatusRecord.note_id == row["note_id"]).first()
-                    )
-                    if status is None or status.created_at_millis > int(datetime.now().timestamp() * 1000):
-                        sqlite.query(RowNoteStatusRecord).filter(RowNoteStatusRecord.note_id == row["note_id"]).delete()
-                        rows_to_add.append(RowNoteStatusRecord(**row))
+                    if sqlite.query(RowNoteRecord).filter(RowNoteRecord.note_id == row["note_id"]).first():
+                        continue
+                    rows_to_add.append(RowNoteRecord(**row))
                     if index % 1000 == 0:
                         sqlite.bulk_save_objects(rows_to_add)
                         rows_to_add = []
                 sqlite.bulk_save_objects(rows_to_add)
+            else:
+                # Handle real data as zip file
+                try:
+                    with zipfile.ZipFile(io.BytesIO(res.content)) as zip_file:
+                        file_names = zip_file.namelist()
+                        if file_names:
+                            tsv_file_name = file_names[0]
+                            with zip_file.open(tsv_file_name) as tsv_file:
+                                tsv_data = tsv_file.read().decode("utf-8").splitlines()
+                                reader = csv.DictReader(tsv_data, delimiter="\t")
+                                reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
+
+                                rows_to_add = []
+                                for index, row in enumerate(reader):
+                                    if (
+                                        sqlite.query(RowNoteRecord)
+                                        .filter(RowNoteRecord.note_id == row["note_id"])
+                                        .first()
+                                    ):
+                                        continue
+                                    rows_to_add.append(RowNoteRecord(**row))
+                                    if index % 1000 == 0:
+                                        sqlite.bulk_save_objects(rows_to_add)
+                                        rows_to_add = []
+                                sqlite.bulk_save_objects(rows_to_add)
+                except zipfile.BadZipFile:
+                    logging.error(f"Invalid zip file from {note_url}")
+                    continue
+                except Exception as e:
+                    logging.error(f"Error processing note data from {note_url}: {e}")
+                    continue
+
+            status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.zip"
+            if settings.USE_DUMMY_DATA:
+                status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv"
+
+            logging.info(status_url)
+            res = requests.get(status_url)
+
+            if res.status_code == 200:
+                if settings.USE_DUMMY_DATA:
+                    # Handle dummy data as TSV
+                    tsv_data = res.content.decode("utf-8").splitlines()
+                    reader = csv.DictReader(tsv_data, delimiter="\t")
+                    reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
+
+                    rows_to_add = []
+                    for index, row in enumerate(reader):
+                        for key, value in list(row.items()):
+                            if value == "":
+                                row[key] = None
+                        status = (
+                            sqlite.query(RowNoteStatusRecord)
+                            .filter(RowNoteStatusRecord.note_id == row["note_id"])
+                            .first()
+                        )
+                        if status is None or status.created_at_millis > int(datetime.now().timestamp() * 1000):
+                            sqlite.query(RowNoteStatusRecord).filter(
+                                RowNoteStatusRecord.note_id == row["note_id"]
+                            ).delete()
+                            rows_to_add.append(RowNoteStatusRecord(**row))
+                        if index % 1000 == 0:
+                            sqlite.bulk_save_objects(rows_to_add)
+                            rows_to_add = []
+                    sqlite.bulk_save_objects(rows_to_add)
+                else:
+                    # Handle real data as zip file
+                    try:
+                        with zipfile.ZipFile(io.BytesIO(res.content)) as zip_file:
+                            file_names = zip_file.namelist()
+                            if file_names:
+                                tsv_file_name = file_names[0]
+                                with zip_file.open(tsv_file_name) as tsv_file:
+                                    tsv_data = tsv_file.read().decode("utf-8").splitlines()
+                                    reader = csv.DictReader(tsv_data, delimiter="\t")
+                                    reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
+
+                                    rows_to_add = []
+                                    for index, row in enumerate(reader):
+                                        for key, value in list(row.items()):
+                                            if value == "":
+                                                row[key] = None
+                                        status = (
+                                            sqlite.query(RowNoteStatusRecord)
+                                            .filter(RowNoteStatusRecord.note_id == row["note_id"])
+                                            .first()
+                                        )
+                                        if status is None or status.created_at_millis > int(
+                                            datetime.now().timestamp() * 1000
+                                        ):
+                                            sqlite.query(RowNoteStatusRecord).filter(
+                                                RowNoteStatusRecord.note_id == row["note_id"]
+                                            ).delete()
+                                            rows_to_add.append(RowNoteStatusRecord(**row))
+                                        if index % 1000 == 0:
+                                            sqlite.bulk_save_objects(rows_to_add)
+                                            rows_to_add = []
+                                    sqlite.bulk_save_objects(rows_to_add)
+                    except zipfile.BadZipFile:
+                        logging.error(f"Invalid zip file from {status_url}")
+                        continue
+                    except Exception as e:
+                        logging.error(f"Error processing note status data from {status_url}: {e}")
+                        continue
 
                 break
 
@@ -96,11 +178,20 @@ def extract_data(sqlite: Session, postgresql: Session):
     sqlite.commit()
 
     # Noteに紐づくtweetデータを取得
+    # Build keyword filter conditions using shared TARGET_KEYWORDS
+    keyword_conditions = []
+    for keyword in TARGET_KEYWORDS:
+        keyword_conditions.append(RowNoteRecord.summary.ilike(f"%{keyword}%"))
+
     postExtract_targetNotes = (
         sqlite.query(RowNoteRecord)
         .filter(RowNoteRecord.tweet_id != None)
         .filter(RowNoteRecord.created_at_millis >= settings.TARGET_TWITTER_POST_START_UNIX_MILLISECOND)
         .filter(RowNoteRecord.created_at_millis <= settings.TARGET_TWITTER_POST_END_UNIX_MILLISECOND)
+        .filter(
+            # Use OR condition to match any of the keywords
+            sqlalchemy.or_(*keyword_conditions)
+        )
         .all()
     )
     logging.info(f"Target notes: {len(postExtract_targetNotes)}")
@@ -119,8 +210,9 @@ def extract_data(sqlite: Session, postgresql: Session):
         if post == None or "data" not in post:
             continue
 
-        created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
+        created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
         created_at_millis = int(created_at.timestamp() * 1000)
+        now_millis = int(datetime.now(timezone.utc).timestamp() * 1000)
 
         is_userExist = (
             postgresql.query(RowUserRecord).filter(RowUserRecord.user_id == post["data"]["author_id"]).first()
@@ -166,6 +258,7 @@ def extract_data(sqlite: Session, postgresql: Session):
             quote_count=post["data"]["public_metrics"]["quote_count"],
             reply_count=post["data"]["public_metrics"]["reply_count"],
             lang=post["data"]["lang"],
+            extracted_at=now_millis,
         )
         postgresql.add(row_post)
 
diff --git a/etl/src/birdxplorer_etl/lib/sqlite/init.py b/etl/src/birdxplorer_etl/lib/sqlite/init.py
@@ -12,14 +12,22 @@
     RowPostEmbedURLRecord,
     RowNoteStatusRecord,
     RowPostMediaRecord,
+    RowNoteRatingRecord,
 )
 
 
 def init_sqlite():
     # ToDo: dbファイルをS3など外部に置く必要がある。
     db_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "data", "note.db"))
     logging.info(f"Initializing database at {db_path}")
-    engine = create_engine("sqlite:///" + db_path)
+    engine = create_engine(
+        "sqlite:///" + db_path,
+        pool_size=20,
+        max_overflow=30,
+        pool_timeout=60,
+        pool_recycle=3600,
+        connect_args={"check_same_thread": False, "timeout": 60},
+    )
 
     # 一時データベースのテーブル作成する
     # ToDo: noteテーブル以外に必要なものを追加
@@ -29,6 +37,9 @@ def init_sqlite():
     if not inspect(engine).has_table("row_note_status"):
         logging.info("Creating table note_status")
         RowNoteStatusRecord.metadata.create_all(engine)
+    if not inspect(engine).has_table("row_note_ratings"):
+        logging.info("Creating table note_ratings")
+        RowNoteRatingRecord.metadata.create_all(engine)
 
     Session = sessionmaker(bind=engine)
 
@@ -57,6 +68,9 @@ def init_postgresql():
     if not inspect(engine).has_table("row_post_media"):
         logging.info("Creating table post_media")
         RowPostMediaRecord.metadata.create_all(engine)
+    if not inspect(engine).has_table("row_note_ratings"):
+        logging.info("Creating table note_ratings")
+        RowNoteRatingRecord.metadata.create_all(engine)
 
     Session = sessionmaker(bind=engine)
 
diff --git a/etl/src/birdxplorer_etl/transform.py b/etl/src/birdxplorer_etl/transform.py
diff --git a/migrate/migration/versions/c356b162f2f7_test.py b/migrate/migration/versions/c356b162f2f7_test.py