Skip to content

Commit 5fe1b48

Browse files
committed
tmp
1 parent 437fe89 commit 5fe1b48

File tree

6 files changed

+336
-39
lines changed

6 files changed

+336
-39
lines changed

common/birdxplorer_common/storage.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,43 @@ class RowNoteStatusRecord(Base):
221221
timestamp_millis_of_first_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column(nullable=True)
222222

223223

224+
class RowNoteRatingRecord(Base):
225+
__tablename__ = "row_note_ratings"
226+
227+
note_id: Mapped[NoteId] = mapped_column(primary_key=True)
228+
rater_participant_id: Mapped[ParticipantId] = mapped_column(primary_key=True)
229+
created_at_millis: Mapped[TwitterTimestamp] = mapped_column(nullable=False)
230+
version: Mapped[int] = mapped_column(nullable=False)
231+
agree: Mapped[BinaryBool] = mapped_column(nullable=False)
232+
disagree: Mapped[BinaryBool] = mapped_column(nullable=False)
233+
helpful: Mapped[BinaryBool] = mapped_column(nullable=False)
234+
not_helpful: Mapped[BinaryBool] = mapped_column(nullable=False)
235+
helpfulness_level: Mapped[String] = mapped_column(nullable=False)
236+
helpful_other: Mapped[BinaryBool] = mapped_column(nullable=False)
237+
helpful_informative: Mapped[BinaryBool] = mapped_column(nullable=False)
238+
helpful_clear: Mapped[BinaryBool] = mapped_column(nullable=False)
239+
helpful_empathetic: Mapped[BinaryBool] = mapped_column(nullable=False)
240+
helpful_good_sources: Mapped[BinaryBool] = mapped_column(nullable=False)
241+
helpful_unique_context: Mapped[BinaryBool] = mapped_column(nullable=False)
242+
helpful_addresses_claim: Mapped[BinaryBool] = mapped_column(nullable=False)
243+
helpful_important_context: Mapped[BinaryBool] = mapped_column(nullable=False)
244+
helpful_unbiased_language: Mapped[BinaryBool] = mapped_column(nullable=False)
245+
not_helpful_other: Mapped[BinaryBool] = mapped_column(nullable=False)
246+
not_helpful_incorrect: Mapped[BinaryBool] = mapped_column(nullable=False)
247+
not_helpful_sources_missing_or_unreliable: Mapped[BinaryBool] = mapped_column(nullable=False)
248+
not_helpful_opinion_speculation_or_bias: Mapped[BinaryBool] = mapped_column(nullable=False)
249+
not_helpful_missing_key_points: Mapped[BinaryBool] = mapped_column(nullable=False)
250+
not_helpful_outdated: Mapped[BinaryBool] = mapped_column(nullable=False)
251+
not_helpful_hard_to_understand: Mapped[BinaryBool] = mapped_column(nullable=False)
252+
not_helpful_argumentative_or_biased: Mapped[BinaryBool] = mapped_column(nullable=False)
253+
not_helpful_off_topic: Mapped[BinaryBool] = mapped_column(nullable=False)
254+
not_helpful_spam_harassment_or_abuse: Mapped[BinaryBool] = mapped_column(nullable=False)
255+
not_helpful_irrelevant_sources: Mapped[BinaryBool] = mapped_column(nullable=False)
256+
not_helpful_opinion_speculation: Mapped[BinaryBool] = mapped_column(nullable=False)
257+
not_helpful_note_not_needed: Mapped[BinaryBool] = mapped_column(nullable=False)
258+
rated_on_tweet_id: Mapped[PostId] = mapped_column(nullable=False)
259+
260+
224261
class RowPostRecord(Base):
225262
__tablename__ = "row_posts"
226263

@@ -237,6 +274,7 @@ class RowPostRecord(Base):
237274
lang: Mapped[String] = mapped_column()
238275
row_notes: Mapped["RowNoteRecord"] = relationship("RowNoteRecord", back_populates="row_post")
239276
user: Mapped["RowUserRecord"] = relationship("RowUserRecord", back_populates="row_post")
277+
extracted_at: Mapped[TwitterTimestamp] = mapped_column(nullable=False)
240278

241279

242280
class RowPostMediaRecord(Base):
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""
2+
Shared constants for BirdXplorer ETL pipeline
3+
"""
4+
5+
# Keywords to filter notes by summary content (Japanese political terms)
6+
TARGET_KEYWORDS = [
7+
"公明党",
8+
"国民民主党",
9+
"国民",
10+
"民主",
11+
"参政党",
12+
"社会民主党",
13+
"社民",
14+
"自由民主党",
15+
"自民",
16+
"日本維新の会",
17+
"維新",
18+
"日本保守党",
19+
"保守党",
20+
"日本共産党",
21+
"共産",
22+
"みんなでつくる党",
23+
"立憲民主党",
24+
"立憲",
25+
"れいわ新選組",
26+
"れいわ",
27+
"NHK",
28+
"斉藤 鉄夫",
29+
"玉木雄一郎",
30+
"神谷 宗幣",
31+
"福島 瑞穂",
32+
"石破 茂",
33+
"吉村 洋文",
34+
"百田 尚樹",
35+
"田村 智子",
36+
"大津 綾香",
37+
"野田 佳彦",
38+
"山本 太郎",
39+
"立花 孝志",
40+
"斉藤",
41+
"玉木",
42+
"神谷",
43+
"福島",
44+
"石破",
45+
"吉村",
46+
"百田",
47+
"田村",
48+
"大津",
49+
"野田",
50+
"山本",
51+
"立花",
52+
"選挙",
53+
"参議院",
54+
"参院",
55+
"投票",
56+
"開票",
57+
"期日前",
58+
"演説",
59+
"政党",
60+
"当選",
61+
"落選",
62+
"チームみらい",
63+
"みらい",
64+
"再生の道",
65+
"再生",
66+
"安野 貴博",
67+
"安野",
68+
"石丸 信二",
69+
"石丸",
70+
]

etl/src/birdxplorer_etl/extract.py

Lines changed: 127 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import csv
22
import logging
3-
from datetime import datetime, timedelta
3+
from datetime import datetime, timedelta, timezone
44
import requests
55
import stringcase
6+
import zipfile
7+
import io
68
from sqlalchemy.orm import Session
9+
import sqlalchemy
710
from lib.x.postlookup import lookup
811
from birdxplorer_common.storage import (
912
RowNoteRecord,
@@ -12,8 +15,10 @@
1215
RowUserRecord,
1316
RowNoteStatusRecord,
1417
RowPostEmbedURLRecord,
18+
RowNoteRatingRecord,
1519
)
1620
import settings
21+
from constants import TARGET_KEYWORDS
1722

1823

1924
def extract_data(sqlite: Session, postgresql: Session):
@@ -36,7 +41,7 @@ def extract_data(sqlite: Session, postgresql: Session):
3641
break
3742

3843
dateString = date.strftime("%Y/%m/%d")
39-
note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv"
44+
note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.zip"
4045
if settings.USE_DUMMY_DATA:
4146
note_url = (
4247
"https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv"
@@ -46,48 +51,125 @@ def extract_data(sqlite: Session, postgresql: Session):
4651
res = requests.get(note_url)
4752

4853
if res.status_code == 200:
49-
# res.contentをsqliteのNoteテーブル
50-
tsv_data = res.content.decode("utf-8").splitlines()
51-
reader = csv.DictReader(tsv_data, delimiter="\t")
52-
reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
53-
54-
rows_to_add = []
55-
for index, row in enumerate(reader):
56-
if sqlite.query(RowNoteRecord).filter(RowNoteRecord.note_id == row["note_id"]).first():
57-
continue
58-
rows_to_add.append(RowNoteRecord(**row))
59-
if index % 1000 == 0:
60-
sqlite.bulk_save_objects(rows_to_add)
61-
rows_to_add = []
62-
sqlite.bulk_save_objects(rows_to_add)
63-
64-
status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv"
6554
if settings.USE_DUMMY_DATA:
66-
status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv"
67-
68-
logging.info(status_url)
69-
res = requests.get(status_url)
70-
71-
if res.status_code == 200:
55+
# Handle dummy data as TSV
7256
tsv_data = res.content.decode("utf-8").splitlines()
7357
reader = csv.DictReader(tsv_data, delimiter="\t")
7458
reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
7559

7660
rows_to_add = []
7761
for index, row in enumerate(reader):
78-
for key, value in list(row.items()):
79-
if value == "":
80-
row[key] = None
81-
status = (
82-
sqlite.query(RowNoteStatusRecord).filter(RowNoteStatusRecord.note_id == row["note_id"]).first()
83-
)
84-
if status is None or status.created_at_millis > int(datetime.now().timestamp() * 1000):
85-
sqlite.query(RowNoteStatusRecord).filter(RowNoteStatusRecord.note_id == row["note_id"]).delete()
86-
rows_to_add.append(RowNoteStatusRecord(**row))
62+
if sqlite.query(RowNoteRecord).filter(RowNoteRecord.note_id == row["note_id"]).first():
63+
continue
64+
rows_to_add.append(RowNoteRecord(**row))
8765
if index % 1000 == 0:
8866
sqlite.bulk_save_objects(rows_to_add)
8967
rows_to_add = []
9068
sqlite.bulk_save_objects(rows_to_add)
69+
else:
70+
# Handle real data as zip file
71+
try:
72+
with zipfile.ZipFile(io.BytesIO(res.content)) as zip_file:
73+
file_names = zip_file.namelist()
74+
if file_names:
75+
tsv_file_name = file_names[0]
76+
with zip_file.open(tsv_file_name) as tsv_file:
77+
tsv_data = tsv_file.read().decode("utf-8").splitlines()
78+
reader = csv.DictReader(tsv_data, delimiter="\t")
79+
reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
80+
81+
rows_to_add = []
82+
for index, row in enumerate(reader):
83+
if (
84+
sqlite.query(RowNoteRecord)
85+
.filter(RowNoteRecord.note_id == row["note_id"])
86+
.first()
87+
):
88+
continue
89+
rows_to_add.append(RowNoteRecord(**row))
90+
if index % 1000 == 0:
91+
sqlite.bulk_save_objects(rows_to_add)
92+
rows_to_add = []
93+
sqlite.bulk_save_objects(rows_to_add)
94+
except zipfile.BadZipFile:
95+
logging.error(f"Invalid zip file from {note_url}")
96+
continue
97+
except Exception as e:
98+
logging.error(f"Error processing note data from {note_url}: {e}")
99+
continue
100+
101+
status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.zip"
102+
if settings.USE_DUMMY_DATA:
103+
status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv"
104+
105+
logging.info(status_url)
106+
res = requests.get(status_url)
107+
108+
if res.status_code == 200:
109+
if settings.USE_DUMMY_DATA:
110+
# Handle dummy data as TSV
111+
tsv_data = res.content.decode("utf-8").splitlines()
112+
reader = csv.DictReader(tsv_data, delimiter="\t")
113+
reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
114+
115+
rows_to_add = []
116+
for index, row in enumerate(reader):
117+
for key, value in list(row.items()):
118+
if value == "":
119+
row[key] = None
120+
status = (
121+
sqlite.query(RowNoteStatusRecord)
122+
.filter(RowNoteStatusRecord.note_id == row["note_id"])
123+
.first()
124+
)
125+
if status is None or status.created_at_millis > int(datetime.now().timestamp() * 1000):
126+
sqlite.query(RowNoteStatusRecord).filter(
127+
RowNoteStatusRecord.note_id == row["note_id"]
128+
).delete()
129+
rows_to_add.append(RowNoteStatusRecord(**row))
130+
if index % 1000 == 0:
131+
sqlite.bulk_save_objects(rows_to_add)
132+
rows_to_add = []
133+
sqlite.bulk_save_objects(rows_to_add)
134+
else:
135+
# Handle real data as zip file
136+
try:
137+
with zipfile.ZipFile(io.BytesIO(res.content)) as zip_file:
138+
file_names = zip_file.namelist()
139+
if file_names:
140+
tsv_file_name = file_names[0]
141+
with zip_file.open(tsv_file_name) as tsv_file:
142+
tsv_data = tsv_file.read().decode("utf-8").splitlines()
143+
reader = csv.DictReader(tsv_data, delimiter="\t")
144+
reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
145+
146+
rows_to_add = []
147+
for index, row in enumerate(reader):
148+
for key, value in list(row.items()):
149+
if value == "":
150+
row[key] = None
151+
status = (
152+
sqlite.query(RowNoteStatusRecord)
153+
.filter(RowNoteStatusRecord.note_id == row["note_id"])
154+
.first()
155+
)
156+
if status is None or status.created_at_millis > int(
157+
datetime.now().timestamp() * 1000
158+
):
159+
sqlite.query(RowNoteStatusRecord).filter(
160+
RowNoteStatusRecord.note_id == row["note_id"]
161+
).delete()
162+
rows_to_add.append(RowNoteStatusRecord(**row))
163+
if index % 1000 == 0:
164+
sqlite.bulk_save_objects(rows_to_add)
165+
rows_to_add = []
166+
sqlite.bulk_save_objects(rows_to_add)
167+
except zipfile.BadZipFile:
168+
logging.error(f"Invalid zip file from {status_url}")
169+
continue
170+
except Exception as e:
171+
logging.error(f"Error processing note status data from {status_url}: {e}")
172+
continue
91173

92174
break
93175

@@ -96,11 +178,20 @@ def extract_data(sqlite: Session, postgresql: Session):
96178
sqlite.commit()
97179

98180
# Noteに紐づくtweetデータを取得
181+
# Build keyword filter conditions using shared TARGET_KEYWORDS
182+
keyword_conditions = []
183+
for keyword in TARGET_KEYWORDS:
184+
keyword_conditions.append(RowNoteRecord.summary.ilike(f"%{keyword}%"))
185+
99186
postExtract_targetNotes = (
100187
sqlite.query(RowNoteRecord)
101188
.filter(RowNoteRecord.tweet_id != None)
102189
.filter(RowNoteRecord.created_at_millis >= settings.TARGET_TWITTER_POST_START_UNIX_MILLISECOND)
103190
.filter(RowNoteRecord.created_at_millis <= settings.TARGET_TWITTER_POST_END_UNIX_MILLISECOND)
191+
.filter(
192+
# Use OR condition to match any of the keywords
193+
sqlalchemy.or_(*keyword_conditions)
194+
)
104195
.all()
105196
)
106197
logging.info(f"Target notes: {len(postExtract_targetNotes)}")
@@ -119,8 +210,9 @@ def extract_data(sqlite: Session, postgresql: Session):
119210
if post == None or "data" not in post:
120211
continue
121212

122-
created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
213+
created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
123214
created_at_millis = int(created_at.timestamp() * 1000)
215+
now_millis = int(datetime.now(timezone.utc).timestamp() * 1000)
124216

125217
is_userExist = (
126218
postgresql.query(RowUserRecord).filter(RowUserRecord.user_id == post["data"]["author_id"]).first()
@@ -166,6 +258,7 @@ def extract_data(sqlite: Session, postgresql: Session):
166258
quote_count=post["data"]["public_metrics"]["quote_count"],
167259
reply_count=post["data"]["public_metrics"]["reply_count"],
168260
lang=post["data"]["lang"],
261+
extracted_at=now_millis,
169262
)
170263
postgresql.add(row_post)
171264

etl/src/birdxplorer_etl/lib/sqlite/init.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,22 @@
1212
RowPostEmbedURLRecord,
1313
RowNoteStatusRecord,
1414
RowPostMediaRecord,
15+
RowNoteRatingRecord,
1516
)
1617

1718

1819
def init_sqlite():
1920
# ToDo: dbファイルをS3など外部に置く必要がある。
2021
db_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "data", "note.db"))
2122
logging.info(f"Initializing database at {db_path}")
22-
engine = create_engine("sqlite:///" + db_path)
23+
engine = create_engine(
24+
"sqlite:///" + db_path,
25+
pool_size=20,
26+
max_overflow=30,
27+
pool_timeout=60,
28+
pool_recycle=3600,
29+
connect_args={"check_same_thread": False, "timeout": 60},
30+
)
2331

2432
# 一時データベースのテーブル作成する
2533
# ToDo: noteテーブル以外に必要なものを追加
@@ -29,6 +37,9 @@ def init_sqlite():
2937
if not inspect(engine).has_table("row_note_status"):
3038
logging.info("Creating table note_status")
3139
RowNoteStatusRecord.metadata.create_all(engine)
40+
if not inspect(engine).has_table("row_note_ratings"):
41+
logging.info("Creating table note_ratings")
42+
RowNoteRatingRecord.metadata.create_all(engine)
3243

3344
Session = sessionmaker(bind=engine)
3445

@@ -57,6 +68,9 @@ def init_postgresql():
5768
if not inspect(engine).has_table("row_post_media"):
5869
logging.info("Creating table post_media")
5970
RowPostMediaRecord.metadata.create_all(engine)
71+
if not inspect(engine).has_table("row_note_ratings"):
72+
logging.info("Creating table note_ratings")
73+
RowNoteRatingRecord.metadata.create_all(engine)
6074

6175
Session = sessionmaker(bind=engine)
6276

0 commit comments

Comments
 (0)