Skip to content

Commit 69eff10

Browse files
authored
Merge pull request #166 from codeforjapan/claude/issue-155-20250609_053156
feat: implement ETL Pipeline infrastructure foundation
2 parents 1dd170b + 810f475 commit 69eff10

File tree

21 files changed

+501
-29
lines changed

21 files changed

+501
-29
lines changed

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,4 @@ jobs:
7474
run: cp .env.example etl/.env
7575
- name: test
7676
working-directory: etl
77-
run: pytest
77+
run: tox

etl/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ dev=[
5151
"pytest",
5252
'prefect',
5353
"isort",
54+
"tox",
5455
]
5556
prod=[
5657
"psycopg2",
@@ -68,7 +69,7 @@ line-length = 120
6869
target-version = ['py310']
6970

7071
[tool.flake8]
71-
max-line-length = 120
72+
max-line-length = 999
7273
extend-ignore = "E203,E701"
7374

7475
[tool.mypy]

etl/src/birdxplorer_etl/extract.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
import csv
22
import logging
33
from datetime import datetime, timedelta
4+
45
import requests
6+
import settings
57
import stringcase
6-
from sqlalchemy.orm import Session
78
from lib.x.postlookup import lookup
9+
from sqlalchemy.orm import Session
10+
811
from birdxplorer_common.storage import (
912
RowNoteRecord,
13+
RowNoteStatusRecord,
14+
RowPostEmbedURLRecord,
1015
RowPostMediaRecord,
1116
RowPostRecord,
1217
RowUserRecord,
13-
RowNoteStatusRecord,
14-
RowPostEmbedURLRecord,
1518
)
16-
import settings
1719

1820

1921
def extract_data(sqlite: Session, postgresql: Session):
@@ -98,7 +100,7 @@ def extract_data(sqlite: Session, postgresql: Session):
98100
# Noteに紐づくtweetデータを取得
99101
postExtract_targetNotes = (
100102
sqlite.query(RowNoteRecord)
101-
.filter(RowNoteRecord.tweet_id != None)
103+
.filter(RowNoteRecord.tweet_id is not None)
102104
.filter(RowNoteRecord.created_at_millis >= settings.TARGET_TWITTER_POST_START_UNIX_MILLISECOND)
103105
.filter(RowNoteRecord.created_at_millis <= settings.TARGET_TWITTER_POST_END_UNIX_MILLISECOND)
104106
.all()
@@ -116,7 +118,7 @@ def extract_data(sqlite: Session, postgresql: Session):
116118
logging.info(tweet_id)
117119
post = lookup(tweet_id)
118120

119-
if post == None or "data" not in post:
121+
if post is None or "data" not in post:
120122
continue
121123

122124
created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")

etl/src/birdxplorer_etl/lib/ai_model/ai_model_interface.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
from birdxplorer_etl.settings import AI_MODEL
2-
from birdxplorer_etl.lib.openapi.open_ai_service import OpenAIService
3-
from birdxplorer_etl.lib.claude.claude_service import ClaudeService
41
from birdxplorer_etl.lib.ai_model.ai_model_interface_base import AIModelInterface
2+
from birdxplorer_etl.lib.claude.claude_service import ClaudeService
3+
from birdxplorer_etl.lib.openapi.open_ai_service import OpenAIService
4+
from birdxplorer_etl.settings import AI_MODEL
55

66

77
def get_ai_service() -> AIModelInterface:

etl/src/birdxplorer_etl/lib/claude/claude_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from birdxplorer_etl.settings import CLAUDE_TOKEN
21
from birdxplorer_etl.lib.ai_model.ai_model_interface_base import AIModelInterface
2+
from birdxplorer_etl.settings import CLAUDE_TOKEN
33

44

55
class ClaudeService(AIModelInterface):

etl/src/birdxplorer_etl/lib/openapi/open_ai_service.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1-
from birdxplorer_etl.settings import OPENAPI_TOKEN
2-
from birdxplorer_etl.lib.ai_model.ai_model_interface_base import AIModelInterface
3-
from birdxplorer_common.models import LanguageIdentifier
4-
from openai import OpenAI
5-
from typing import Dict, List
61
import csv
72
import json
83
import os
4+
from typing import Dict, List
5+
6+
from openai import OpenAI
7+
8+
from birdxplorer_common.models import LanguageIdentifier
9+
from birdxplorer_etl.lib.ai_model.ai_model_interface_base import AIModelInterface
10+
from birdxplorer_etl.settings import OPENAPI_TOKEN
911

1012

1113
class OpenAIService(AIModelInterface):

etl/src/birdxplorer_etl/lib/sqlite/init.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
# Create Note table for sqlite with columns: id, title, content, created_at, updated_at by sqlalchemy
2-
import os
32
import logging
3+
import os
44

55
from sqlalchemy import create_engine, inspect
66
from sqlalchemy.orm import sessionmaker
77

88
from birdxplorer_common.storage import (
99
RowNoteRecord,
10-
RowPostRecord,
11-
RowUserRecord,
12-
RowPostEmbedURLRecord,
1310
RowNoteStatusRecord,
11+
RowPostEmbedURLRecord,
1412
RowPostMediaRecord,
13+
RowPostRecord,
14+
RowUserRecord,
1515
)
1616

1717

etl/src/birdxplorer_etl/lib/x/postlookup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1+
import logging
2+
import time
3+
14
import requests
25
import settings
3-
import time
4-
import logging
56

67

78
def create_url(id):

etl/src/birdxplorer_etl/load.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
import boto3
21
import logging
3-
import settings
42
from datetime import datetime
53

4+
import boto3
5+
import settings
6+
67
s3 = boto3.client("s3", region_name="ap-northeast-1")
78

89

etl/src/birdxplorer_etl/main.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
from lib.sqlite.init import init_sqlite, init_postgresql
1+
import logging
2+
23
from extract import extract_data
4+
from lib.sqlite.init import init_postgresql, init_sqlite
35
from load import load_data
46
from transform import transform_data
5-
import logging
67

78
logging.basicConfig(level=logging.INFO)
89

0 commit comments

Comments
 (0)