Skip to content

Commit 122ed43

Browse files
committed
add kns_query + misc. minor improvements
1 parent 84e364f commit 122ed43

File tree

10 files changed

+3046
-696
lines changed

10 files changed

+3046
-696
lines changed

airflow/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ venv
44
data
55
*.egg-info
66
.airflow
7+
.venv

airflow/README.md

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,28 +12,25 @@ install Airflow unless you want to check some Airflow specific detail.
1212

1313
Prerequisites:
1414

15-
* Python 3.8
15+
* Python 3.8 + [uv](https://pypi.org/project/uv/)
1616
* Docker Compose
1717

1818
Create virtualenv and install dependencies
1919

2020
```
21-
python3.8 -m venv venv &&\
22-
. venv/bin/activate &&\
23-
pip install --upgrade pip setuptools wheel &&\
24-
pip install -e .
21+
uv sync
2522
```
2623

2724
Start a Database:
2825

2926
```
30-
docker-compose up -d db
27+
docker compose up -d db
3128
```
3229

3330
Run commands from the CLI:
3431

3532
```
36-
knesset-data-pipelines --help
33+
uv run knesset-data-pipelines --help
3734
```
3835

3936
Depending on the specific command, you will probably need to run dependant pipelines or download some packages or

airflow/docker-compose.yaml renamed to airflow/compose.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
version: '2.4'
2-
31
services:
42

53
db:

airflow/knesset_data_pipelines/run_pipeline.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,26 @@ class RequestThrottledException(Exception):
3939
pass
4040

4141

42+
def post_process_airflow_pipeline(pipeline_path, pipeline_name, pipeline):
43+
if 'schemas-bucket' not in pipeline:
44+
pipeline['schemas-bucket'] = pipeline_path
45+
return pipeline
46+
47+
4248
def get_pipeline_spec(pipeline_id):
4349
*pipeline_path, pipeline_name = pipeline_id.split("/")
4450
pipeline_path = '/'.join(pipeline_path)
4551
source_spec_yaml = os.path.join(config.KNESSET_DATA_PIPELINES_ROOT_DIR, pipeline_path, 'knesset.source-spec.yaml')
46-
assert os.path.exists(source_spec_yaml)
47-
with open(source_spec_yaml) as f:
48-
source_spec = yaml.safe_load(f)
49-
pipeline = source_spec[pipeline_name]
50-
return pipeline_name, pipeline
52+
if os.path.exists(source_spec_yaml):
53+
with open(source_spec_yaml) as f:
54+
source_spec = yaml.safe_load(f)
55+
if pipeline_name in source_spec:
56+
return pipeline_name, source_spec[pipeline_name]
57+
source_spec_yaml = os.path.join(config.KNESSET_DATA_PIPELINES_ROOT_DIR, 'airflow/pipelines', pipeline_path, f'{pipeline_name}.yaml')
58+
if os.path.exists(source_spec_yaml):
59+
with open(source_spec_yaml) as f:
60+
return pipeline_name, post_process_airflow_pipeline(pipeline_path, pipeline_name, yaml.safe_load(f))
61+
raise Exception(f'pipeline {pipeline_id} not found')
5162

5263

5364
def get_pipeline_params(pipeline_name, pipeline):
@@ -410,6 +421,13 @@ def _iterate_pipelines(filter_pipeline_ids=None):
410421
continue
411422
pipeline_name, pipeline = get_pipeline_spec(pipeline_id)
412423
yield pipeline_id, pipeline_name, pipeline
424+
for pipeline_yaml in glob(os.path.join(config.KNESSET_DATA_PIPELINES_ROOT_DIR, 'airflow/pipelines/**/*.yaml'), recursive=True):
425+
pipeline_path, pipeline_name = pipeline_yaml.split('/')[-2:]
426+
pipeline_name = pipeline_name.replace('.yaml', '')
427+
pipeline_id = f'{pipeline_path}/{pipeline_name}'
428+
with open(pipeline_yaml) as f:
429+
pipeline = post_process_airflow_pipeline(pipeline_path, pipeline_name, yaml.safe_load(f))
430+
yield pipeline_id, pipeline_name, pipeline
413431

414432

415433
def list_pipelines(full=False, filter_pipeline_ids=None, all_=False, with_dependencies=False):
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
pipeline-type: knesset dataservice
2+
dataservice-parameters:
3+
service-name: api
4+
method-name: "KNS_Query"
5+
fields:
6+
QueryID:
7+
source: "{name}"
8+
type: integer
9+
Number:
10+
source: "{name}"
11+
type: integer
12+
KnessetNum:
13+
source: "{name}"
14+
type: string
15+
Name:
16+
source: "{name}"
17+
type: string
18+
TypeID:
19+
source: "{name}"
20+
type: integer
21+
TypeDesc:
22+
source: "{name}"
23+
type: string
24+
StatusID:
25+
source: "{name}"
26+
type: integer
27+
PersonID:
28+
source: "{name}"
29+
type: integer
30+
GovMinistryID:
31+
source: "{name}"
32+
type: integer
33+
SubmitDate:
34+
source: "{name}"
35+
type: datetime
36+
ReplyMinisterDate:
37+
source: "{name}"
38+
type: datetime
39+
ReplyDatePlanned:
40+
source: "{name}"
41+
type: datetime
42+
LastUpdatedDate:
43+
source: "{name}"
44+
type: datetime

airflow/pyproject.toml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
[project]
2+
name = "knesset-data-pipelines"
3+
version = "0.1.0"
4+
requires-python = "==3.8.*"
5+
dependencies = [
6+
"SQLAlchemy==1.4.46",
7+
"click==8.1.3",
8+
"ruamel.yaml==0.17.21",
9+
"python-dotenv==0.21.1",
10+
"dataflows==0.3.16",
11+
"requests[socks]==2.28.2",
12+
"beautifulsoup4==4.11.2",
13+
"google-cloud-storage==2.7.0",
14+
"google-api-python-client==2.90.0",
15+
"apache-airflow-providers-cncf-kubernetes==7.2.0",
16+
"pyquery==1.4.0",
17+
]
18+
19+
#[dependency-groups]
20+
#dev = [
21+
# "pytest>=8.3.5",
22+
#]
23+
24+
[build-system]
25+
requires = ["hatchling"]
26+
build-backend = "hatchling.build"
27+
28+
[project.scripts]
29+
knesset-data-pipelines = "knesset_data_pipelines.cli:main"

airflow/requirements.in

Lines changed: 0 additions & 11 deletions
This file was deleted.

0 commit comments

Comments
 (0)