Skip to content

Commit 48e842a

Browse files
committed
add default step to convert geoms to multi
1 parent 525c35a commit 48e842a

File tree

9 files changed

+160
-17
lines changed

9 files changed

+160
-17
lines changed

dcpy/lifecycle/ingest/configure.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,10 @@ def get_config(
129129
)
130130
processing_steps.append(clean_column_names)
131131

132+
if "multi" not in processing_step_names and template.has_geom:
133+
multi = PreprocessingStep(name="multi")
134+
processing_steps.append(multi)
135+
132136
if mode:
133137
modes = {s.mode for s in processing_steps}
134138
if mode not in modes:

dcpy/lifecycle/ingest/transform.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,14 @@ def strip_columns(
182182
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
183183
return df
184184

185+
def multi(self, df: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
186+
multi_gdf = df.copy()
187+
multi_gdf.set_geometry(
188+
gpd.GeoSeries([transform.multi(feature) for feature in multi_gdf.geometry]),
189+
inplace=True,
190+
)
191+
return multi_gdf
192+
185193
def pd_series_func(
186194
self,
187195
df: pd.DataFrame,
@@ -219,10 +227,6 @@ def pd_series_func(
219227
transformed[output_column_name or column_name] = func(**kwargs) # type: ignore
220228
return transformed
221229

222-
def no_arg_function(self, df: pd.DataFrame) -> pd.DataFrame:
223-
"""Dummy/stub for testing. Can be dropped if we implement actual function with no args other than df"""
224-
return df
225-
226230

227231
def validate_pd_series_func(
228232
*, function_name: str, column_name: str = "", **kwargs

dcpy/models/lifecycle/ingest.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,14 @@ class Template(BaseModel, extra="forbid"):
8989
## this is the original library template, included just for reference while we build out our new templates
9090
library_dataset: library.DatasetDefinition | None = None
9191

92+
@property
93+
def has_geom(self):
94+
match self.ingestion.file_format:
95+
case file.Shapefile() | file.Geodatabase() | file.GeoJson():
96+
return True
97+
case file.Csv() | file.Xlsx() | file.Json() as format:
98+
return format.geometry is not None
99+
92100

93101
class Config(SortedSerializedBase, extra="forbid"):
94102
"""
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
id: dob_now_permits
2+
acl: public-read
3+
4+
ingestion:
5+
source:
6+
type: s3
7+
bucket: edm-private
8+
key: dob_now/dob_now_permits/DOB_Now_Permit_Filing_File_{{ version }}.csv
9+
file_format:
10+
type: csv

dcpy/test/lifecycle/ingest/test_configure.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -94,28 +94,35 @@ def test_get_filename_invalid_source():
9494

9595

9696
class TestGetConfig:
97+
def test_standard_no_geom(self):
98+
config = configure.get_config("dob_now_permits", template_dir=TEMPLATE_DIR)
99+
# ensure no reprojection or multi step (no target_crs, no geom respectively)
100+
# ensure default 'clean_column_names' step is added
101+
assert len(config.ingestion.processing_steps) == 1
102+
assert config.ingestion.processing_steps[0].name == "clean_column_names"
103+
97104
def test_standard(self):
98105
config = configure.get_config(
99106
"dca_operatingbusinesses", template_dir=TEMPLATE_DIR
100107
)
101-
# ensure no reprojection step
102-
# ensure default 'clean_column_names' step is added
103-
assert len(config.ingestion.processing_steps) == 1
108+
# ensure no reprojection
109+
# ensure default 'clean_column_names' and multi steps are added
110+
assert len(config.ingestion.processing_steps) == 2
104111
assert config.ingestion.processing_steps[0].name == "clean_column_names"
105112

106113
def test_clean_column_names_defined(self):
107114
config = configure.get_config("bpl_libraries", template_dir=TEMPLATE_DIR)
108115
# ensure no reprojection step
109-
# ensure default 'clean_column_names' step is added
110-
assert len(config.ingestion.processing_steps) == 1
116+
# ensure default 'clean_column_names' and 'multi' steps are added
117+
assert len(config.ingestion.processing_steps) == 2
111118
assert config.ingestion.processing_steps[0].name == "clean_column_names"
112119
assert config.ingestion.processing_steps[0].args == {"replace": {"data.": ""}}
113120

114121
def test_reproject(self):
115122
config = configure.get_config(
116123
"dcp_addresspoints", version="24c", template_dir=TEMPLATE_DIR
117124
)
118-
assert len(config.ingestion.processing_steps) == 2
125+
assert len(config.ingestion.processing_steps) == 3
119126
assert config.ingestion.processing_steps[0].name == "reproject"
120127

121128
def test_no_mode(self):

dcpy/test/lifecycle/ingest/test_run.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import pandas as pd
1+
import geopandas as gpd
22
import pytest
33
from unittest import mock
44
import shutil
@@ -59,7 +59,6 @@ def test_run_update_freshness(mock_request_get, create_buckets, create_temp_file
5959
)
6060
config = recipes.get_config(DATASET, FAKE_VERSION)
6161
assert config.archival.check_timestamps == []
62-
6362
run(
6463
dataset_id=DATASET,
6564
version=FAKE_VERSION,
@@ -90,7 +89,7 @@ def test_run_update_freshness_fails_if_data_diff(
9089

9190
# this time, replace the dataframe with a different one in the middle of the ingest process
9291
with mock.patch("dcpy.utils.geospatial.parquet.read_df") as patch_read_df:
93-
patch_read_df.return_value = pd.DataFrame({"a": ["b"]})
92+
patch_read_df.return_value = gpd.GeoDataFrame({"a": [None]}).set_geometry("a")
9493
with pytest.raises(
9594
FileExistsError,
9695
match=f"Archived dataset 'id='{DATASET}' version='{FAKE_VERSION}'' already exists and has different data.",

dcpy/test/lifecycle/ingest/test_transform.py

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from pathlib import Path
55
from pydantic import TypeAdapter, BaseModel
66
import pytest
7+
from shapely import Polygon, MultiPolygon
78
import yaml
89
from unittest import TestCase, mock
910

@@ -91,16 +92,24 @@ def test_to_parquet(file: dict, create_temp_filesystem: Path):
9192

9293
def test_validate_processing_steps():
9394
steps = [
94-
PreprocessingStep(name="no_arg_function"),
95+
PreprocessingStep(name="multi"),
9596
PreprocessingStep(name="drop_columns", args={"columns": ["col1", "col2"]}),
9697
]
9798
compiled_steps = transform.validate_processing_steps("test", steps)
9899
assert len(compiled_steps) == 2
99100

100-
df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [7, 8, 9]})
101+
df = gpd.GeoDataFrame(
102+
{
103+
"col1": [1, 2, 3],
104+
"col2": [4, 5, 6],
105+
"col3": gpd.GeoSeries([None, None, None]),
106+
}
107+
).set_geometry("col3")
101108
for step in compiled_steps:
102109
df = step(df)
103-
expected = pd.DataFrame({"col3": [7, 8, 9]})
110+
expected = gpd.GeoDataFrame(
111+
{"col3": gpd.GeoSeries([None, None, None])}
112+
).set_geometry("col3")
104113
assert df.equals(expected)
105114

106115

@@ -155,7 +164,7 @@ def test_invalid_function(self):
155164

156165
class TestPreprocessors(TestCase):
157166
proc = transform.Preprocessor(TEST_DATASET_NAME)
158-
gdf: gpd.GeoDataFrame = gpd.read_parquet(RESOURCES / TEST_DATA_DIR / "test.parquet")
167+
gdf = gpd.read_parquet(RESOURCES / TEST_DATA_DIR / "test.parquet")
159168
basic_df = pd.DataFrame({"a": [2, 3, 1], "b": ["b_1", "b_2", "c_3"]})
160169
messy_names_df = pd.DataFrame({"Column": [1, 2], "Two_Words": [3, 4]})
161170
dupe_df = pd.DataFrame({"a": [1, 1, 1, 2], "b": [3, 1, 3, 2]})
@@ -287,6 +296,44 @@ def test_rename_geodataframe(self):
287296
expected = gpd.read_parquet(RESOURCES / TEST_DATA_DIR / "renamed.parquet")
288297
assert transformed.equals(expected)
289298

299+
def test_multi(self):
300+
gdf = gpd.GeoDataFrame(
301+
{
302+
"a": [1, 2, 3],
303+
"wkt": gpd.GeoSeries(
304+
[
305+
None,
306+
Polygon([(0, 0), (0, 1), (1, 0), (0, 0)]),
307+
MultiPolygon(
308+
[
309+
Polygon([(0, 0), (0, 1), (1, 0), (0, 0)]),
310+
Polygon([(0, 0), (0, -1), (-1, 0), (0, 0)]),
311+
]
312+
),
313+
]
314+
),
315+
}
316+
).set_geometry("wkt")
317+
transformed = self.proc.multi(gdf)
318+
expected = gpd.GeoDataFrame(
319+
{
320+
"a": [1, 2, 3],
321+
"wkt": gpd.GeoSeries(
322+
[
323+
None,
324+
MultiPolygon([Polygon([(0, 0), (0, 1), (1, 0), (0, 0)])]),
325+
MultiPolygon(
326+
[
327+
Polygon([(0, 0), (0, 1), (1, 0), (0, 0)]),
328+
Polygon([(0, 0), (0, -1), (-1, 0), (0, 0)]),
329+
]
330+
),
331+
]
332+
),
333+
}
334+
)
335+
assert transformed.equals(expected)
336+
290337

291338
def test_preprocess_no_steps(create_temp_filesystem: Path):
292339
input = create_temp_filesystem / "input.txt"

dcpy/test/utils/test_geospatial.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,14 @@
33
import pandas as pd
44
import geopandas as gpd
55
import shapely
6+
from shapely import (
7+
Point,
8+
MultiPoint,
9+
LineString,
10+
MultiLineString,
11+
Polygon,
12+
MultiPolygon,
13+
)
614
from tempfile import TemporaryDirectory
715

816
from dcpy.models.geospatial import geometry
@@ -146,3 +154,38 @@ def test_read_parquet(self):
146154

147155
gdf = parquet.read_df(RESOURCES_DIR / "geo.parquet")
148156
assert isinstance(gdf, gpd.GeoDataFrame)
157+
158+
159+
@pytest.mark.parametrize(
160+
"input, expected",
161+
[
162+
(None, None),
163+
(Point(0, 1), MultiPoint([(0, 1)])),
164+
(MultiPoint([(2, 3), (4, 5)]), MultiPoint([(2, 3), (4, 5)])),
165+
(LineString([(0, 0), (1, 1)]), MultiLineString([[(0, 0), (1, 1)]])),
166+
(
167+
MultiLineString([[(0, 0), (-1, 1)], [(0, 0), (1, -1)]]),
168+
MultiLineString([[(0, 0), (-1, 1)], [(0, 0), (1, -1)]]),
169+
),
170+
(
171+
Polygon([(0, 0), (0, 1), (1, 0), (0, 0)]),
172+
MultiPolygon([Polygon([(0, 0), (0, 1), (1, 0), (0, 0)])]),
173+
),
174+
(
175+
MultiPolygon(
176+
[
177+
Polygon([(0, 0), (0, 1), (1, 0), (0, 0)]),
178+
Polygon([(0, 0), (0, -1), (-1, 0), (0, 0)]),
179+
]
180+
),
181+
MultiPolygon(
182+
[
183+
Polygon([(0, 0), (0, 1), (1, 0), (0, 0)]),
184+
Polygon([(0, 0), (0, -1), (-1, 0), (0, 0)]),
185+
]
186+
),
187+
),
188+
],
189+
)
190+
def test_multi(input, expected):
191+
assert transform.multi(input) == expected

dcpy/utils/geospatial/transform.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,33 @@
88
TextColumn,
99
TimeRemainingColumn,
1010
)
11+
from shapely import (
12+
Geometry,
13+
Point,
14+
MultiPoint,
15+
LineString,
16+
MultiLineString,
17+
Polygon,
18+
MultiPolygon,
19+
)
1120

1221
from dcpy.models import file
1322
from dcpy.models.geospatial import geometry as geom
1423
from dcpy.utils.logging import logger
1524

1625

26+
def multi(geom: Geometry | None) -> Geometry | None:
27+
match geom:
28+
case Point():
29+
return MultiPoint([geom])
30+
case LineString():
31+
return MultiLineString([geom])
32+
case Polygon():
33+
return MultiPolygon([geom])
34+
case _:
35+
return geom
36+
37+
1738
def df_to_gdf(df: pd.DataFrame, geometry: file.Geometry) -> gpd.GeoDataFrame:
1839
"""
1940
Convert a pandas DataFrame to a GeoDataFrame based on the provided geometry information.

0 commit comments

Comments
 (0)