Skip to content

Commit a713942

Browse files
Start tests enusc dtmgt and small logistical changes.
1 parent 1c6bbc4 commit a713942

File tree

7 files changed

+349
-1329
lines changed

7 files changed

+349
-1329
lines changed

src/project_mbb/data_management/clean_enusc.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,37 @@
11
import pandas as pd
22

3-
pd.options.mode.copy_on_write = True
4-
pd.options.future.infer_string = True
5-
pd.options.plotting.backend = "plotly"
6-
73
from project_mbb.parameters import (
84
categories,
95
floats,
106
integers,
117
map_category,
12-
relevant_var,
138
rename_mapping,
149
strings,
1510
)
1611

12+
pd.options.mode.copy_on_write = True
13+
pd.options.future.infer_string = True
14+
pd.options.plotting.backend = "plotly"
15+
1716

1817
def clean_enusc(raw_enusc):
1918
enusc_filtered = _filter_enusc(raw_enusc)
20-
enusc_renamed = _rename_enusc(enusc_filtered)
19+
enusc_renamed = _rename_enusc(enusc_filtered, rename_mapping)
2120
enusc_category = _map_categories(enusc_renamed)
2221
# here check for no objects present
2322
enusc_filled = _fill_missings(enusc_category)
2423
enusc_dtypes = _set_data_types_of_numbers(enusc_filled)
2524
return enusc_dtypes
2625

2726

28-
def _filter_enusc(raw_enusc):
27+
def _filter_enusc(raw_enusc, relevant_var):
2928
enusc_filtered = raw_enusc[relevant_var]
3029
return enusc_filtered
3130

3231

33-
def _rename_enusc(enusc_lower):
34-
enusc_lower.columns = enusc_lower.columns.str.lower()
35-
enusc_renamed = enusc_lower.rename(columns=rename_mapping)
32+
def _rename_enusc(enusc_filtered, rename_mapping):
33+
enusc_filtered.columns = enusc_filtered.columns.str.lower()
34+
enusc_renamed = enusc_filtered.rename(columns=rename_mapping)
3635
return enusc_renamed
3736

3837

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
import pandas as pd
22

3+
from project_mbb.parameters import relevant_var, rename_mapping
4+
35
pd.options.mode.copy_on_write = True
46
pd.options.future.infer_string = True
57
pd.options.plotting.backend = "plotly"
68

7-
from project_mbb.parameters import relevant_var, rename_mapping
8-
99

10-
def create_labels_var_relation(raw_enusc, labels_raw):
11-
labels_var = _concat_labels_var(raw_enusc, labels_raw)
12-
labels_var_filt = _filter_labels_var(labels_var)
13-
labels_renamed = _rename_variables_eng(labels_var_filt)
10+
def create_labels_var_relation(raw_enusc, column_labels):
11+
labels_var = _concat_labels_var(raw_enusc, column_labels)
12+
labels_var_filt = _filter_labels_var(labels_var, relevant_var)
13+
labels_renamed = _rename_variables_eng(labels_var_filt, rename_mapping)
1414
return labels_renamed
1515

1616

@@ -24,12 +24,12 @@ def _concat_labels_var(raw_enusc, column_labels):
2424
return labels_var
2525

2626

27-
def _filter_labels_var(var_labels):
27+
def _filter_labels_var(var_labels, relevant_var):
2828
var_labels_filt = var_labels[var_labels["variable_name"].isin(relevant_var)]
2929
return var_labels_filt
3030

3131

32-
def _rename_variables_eng(labels_var_filt):
32+
def _rename_variables_eng(labels_var_filt, rename_mapping):
3333
labels_var_filt["variable_name"] = labels_var_filt["variable_name"].str.lower()
3434
labels_renamed = labels_var_filt.replace({"variable_name": rename_mapping})
3535
return labels_renamed

src/project_mbb/data_management/task_data_management.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@ def task_clean_labels(
4141
labels_raw=BLD / "data" / "column_labels.csv",
4242
produces=BLD / "data" / "variable_labels_clean.pkl",
4343
):
44-
raw = pd.read_csv(raw)
44+
raw_enusc = pd.read_csv(raw)
4545
labels_raw = pd.read_csv(labels_raw)
46-
labels_var = create_labels_var_relation(raw, labels_raw)
46+
labels_var = create_labels_var_relation(raw_enusc, labels_raw)
4747
labels_var.to_pickle(produces)
4848

4949

src/project_mbb/parameters.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,6 @@
123123
"hur_contacto_mp": "contact_prosecutor_post_report",
124124
"hur_motiv_no_den": "reason_not_reporting_theft",
125125
# residency history
126-
"antig_sector": "residency_duration_sector",
127126
# crime perception indicators
128127
"pad": "crime_perception_national",
129128
"padc": "crime_perception_commune",
@@ -467,7 +466,6 @@
467466
}
468467

469468

470-
471469
floats = {
472470
"fact_pers_com",
473471
"fact_pers_reg",
@@ -511,10 +509,11 @@
511509
"household_theft_victim",
512510
"theft_reported",
513511
"education_level",
514-
"reason_not_reporting_theft"
512+
"reason_not_reporting_theft",
515513
}
516514

517515

518-
strings = {"person_id",
519-
"household_id",
520-
}
516+
strings = {
517+
"person_id",
518+
"household_id",
519+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import pandas as pd
2+
import pytest
3+
4+
from project_mbb.config import BLD
5+
from project_mbb.data_management.clean_enusc import (
6+
_filter_enusc,
7+
_rename_enusc,
8+
)
9+
from project_mbb.parameters import relevant_var, rename_mapping
10+
11+
12+
@pytest.fixture
13+
def enusc_raw():
14+
enusc_raw_path = BLD / "data" / "ENUSC_raw.csv"
15+
enusc_raw = pd.read_csv(enusc_raw_path)
16+
return enusc_raw
17+
18+
19+
@pytest.fixture
20+
def data_test():
21+
data = {
22+
"variable_name": ["rph_id", "idhogar", "enc_idr", "enc_region", "enc_rpc"],
23+
"label": [
24+
"Identificador de persona",
25+
"Identificador de hogar",
26+
"Folio de la vivienda",
27+
"Región",
28+
"Región, provincia, comuna",
29+
],
30+
}
31+
return pd.DataFrame(data)
32+
33+
34+
def test_check_if_filter_correct_columns(enusc_raw):
35+
enusc_filtered = _filter_enusc(enusc_raw, relevant_var)
36+
37+
actual_columns = set(enusc_filtered.columns)
38+
expected_columns = set(relevant_var)
39+
40+
missing_columns = expected_columns - actual_columns
41+
extra_columns = actual_columns - expected_columns
42+
43+
assert (
44+
expected_columns == actual_columns
45+
), f"Mismatch in columns. Missing: {missing_columns}, Extra: {extra_columns}"
46+
47+
48+
def test_rename_enusc(enusc_raw):
49+
enusc_filtered = _filter_enusc(enusc_raw, relevant_var)
50+
enusc_renamed = _rename_enusc(enusc_filtered, rename_mapping)
51+
52+
actual_columns = set(enusc_renamed.columns)
53+
expected_columns = set(rename_mapping.values())
54+
55+
missing_columns = expected_columns - actual_columns
56+
extra_columns = actual_columns - expected_columns
57+
58+
assert expected_columns.issubset(
59+
actual_columns
60+
), f"Missing columns: {missing_columns}, Extra: {extra_columns}"

tests/data_management/test_data_mgt_labels.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def test_concat_labels_var(enusc_raw, labels_raw):
4747
assert "label" in data_test.columns, "Column 'label' not found in DataFrame"
4848

4949
# indexes are original ones: s.t if we want to include other variables into
50-
# the filtering then we can continue using this test
50+
# the filtering we can continue using this test
5151
assert all(
5252
[
5353
data_test["variable_name"].iloc[0] == "rph_ID"

0 commit comments

Comments
 (0)