diff --git a/helpers/write_unittest_to_csv.py b/helpers/write_unittest_to_csv.py new file mode 100644 index 000000000..a1b31ade7 --- /dev/null +++ b/helpers/write_unittest_to_csv.py @@ -0,0 +1,83 @@ +import pandas as pd +import numpy as np + +# function to import a function from the tests of this repo, generate a +# test dataframe and write it to a csv +import os +import sys +sys.path.append(os.path.abspath("")) + +from tests.test_site_apportionment.test_site_apportionment import ( + TestSplitSitesDf, + TestDeduplicateCodeValues +) + +def get_df_from_test(test_class, test_method): + """Get the dataframe from the test method.""" + test_instance = test_class() + test_method = getattr(test_instance, test_method) + df = test_method() + return df + +def create_input_df(): + """Create an input dataframe for the beginning of the site apportionment module.""" + input_columns = [ + "reference", + "instance", + "period", + "200", + "201", + "pg_numeric", + "formtype", + "211", + "250", + "601", + "602", + "status", + "imp_marker", + "postcodes_harmonised", + "itl" + ] + + data = [ + [1, 0, "2020", "C", "A", 40, "0006", np.nan, np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], + [1, 1, "2020", "C", "A", 40, "0006", np.nan, np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], + [1, 2, "2020", "C", "A", 40, "0006", np.nan, np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], + [2, 0, "2020", "C", "A", 40, "0001", np.nan, np.nan, np.nan, np.nan, "Clear", "R", "NP20 6YY", "cym"], + [2, 1, "2020", "C", "A", 40, "0001", 100, "yes", "CB1 3NF", 60.0, "Clear", "R", "CB1 3NF", "cym"], + [2, 2, "2020", "C", "A", 40, "0001", 100, "yes", "BA1 5DA", 40.0, "Clear", "R", "BA1 5DA", "cym"], + [3, 0, "2020", "C", "A", 40, "0001", np.nan, np.nan, np.nan, np.nan, "Check needed", "TMI", "NP30 7ZZ", "cym"], + [3, 1, "2020", "C", "A", 40, "0001", 100, "yes", "DE72 3AU", np.nan, "Check needed", "TMI", "DE72 3AU", "cym"], + [3, 2, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Check needed", "No mean found", "NP30 7ZZ", "cym"], + [4, 1, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Form sent out", "TMI", "CF10 BZZ", "cym"], + [5, 1, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Form sent out", "No mean found", "SA50 5BE", "cym"], + [6, 0, "2020", "C", "A", 40, "0001", np.nan, np.nan, np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym"], + [6, 1, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym"], + [6, 2, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym"], + [7, 0, "2020", "C", "A", 40, "0001", np.nan, np.nan, np.nan, np.nan, "Form sent out", "MoR", "NP10 5XX", "cym"], + [7, 1, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Form sent out", "MoR", "NP10 5XX", "cym"], + [7, 2, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Form sent out", "MoR", "NP10 5XX", "cym"], + ] + + input_df = pd.DataFrame(data=data, columns=input_columns) + return input_df + +# function to write the test data to a csv +def write_test_data_to_csv(filename): + """Write the input data to a CSV file.""" + df = get_df_from_test(TestSplitSitesDf, "create_exp_remaining_output") + + file_path = csv_path + filename + df.to_csv(file_path, index=False) + print(f"Data written to {file_path}") + + + +# Save the DataFrame to a CSV file +csv_path = "D:/coding_projects/randd_test_data/" + +filename = "expected_remaining_output.csv" + + +if __name__ == "__main__": + write_test_data_to_csv(filename) diff --git a/src/aggregation/__init__.py b/src/aggregation/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/aggregation/aggregation_main.py b/src/aggregation/aggregation_main.py deleted file mode 100644 index 83aa539ec..000000000 --- a/src/aggregation/aggregation_main.py +++ /dev/null @@ -1 +0,0 @@ -"""The main file for the Aggregation and Disclosure module.""" diff --git a/tests/test_aggregation/__init__.py b/tests/test_aggregation/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/test_site_apportionment/test_site_apportionment.py b/tests/test_site_apportionment/test_site_apportionment.py index 9999c13d2..291138dee 100644 --- a/tests/test_site_apportionment/test_site_apportionment.py +++ b/tests/test_site_apportionment/test_site_apportionment.py @@ -6,8 +6,11 @@ import pandas as pd import numpy as np +from unittest.mock import patch + # Local Imports from src.site_apportionment.site_apportionment import ( + count_unique_postcodes_in_col, create_notnull_mask, set_percentages, split_sites_df, @@ -19,6 +22,7 @@ count_duplicate_sites, weight_values, create_category_df, + run_apportion_sites, ) # Define easier pandas usages @@ -26,16 +30,115 @@ assert_frame_equal = pd._testing.assert_frame_equal assert_series_equal = pd._testing.assert_series_equal +# Add a new column "period" with a value of "2020" in the third position +@pytest.fixture +def create_input_df(): + """Create an input dataframe for the beginning of the site apportionment module.""" + input_columns = [ + "reference", + "instance", + "period", + "200", + "201", + "pg_numeric", + "formtype", + "211", + "251", + "601", + "602", + "status", + "imp_marker", + "postcodes_harmonised", + "itl" + ] + + data = [ + [111, 0, "2020", "C", "A", 40, "0006", np.nan, np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], + [111, 1, "2020", "C", "A", 40, "0006", np.nan, np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], + [111, 2, "2020", "C", "A", 40, "0006", np.nan, np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], + [222, 0, "2020", "C", "A", 40, "0001", np.nan, np.nan, np.nan, np.nan, "Clear", "R", "NP20 6YY", "cym"], + [222, 1, "2020", "C", "A", 40, "0001", 100, "yes", "CB1 3NF", 60.0, "Clear", "R", "CB1 3NF", "cym"], + [222, 2, "2020", "D", "ZZ", 80, "0001", 100, "yes", "BA1 5DA", 40.0, "Clear", "R", "BA1 5DA", "cym"], + [333, 0, "2020", "C", "A", 40, "0001", np.nan, np.nan, np.nan, np.nan, "Check needed", "TMI", "NP30 7ZZ", "cym"], + [333, 1, "2020", "C", "A", 40, "0001", 100, "yes", "DE72 3AU", np.nan, "Check needed", "TMI", "DE72 3AU", "cym"], + [333, 2, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Check needed", "No mean found", "NP30 7ZZ", "cym"], + [444, 1, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Form sent out", "TMI", "CF10 BZZ", "cym"], + [555, 1, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Form sent out", "No mean found", "SA50 5BE", "cym"], + # below, the case of short form in previous period and long form in current period + [666, 0, "2020", "C", "A", 40, "0001", np.nan, np.nan, np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym"], + [666, 1, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym"], + [666, 2, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym"], + # as above but status "Form sent out" + [777, 0, "2020", "C", "A", 40, "0001", np.nan, np.nan, np.nan, np.nan, "Form sent out", "MoR", "NP10 5XX", "cym"], + [777, 1, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Form sent out", "MoR", "NP10 5XX", "cym"], + [777, 2, "2020", "D", "XX", 70, "0001", 100, "yes", np.nan, np.nan, "Form sent out", "MoR", "NP10 5XX", "cym"], + ] + + input_df = pandasDF(data=data, columns=input_columns) + return input_df + + +@pytest.fixture +def create_exp_postcode_count_output_df(): + """Create a dataframe for expected output of the count_unique_postcodes_in_col function test. + + NOTE: The columns 200, 201, 211, 251 and pg_numeric are dropped from the input before the function is called + for the sake of simplicity. + + This will also be used as input for the set_percentages function test. + """ + exp_output_columns = [ + "reference", + "instance", + "period", + "formtype", + "601", + "602", + "status", + "imp_marker", + "postcodes_harmonised", + "itl", + "601_count" + ] + data = [ + [111, 0, "2020", "0006", np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym", np.nan], + [111, 1, "2020", "0006", np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym", np.nan], + [111, 2, "2020", "0006", np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym", np.nan], + [222, 0, "2020", "0001", np.nan, np.nan, "Clear", "R", "NP20 6YY", "cym", 2.0], + [222, 1, "2020", "0001", "CB1 3NF", 60.0, "Clear", "R", "CB1 3NF", "cym", 2.0], + [222, 2, "2020", "0001", "BA1 5DA", 40.0, "Clear", "R", "BA1 5DA", "cym", 2.0], + [333, 0, "2020", "0001", np.nan, np.nan, "Check needed", "TMI", "NP30 7ZZ", "cym", 1.0], + [333, 1, "2020", "0001", "DE72 3AU", np.nan, "Check needed", "TMI", "DE72 3AU", "cym", 1.0], + [333, 2, "2020", "0001", np.nan, np.nan, "Check needed", "No mean found", "NP30 7ZZ", "cym", 1.0], + [444, 1, "2020", "0001", np.nan, np.nan, "Form sent out", "TMI", "CF10 BZZ", "cym", np.nan], + [555, 1, "2020", "0001", np.nan, np.nan, "Form sent out", "No mean found", "SA50 5BE", "cym", np.nan], + # below, the case of short form in previous period and long form in current period + [666, 0, "2020", "0001", np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym", np.nan], + [666, 1, "2020", "0001", np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym", np.nan], + [666, 2, "2020", "0001", np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym", np.nan], + # as above but status "Form sent out" + [777, 0, "2020", "0001", np.nan, np.nan, "Form sent out", "MoR", "NP10 5XX", "cym", np.nan], + [777, 1, "2020", "0001", np.nan, np.nan, "Form sent out", "MoR", "NP10 5XX", "cym", np.nan], + [777, 2, "2020", "0001", np.nan, np.nan, "Form sent out", "MoR", "NP10 5XX", "cym", np.nan], + ] + + exp_output_df = pandasDF(data=data, columns=exp_output_columns) + return exp_output_df + @pytest.fixture def create_exp_percent_test_output_df(): """Create a dataframe for expected output of the for the set_percentages function test. - NOTE: this dataframe will also be the input for the split_sites function test. + NOTE: The columns 200, 201, 211, 251 and pg_numeric are dropped from the input before the function is called + for the sake of simplicity. + + This will also be used as input for the split_sites_df function test. """ exp_output_columns = [ "reference", "instance", + "period", "formtype", "601", "602", @@ -46,65 +149,52 @@ def create_exp_percent_test_output_df(): "itl" ] data = [ - [1, 0, "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], - [1, 1, "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], - [1, 2, "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], - [2, 0, "0001", np.nan, np.nan, 2.0, "Clear", "R", "NP20 6YY", "cym"], - [2, 1, "0001", "CB1 3NF", 60.0, 2.0, "Clear", "R", "CB1 3NF", "cym"], - [2, 2, "0001", "BA1 5DA", 40.0, 2.0, "Clear", "R", "BA1 5DA", "cym"], - [3, 0, "0001", np.nan, np.nan, 1.0, "Check needed", "TMI", "NP30 7ZZ", "cym"], - [3, 1, "0001", "DE72 3AU", 100.0, 1.0, "Check needed", "TMI", "DE72 3AU", "cym"], - [3, 2, "0001", np.nan, np.nan, 1.0, "Check needed", "No mean found", "NP30 7ZZ", "cym"], - [4, 1, "0001", "CF10 BZZ", 100.0, 1.0, "Form sent out", "TMI", "CF10 BZZ", "cym"], - [5, 1, "0001", "SA50 5BE", 100.0, 1.0, "Form sent out", "No mean found", "SA50 5BE", "cym"], + [111, 0, "2020", "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], + [111, 1, "2020", "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], + [111, 2, "2020", "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], + [222, 0, "2020", "0001", np.nan, np.nan, 2.0, "Clear", "R", "NP20 6YY", "cym"], + [222, 1, "2020", "0001", "CB1 3NF", 60.0, 2.0, "Clear", "R", "CB1 3NF", "cym"], + [222, 2, "2020", "0001", "BA1 5DA", 40.0, 2.0, "Clear", "R", "BA1 5DA", "cym"], + [333, 0, "2020", "0001", np.nan, np.nan, 1.0, "Check needed", "TMI", "NP30 7ZZ", "cym"], + [333, 1, "2020", "0001", "DE72 3AU", 100.0, 1.0, "Check needed", "TMI", "DE72 3AU", "cym"], + [333, 2, "2020", "0001", np.nan, np.nan, 1.0, "Check needed", "No mean found", "NP30 7ZZ", "cym"], + [444, 1, "2020", "0001", "CF10 BZZ", 100.0, 1.0, "Form sent out", "TMI", "CF10 BZZ", "cym"], + [555, 1, "2020", "0001", "SA50 5BE", 100.0, 1.0, "Form sent out", "No mean found", "SA50 5BE", "cym"], + # below, the case of short form in previous period and long form in current period + [666, 0, "2020", "0001", np.nan, np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym"], + [666, 1, "2020", "0001", np.nan, np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym"], + [666, 2, "2020", "0001", np.nan, np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym"], + # as above but status "Form sent out" + [777, 0, "2020", "0001", "NP10 5XX", np.nan, 1.0, "Form sent out", "MoR", "NP10 5XX", "cym"], + [777, 1, "2020", "0001", "NP10 5XX", 100.0, 1.0, "Form sent out", "MoR", "NP10 5XX", "cym"], + [777, 2, "2020", "0001", "NP10 5XX", 100.0, 1.0, "Form sent out", "MoR", "NP10 5XX", "cym"], ] exp_output_df = pandasDF(data=data, columns=exp_output_columns) return exp_output_df +class TestCountUniquePostcodesInCol: + """Tests for the count_unique_postcodes_in_col function.""" -class TestSetPercentages: - """Tests for the set_percentages function.""" + def test_count_unique_postcodes_in_col(self, create_input_df, create_exp_postcode_count_output_df): + """Test for the count_unique_postcodes_in_col function.""" + input_df = create_input_df + input_df = input_df.drop(columns=["200", "201", "pg_numeric", "211", "251"]) + result_df = count_unique_postcodes_in_col(input_df) + expected_output_df = create_exp_postcode_count_output_df - def create_input_df(self): - """Create an input dataframe for the test.""" - input_columns = [ - "reference", - "instance", - "formtype", - "601", - "602", - "601_count", - "status", - "imp_marker", - "postcodes_harmonised", - "itl" - ] - - data = [ - [1, 0, "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], - [1, 1, "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], - [1, 2, "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], - [2, 0, "0001", np.nan, np.nan, 2.0, "Clear", "R", "NP20 6YY", "cym"], - [2, 1, "0001", "CB1 3NF", 60.0, 2.0, "Clear", "R", "CB1 3NF", "cym"], - [2, 2, "0001", "BA1 5DA", 40.0, 2.0, "Clear", "R", "BA1 5DA", "cym"], - [3, 0, "0001", np.nan, np.nan, 1.0, "Check needed", "TMI", "NP30 7ZZ", "cym"], - [3, 1, "0001", "DE72 3AU", np.nan, 1.0, "Check needed", "TMI", "DE72 3AU", "cym"], - [3, 2, "0001", np.nan, np.nan, 1.0, "Check needed", "No mean found", "NP30 7ZZ", "cym"], - [4, 1, "0001", np.nan, np.nan, np.nan, "Form sent out", "TMI", "CF10 BZZ", "cym"], - [5, 1, "0001", np.nan, np.nan, np.nan, "Form sent out", "No mean found", "SA50 5BE", "cym"], - ] + assert_frame_equal(result_df[expected_output_df.columns], expected_output_df) - input_df = pandasDF(data=data, columns=input_columns) - return input_df +class TestSetPercentages: + """Tests for the set_percentages function.""" - def test_set_percentage(self, create_exp_percent_test_output_df): + def test_set_percentage(self, create_exp_percent_test_output_df, create_exp_postcode_count_output_df): """Test for the set_percentages function.""" - input_df = self.create_input_df() + input_df = create_exp_postcode_count_output_df expected_output_df = create_exp_percent_test_output_df result_df = set_percentages(input_df) - assert_frame_equal(result_df, expected_output_df) + assert_frame_equal(result_df[expected_output_df.columns], expected_output_df) class TestSplitSitesDf: @@ -115,6 +205,7 @@ def create_exp_to_apportion_output(self): exp_output_cols1 = [ "reference", "instance", + "period", "formtype", "601", "602", @@ -126,12 +217,14 @@ def create_exp_to_apportion_output(self): ] data1 = [ - [2, 1, "0001", "CB1 3NF", 60.0, 2.0, "Clear", "R", "CB1 3NF", "cym"], - [2, 2, "0001", "BA1 5DA", 40.0, 2.0, "Clear", "R", "BA1 5DA", "cym"], - [3, 1, "0001", "DE72 3AU", 100.0, 1.0, "Check needed", "TMI", "DE72 3AU", "cym"], - [3, 2, "0001", np.nan, np.nan, 1.0, "Check needed", "No mean found", "NP30 7ZZ", "cym"], - [4, 1, "0001", "CF10 BZZ", 100.0, 1.0, "Form sent out", "TMI", "CF10 BZZ", "cym"], - [5, 1, "0001", "SA50 5BE", 100.0, 1.0, "Form sent out", "No mean found", "SA50 5BE", "cym"], + [222, 1, "2020", "0001", "CB1 3NF", 60.0, 2.0, "Clear", "R", "CB1 3NF", "cym"], + [222, 2, "2020", "0001", "BA1 5DA", 40.0, 2.0, "Clear", "R", "BA1 5DA", "cym"], + [333, 1, "2020", "0001", "DE72 3AU", 100.0, 1.0, "Check needed", "TMI", "DE72 3AU", "cym"], + [333, 2, "2020", "0001", np.nan, np.nan, 1.0, "Check needed", "No mean found", "NP30 7ZZ", "cym"], + [444, 1, "2020", "0001", "CF10 BZZ", 100.0, 1.0, "Form sent out", "TMI", "CF10 BZZ", "cym"], + [555, 1, "2020", "0001", "SA50 5BE", 100.0, 1.0, "Form sent out", "No mean found", "SA50 5BE", "cym"], + [777, 1, "2020", "0001", "NP10 5XX", 100.0, 1.0, "Form sent out", "MoR", "NP10 5XX", "cym"], + [777, 2, "2020", "0001", "NP10 5XX", 100.0, 1.0, "Form sent out", "MoR", "NP10 5XX", "cym"], ] exp_output_df1 = pandasDF(data=data1, columns=exp_output_cols1) @@ -146,6 +239,7 @@ def create_exp_remaining_output(self): exp_output_cols2 = [ "reference", "instance", + "period", "formtype", "601", "602", @@ -157,16 +251,23 @@ def create_exp_remaining_output(self): ] data2 = [ - [1, 0, "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], - [1, 1, "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], - [1, 2, "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], - [2, 0, "0001", np.nan, np.nan, 2.0, "Clear", "R", "NP20 6YY", "cym"], - [3, 0, "0001", np.nan, np.nan, 1.0, "Check needed", "TMI", "NP30 7ZZ", "cym"], + [111, 0, "2020", "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], + [111, 1, "2020", "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], + [111, 2, "2020", "0006", np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym"], + [222, 0, "2020", "0001", np.nan, np.nan, 2.0, "Clear", "R", "NP20 6YY", "cym"], + [333, 0, "2020", "0001", np.nan, np.nan, 1.0, "Check needed", "TMI", "NP30 7ZZ", "cym"], + # below, the case of short form in previous period and long form in current period + [666, 0, "2020", "0001", np.nan, np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym"], + [666, 1, "2020", "0001", np.nan, np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym"], + [666, 2, "2020", "0001", np.nan, np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym"], + # as above but status "Form sent out" + [777, 0, "2020", "0001", "NP10 5XX", np.nan, 1.0, "Form sent out", "MoR", "NP10 5XX", "cym"], ] exp_output_df2 = pandasDF(data=data2, columns=exp_output_cols2) exp_output_df2 = exp_output_df2.astype({"601": object}) return exp_output_df2 + def test_split_sites_df(self, create_exp_percent_test_output_df): """Test for the split_sites_df function.""" input_df = create_exp_percent_test_output_df @@ -662,23 +763,23 @@ def sites_df_input(): input_cols = [ "reference", "instance", + "period", "601", "602", "601_count", "status", "imp_marker", "postcodes_harmonised", - "period", "itl" ] input_data = [ - [1, 1, "RH12 1XL", 100.0, np.nan, "Clear", "R", "RH12 1XL", "202101", "cym"], - [1, 2, "RH12 1XL", 125.0, np.nan, "Clear", "R", "RH12 1XL", "202101", "cym"], - [1, 3, "RH12 1XL", np.nan, np.nan, "Clear", "R", "RH12 1XL", "202101", "cym"], # Nan 602 - Ensure conv to 0 - [1, 4, "RH12 1XZ", 100.0, np.nan, "Clear", "R", "RH12 1XZ", "202101", "cym"], # different postcode - [2, 1, "NP44 2NZ", np.nan, 2.0, "Clear", "R", "NP44 2NZ", "202102", "cym"], - [2, 2, "NP44 2NZ", 50.0, 2.0, "Clear", "R", "NP44 2NZ", "202102", "cym"], - [3, 1, np.nan, np.nan, 1.0, "Check needed", "TMI", "NP30 7ZZ", "202102", "cym"], # NaN 601 - Ensure dropped + [111, 1, "2020", "RH12 1XL", 100.0, np.nan, "Clear", "R", "RH12 1XL", "cym"], + [111, 2, "2020", "RH12 1XL", 125.0, np.nan, "Clear", "R", "RH12 1XL", "cym"], + [111, 3, "2020", "RH12 1XL", np.nan, np.nan, "Clear", "R", "RH12 1XL", "cym"], # Nan 602 - Ensure conv to 0 + [111, 4, "2020", "RH12 1XZ", 100.0, np.nan, "Clear", "R", "RH12 1XZ", "cym"], # different postcode + [222, 1, "2020", "NP44 2NZ", np.nan, 2.0, "Clear", "R", "NP44 2NZ", "cym"], + [222, 2, "2020", "NP44 2NZ", 50.0, 2.0, "Clear", "R", "NP44 2NZ", "cym"], + [333, 1, "2020", np.nan, np.nan, 1.0, "Check needed", "TMI", "NP30 7ZZ", "cym"], # NaN 601 - Ensure dropped ] input_df = pandasDF(data=input_data, columns=input_cols) @@ -702,20 +803,20 @@ def test_create_sites_df_on_pass(self, sites_df_input): # assert the resultant dataframe is as expected exp_columns = [ "reference", + "instance", "period", "601", "postcodes_harmonised", - "instance", "itl", "602", ] exp_data = [ - [1, "202101", "RH12 1XL", "RH12 1XL", 1, "cym", 225.0], - [1, "202101", "RH12 1XZ", "RH12 1XZ", 4, "cym", 100.0], - [2, "202102", "NP44 2NZ", "NP44 2NZ", 1, "cym", 50.0] + [111, 1, "2020", "RH12 1XL", "RH12 1XL", "cym", 225.0], + [111, 4, "2020", "RH12 1XZ", "RH12 1XZ", "cym", 100.0], + [222, 1, "2020", "NP44 2NZ", "NP44 2NZ", "cym", 50.0], ] expected = pandasDF(data=exp_data, columns=exp_columns) - assert output.equals(expected), "create_sites_df not behaving as expected." + assert_frame_equal(output[expected.columns], expected) class TestCountDuplicateSites(object): @@ -743,10 +844,10 @@ def weight_values_test_df(self): columns = ["reference", "val_col_1", "val_col_2", "weight_col"] data = [ [0, 10, 3.5, 1.5], - [1, 14, 4.5, 1.5], - [2, 16, 10, 1.0], - [3, 18.0, np.nan, 2.0], - [4, 6, 0, 3.0], + [111, 14, 4.5, 1.5], + [222, 16, 10, 1.0], + [333, 18.0, np.nan, 2.0], + [444, 6, 0, 3.0], ] return pandasDF(data=data, columns=columns) @@ -764,10 +865,10 @@ def test_weight_values_on_pass(self, weight_values_test_df): data = [ [0, 15.0, 5.25, 1.5], - [1, 21.0, 6.75, 1.5], - [2, 16.0, 10.0, 1.0], - [3, 36.0, np.nan, 2.0], - [4, 18.0, 0.0, 3.0], + [111, 21.0, 6.75, 1.5], + [222, 16.0, 10.0, 1.0], + [333, 36.0, np.nan, 2.0], + [444, 18.0, 0.0, 3.0], ] exp_out = pandasDF(data=data, columns=columns) assert output.equals(exp_out), "weight_values not acting as expected." @@ -881,3 +982,75 @@ def test_create_category_df_drops_duplicates(self, category_df_input): ] expected = pandasDF(data=exp_data, columns=exp_cols) assert output.equals(expected), "Duplicates not dropped by create_category_df." + + +class TestRunApportionSites(object): + """Tests for run_apportion_sites.""" + + def create_exp_output_df(self): + """Create a dataframe for the expected output of the apportionment module.""" + exp_output_cols = [ + "reference", + "instance", + "period", + "200", + "201", + "pg_numeric", + "formtype", + "211", + "251", + "601", + "602", + "status", + "imp_marker", + "postcodes_harmonised", + "itl", + "itl2", + ] + + data = [ + [111, 0, "2020", "C", "A", 40, "0006", np.nan, np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym", "eng"], + [111, 1, "2020", "C", "A", 40, "0006", np.nan, np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym", "eng"], + [111, 2, "2020", "C", "A", 40, "0006", np.nan, np.nan, np.nan, np.nan, "Clear", "R", "NP10 5XX", "cym", "eng"], + [222, 0, "2020", "C", "A", 40, "0001", np.nan, np.nan, np.nan, np.nan, "Clear", "R", "NP20 6YY", "cym", "eng"], + # the following 4 rows are the ones that have been apportioned + [222, 1, "2020", "C", "A", 40, "0001", 60, "yes", "CB1 3NF", 60.0, "Clear", "R", "CB1 3NF", "cym", "eng"], + [222, 1, "2020", "D", "ZZ", 80, "0001", 60, "yes", "CB1 3NF", 60.0, "Clear", "R", "CB1 3NF", "cym", "eng"], + [222, 2, "2020", "C", "A", 40, "0001", 40, "yes", "BA1 5DA", 40.0, "Clear", "R", "BA1 5DA", "cym", "eng"], + [222, 2, "2020", "D", "ZZ", 80, "0001", 40, "yes", "BA1 5DA", 40.0, "Clear", "R", "BA1 5DA", "cym", "eng"], + [333, 0, "2020", "C", "A", 40, "0001", np.nan, np.nan, np.nan, np.nan, "Check needed", "TMI", "NP30 7ZZ", "cym", "eng"], + [333, 1, "2020", "C", "A", 40, "0001", 100, "yes", "DE72 3AU", 100.0, "Check needed", "TMI", "DE72 3AU", "cym", "eng"], + [444, 1, "2020", "C", "A", 40, "0001", 100, "yes", "CF10 BZZ", 100.0, "Form sent out", "TMI", "CF10 BZZ", "cym", "eng"], + [666, 0, "2020", "C", "A", 40, "0001", np.nan, np.nan, np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym", "eng"], + [666, 1, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym", "eng"], + [666, 2, "2020", "C", "A", 40, "0001", 100, "yes", np.nan, np.nan, "Check needed", "MoR", "NP10 5XX", "cym", "eng"], + [777, 0, "2020", "C", "A", 40, "0001", np.nan, np.nan, "NP10 5XX", np.nan, "Form sent out", "MoR", "NP10 5XX", "cym", "eng"], + [777, 1, "2020", "C", "A", 40, "0001", 100, "yes", "NP10 5XX", 200.0, "Form sent out", "MoR", "NP10 5XX", "cym", "eng"], + [777, 1, "2020", "D", "XX", 70, "0001", 100, "yes", "NP10 5XX", 200.0, "Form sent out", "MoR", "NP10 5XX", "cym", "eng"], + ] + + exp_output_df = pandasDF(data=data, columns=exp_output_cols) + return exp_output_df + + + def test_run_apportion_sites(self, create_input_df): + """General tests for run_apportion_sites.""" + input_df = create_input_df + input_df["itl2"] = "eng" + + exp_output_df = self.create_exp_output_df() + + config = { + "mappers": {"geo_cols": ["itl2"]}, + "breakdowns": {"211": []}, + "imputation": {"sum_cols": ["211"], "impute_cols": ["211"]}, + } + imp_markers_to_keep: list = ["R", "TMI", "CF", "MoR"] + + # we will mock the function consistency_checks for simplicity + with patch("src.site_apportionment.site_apportionment.consistency_checks") as mock_consistency_checks: + mock_consistency_checks.return_value = True + + results_df = run_apportion_sites(input_df, imp_markers_to_keep, config, intram_tot_dict={}) + + assert_frame_equal(results_df, exp_output_df)