1084 Handle Epidata FutureWarnings (#1109)

Handle FutureWarnings from newest pandas version. Co-authored-by: Anna Wendler <[email protected]>
SciCompMod · Oct 28, 2024 · 86eb344 · 86eb344
1 parent 0962eb0
commit 86eb344
Show file tree

Hide file tree

Showing 6 changed files with 47 additions and 42 deletions.
diff --git a/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py b/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py
@@ -97,7 +97,7 @@ def insert_names_of_states(df, state_id_col=dd.EngEng["idState"]):
     @return dataframe df with column of state names corresponding to county ids
     """
     df = modifyDataframeSeries.insert_column_by_map(
-        df, state_id_col, dd.EngEng["state"], get_state_names_and_ids())
+        df, state_id_col, dd.EngEng["state"], get_state_names_and_ids(), str)
     return df
 
 # while reporting for Berlin is just different for different sources, Eisenach
@@ -188,7 +188,7 @@ def insert_names_of_counties(
     county_id_map = get_county_names_and_ids(
         merge_berlin=merge_berlin, merge_eisenach=False)
     df = modifyDataframeSeries.insert_column_by_map(
-        df, county_id_col, dd.EngEng["county"], county_id_map)
+        df, county_id_col, dd.EngEng["county"], county_id_map, str)
     return df
 
 

diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py
@@ -619,8 +619,13 @@ def get_npi_data(fine_resolution=2,
         df_npis_combinations_pre = df_npis_combinations_pre[[
             'Variablenname', 'Massnahmenindex'] + [i for i in range(0, len(columns_used))]]
         # replace empty cells by zeros and x-marked cells by ones
-        df_npis_combinations_pre = df_npis_combinations_pre.replace(np.nan, 0)
-        df_npis_combinations_pre = df_npis_combinations_pre.replace('x', 1)
+        # This has to be done by replacing the values with the same dtype and then changing the dtype
+        # Pandas 3.0 will not allow downcasting with replace operations
+        df_npis_combinations_pre = df_npis_combinations_pre.replace(
+            np.nan, '0')
+        df_npis_combinations_pre = df_npis_combinations_pre.replace('x', '1')
+        df_npis_combinations_pre[df_npis_combinations_pre.columns[2:]
+                                 ] = df_npis_combinations_pre[df_npis_combinations_pre.columns[2:]].astype(int)
 
         # extract different NPI groups and store indices of NPIs belonging
         # to the different groups
@@ -905,25 +910,15 @@ def get_npi_data(fine_resolution=2,
         max_date + [max(dates_new),
                     pd.to_datetime(end_date)])
 
-    # create new data frame for all NPIs given in the columns,
-    # resolved by county and day
-    df_npis = pd.DataFrame(
-        columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] +
-        list(npis_final[dd.EngEng['npiCode']]))
-    # convert NPI data from object to int such that correlations can be
-    # computed
-    df_npis = df_npis.astype(dict(
-        zip(
-            [dd.EngEng['date']] + [dd.EngEng['idCounty']] +
-            list(npis_final[dd.EngEng['npiCode']]), ['str', 'int'] +
-            ['int' for i in npis_final[dd.EngEng['npiCode']]])))
-
     # iterate over countyIDs
     counters = np.zeros(4)  # time counter for output only
     countyidx = 0
-    # replace -99 ("not used anymore") by 0 ("not used")
-    # replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned")
-    df_npis_old.replace([-99, 2, 3, 4, 5], [0, 1, 1, 1, 1], inplace=True)
+
+    # Infer type of columns to be able to use replace with ints without downcasting.
+    df_npis_old = df_npis_old.infer_objects()
+    df_npis_old.replace([-99, 2, 3, 4, 5],
+                        [0, 1, 1, 1, 1], inplace=True)
+
     counter_cases_start = 0
 
     # setup dataframe for each maingroup, same format as df_npi_combinations
@@ -961,6 +956,9 @@ def get_npi_data(fine_resolution=2,
                 df_npis_combinations[maincode][0].keys()):
             raise gd.DataError('Error. Description and table do not match.')
 
+    # create new data frame for all NPIs
+    df_npis = pd.DataFrame()
+
     for countyID in counties_considered:
         cid = 0
         countyidx += 1
@@ -1004,7 +1002,7 @@ def get_npi_data(fine_resolution=2,
 
         # get number of codes of one NPI (incidence indep. + dep.)
         # for fine_resolution=1, inc_codes=1, for fine_res=2, inc_codes=6
-        inc_codes = len(np.where(df_npis.columns.str.contains(
+        inc_codes = len(np.where(npis_final.NPI_code.str.contains(
             npis[dd.EngEng['npiCode']][0]))[0])
 
         # Consistency of incidence independent and dependent NPIs:
@@ -1406,7 +1404,7 @@ def plot_interaction_matrix(filename, directory):
 
     # invert color map elements for tab20c such that subcolors are shown
     # from light to dark
-    cmap = copy.copy(mpl.cm.get_cmap('tab20b'))
+    cmap = copy.copy(plt.get_cmap('tab20b'))
     colors = [
         cmap(i)
         for i in np.array(

diff --git a/pycode/memilio-epidata/memilio/epidata/modifyDataframeSeries.py b/pycode/memilio-epidata/memilio/epidata/modifyDataframeSeries.py
@@ -288,7 +288,7 @@ def extract_subframe_based_on_dates(df, start_date, end_date):
     return df_new
 
 
-def insert_column_by_map(df, col_to_map, new_col_name, map):
+def insert_column_by_map(df, col_to_map, new_col_name, map, new_col_dtype='object'):
     """! Adds a column to a given dataframe based on a mapping of values of a given column
 
     The mapping is defined by a list containing tupels of the form (new_value, old_value)
@@ -298,14 +298,19 @@ def insert_column_by_map(df, col_to_map, new_col_name, map):
     @param col_to_map column containing values to be mapped
     @param new_col_name name of the new column containing the mapped values
     @param map List of tuples of values in the column to be added and values in the given column
+    @param new_col_dtype String of dtype [Default: 'object'] for the new generated column
     @return dataframe df with column of state names correspomding to state ids
     """
     df_new = df[:]
     loc_new_col = df_new.columns.get_loc(col_to_map)+1
     df_new.insert(loc=loc_new_col, column=new_col_name,
                   value=df_new[col_to_map])
+    # Set dtype=object at new created column to prevent incompatible dtype errors
+    df_new[new_col_name] = df_new[new_col_name].astype('object')
     for item in map:
         df_new.loc[df_new[col_to_map] == item[1], [new_col_name]] = item[0]
+    # Set dtype of new column
+    df_new[new_col_name] = df_new[new_col_name].astype(new_col_dtype)
     return df_new
 
 
@@ -517,7 +522,7 @@ def fit_age_group_intervals(
                           age_share[0][1]:age_share[-1][1]+1])
             for age in age_share:
                 new_pop[age[1]] += pop_data[age[1]
-                                            ] / sum_pop * df_age_in.iloc[0][population_indx]
+                                            ] / sum_pop * df_age_in.iloc[0, population_indx]
 
             population_indx += 1
 

diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCaseData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getCaseData.py
@@ -19,6 +19,7 @@
 #############################################################################
 import json
 import os
+import io
 import unittest
 from datetime import date, datetime
 from unittest.mock import patch
@@ -322,8 +323,8 @@ def test_get_case_data_split_berlin(self, mock_file):
         self.assertEqual(df_state.shape[0], 286)
 
     @patch('memilio.epidata.getDataIntoPandasDataFrame.get_file',
-           return_value=pd.read_json(
-               test_string_all_federal_states_and_counties_read).copy())
+           return_value=pd.read_json(io.StringIO(
+               test_string_all_federal_states_and_counties_read)).copy())
     def test_get_case_data_moving_average(self, mock_file):
 
         read_data = True
@@ -470,8 +471,8 @@ def test_get_case_data_moving_average(self, mock_file):
             1.0)
 
     @patch('memilio.epidata.getDataIntoPandasDataFrame.get_file',
-           return_value=pd.read_json(
-               test_string_all_federal_states_and_counties_read).copy())
+           return_value=pd.read_json(io.StringIO(
+               test_string_all_federal_states_and_counties_read)).copy())
     def test_get_case_data_impute_dates(self, mock_file):
         read_data = True
         file_format = 'json_timeasstring'
@@ -554,8 +555,7 @@ def test_get_case_data_impute_dates(self, mock_file):
             25)
 
     @patch('memilio.epidata.getDataIntoPandasDataFrame.get_file',
-           return_value=pd.read_json(
-               test_string_all_federal_states_and_counties_read).copy())
+           return_value=pd.read_json(io.StringIO(test_string_all_federal_states_and_counties_read)).copy())
     def test_get_case_data_moving_average_and_split_berlin(self, mock_file):
         # test if split_berlin and moving_average = True are working together
 
@@ -659,8 +659,8 @@ def test_no_raw(self, mock_file):
         directory = os.path.join(out_folder, 'Germany/')
         gd.check_dir(directory)
 
-        mock_file.return_value = pd.read_json(
-            self.test_string_all_federal_states_and_counties_github)
+        mock_file.return_value = pd.read_json(io.StringIO(
+            self.test_string_all_federal_states_and_counties_github))
 
         gcd.get_case_data(
             read_data=read_data, file_format=file_format,
@@ -716,8 +716,8 @@ def test_check_for_completeness(self):
     @patch('memilio.epidata.getDataIntoPandasDataFrame.get_file')
     def test_rep_date(self, mock_file):
 
-        mock_file.return_value = pd.read_json(
-            self.test_string_all_federal_states_and_counties_github)
+        mock_file.return_value = pd.read_json(io.StringIO(
+            self.test_string_all_federal_states_and_counties_github))
 
         read_data = False
         file_format = 'json_timeasstring'

diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_vaccination_data.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_vaccination_data.py
@@ -22,6 +22,7 @@
 
 import os
 import json
+import numpy as np
 import pandas as pd
 from pyfakefs import fake_filesystem_unittest
 
@@ -67,7 +68,7 @@ class TestGetVaccinationData(fake_filesystem_unittest.TestCase):
 
     df_vacc_data = df_vacc_data.astype(
         {'LandkreisId_Impfort': 'string', 'Altersgruppe': "string",
-         'Impfschutz': int, 'Anzahl': int})
+         'Impfschutz': int, 'Anzahl': float})
 
     df_vacc_data_altern = pd.DataFrame(columns=col_names_vacc_data)
     for i in range(len(counties)):
@@ -96,7 +97,7 @@ class TestGetVaccinationData(fake_filesystem_unittest.TestCase):
 
     df_vacc_data_altern = df_vacc_data_altern.astype(
         {'LandkreisId_Impfort': 'string', 'Altersgruppe': "string",
-         'Impfschutz': int, 'Anzahl': int})
+         'Impfschutz': int, 'Anzahl': float})
 
     filename = os.path.join(
         here, 'test_data', 'TestSetPopulationFinal.json')
@@ -171,7 +172,7 @@ def test_sanity_checks(self, mockv):
                 "LandkreisId_Impfort": ['05754', '1', '2', '3', '4'],
                 "Altersgruppe": ["01-59", "01-59", "01-59", "01-59", "01-59"],
                 "Impfschutz": [1, 1, 2, 3, 1],
-                "Anzahl": [10000, 1, 2, 3, 4]})
+                "Anzahl": [10000., 1., 2., 3., 4.]})
         gvd.sanity_checks(df_no_errors)
 
     def test_sanitizing_based_on_regions(self):
@@ -182,8 +183,8 @@ def test_sanitizing_based_on_regions(self):
             'ID_State': sorted(4*[1, 1, 2, 6, 6, 6]),
             'ID_County': sorted(4*[1001, 1002, 2001, 6000, 6005, 6006]),
             'Age_RKI': 6*age_groups,
-            'vacc_1': [0, 0, 0, 0, 2, 5, 7, 9, 2, 4, 6, 8, 4, 4, 4, 4, 1, 6, 1, 2, 0, 0, 5, 17],
-            'vacc_2': [0, 1, 0, 2, 1, 4, 3, 2, 1, 1, 6, 4, 4, 4, 4, 1, 2, 1, 2, 0, 0, 5, 4, 0]
+            'vacc_1': np.array([0, 0, 0, 0, 2, 5, 7, 9, 2, 4, 6, 8, 4, 4, 4, 4, 1, 6, 1, 2, 0, 0, 5, 17], dtype=float),
+            'vacc_2': np.array([0, 1, 0, 2, 1, 4, 3, 2, 1, 1, 6, 4, 4, 4, 4, 1, 2, 1, 2, 0, 0, 5, 4, 0], dtype=float)
         })
         population = pd.DataFrame({
             'ID_County': [1001, 1002, 2001, 6000, 6005, 6006],

diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_modifyDataframeSeries.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_modifyDataframeSeries.py
@@ -507,18 +507,19 @@ def test_insert_column_by_map(self):
 
         # test with integer mapping
         df = mdfs.insert_column_by_map(
-            self.test_df1, 'test_col3', 'inserted_col', self.int_map)
+            self.test_df1, 'test_col3', 'inserted_col', self.int_map, int)
         new_cols = df.columns.to_list()
         exp_cols = ['Date', 'test_col1', 'test_col2', 'test_col3',
                     'inserted_col', 'ID']
         self.assertEqual(new_cols, exp_cols)
         pd.testing.assert_frame_equal(df[old_cols], self.test_df1)
-        exp_new_col = (10*self.test_df1['test_col3']).rename('inserted_col')
+        exp_new_col = (10*self.test_df1['test_col3']
+                       ).rename('inserted_col').astype(int)
         pd.testing.assert_series_equal(df['inserted_col'], exp_new_col)
 
         # test with string mapping
         df = mdfs.insert_column_by_map(
-            self.test_df1, 'test_col2', 'inserted_col', self.str_map)
+            self.test_df1, 'test_col2', 'inserted_col', self.str_map, str)
         new_cols = df.columns.to_list()
         exp_cols = ['Date', 'test_col1', 'test_col2', 'inserted_col',
                     'test_col3', 'ID']