Skip to content

Commit

Permalink
1084 Handle Epidata FutureWarnings (#1109)
Browse files Browse the repository at this point in the history
Handle FutureWarnings from newest pandas version.

Co-authored-by: Anna Wendler <[email protected]>
  • Loading branch information
patricklnz and annawendler authored Oct 28, 2024
1 parent 0962eb0 commit 86eb344
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def insert_names_of_states(df, state_id_col=dd.EngEng["idState"]):
@return dataframe df with column of state names corresponding to county ids
"""
df = modifyDataframeSeries.insert_column_by_map(
df, state_id_col, dd.EngEng["state"], get_state_names_and_ids())
df, state_id_col, dd.EngEng["state"], get_state_names_and_ids(), str)
return df

# while reporting for Berlin is just different for different sources, Eisenach
Expand Down Expand Up @@ -188,7 +188,7 @@ def insert_names_of_counties(
county_id_map = get_county_names_and_ids(
merge_berlin=merge_berlin, merge_eisenach=False)
df = modifyDataframeSeries.insert_column_by_map(
df, county_id_col, dd.EngEng["county"], county_id_map)
df, county_id_col, dd.EngEng["county"], county_id_map, str)
return df


Expand Down
38 changes: 18 additions & 20 deletions pycode/memilio-epidata/memilio/epidata/getNPIData.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,8 +619,13 @@ def get_npi_data(fine_resolution=2,
df_npis_combinations_pre = df_npis_combinations_pre[[
'Variablenname', 'Massnahmenindex'] + [i for i in range(0, len(columns_used))]]
# replace empty cells by zeros and x-marked cells by ones
df_npis_combinations_pre = df_npis_combinations_pre.replace(np.nan, 0)
df_npis_combinations_pre = df_npis_combinations_pre.replace('x', 1)
# This has to be done by replacing the values with the same dtype and then changing the dtype
# Pandas 3.0 will not allow downcasting with replace operations
df_npis_combinations_pre = df_npis_combinations_pre.replace(
np.nan, '0')
df_npis_combinations_pre = df_npis_combinations_pre.replace('x', '1')
df_npis_combinations_pre[df_npis_combinations_pre.columns[2:]
] = df_npis_combinations_pre[df_npis_combinations_pre.columns[2:]].astype(int)

# extract different NPI groups and store indices of NPIs belonging
# to the different groups
Expand Down Expand Up @@ -905,25 +910,15 @@ def get_npi_data(fine_resolution=2,
max_date + [max(dates_new),
pd.to_datetime(end_date)])

# create new data frame for all NPIs given in the columns,
# resolved by county and day
df_npis = pd.DataFrame(
columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] +
list(npis_final[dd.EngEng['npiCode']]))
# convert NPI data from object to int such that correlations can be
# computed
df_npis = df_npis.astype(dict(
zip(
[dd.EngEng['date']] + [dd.EngEng['idCounty']] +
list(npis_final[dd.EngEng['npiCode']]), ['str', 'int'] +
['int' for i in npis_final[dd.EngEng['npiCode']]])))

# iterate over countyIDs
counters = np.zeros(4) # time counter for output only
countyidx = 0
# replace -99 ("not used anymore") by 0 ("not used")
# replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned")
df_npis_old.replace([-99, 2, 3, 4, 5], [0, 1, 1, 1, 1], inplace=True)

# Infer type of columns to be able to use replace with ints without downcasting.
df_npis_old = df_npis_old.infer_objects()
df_npis_old.replace([-99, 2, 3, 4, 5],
[0, 1, 1, 1, 1], inplace=True)

counter_cases_start = 0

# setup dataframe for each maingroup, same format as df_npi_combinations
Expand Down Expand Up @@ -961,6 +956,9 @@ def get_npi_data(fine_resolution=2,
df_npis_combinations[maincode][0].keys()):
raise gd.DataError('Error. Description and table do not match.')

# create new data frame for all NPIs
df_npis = pd.DataFrame()

for countyID in counties_considered:
cid = 0
countyidx += 1
Expand Down Expand Up @@ -1004,7 +1002,7 @@ def get_npi_data(fine_resolution=2,

# get number of codes of one NPI (incidence indep. + dep.)
# for fine_resolution=1, inc_codes=1, for fine_res=2, inc_codes=6
inc_codes = len(np.where(df_npis.columns.str.contains(
inc_codes = len(np.where(npis_final.NPI_code.str.contains(
npis[dd.EngEng['npiCode']][0]))[0])

# Consistency of incidence independent and dependent NPIs:
Expand Down Expand Up @@ -1406,7 +1404,7 @@ def plot_interaction_matrix(filename, directory):

# invert color map elements for tab20c such that subcolors are shown
# from light to dark
cmap = copy.copy(mpl.cm.get_cmap('tab20b'))
cmap = copy.copy(plt.get_cmap('tab20b'))
colors = [
cmap(i)
for i in np.array(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def extract_subframe_based_on_dates(df, start_date, end_date):
return df_new


def insert_column_by_map(df, col_to_map, new_col_name, map):
def insert_column_by_map(df, col_to_map, new_col_name, map, new_col_dtype='object'):
"""! Adds a column to a given dataframe based on a mapping of values of a given column
The mapping is defined by a list containing tupels of the form (new_value, old_value)
Expand All @@ -298,14 +298,19 @@ def insert_column_by_map(df, col_to_map, new_col_name, map):
@param col_to_map column containing values to be mapped
@param new_col_name name of the new column containing the mapped values
@param map List of tuples of values in the column to be added and values in the given column
@param new_col_dtype String of dtype [Default: 'object'] for the new generated column
@return dataframe df with column of state names correspomding to state ids
"""
df_new = df[:]
loc_new_col = df_new.columns.get_loc(col_to_map)+1
df_new.insert(loc=loc_new_col, column=new_col_name,
value=df_new[col_to_map])
# Set dtype=object at new created column to prevent incompatible dtype errors
df_new[new_col_name] = df_new[new_col_name].astype('object')
for item in map:
df_new.loc[df_new[col_to_map] == item[1], [new_col_name]] = item[0]
# Set dtype of new column
df_new[new_col_name] = df_new[new_col_name].astype(new_col_dtype)
return df_new


Expand Down Expand Up @@ -517,7 +522,7 @@ def fit_age_group_intervals(
age_share[0][1]:age_share[-1][1]+1])
for age in age_share:
new_pop[age[1]] += pop_data[age[1]
] / sum_pop * df_age_in.iloc[0][population_indx]
] / sum_pop * df_age_in.iloc[0, population_indx]

population_indx += 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#############################################################################
import json
import os
import io
import unittest
from datetime import date, datetime
from unittest.mock import patch
Expand Down Expand Up @@ -322,8 +323,8 @@ def test_get_case_data_split_berlin(self, mock_file):
self.assertEqual(df_state.shape[0], 286)

@patch('memilio.epidata.getDataIntoPandasDataFrame.get_file',
return_value=pd.read_json(
test_string_all_federal_states_and_counties_read).copy())
return_value=pd.read_json(io.StringIO(
test_string_all_federal_states_and_counties_read)).copy())
def test_get_case_data_moving_average(self, mock_file):

read_data = True
Expand Down Expand Up @@ -470,8 +471,8 @@ def test_get_case_data_moving_average(self, mock_file):
1.0)

@patch('memilio.epidata.getDataIntoPandasDataFrame.get_file',
return_value=pd.read_json(
test_string_all_federal_states_and_counties_read).copy())
return_value=pd.read_json(io.StringIO(
test_string_all_federal_states_and_counties_read)).copy())
def test_get_case_data_impute_dates(self, mock_file):
read_data = True
file_format = 'json_timeasstring'
Expand Down Expand Up @@ -554,8 +555,7 @@ def test_get_case_data_impute_dates(self, mock_file):
25)

@patch('memilio.epidata.getDataIntoPandasDataFrame.get_file',
return_value=pd.read_json(
test_string_all_federal_states_and_counties_read).copy())
return_value=pd.read_json(io.StringIO(test_string_all_federal_states_and_counties_read)).copy())
def test_get_case_data_moving_average_and_split_berlin(self, mock_file):
# test if split_berlin and moving_average = True are working together

Expand Down Expand Up @@ -659,8 +659,8 @@ def test_no_raw(self, mock_file):
directory = os.path.join(out_folder, 'Germany/')
gd.check_dir(directory)

mock_file.return_value = pd.read_json(
self.test_string_all_federal_states_and_counties_github)
mock_file.return_value = pd.read_json(io.StringIO(
self.test_string_all_federal_states_and_counties_github))

gcd.get_case_data(
read_data=read_data, file_format=file_format,
Expand Down Expand Up @@ -716,8 +716,8 @@ def test_check_for_completeness(self):
@patch('memilio.epidata.getDataIntoPandasDataFrame.get_file')
def test_rep_date(self, mock_file):

mock_file.return_value = pd.read_json(
self.test_string_all_federal_states_and_counties_github)
mock_file.return_value = pd.read_json(io.StringIO(
self.test_string_all_federal_states_and_counties_github))

read_data = False
file_format = 'json_timeasstring'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import os
import json
import numpy as np
import pandas as pd
from pyfakefs import fake_filesystem_unittest

Expand Down Expand Up @@ -67,7 +68,7 @@ class TestGetVaccinationData(fake_filesystem_unittest.TestCase):

df_vacc_data = df_vacc_data.astype(
{'LandkreisId_Impfort': 'string', 'Altersgruppe': "string",
'Impfschutz': int, 'Anzahl': int})
'Impfschutz': int, 'Anzahl': float})

df_vacc_data_altern = pd.DataFrame(columns=col_names_vacc_data)
for i in range(len(counties)):
Expand Down Expand Up @@ -96,7 +97,7 @@ class TestGetVaccinationData(fake_filesystem_unittest.TestCase):

df_vacc_data_altern = df_vacc_data_altern.astype(
{'LandkreisId_Impfort': 'string', 'Altersgruppe': "string",
'Impfschutz': int, 'Anzahl': int})
'Impfschutz': int, 'Anzahl': float})

filename = os.path.join(
here, 'test_data', 'TestSetPopulationFinal.json')
Expand Down Expand Up @@ -171,7 +172,7 @@ def test_sanity_checks(self, mockv):
"LandkreisId_Impfort": ['05754', '1', '2', '3', '4'],
"Altersgruppe": ["01-59", "01-59", "01-59", "01-59", "01-59"],
"Impfschutz": [1, 1, 2, 3, 1],
"Anzahl": [10000, 1, 2, 3, 4]})
"Anzahl": [10000., 1., 2., 3., 4.]})
gvd.sanity_checks(df_no_errors)

def test_sanitizing_based_on_regions(self):
Expand All @@ -182,8 +183,8 @@ def test_sanitizing_based_on_regions(self):
'ID_State': sorted(4*[1, 1, 2, 6, 6, 6]),
'ID_County': sorted(4*[1001, 1002, 2001, 6000, 6005, 6006]),
'Age_RKI': 6*age_groups,
'vacc_1': [0, 0, 0, 0, 2, 5, 7, 9, 2, 4, 6, 8, 4, 4, 4, 4, 1, 6, 1, 2, 0, 0, 5, 17],
'vacc_2': [0, 1, 0, 2, 1, 4, 3, 2, 1, 1, 6, 4, 4, 4, 4, 1, 2, 1, 2, 0, 0, 5, 4, 0]
'vacc_1': np.array([0, 0, 0, 0, 2, 5, 7, 9, 2, 4, 6, 8, 4, 4, 4, 4, 1, 6, 1, 2, 0, 0, 5, 17], dtype=float),
'vacc_2': np.array([0, 1, 0, 2, 1, 4, 3, 2, 1, 1, 6, 4, 4, 4, 4, 1, 2, 1, 2, 0, 0, 5, 4, 0], dtype=float)
})
population = pd.DataFrame({
'ID_County': [1001, 1002, 2001, 6000, 6005, 6006],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -507,18 +507,19 @@ def test_insert_column_by_map(self):

# test with integer mapping
df = mdfs.insert_column_by_map(
self.test_df1, 'test_col3', 'inserted_col', self.int_map)
self.test_df1, 'test_col3', 'inserted_col', self.int_map, int)
new_cols = df.columns.to_list()
exp_cols = ['Date', 'test_col1', 'test_col2', 'test_col3',
'inserted_col', 'ID']
self.assertEqual(new_cols, exp_cols)
pd.testing.assert_frame_equal(df[old_cols], self.test_df1)
exp_new_col = (10*self.test_df1['test_col3']).rename('inserted_col')
exp_new_col = (10*self.test_df1['test_col3']
).rename('inserted_col').astype(int)
pd.testing.assert_series_equal(df['inserted_col'], exp_new_col)

# test with string mapping
df = mdfs.insert_column_by_map(
self.test_df1, 'test_col2', 'inserted_col', self.str_map)
self.test_df1, 'test_col2', 'inserted_col', self.str_map, str)
new_cols = df.columns.to_list()
exp_cols = ['Date', 'test_col1', 'test_col2', 'inserted_col',
'test_col3', 'ID']
Expand Down

0 comments on commit 86eb344

Please sign in to comment.