40
40
from git import Repo
41
41
from git .refs .head import Head
42
42
import pandas as pd
43
+ import numpy as np
43
44
44
45
from .utils import read_params
45
46
from .logger import get_structured_logger
47
+ from .nancodes import Nans
46
48
47
49
Files = List [str ]
48
50
FileDiffMap = Dict [str , Optional [str ]]
@@ -73,8 +75,10 @@ def diff_export_csv(
73
75
changed_df is the pd.DataFrame of common rows from after_csv with changed values.
74
76
added_df is the pd.DataFrame of added rows from after_csv.
75
77
"""
76
- export_csv_dtypes = {"geo_id" : str , "val" : float ,
77
- "se" : float , "sample_size" : float }
78
+ export_csv_dtypes = {
79
+ "geo_id" : str , "val" : float , "se" : float , "sample_size" : float ,
80
+ "missing_val" : int , "missing_se" : int , "missing_sample_size" : int
81
+ }
78
82
79
83
before_df = pd .read_csv (before_csv , dtype = export_csv_dtypes )
80
84
before_df .set_index ("geo_id" , inplace = True )
@@ -89,12 +93,22 @@ def diff_export_csv(
89
93
before_df_cmn = before_df .reindex (common_idx )
90
94
after_df_cmn = after_df .reindex (common_idx )
91
95
92
- # Exact comparisons, treating NA == NA as True
93
- same_mask = before_df_cmn == after_df_cmn
94
- same_mask |= pd .isna (before_df_cmn ) & pd .isna (after_df_cmn )
96
+ # If CSVs have different columns (no missingness), mark all values as new
97
+ if ("missing_val" in before_df_cmn .columns ) ^ ("missing_val" in after_df_cmn .columns ):
98
+ same_mask = after_df_cmn .copy ()
99
+ same_mask .loc [:] = False
100
+ else :
101
+ # Exact comparisons, treating NA == NA as True
102
+ same_mask = before_df_cmn == after_df_cmn
103
+ same_mask |= pd .isna (before_df_cmn ) & pd .isna (after_df_cmn )
104
+
105
+ # Code deleted entries as nans with the deleted missing code
106
+ deleted_df = before_df .loc [deleted_idx , :].copy ()
107
+ deleted_df [["val" , "se" , "sample_size" ]] = np .nan
108
+ deleted_df [["missing_val" , "missing_se" , "missing_sample_size" ]] = Nans .DELETED
95
109
96
110
return (
97
- before_df . loc [ deleted_idx , :] ,
111
+ deleted_df ,
98
112
after_df_cmn .loc [~ (same_mask .all (axis = 1 )), :],
99
113
after_df .loc [added_idx , :])
100
114
@@ -227,11 +241,11 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]:
227
241
228
242
deleted_df , changed_df , added_df = diff_export_csv (
229
243
before_file , after_file )
230
- new_issues_df = pd .concat ([changed_df , added_df ], axis = 0 )
244
+ new_issues_df = pd .concat ([deleted_df , changed_df , added_df ], axis = 0 )
231
245
232
246
if len (deleted_df ) > 0 :
233
247
print (
234
- f"Warning, diff has deleted indices in { after_file } that will be ignored " )
248
+ f"Diff has deleted indices in { after_file } that have been coded as nans. " )
235
249
236
250
# Write the diffs to diff_file, if applicable
237
251
if len (new_issues_df ) > 0 :
@@ -240,6 +254,17 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]:
240
254
new_issues_df .to_csv (diff_file , na_rep = "NA" )
241
255
common_diffs [after_file ] = diff_file
242
256
257
+ # Replace deleted files with empty versions, but only if the cached version is not
258
+ # already empty
259
+ for deleted_file in deleted_files :
260
+ deleted_df = pd .read_csv (deleted_file )
261
+ if not deleted_df .empty :
262
+ print (
263
+ f"Diff has deleted { deleted_file } and replaced it with an empty CSV." )
264
+ empty_df = deleted_df [0 :0 ]
265
+ new_deleted_filename = join (self .export_dir , basename (deleted_file ))
266
+ empty_df .to_csv (new_deleted_filename , index = False )
267
+
243
268
return deleted_files , common_diffs , new_files
244
269
245
270
def archive_exports (self , exported_files : Files ) -> Tuple [Files , Files ]:
@@ -266,9 +291,10 @@ def filter_exports(self, common_diffs: FileDiffMap):
266
291
Filter export directory to only contain relevant files.
267
292
268
293
Filters down the export_dir to only contain:
269
- 1) New files, 2) Changed files, filtered-down to the ADDED and CHANGED rows only.
270
- Should be called after archive_exports() so we archive the raw exports before
271
- potentially modifying them.
294
+ 1) New files, 2) Changed files, filtered-down to the ADDED and CHANGED rows
295
+ only, and 3) Deleted files replaced with empty CSVs with the same name. Should
296
+ be called after archive_exports() so we archive the raw exports before potentially
297
+ modifying them.
272
298
273
299
Parameters
274
300
----------
@@ -297,9 +323,9 @@ def run(self):
297
323
self .update_cache ()
298
324
299
325
# Diff exports, and make incremental versions
300
- _ , common_diffs , new_files = self .diff_exports ()
326
+ deleted_files , common_diffs , new_files = self .diff_exports ()
301
327
302
- # Archive changed and new files only
328
+ # Archive changed, new, and emptied deleted files
303
329
to_archive = [f for f , diff in common_diffs .items ()
304
330
if diff is not None ]
305
331
to_archive += new_files
0 commit comments