@@ -13,18 +13,35 @@ def get_metadata_input_dataframe(cell_loc: CellLocation) -> pd.DataFrame:
13
13
from a CellLocation object.
14
14
"""
15
15
16
- return (
17
- pd .read_parquet (
18
- cell_loc .metadata_input ,
19
- # set storage options if we have an s3 path
20
- storage_options = { "anon" : True }
21
- if isinstance (cell_loc .metadata_input , str )
22
- and cell_loc . metadata_input . startswith ( "s3://" )
23
- else None ,
16
+ # return a dataframe if it is already a dataframe
17
+ if isinstance ( cell_loc . metadata_input , pd .DataFrame ):
18
+ return cell_loc .metadata_input
19
+
20
+ # try to process a string-based path
21
+ if isinstance (cell_loc .metadata_input , str ):
22
+ storage_opts = (
23
+ { "anon" : True } if cell_loc . metadata_input . startswith ( "s3://" ) else None
24
24
)
25
- if isinstance (cell_loc .metadata_input , str )
26
- else cell_loc .metadata_input
27
- )
25
+ return (
26
+ # read from parquet if we have a parquet object path
27
+ pd .read_parquet (
28
+ path = cell_loc .metadata_input ,
29
+ # set storage options if we have an s3 path
30
+ storage_options = storage_opts ,
31
+ )
32
+ if cell_loc .metadata_input .endswith (".parquet" )
33
+ # read from csv if we have a csv object path
34
+ else (
35
+ pd .read_csv (
36
+ filepath_or_buffer = cell_loc .metadata_input ,
37
+ # set storage options if we have an s3 path
38
+ storage_options = storage_opts ,
39
+ )
40
+ )
41
+ )
42
+ else :
43
+ # otherwise raise an error as we don't have a supported format
44
+ raise ValueError ("Unsupported metadata_input type" )
28
45
29
46
30
47
@pytest .mark .parametrize (
@@ -80,11 +97,15 @@ def test_output_value_correctness(
80
97
cell_loc = cls_cell_loc .add_cell_location ()
81
98
metadata_input_dataframe = get_metadata_input_dataframe (cell_loc = cls_cell_loc )
82
99
100
+ # Cast cell_loc columns to the data types of metadata_input_dataframe columns
101
+ # (observed metadata_site as having different types)
102
+ for col in metadata_input_dataframe .columns :
103
+ cell_loc [col ] = cell_loc [col ].astype (metadata_input_dataframe [col ].dtype )
104
+
83
105
# if we restrict the columns of cell_loc to the ones in metadata_input_dataframe, we should get the same dataframe
84
- assert (
85
- cell_loc [metadata_input_dataframe .columns ]
86
- .reset_index (drop = True )
87
- .equals (metadata_input_dataframe .reset_index (drop = True ))
106
+ pd .testing .assert_frame_equal (
107
+ cell_loc [metadata_input_dataframe .columns ].reset_index (drop = True ),
108
+ metadata_input_dataframe .reset_index (drop = True ),
88
109
)
89
110
90
111
# gather an engine from the cell_loc class
0 commit comments