File tree Expand file tree Collapse file tree 3 files changed +200
-14
lines changed Expand file tree Collapse file tree 3 files changed +200
-14
lines changed Original file line number Diff line number Diff line change @@ -177,8 +177,11 @@ def _validate_data_value_type(
177177 return df .astype (np .float64 )
178178
179179
180- def _read_parquet (file : BinaryIO , value_type : ValueType ) -> pd .DataFrame :
181- df = pd .read_parquet (file , use_nullable_dtypes = True ) # pyright: ignore
180+ def _read_parquet (file , value_type : ValueType ) -> pd .DataFrame :
181+ # It appears that pd.read_parquet() by default uses pyarrow. However, for some reason
182+ # when reading a file with 20k columns, the memory usage balloons
183+ # to > 30GB and would take down breadbox. However, using fastparquet seems to avoid this problem.
184+ df = pd .read_parquet (file , engine = "fastparquet" ).convert_dtypes ()
182185
183186 # the first column will be treated as the index. Make sure it's of type string
184187 df [df .columns [0 ]] = df [df .columns [0 ]].astype ("string" )
You can’t perform that action at this time.
0 commit comments