@@ -400,11 +400,16 @@ def profile(
400
400
exact_uniqueness = True ,
401
401
):
402
402
# TODO: add statistics for correlations, histograms and exact_uniqueness
403
+ arrow_schema = pa .Schema .from_pandas (df , preserve_index = False )
403
404
404
405
# parse timestamp columns to string columns
405
- for col , dtype in df .dtypes .items ():
406
- if isinstance (dtype , type (np .dtype (np .datetime64 ))):
407
- df [col ] = df [col ].astype (str )
406
+ for field in arrow_schema :
407
+ if not (
408
+ pa .types .is_list (field .type )
409
+ or pa .types .is_large_list (field .type )
410
+ or pa .types .is_struct (field .type )
411
+ ) and PYARROW_HOPSWORKS_DTYPE_MAPPING [field .type ] in ["timestamp" , "date" ]:
412
+ df [field .name ] = df [field .name ].astype (str )
408
413
409
414
if not relevant_columns :
410
415
stats = df .describe ().to_dict ()
@@ -427,14 +432,21 @@ def profile(
427
432
stat ["completeness" ] = 1
428
433
429
434
# set data type
430
- if isinstance (df .dtypes [col ], type (np .dtype (np .float64 ))):
435
+ arrow_type = arrow_schema .field (col ).type
436
+ if (
437
+ pa .types .is_list (arrow_type )
438
+ or pa .types .is_large_list (arrow_type )
439
+ or pa .types .is_struct (arrow_type )
440
+ or PYARROW_HOPSWORKS_DTYPE_MAPPING [arrow_type ]
441
+ in ["timestamp" , "date" , "binary" , "string" ]
442
+ ):
443
+ stat ["dataType" ] = "String"
444
+ elif PYARROW_HOPSWORKS_DTYPE_MAPPING [arrow_type ] in ["float" , "double" ]:
431
445
stat ["dataType" ] = "Fractional"
432
- elif isinstance ( df . dtypes [ col ], type ( np . dtype ( np . int64 ))) :
446
+ elif PYARROW_HOPSWORKS_DTYPE_MAPPING [ arrow_type ] in [ "int" , "bigint" ] :
433
447
stat ["dataType" ] = "Integral"
434
- elif isinstance ( df . dtypes [ col ], type ( np . dtype ( np . bool_ ))) :
448
+ elif PYARROW_HOPSWORKS_DTYPE_MAPPING [ arrow_type ] == "boolean" :
435
449
stat ["dataType" ] = "Boolean"
436
- elif isinstance (df .dtypes [col ], type (np .dtype (object ))):
437
- stat ["dataType" ] = "String"
438
450
else :
439
451
print (
440
452
"Data type could not be inferred for column '"
0 commit comments