forked from jcrobak/parquet-python
-
-
Notifications
You must be signed in to change notification settings - Fork 188
Open
Description
I want to be able to add new json implementations to fast_parquet.json. I currently am monkeypatching the implementation to add extra options to the JSON implementation, and use the pandas ujson_dumps
from fastparquet import json as fp_json
from io import BytesIO
import logging
logger = logging.getLogger()
class MyJsonImpl(fp_json.BaseImpl):
def __init__(self):
from pandas._libs.json import ujson_dumps
#for some reason the following line causes errors, so I have to reimport ujson_dumps
# from pandas._libs.json import ujson_dumps
# self.dumps = ujson_dumps
def dumps(self, data):
from pandas._libs.json import ujson_dumps
return ujson_dumps(data, default_handler=str).encode("utf-8")
def loads(self, s):
return self.api.loads(s)
def to_parquet(df):
data: BytesIO = BytesIO()
# data.close doesn't work in pyodide, so we make close a no-op
orig_close = data.close
data.close = lambda: None
# I don't like this copy. modify to keep the same data with different names
df2 = df.copy()
df2['index'] = df2.index
df2.columns = [str(x) for x in df2.columns]
obj_columns = df2.select_dtypes([pd.CategoricalDtype(), 'object']).columns.to_list()
encodings = {k:'json' for k in obj_columns}
orig_get_cached_codec = fp_json._get_cached_codec
def fake_get_cached_codec():
return MyJsonImpl()
fp_json._get_cached_codec = fake_get_cached_codec
try:
df2.to_parquet(data, engine='fastparquet', object_encoding=encodings)
except Exception as e:
logger.error("error serializing to parquet %r", e)
raise
finally:
data.close = orig_close
fp_json._get_cached_codec = orig_get_cached_codec
data.seek(0)
return data.read()
This would be cleaner if I could call something like
fastparquet.json.register_impl(MyJsonImpl)
Here is my testcase for my to_parquet
function
def test_serialize_naive_json():
d = date(year=1999, month=10, day=3)
d2 = date(year=1999, month=10, day=3)
df = pd.DataFrame({'a': [pd.DataFrame, Exception, lambda x: x+10],
'b': [d, d2, None]})
#just make sure we don't throw an error
output = to_parquet(df)
#and make sure output isn't empty. I don't want to hardcode a
#response here
assert len(output) > 20
For my usecase, I really need parquet encoding to always work and produce something. I use fastparquet for my Buckaroo table UI and I can get any type of DataFrame from the wild passed into it. Fastparquet produces a parquet buffer that is sent to the browser to read. A column that is just strings or empty is greatly preferable to an exception or a broken parquet file.
Metadata
Metadata
Assignees
Labels
No labels