Better way to add options or implementations to the json encoders

I want to be able to add new json implementations to fast_parquet.json.  I currently am monkeypatching the implementation to add extra options to the JSON implementation, and use the pandas ujson_dumps

```python

from fastparquet import json as fp_json
from io import BytesIO
import logging
logger = logging.getLogger()


class MyJsonImpl(fp_json.BaseImpl):
    def __init__(self):
        from pandas._libs.json import ujson_dumps

        #for some reason the following line causes errors, so I have to reimport ujson_dumps
        # from pandas._libs.json import ujson_dumps
        # self.dumps = ujson_dumps

    def dumps(self, data):
        from pandas._libs.json import ujson_dumps
        return ujson_dumps(data, default_handler=str).encode("utf-8")

    def loads(self, s):
        return self.api.loads(s)


def to_parquet(df):
    data: BytesIO = BytesIO()

    # data.close doesn't work in pyodide, so we make close a no-op
    orig_close = data.close
    data.close = lambda: None
    # I don't like this copy.  modify to keep the same data with different names
    df2 = df.copy()
    df2['index'] = df2.index
    df2.columns = [str(x) for x in df2.columns]
    obj_columns = df2.select_dtypes([pd.CategoricalDtype(), 'object']).columns.to_list()
    encodings = {k:'json' for k in obj_columns}

    orig_get_cached_codec = fp_json._get_cached_codec
    def fake_get_cached_codec():
        return MyJsonImpl()

    fp_json._get_cached_codec = fake_get_cached_codec
    try:
        df2.to_parquet(data, engine='fastparquet', object_encoding=encodings)
    except Exception as e:
        logger.error("error serializing to parquet %r", e)
        raise
    finally:
        data.close = orig_close
        fp_json._get_cached_codec = orig_get_cached_codec


    data.seek(0)
    return data.read()

```

This would be cleaner if I could call something like
`fastparquet.json.register_impl(MyJsonImpl)`

Here is my testcase for my `to_parquet` function

```
def test_serialize_naive_json():
    d = date(year=1999, month=10, day=3)
    d2 = date(year=1999, month=10, day=3)
    df = pd.DataFrame({'a': [pd.DataFrame, Exception, lambda x: x+10],
                       'b': [d, d2, None]})

    #just make sure we don't throw an error
    output = to_parquet(df)
    #and make sure output isn't empty. I don't want to hardcode a
    #response here
    assert len(output) > 20
```
-----

For my usecase, I really need parquet encoding to always work and produce something.  I use fastparquet for my Buckaroo table UI and I can get any type of DataFrame from the wild passed into it.  Fastparquet produces a parquet buffer that is sent to the browser to read.  A column that is just strings or empty is greatly preferable to an exception or  a broken parquet file.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Better way to add options or implementations to the json encoders #955

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

Better way to add options or implementations to the json encoders #955

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions