-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Structured data export #48: DocumentModel, FrictionLess Data
- Loading branch information
1 parent
88a60a9
commit dc70960
Showing
9 changed files
with
222 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
from .core import Exporter | ||
from .exporters import Exporter | ||
from .frictionless_data import FrictionlessExporter | ||
from .xlsx import XLSXExporter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,94 @@ | ||
import pandas as pd | ||
import json | ||
import os | ||
import subprocess | ||
import tempfile | ||
from datetime import datetime | ||
|
||
from moonsheep.exporters.core import PandasExporter | ||
from moonsheep.exporters.exporters import PandasExporter | ||
|
||
|
||
class FrictionlessExporter(PandasExporter): | ||
""" | ||
Frictionless Data exporter | ||
Frictionless Data (https://frictionlessdata.io/) is basically a zip file containing csv data files | ||
along with some metadata about them. | ||
Frictionless Data (https://frictionlessdata.io/) is basically a folder containing csv data files | ||
along with some metadata about them. Such folder can be packed in one file (.zip, .tar.gz, etc.) | ||
""" | ||
|
||
@staticmethod | ||
def type_from_pandas(type): | ||
""" | ||
:param type: | ||
:return: http://frictionlessdata.io/specs/table-schema/ | ||
""" | ||
if type == 'int64': | ||
return 'integer' | ||
if type == 'object': | ||
return 'object' | ||
if type == 'bool': | ||
return 'boolean' | ||
|
||
print(f"Warning: Type not mapped: {type}") | ||
return 'object' | ||
|
||
def export(self, output, **options): | ||
""" | ||
:param output: a path. If output ends with .tar.gz or .zip then archive file will be created. | ||
Otherwise output is treated as directory name and no compression will be performed. | ||
:param options: | ||
:return: | ||
""" | ||
created_at = datetime.now().isoformat() | ||
datapackage = { | ||
"name": self.app_label + "-" + created_at, | ||
"version": "1.0.0-rc.2", | ||
"created": created_at, | ||
"profile": "tabular-data-package", | ||
"resources": [] | ||
} | ||
|
||
compression_cmd = None | ||
if output.endswith('.zip'): | ||
output_dir = tempfile.mkdtemp() | ||
output_absolute = os.path.join(os.getcwd(), output) | ||
print(output_absolute) | ||
compression_cmd = f'(cd {output_dir} && zip {output_absolute} *)' | ||
|
||
elif output.endswith('.tar.gz'): | ||
output_dir = tempfile.mkdtemp() | ||
compression_cmd = f'find {output_dir} -printf "%P\n" | tar -czf {output} --no-recursion -C {output_dir} -T -' | ||
|
||
else: | ||
output_dir = output | ||
os.makedirs(output_dir, exist_ok=True) | ||
|
||
for slug, data_frame in self.data_frames(): | ||
data_frame.to_csv(output + "_" + slug + ".csv", index=False) | ||
fname = slug + '.csv' | ||
|
||
data_frame.to_csv(os.path.join(output_dir, fname), index=False) | ||
|
||
datapackage['resources'].append({ | ||
"path": fname, | ||
"profile": "tabular-data-resource", | ||
"schema": { | ||
"fields": [{ | ||
"name": fld, | ||
"type": FrictionlessExporter.type_from_pandas(ftype) | ||
# TODO while creating dataframe ask model for specific field type | ||
# (now we have string expressed as object) | ||
# TODO description from model | ||
} for fld, ftype in data_frame.dtypes.items()] | ||
# TODO ask model and add "primaryKey": "id" | ||
# TODO is defining relations between objects possible here? | ||
} | ||
}) | ||
|
||
# write datapackage.json | ||
with open(os.path.join(output_dir, 'datapackage.json'), 'w') as f: | ||
f.write(json.dumps(datapackage, indent=2)) | ||
|
||
if compression_cmd is not None: | ||
try: | ||
subprocess.run(compression_cmd, shell=True, check=True) | ||
finally: | ||
subprocess.run(["rm", "-rf", output_dir], check=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters