Structured data export #48: DocumentModel, FrictionLess Data

themoonsheep · Nov 19, 2019 · dc70960 · dc70960
1 parent 88a60a9
commit dc70960
Show file tree

Hide file tree

Showing 9 changed files with 222 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -174,4 +174,103 @@ After each transcription it will show "Thank you! Are you ready for a next one?
  ```python
     url(r'^$', TemplateView.as_view(template_name='homepage.html'), name='finish-transcription'),
     url(r'^$', TemplateView.as_view(template_name='homepage.html'), name='home'),
-```
+```
+
+## Exporting data
+
+Moonsheep supports several ways to export structured domain data. That include:
+- JSON:API compliant API
+- XLSX
+- [Frictionless Data](https://frictionlessdata.io/) (packed CSVs)  
+
+Export options are available in the Moonsheep admin on the campaign page and also via command line on the server.
+
+### Configuration
+
+Structured data export is available out of the box, but can be further refined.
+
+#### Exported queryset
+
+It is possible to define what objects should be returned for each model 
+by defining a custom queryset method `exported()`. 
+
+`DocumentModel` base class uses that mechanism to return by default only those objects
+that have been fully transcribed. You can define it on any of your domain models
+and it will be picked up by the export mechanism.
+
+```python
+class DocumentQuerySet(models.QuerySet):
+    def exported(self) -> models.QuerySet:
+        return self.filter(progress=100)
+
+class DocumentModel(models.Model):
+    objects = DocumentQuerySet.as_manager()
+```  
+
+#### Excluded fields
+
+You might have some metadata/control fields that you don't want to be exported
+in structured data. To define them add an inner class `Exported`
+and define there either a full list of fields to export or fields to be excluded.
+
+```python
+class MyModel(models.Model):
+    class Exported:
+        # fields = ['first', 'second'] # to list all fields which should appear
+        exclude = ['progress'] # or exclude a few
+        # if Exported is not specified then by default all fields are exported
+```
+
+#### DocumentModel
+
+`moonsheep.models.DocumentModel` should be used as a default base class 
+for defining a model that defines a document being transcribed. It offers following features:
+- defines `url` and `progress` model fields required by Moonsheep
+- exclude `progress` field from being exported
+- limits exported objects to those fully transcribed `progress == 100` 
+
+### Exporters
+
+#### API
+
+Implementing an API for domain model is as simple as adding one url line:
+```python
+from moonsheep.exporters.api import AppApi
+
+urlpatterns = [
+    path('api/opora/', AppApi('opora').urls, name='api-opora'),
+]
+```
+
+`AppApi` scans for all domain models defined in your app (passed in the param) 
+and generates URL for each using Django Rest Framework. 
+Rest Framework gives you a nice html interface to play with requests with 
+discovery features (listing of all endpoints). That's the effect at `/api/opora`: 
+
+![API Home Screen](docs/images/api-generated.png)
+
+#### XLXS
+
+Exports data placing each model in a separate sheet of `xlsx` file.
+
+Can be called from a command line:
+```bash
+python manage.py moonsheep_export [app_label] xlsx -o opora.xlsx
+```
+
+#### Frictionless Data (packed CSV)
+
+Exports data placing each model in a separate `csv` file
+and packing all of them into a zip file according to Frictionless Data specification.
+
+Can be called from a command line:
+```bash
+python manage.py moonsheep_export [app_label] frictionless -o opora.zip
+```
+
+#### Guidelines on how to write your own exporter
+
+Exporters should extend `moonsheep.exporters.Exporter` abstract class and implement
+`def export(self, output: Union[io.IOBase, str], **options)` method.
+
+`PandasExporter` can be used as a base as `pandas` supports already [several output types](http://pandas-docs.github.io/pandas-docs-travis/reference/frame.html#serialization-io-conversion).
diff --git a/docs/images/api-generated.png b/docs/images/api-generated.png
diff --git a/moonsheep/exporters/__init__.py b/moonsheep/exporters/__init__.py
@@ -1,3 +1,3 @@
-from .core import Exporter
+from .exporters import Exporter
 from .frictionless_data import FrictionlessExporter
 from .xlsx import XLSXExporter
diff --git a/moonsheep/exporters/api.py b/moonsheep/exporters/api.py
@@ -3,7 +3,7 @@
 from rest_framework import routers, serializers, viewsets
 from django.apps import apps
 
-from moonsheep.exporters.core import Exporter
+from moonsheep.exporters.exporters import Exporter
 
 
 class AppApi(Exporter):

diff --git a/moonsheep/exporters/core.py → moonsheep/exporters/exporters.py b/moonsheep/exporters/core.py → moonsheep/exporters/exporters.py
@@ -38,9 +38,18 @@ def models(self):
         :return:
         """
         for slug, model_cls in apps.get_app_config(self.app_label).models.items():
+            # Customize exported fields by adding `class Exported` on the model
+            exported = getattr(model_cls, 'Exported', None)
+            exported_fields = getattr(exported, 'fields', None)
+            exported_exclude = getattr(exported, 'exclude', None)
+            if exported_fields is None and exported_exclude is None:
+                # default to having all fields exported
+                exported_fields = '__all__'
+
             class Meta:
                 model = model_cls
-                fields = '__all__'  # TODO by default drop some fields such as progress on Document
+                fields = exported_fields
+                exclude = exported_exclude
 
             serializer_cls = type(model_cls.__name__ + "SeralizerDefault", (serializers.ModelSerializer,), dict(
                 Meta=Meta
@@ -67,4 +76,4 @@ def data_frames(self):
             serializer = serializer_cls(queryset, many=True)
             data = serializer.data
 
-            yield slug, pd.DataFrame(data)
+            yield slug, pd.DataFrame(data) # TODO return as object so it would be easier to extend?
diff --git a/moonsheep/exporters/frictionless_data.py b/moonsheep/exporters/frictionless_data.py
@@ -1,15 +1,94 @@
-import pandas as pd
+import json
+import os
+import subprocess
+import tempfile
+from datetime import datetime
 
-from moonsheep.exporters.core import PandasExporter
+from moonsheep.exporters.exporters import PandasExporter
 
 
 class FrictionlessExporter(PandasExporter):
     """
     Frictionless Data exporter
 
-    Frictionless Data (https://frictionlessdata.io/) is basically a zip file containing csv data files
-    along with some metadata about them.
+    Frictionless Data (https://frictionlessdata.io/) is basically a folder containing csv data files
+    along with some metadata about them. Such folder can be packed in one file (.zip, .tar.gz, etc.)
     """
+
+    @staticmethod
+    def type_from_pandas(type):
+        """
+        :param type:
+        :return: http://frictionlessdata.io/specs/table-schema/
+        """
+        if type == 'int64':
+            return 'integer'
+        if type == 'object':
+            return 'object'
+        if type == 'bool':
+            return 'boolean'
+
+        print(f"Warning: Type not mapped: {type}")
+        return 'object'
+
     def export(self, output, **options):
+        """
+        :param output: a path. If output ends with .tar.gz or .zip then archive file will be created.
+            Otherwise output is treated as directory name and no compression will be performed.
+        :param options:
+        :return:
+        """
+        created_at = datetime.now().isoformat()
+        datapackage = {
+            "name": self.app_label + "-" + created_at,
+            "version": "1.0.0-rc.2",
+            "created": created_at,
+            "profile": "tabular-data-package",
+            "resources": []
+        }
+
+        compression_cmd = None
+        if output.endswith('.zip'):
+            output_dir = tempfile.mkdtemp()
+            output_absolute = os.path.join(os.getcwd(), output)
+            print(output_absolute)
+            compression_cmd = f'(cd {output_dir} && zip {output_absolute} *)'
+
+        elif output.endswith('.tar.gz'):
+            output_dir = tempfile.mkdtemp()
+            compression_cmd = f'find {output_dir} -printf "%P\n" | tar -czf {output} --no-recursion -C {output_dir} -T -'
+
+        else:
+            output_dir = output
+            os.makedirs(output_dir, exist_ok=True)
+
         for slug, data_frame in self.data_frames():
-            data_frame.to_csv(output + "_" + slug + ".csv", index=False)
+            fname = slug + '.csv'
+
+            data_frame.to_csv(os.path.join(output_dir, fname), index=False)
+
+            datapackage['resources'].append({
+                "path": fname,
+                "profile": "tabular-data-resource",
+                "schema": {
+                    "fields": [{
+                        "name": fld,
+                        "type": FrictionlessExporter.type_from_pandas(ftype)
+                        # TODO while creating dataframe ask model for specific field type
+                        #  (now we have string expressed as object)
+                        # TODO description from model
+                    } for fld, ftype in data_frame.dtypes.items()]
+                    # TODO ask model and add "primaryKey": "id"
+                    # TODO is defining relations between objects possible here?
+                }
+            })
+
+        # write datapackage.json
+        with open(os.path.join(output_dir, 'datapackage.json'), 'w') as f:
+            f.write(json.dumps(datapackage, indent=2))
+
+        if compression_cmd is not None:
+            try:
+                subprocess.run(compression_cmd, shell=True, check=True)
+            finally:
+                subprocess.run(["rm", "-rf", output_dir], check=True)
diff --git a/moonsheep/exporters/xlsx.py b/moonsheep/exporters/xlsx.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from moonsheep.exporters.core import PandasExporter
+from moonsheep.exporters.exporters import PandasExporter
 
 
 class XLSXExporter(PandasExporter):

diff --git a/moonsheep/management/commands/moonsheep_export.py b/moonsheep/management/commands/moonsheep_export.py
@@ -15,11 +15,9 @@ def add_arguments(self, parser):
 
     def handle(self, *args, **options):
         app_label = options['app_label']
+
         # TODO default label
-        # TODO test custom manager
-        # TODO excluded fields
-        # TODO document
-        # TODO finish frictionless
+        # TODO discovery and export in the admin
 
         fmt = options['format']
         output = options.get('output', None)

diff --git a/moonsheep/models.py b/moonsheep/models.py
@@ -175,3 +175,25 @@ class Meta:
             models.UniqueConstraint(fields=['task', 'user', 'closed_manually'], name='unique_task_user')
         ]
         verbose_name_plural = "entries"
+
+
+class DocumentQuerySet(models.QuerySet):
+    def exported(self) -> models.QuerySet:
+        return self.filter(progress=100)
+
+
+class DocumentModel(models.Model):
+    """
+    Base fields to be included in project's document model
+    """
+    class Meta:
+        abstract = True
+
+    url = models.URLField(verbose_name=_("URL"), unique=True, max_length=2048)
+    progress = models.DecimalField(decimal_places=3, max_digits=6, default=0,
+                                   validators=[validators.MaxValueValidator(100), validators.MinValueValidator(0)])
+
+    objects = DocumentQuerySet.as_manager()
+
+    class Exported:
+        exclude = ['progress']