diff --git a/regparser/web/jobs/migrations/0003_auto_20160810_0027.py b/regparser/web/jobs/migrations/0003_auto_20160810_0027.py new file mode 100644 index 00000000..6d44d0a3 --- /dev/null +++ b/regparser/web/jobs/migrations/0003_auto_20160810_0027.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-10 00:27 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('jobs', '0002_auto_20160728_2331'), + ] + + operations = [ + migrations.CreateModel( + name='PipelineJob', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created', models.DateTimeField(auto_now_add=True)), + ('clear_cache', models.BooleanField(default=False)), + ('destination', models.URLField(default=b'http://localhost:8888/api', max_length=2000)), + ('notification_email', models.EmailField(blank=b'True', max_length=254)), + ('job_id', models.UUIDField(default=None, null=True)), + ('use_uploaded_metadata', models.UUIDField(default=None, null=True)), + ('use_uploaded_regulation', models.UUIDField(default=None, null=True)), + ('parser_errors', models.TextField(blank=True)), + ('regulation_url', models.URLField(blank=True, max_length=2000)), + ('status', models.CharField(choices=[(b'received', b'received'), (b'in_progress', b'in_progress'), (b'failed', b'failed'), (b'complete', b'complete'), (b'complete_with_errors', b'complete_with_errors')], default=b'received', max_length=32)), + ('url', models.URLField(blank=True, max_length=2000)), + ('cfr_title', models.IntegerField()), + ('cfr_part', models.IntegerField()), + ('only_latest', models.BooleanField(default=False)), + ], + options={ + 'abstract': False, + }, + ), + migrations.DeleteModel( + name='ParsingJob', + ), + ] diff --git a/regparser/web/jobs/migrations/0004_uploadedfile.py b/regparser/web/jobs/migrations/0004_uploadedfile.py new file mode 100644 index 00000000..53541811 --- /dev/null +++ b/regparser/web/jobs/migrations/0004_uploadedfile.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-11 00:19 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('jobs', '0003_auto_20160810_0027'), + ] + + operations = [ + migrations.CreateModel( + name='UploadedFile', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('file', models.FileField(upload_to=b'')), + ], + ), + ] diff --git a/regparser/web/jobs/migrations/0005_auto_20160811_2132.py b/regparser/web/jobs/migrations/0005_auto_20160811_2132.py new file mode 100644 index 00000000..26e73ea0 --- /dev/null +++ b/regparser/web/jobs/migrations/0005_auto_20160811_2132.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-11 21:32 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('jobs', '0004_uploadedfile'), + ] + + operations = [ + migrations.CreateModel( + name='RegulationFile', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('hexhash', models.CharField(default=None, max_length=32, null=True)), + ('filename', models.CharField(default=None, max_length=512, null=True)), + ('contents', models.BinaryField()), + ], + ), + migrations.DeleteModel( + name='UploadedFile', + ), + ] diff --git a/regparser/web/jobs/migrations/0006_regulationfile_file.py b/regparser/web/jobs/migrations/0006_regulationfile_file.py new file mode 100644 index 00000000..5a0ced34 --- /dev/null +++ b/regparser/web/jobs/migrations/0006_regulationfile_file.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-11 22:30 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('jobs', '0005_auto_20160811_2132'), + ] + + operations = [ + migrations.AddField( + model_name='regulationfile', + name='file', + field=models.FileField(null=True, upload_to=b''), + ), + ] diff --git a/regparser/web/jobs/migrations/0007_regulationfile_url.py b/regparser/web/jobs/migrations/0007_regulationfile_url.py new file mode 100644 index 00000000..e831dc69 --- /dev/null +++ b/regparser/web/jobs/migrations/0007_regulationfile_url.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-19 04:28 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('jobs', '0006_regulationfile_file'), + ] + + operations = [ + migrations.AddField( + model_name='regulationfile', + name='url', + field=models.URLField(blank=True, max_length=2000), + ), + ] diff --git a/regparser/web/jobs/migrations/0008_proposalpipelinejob.py b/regparser/web/jobs/migrations/0008_proposalpipelinejob.py new file mode 100644 index 00000000..41930470 --- /dev/null +++ b/regparser/web/jobs/migrations/0008_proposalpipelinejob.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-19 04:42 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('jobs', '0007_regulationfile_url'), + ] + + operations = [ + migrations.CreateModel( + name='ProposalPipelineJob', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created', models.DateTimeField(auto_now_add=True)), + ('clear_cache', models.BooleanField(default=False)), + ('destination', models.URLField(default=b'http://localhost:8888/api', max_length=2000)), + ('notification_email', models.EmailField(blank=b'True', max_length=254)), + ('job_id', models.UUIDField(default=None, null=True)), + ('use_uploaded_metadata', models.UUIDField(default=None, null=True)), + ('use_uploaded_regulation', models.UUIDField(default=None, null=True)), + ('parser_errors', models.TextField(blank=True)), + ('regulation_url', models.URLField(blank=True, max_length=2000)), + ('status', models.CharField(choices=[(b'received', b'received'), (b'in_progress', b'in_progress'), (b'failed', b'failed'), (b'complete', b'complete'), (b'complete_with_errors', b'complete_with_errors')], default=b'received', max_length=32)), + ('url', models.URLField(blank=True, max_length=2000)), + ('file_hexhash', models.CharField(max_length=32)), + ('only_latest', models.BooleanField(default=True)), + ], + options={ + 'abstract': False, + }, + ), + ] diff --git a/regparser/web/jobs/migrations/0009_auto_20160824_2347.py b/regparser/web/jobs/migrations/0009_auto_20160824_2347.py new file mode 100644 index 00000000..2d1b2159 --- /dev/null +++ b/regparser/web/jobs/migrations/0009_auto_20160824_2347.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.7 on 2016-08-24 23:47 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('jobs', '0008_proposalpipelinejob'), + ] + + operations = [ + migrations.AlterField( + model_name='pipelinejob', + name='destination', + field=models.URLField(max_length=2000), + ), + migrations.AlterField( + model_name='proposalpipelinejob', + name='destination', + field=models.URLField(max_length=2000), + ), + ] diff --git a/regparser/web/jobs/models.py b/regparser/web/jobs/models.py index 0dcf0871..fcef4f57 100755 --- a/regparser/web/jobs/models.py +++ b/regparser/web/jobs/models.py @@ -1,27 +1,52 @@ from django.db import models +job_status_pairs = ( + ("complete", "complete"), + ("complete_with_errors", "complete_with_errors"), + ("failed", "failed"), + ("in_progress", "in_progress"), + ("received", "received") +) +job_status_values = tuple(j[0] for j in job_status_pairs) + class ParsingJob(models.Model): + class Meta: + abstract = True + created = models.DateTimeField(auto_now_add=True) - cfr_title = models.IntegerField() - cfr_part = models.IntegerField() clear_cache = models.BooleanField(default=False) - destination = models.URLField(default="http://fake-reg-site.gov/api", - max_length=2000) + destination = models.URLField(max_length=2000) notification_email = models.EmailField(blank="True", max_length=254) - only_latest = models.BooleanField(default=False) job_id = models.UUIDField(default=None, null=True) use_uploaded_metadata = models.UUIDField(default=None, null=True) use_uploaded_regulation = models.UUIDField(default=None, null=True) parser_errors = models.TextField(blank=True) regulation_url = models.URLField(blank=True, max_length=2000) - status = models.CharField(max_length=32, choices=( - ("received", "received"), - ("in_progress", "in_progress"), - ("failed", "failed"), - ("complete", "complete"), - ("complete_with_errors", "complete_with_errors") - ), default="received") + status = models.CharField(max_length=32, choices=job_status_pairs, + default="received") + url = models.URLField(blank=True, max_length=2000) + + +class PipelineJob(ParsingJob): + + cfr_title = models.IntegerField() + cfr_part = models.IntegerField() + only_latest = models.BooleanField(default=False) + + +class ProposalPipelineJob(ParsingJob): + + file_hexhash = models.CharField(max_length=32) + only_latest = models.BooleanField(default=True) + + +class RegulationFile(models.Model): + + contents = models.BinaryField() + file = models.FileField(null=True) + filename = models.CharField(default=None, max_length=512, null=True) + hexhash = models.CharField(default=None, max_length=32, null=True) url = models.URLField(blank=True, max_length=2000) diff --git a/regparser/web/jobs/serializers.py b/regparser/web/jobs/serializers.py index 7501c02b..f25c20ca 100755 --- a/regparser/web/jobs/serializers.py +++ b/regparser/web/jobs/serializers.py @@ -1,13 +1,17 @@ -from regparser.web.jobs.models import ParsingJob +from regparser.web.jobs.models import ( + ParsingJob, + PipelineJob, + ProposalPipelineJob, + RegulationFile +) from rest_framework import serializers class ParsingJobSerializer(serializers.ModelSerializer): + class Meta: model = ParsingJob fields = ( - "cfr_title", - "cfr_part", "clear_cache", "destination", # Unsure about whether this should accept user # input or be set by the system. @@ -33,3 +37,47 @@ class Meta: def save(self, **kwargs): super(ParsingJobSerializer, self).save(**kwargs) + + +class PipelineJobSerializer(ParsingJobSerializer): + + class Meta(ParsingJobSerializer.Meta): + model = PipelineJob + fields = ParsingJobSerializer.Meta.fields + ( + "cfr_title", + "cfr_part" + ) + + +class ProposalPipelineJobSerializer(ParsingJobSerializer): + + class Meta(ParsingJobSerializer.Meta): + model = ProposalPipelineJob + fields = ParsingJobSerializer.Meta.fields + ( + "file_hexhash", + ) + + # Fields we don't want user input for are listed below. + file_hexhash = serializers.CharField(max_length=32) + + +class FileUploadSerializer(serializers.ModelSerializer): + + class Meta: + model = RegulationFile + fields = ( + "contents", + "file", + "filename", + "hexhash", + "url" + ) + + contents = serializers.SerializerMethodField() + file = serializers.FileField() + filename = serializers.CharField(read_only=True) + hexhash = serializers.CharField(read_only=True) + url = serializers.URLField(read_only=True) + + def get_contents(self, obj): + return "File contents not shown." diff --git a/regparser/web/jobs/urls.py b/regparser/web/jobs/urls.py index 1d06ff09..bde2dab5 100755 --- a/regparser/web/jobs/urls.py +++ b/regparser/web/jobs/urls.py @@ -3,8 +3,19 @@ from rest_framework.urlpatterns import format_suffix_patterns urlpatterns = [ - url(r'^job/$', views.JobViewList.as_view()), - url(r'^job/(?P[-a-z0-9]+)/$', + url(r'^job(/)$', views.JobViewList.as_view()), + url(r'^job/pipeline(/)$', views.PipelineJobViewList.as_view()), + url(r'^job/pipeline(/)(?P[-a-z0-9]+)(/)$', + views.PipelineJobViewInstance.as_view()), + url(r'^job/proposal-pipeline(/)$', + views.ProposalPipelineJobViewList.as_view()), + url(r'^job/proposal-pipeline(/)(?P[-a-z0-9]+)(/)$', + views.ProposalPipelineJobViewInstance.as_view()), + url(r'^job/upload/(?P[a-z0-9]{32})(/)$', + views.FileUploadViewInstance.as_view()), + url(r'^job/upload(/)$', + views.FileUploadView.as_view()), + url(r'^job/(?P[-a-z0-9]+)(/)$', views.JobViewInstance.as_view()) ] diff --git a/regparser/web/jobs/utils.py b/regparser/web/jobs/utils.py index 2a07c9a5..0955f4b5 100644 --- a/regparser/web/jobs/utils.py +++ b/regparser/web/jobs/utils.py @@ -1,7 +1,11 @@ from regparser.tasks import run_eregs_command +from django.conf import settings as django_settings +from django.core.mail import get_connection, send_mail import django_rq import settings +eregs_site_api_url = getattr(settings, "EREGS_SITE_API_URL") + def queue_eregs_job(args, timeout=60*30, result_ttl=-1): """ @@ -24,6 +28,18 @@ def queue_eregs_job(args, timeout=60*30, result_ttl=-1): result_ttl=result_ttl) +def send_notification_email(email_address, status_url): + backend = django_settings.EMAIL_BACKEND + connection = get_connection(backend=backend) + send_mail(status_url, "Job finished at %s" % status_url, + "notifications@18F.gov", [email_address], connection=connection) + + +def queue_notification_email(job, status_url, email_address): + return django_rq.enqueue(send_notification_email, email_address, + status_url, depends_on=job) + + def delete_eregs_job(job_id): """ Removes a job from the Redis queue. @@ -67,9 +83,9 @@ def add_redis_data_to_job_data(job_data): return job_data -def status_url(job_id): +def get_host(): """ - We want to give users a URL for checking the status of a job. + We want to provide users with status URLs, and so need host:port. While I can't think of an exploit resulting from relying on the host data from the request if the request were spoofed, we'll be cautious and define the canonical host data ourselves. @@ -79,13 +95,56 @@ def status_url(job_id): Impure Pulls information from settings. - :arg uuid4 job_id: The UUID of the job. - :rtype: str - :returns: The URL for checking on the status of the job. + :returns: The URL for host in the form host:port, for example: + http://domain.tld:2323 + + Note that the schema is not supplied (we assume it's included in the + string provided to settings) and no trailing slash is provided. + + We assume that the port from setigns is the bare number, with no + trailing colon, so we add that here. """ hostname = getattr(settings, "CANONICAL_HOSTNAME", "") hostport = getattr(settings, "CANONICAL_PORT", "") - if hostport: + if hostport and hostport is not "80": hostport = ":%s" % hostport - return "%s%s/rp/job/%s/" % (hostname, hostport, job_id) + elif hostport is "80": + hostport = "" + return "%s%s" % (hostname, hostport) + + +def status_url(job_id, sub_path=""): + """ + Returns a URL for checking the status of a job. + + Impure + Via get_host(), pulls information from settings. + + :arg uuid4 job_id: The UUID of the job. + :arg str sub_path: The part of the path indicating the type of job. Must + include a trailing slash. + + :rtype: str + :returns: The URL for checking on the status of the job. + """ + if sub_path and not sub_path.endswith("/"): + raise ValueError + host = get_host() + return "%s/rp/job/%s%s/" % (host, sub_path, job_id) + + +def file_url(file_hash): + """ + Returns a URL for retrieving an uploaded file. + + Impure + Via get_host(), pulls information from settings. + + :arg str file_hash: The MD5 hexstring of the file contents. + + :rtype: str + :returns: The URL for checking on the status of the job. + """ + host = get_host() + return "%s/rp/job/upload/%s/" % (host, file_hash) diff --git a/regparser/web/jobs/views.py b/regparser/web/jobs/views.py index 09a3ec1e..8fb73704 100755 --- a/regparser/web/jobs/views.py +++ b/regparser/web/jobs/views.py @@ -1,53 +1,71 @@ -from regparser.web.jobs.models import ParsingJob -from regparser.web.jobs.serializers import ParsingJobSerializer +from django.http import HttpResponse +from regparser.web.jobs.models import ( + PipelineJob, + ProposalPipelineJob, + RegulationFile +) +from regparser.web.jobs.serializers import ( + FileUploadSerializer, + PipelineJobSerializer, + ProposalPipelineJobSerializer +) from regparser.web.jobs.utils import ( add_redis_data_to_job_data, delete_eregs_job, + eregs_site_api_url, + file_url, queue_eregs_job, + queue_notification_email, status_url ) +from rest_framework.parsers import FileUploadParser, MultiPartParser from rest_framework.renderers import BrowsableAPIRenderer, JSONRenderer from rest_framework.response import Response from rest_framework import generics from rest_framework import mixins from rest_framework import status +import abc +import hashlib + renderer_classes = ( JSONRenderer, BrowsableAPIRenderer ) -class JobViewList(mixins.ListModelMixin, - mixins.CreateModelMixin, - generics.GenericAPIView): - queryset = ParsingJob.objects.all() - renderer_classes = renderer_classes - serializer_class = ParsingJobSerializer +class BaseViewList(object): + """ + Intended to be subclassed by classes subclassing ``JobViewList``. + Contains the POST-related methods that are relevant to subclasses of + ``JobViewList`` but not to ``JobViewList``. - def filter_queryset(self, request, *args, **kwargs): - """ - Overridden in order to get data from the Redis queue as well as the DB. + Should be in the subclass list before ``JobViewList``. + """ + __metaclass__ = abc.ABCMeta - Impure - Pulls information from the DB and the Redis queue. + @abc.abstractmethod + def build_eregs_args(self, validated_data): + """ + Each type of parser job has its own set of arguments. + The ``create`` method calls this method to construct the argument + string specific to that type of job. - :arg HttpRequest request: the incoming request. + :arg dict validated_data: Incoming data from the POST that's already + been validated by the serializer. - :rtype: list[ParsingJob] - :returns: List of ParsingJob objects. + :rtype: list[str] + :returns: The components of the argument string in list form. """ - queryset = super(JobViewList, self).filter_queryset(request, *args, - **kwargs) - queryset = add_redis_data_to_job_data(queryset) - return queryset + raise NotImplementedError() def create(self, request, *args, **kwargs): """ - Overridden in order to add the new job to the Redis queue. + Overrides the ``create`` method of ``mixins.CreateModelMixin`` in order + to add the new job to the Redis queue. Side effects - Via ``queue_eregs_job`` and ``ParsingJobSerializer.save``, alters + Via ``queue_eregs_job`` and ``PipelineJobSerializer.save``, alters the redis queue and the DB. :arg HttpRequest request: the incoming request. @@ -58,23 +76,24 @@ def create(self, request, *args, **kwargs): """ serialized = self.get_serializer(data=request.data) serialized.is_valid(raise_exception=True) - eregs_args = [ - "pipeline", - str(serialized.validated_data["cfr_title"]), - str(serialized.validated_data["cfr_part"]), - "./testing" - ] + + eregs_args = self.build_eregs_args(serialized.validated_data) job = queue_eregs_job(eregs_args, timeout=60*30, result_ttl=-1) # Paranoia--validate the values we provide: job_id = job.id for validator in serialized.get_fields()["job_id"].validators: validator(job_id) - statusurl = status_url(job_id) + statusurl = status_url(job_id, sub_path=self.sub_path) for validator in serialized.get_fields()["url"].validators: validator(statusurl) - serialized.save(job_id=job_id, url=status_url(job_id)) + if serialized.validated_data.get("notification_email"): + queue_notification_email( + job, statusurl, + serialized.validated_data["notification_email"]) + serialized.save(job_id=job_id, url=statusurl, + destination=eregs_site_api_url) headers = self.get_success_headers(serialized.data) """ Adding the Refresh header here so that the browser does the @@ -91,21 +110,48 @@ def create(self, request, *args, **kwargs): return Response(serialized.data, status=status.HTTP_201_CREATED, headers=headers) + +class JobViewList(mixins.ListModelMixin, + mixins.CreateModelMixin, + generics.GenericAPIView): + """ + Handles the list view for jobs of all types. + Should be subclassed along with ``BaseViewList`` for classes handling + specific job types. + """ + queryset = PipelineJob.objects.all() + renderer_classes = renderer_classes + serializer_class = PipelineJobSerializer + + def filter_queryset(self, request, *args, **kwargs): + """ + Overridden in order to get data from the Redis queue as well as the DB. + + Impure + Pulls information from the DB and the Redis queue. + + :arg HttpRequest request: the incoming request. + + :rtype: list[PipelineJob] + :returns: List of PipelineJob objects. + """ + queryset = super(JobViewList, self).filter_queryset(request, *args, + **kwargs) + queryset = add_redis_data_to_job_data(queryset) + return queryset + def get(self, request, *args, **kwargs): return self.list(request, *args, **kwargs) - def post(self, request, *args, **kwargs): - return self.create(request, *args, **kwargs) - class JobViewInstance(mixins.RetrieveModelMixin, mixins.UpdateModelMixin, mixins.DestroyModelMixin, generics.GenericAPIView): - queryset = ParsingJob.objects.all() + queryset = PipelineJob.objects.all() renderer_classes = renderer_classes - serializer_class = ParsingJobSerializer lookup_field = "job_id" + serializer_class = PipelineJobSerializer def delete(self, request, *args, **kwargs): return self.destroy(request, *args, **kwargs) @@ -148,3 +194,172 @@ def retrieve(self, request, *args, **kwargs): instance = add_redis_data_to_job_data([instance])[0] serializer = self.get_serializer(instance) return Response(serializer.data) + + +class PipelineJobViewList(BaseViewList, JobViewList): + queryset = PipelineJob.objects.all() + serializer_class = PipelineJobSerializer + sub_path = "pipeline/" + + def build_eregs_args(self, validated_data): + """ + Overrides the method from ``BaseViewList`` in order to pass the + arguments appropriate for the ``pipeline`` command. + + Side Effects + Runs the ``pipeline`` command. + + :arg dict validated_data: Incoming data from the POST that's already + been validated by the serializer. + + :rtype: list[str] + :returns: The components of the argument string in list form. + """ + return [ + "pipeline", + str(validated_data["cfr_title"]), + str(validated_data["cfr_part"]), + eregs_site_api_url, + ] + + def post(self, request, *args, **kwargs): + return self.create(request, *args, **kwargs) + + +class PipelineJobViewInstance(JobViewInstance): + queryset = PipelineJob.objects.all() + serializer_class = PipelineJobSerializer + sub_path = "pipeline/" + + +class ProposalPipelineJobViewList(BaseViewList, JobViewList): + queryset = ProposalPipelineJob.objects.all() + serializer_class = ProposalPipelineJobSerializer + sub_path = "proposal-pipeline/" + + def build_eregs_args(self, validated_data): + """ + Overrides the method from ``BaseViewList`` in order to pass the + arguments appropriate for the ``proposal_pipeline`` command. + + Impure + Reads the contents of the proposal file from the filesystem (in + future, the DB, but impure either way). + Side Effects + Runs the ``proposal_pipeline`` command. + + :arg dict validated_data: Incoming data from the POST that's already + been validated by the serializer. + + :rtype: list[str] + :returns: The components of the argument string in list form. + """ + reg_file = RegulationFile.objects.get( + hexhash=validated_data["file_hexhash"]) + # TODO: This is a total hack; we should not be storing the contents in + # the DB but reading the file from the filesystem. Only doing this + # temporarily before changing the proposal_pipeline command to work + # differently. + path = reg_file.file.storage.path(reg_file.file.name) + return [ + "proposal_pipeline", + path, + eregs_site_api_url + ] + + def post(self, request, *args, **kwargs): + return self.create(request, *args, **kwargs) + + +class ProposalPipelineJobViewInstance(JobViewInstance): + queryset = ProposalPipelineJob.objects.all() + serializer_class = ProposalPipelineJobSerializer + sub_path = "proposal-pipeline/" + + +class FileUploadView(mixins.ListModelMixin, mixins.CreateModelMixin, + generics.GenericAPIView): + parser_classes = (FileUploadParser, MultiPartParser) + parser_classes = (MultiPartParser,) + serializer_class = FileUploadSerializer + queryset = RegulationFile.objects.all() + lookup_field = "hexhash" + size_limit = 100000000 # Arbitrary 100MB limit. + + def create(self, request, *args, **kwargs): + """ + Overrides the ``create`` method of ``mixins.CreateModelMixin`` in order + to add the file contents to the database. + + Side effects + Alters the DB. + + :arg HttpRequest request: the incoming request. + + :rtype: Response + :returns: JSON or HTML of the information about the file (status 201), + or about why the file couldn't be added (status 400). + """ + serialized = self.get_serializer(data=request.data) + serialized.is_valid(raise_exception=True) + + uploaded_file = request.data["file"] + if uploaded_file.size > self.size_limit: + return Response(dict(error="File too large (%s-byte limit)." % + self.size_limit), + status=status.HTTP_400_BAD_REQUEST) + if uploaded_file.multiple_chunks(): + contents = b"".join(chunk for chunk in uploaded_file.chunks()) + else: + contents = uploaded_file.read() + md = hashlib.md5(contents) + hexhash = md.hexdigest() + filename = uploaded_file.name + url = file_url(hexhash) + + if RegulationFile.objects.filter(hexhash=hexhash).exists(): + return Response(dict(error="File already present."), + status=status.HTTP_400_BAD_REQUEST) + else: + serialized.save(contents=contents, file=uploaded_file, + filename=filename, hexhash=hexhash, url=url) + + headers = self.get_success_headers(serialized.data) + return Response(serialized.data, status=status.HTTP_201_CREATED, + headers=headers) + + def post(self, request, *args, **kwargs): + return self.create(request, *args, **kwargs) + + def get(self, request, *args, **kwargs): + return self.list(request, *args, **kwargs) + + +class FileUploadViewInstance(mixins.RetrieveModelMixin, + mixins.UpdateModelMixin, mixins.DestroyModelMixin, + generics.GenericAPIView): + serializer_class = FileUploadSerializer + queryset = RegulationFile.objects.all() + lookup_field = "hexhash" + + def get(self, request, *args, **kwargs): + """ + Overrides the method from ``RetrieveModelMixin`` so that we return the + contents of the file instead of a JSON object representing the file. + + Impure + Reads from the DB. + + :arg HttpRequest request: the incoming request. + + :rtype: Response + :returns: The raw contents of the file. + """ + # Is the next line the best way to kick off a 404 if there's no match? + self.retrieve(request, *args, **kwargs) + + uploaded_file = RegulationFile.objects.get(hexhash=kwargs["hexhash"]) + return HttpResponse(uploaded_file.contents) + + def delete(self, request, *args, **kwargs): + return self.destroy(request, *args, **kwargs) diff --git a/regparser/web/settings/base.py b/regparser/web/settings/base.py index dacd5160..9d5c507c 100644 --- a/regparser/web/settings/base.py +++ b/regparser/web/settings/base.py @@ -138,3 +138,6 @@ 'cache_name': os.path.join(EREGS_INDEX_ROOT, 'http_cache'), 'expire_after': 60*60*24*3 # 3 days } + +FILE_UPLOAD_HANDLERS = [ + "django.core.files.uploadhandler.TemporaryFileUploadHandler"] diff --git a/regparser/web/settings/dev.py b/regparser/web/settings/dev.py index dcff2456..6cb63a03 100644 --- a/regparser/web/settings/dev.py +++ b/regparser/web/settings/dev.py @@ -1 +1,7 @@ from regparser.web.settings.base import * # noqa + +# You need to have an email server running locally for this to work. +# ``python -m smtpd -n -c DebuggingServer localhost:2525`` should be fine. +EMAIL_BACKEND = "django.core.mail.backends.smtp.EmailBackend" +EMAIL_HOST = "127.0.0.1" +EMAIL_PORT = 2525 diff --git a/settings.py b/settings.py index 8054970f..ecec15a8 100644 --- a/settings.py +++ b/settings.py @@ -165,6 +165,10 @@ CANONICAL_HOSTNAME = "https://example.com" CANONICAL_PORT = "" +# The URL for the regulations-site API that parser commands invoked from the +# web API/UI should run against: +EREGS_SITE_API_URL = "http://localhost:1234/api/" + try: from local_settings import * except ImportError: diff --git a/tests/test_web_api.py b/tests/test_web_api.py new file mode 100644 index 00000000..ebe08160 --- /dev/null +++ b/tests/test_web_api.py @@ -0,0 +1,368 @@ +from hashlib import md5 +from mock import patch, Mock +from os import path as ospath +from random import choice +from regparser.web.jobs.models import job_status_values +from regparser.web.jobs.utils import ( + eregs_site_api_url, + file_url, + status_url +) +from regparser.web.jobs.views import FileUploadView as PatchedFileUploadView +from rest_framework.test import APITestCase +from string import hexdigits +from tempfile import NamedTemporaryFile +from uuid import uuid4 + +import pytest +import settings + +try: + from urllib.parse import urlparse +except ImportError: + from urlparse import urlparse + +fake_pipeline_id = uuid4() + + +def _fake_redis_job(cmd, args, timeout=60*30, result_ttl=-1, depends_on=None): + return type("FakeRedisJob", (object, ), {"id": fake_pipeline_id}) + + +def _fake_redis_queue(): + fq = Mock() + fq.fetch_job = Mock(return_value=None) + return fq + + +@patch("django_rq.enqueue", _fake_redis_job) +@patch("django_rq.get_queue", _fake_redis_queue) +class PipelineJobTestCase(APITestCase): + + def __init__(self, *args, **kwargs): + self.defaults = { + "clear_cache": False, + "destination": eregs_site_api_url, + "use_uploaded_metadata": None, + "use_uploaded_regulation": None, + "regulation_url": "", + "status": "received" + } + super(PipelineJobTestCase, self).__init__(*args, **kwargs) + + def _postjson(self, data): + return self.client.post("/rp/job/pipeline/", data, format="json") + + def _stock_response_check(self, expected, actual): + """ + Since we're using a lot of fake values, the tests for them will always + be the same. + """ + for key in expected: + self.assertEqual(expected[key], actual[key]) + self.assertIn(actual["status"], job_status_values) + + def _create_ints(self): + data = { + "cfr_title": 0, + "cfr_part": 0, + "notification_email": "test@example.com" + } + response = self._postjson(data) + return (data, response) + + def test_create_ints(self): + data, response = self._create_ints() + + expected = dict(self.defaults) + expected.update({k: data[k] for k in data}) + expected["url"] = status_url(fake_pipeline_id, sub_path="pipeline/") + self._stock_response_check(expected, response.data) + return expected + + def test_create_strings(self): + data = { + "cfr_title": "0", + "cfr_part": "0", + "notification_email": "test@example.com" + } + response = self._postjson(data) + + expected = dict(self.defaults) + expected.update({k: data[k] for k in data}) + # Even if the input is a str, the return values should be ints: + expected["cfr_title"] = int(expected["cfr_title"]) + expected["cfr_part"] = int(expected["cfr_part"]) + expected["url"] = status_url(fake_pipeline_id, sub_path="pipeline/") + self._stock_response_check(expected, response.data) + + def test_create_with_missing_fields(self): + data = {"cfr_part": "0"} + response = self._postjson(data) + + self.assertEqual(400, response.status_code) + self.assertEqual({"cfr_title": ["This field is required."]}, + response.data) + + data = {"cfr_title": "0"} + response = self._postjson(data) + + self.assertEqual(400, response.status_code) + self.assertEqual({"cfr_part": ["This field is required."]}, + response.data) + + response = self.client.get("/rp/job/pipeline/", format="json") + self.assertEqual(0, len(response.data)) + + def test_create_and_read(self): + expected = self._create_ints()[1].data + + url = urlparse(expected["url"]) + response = self.client.get(url.path, format="json") + self._stock_response_check(expected, response.data) + + response = self.client.get("/rp/job/pipeline/", format="json") + self.assertEqual(1, len(response.data)) + self._stock_response_check(expected, response.data[0]) + + def test_create_delete_and_read(self): + expected = self._create_ints()[1].data + + url = urlparse(expected["url"]) + response = self.client.delete(url.path, format="json") + self.assertEqual(204, response.status_code) + + response = self.client.get(url.path, format="json") + self.assertEqual(404, response.status_code) + + response = self.client.get("/rp/job/pipeline/", format="json") + self.assertEqual(0, len(response.data)) + + +class RegulationFileTestCase(APITestCase): + + def __init__(self, *args, **kwargs): + self.file_contents = "123" + self.hashed_contents = None + super(RegulationFileTestCase, self).__init__(*args, **kwargs) + + def get_hashed_contents(self): + if self.hashed_contents is None: + try: + self.hashed_contents = md5(self.file_contents.encode( + "utf-8")).hexdigest() + except AttributeError: + self.hashed_contents = md5(self.file_contents).hexdigest() + return self.hashed_contents + + def test_create_file(self): + with NamedTemporaryFile(suffix=".xml", delete=True) as tmp: + tmp.write(self.file_contents.encode("utf-8")) + tmp_name = ospath.split(tmp.name)[-1] + tmp.seek(0) + response = self.client.post( + "/rp/job/upload/", {"file": tmp}) + self.assertEquals(201, response.status_code) + data = response.data + self.assertEquals(self.get_hashed_contents(), data["hexhash"]) + self.assertEquals(tmp_name, data["filename"]) + self.assertEquals("File contents not shown.", data["contents"]) + self.assertEquals(file_url(self.get_hashed_contents()), data["url"]) + return response + + def test_reject_duplicates(self): + self.test_create_file() + with NamedTemporaryFile(suffix=".xml", delete=True) as tmp: + tmp.write(self.file_contents.encode("utf-8")) + tmp.seek(0) + response = self.client.post( + "/rp/job/upload/", {"file": tmp}) + self.assertEquals(400, response.status_code) + self.assertIn("error", response.data) + self.assertEquals("File already present.", response.data["error"]) + + def test_reject_large(self): + with patch("regparser.web.jobs.views.FileUploadView", + new=PatchedFileUploadView) as p: + p.size_limit = 10 + with NamedTemporaryFile(suffix=".xml", delete=True) as tmp: + tmp.write(self.file_contents.encode("utf-8")) + tmp.seek(0) + response = self.client.post( + "/rp/job/upload/", {"file": tmp}) + self.assertEquals(201, response.status_code) + + with NamedTemporaryFile(suffix=".xml", delete=True) as tmp: + contents = "123" * 11 + tmp.write(contents.encode("utf-8")) + tmp.seek(0) + response = self.client.post( + "/rp/job/upload/", {"file": tmp}) + self.assertEquals(400, response.status_code) + self.assertEquals("File too large (10-byte limit).", + response.data["error"]) + + def test_create_and_read_and_delete(self): + expected = self.test_create_file().data + url = urlparse(expected["url"]) + response = self.client.get(url.path) + contents = response.content.decode("utf-8") + self.assertEquals(self.file_contents, contents) + + response = self.client.get("/rp/job/upload/", format="json") + self.assertEquals(1, len(response.data)) + data = response.data[0] + self.assertEquals("File contents not shown.", data["contents"]) + self.assertEquals(expected["file"], data["file"]) + self.assertEquals(expected["filename"], data["filename"]) + self.assertEquals(self.get_hashed_contents(), data["hexhash"]) + self.assertEquals(url.path, urlparse(data["url"]).path) + + response = self.client.delete(url.path) + self.assertEqual(204, response.status_code) + + response = self.client.get(url.path) + self.assertEqual(404, response.status_code) + + response = self.client.get("/rp/job/upload/", format="json") + data = response.data + self.assertEquals(0, len(data)) + + +@patch("django_rq.enqueue", _fake_redis_job) +@patch("django_rq.get_queue", _fake_redis_queue) +class ProposalPipelineTestCase(APITestCase): + + def __init__(self, *args, **kwargs): + self.defaults = { + "clear_cache": False, + "destination": eregs_site_api_url, + "only_latest": True, + "use_uploaded_metadata": None, + "use_uploaded_regulation": None, + "regulation_url": "", + "status": "received" + } + self.file_contents = "456" + super(ProposalPipelineTestCase, self).__init__(*args, **kwargs) + + def _create_file(self): + with NamedTemporaryFile(suffix=".xml") as tmp: + tmp.write(self.file_contents.encode("utf-8")) + tmp.seek(0) + response = self.client.post("/rp/job/upload/", {"file": tmp}) + return response.data + + def _postjson(self, data): + return self.client.post("/rp/job/proposal-pipeline/", data, + format="json") + + def _stock_response_check(self, expected, actual): + """ + Since we're using a lot of fake values, the tests for them will always + be the same. + """ + for key in expected: + self.assertEqual(expected[key], actual[key]) + self.assertIn(actual["status"], job_status_values) + + def test_create(self): + file_data = self._create_file() + data = { + "file_hexhash": file_data["hexhash"], + "notification_email": "test@example.com" + } + response = self._postjson(data) + + expected = dict(self.defaults) + expected.update({k: data[k] for k in data}) + expected["url"] = status_url(fake_pipeline_id, + sub_path="proposal-pipeline/") + self._stock_response_check(expected, response.data) + return expected + + def test_create_with_missing_fields(self): + data = {"notification_email": "test@example.com"} + response = self._postjson(data) + + self.assertEqual(400, response.status_code) + self.assertEqual({"file_hexhash": ["This field is required."]}, + response.data) + + def test_create_and_read_and_delete(self): + expected = self.test_create() + + url = urlparse(expected["url"]) + response = self.client.get(url.path, format="json") + self._stock_response_check(expected, response.data) + + response = self.client.get("/rp/job/proposal-pipeline/", format="json") + self.assertEqual(1, len(response.data)) + self._stock_response_check(expected, response.data[0]) + + response = self.client.delete(url.path, format="json") + self.assertEqual(204, response.status_code) + + response = self.client.get(url.path, format="json") + self.assertEqual(404, response.status_code) + + response = self.client.get("/rp/job/proposal-pipeline/", format="json") + self.assertEqual(0, len(response.data)) + + +@patch.object(settings, "CANONICAL_HOSTNAME", "http://domain.tld") +def test_status_url(): + domain = "http://domain.tld" + urlpath = "/rp/job/" + hexes = ["".join([choice(hexdigits) for i in range(32)]) for j in range(6)] + + def _check(port=None): + for hx in hexes: + url = urlparse(status_url(hx)) + assert domain == "%s://%s" % (url.scheme, url.hostname) + if port is None: + assert url.port is port + else: + assert url.port == port + assert "%s%s/" % (urlpath, hx) == url.path + + url = urlparse(status_url(hx, sub_path="%s/" % hx[:10])) + assert domain == "%s://%s" % (url.scheme, url.hostname) + if port is None: + assert url.port is port + else: + assert url.port == port + assert "%s%s%s/" % (urlpath, "%s/" % hx[:10], hx) == url.path + + with patch.object(settings, "CANONICAL_PORT", "2323"): + _check(port=2323) + + with patch.object(settings, "CANONICAL_PORT", "80"): + _check() + + with patch.object(settings, "CANONICAL_PORT", ""): + _check() + + with pytest.raises(ValueError) as err: + status_url("something", "something-without-a-slash") + + assert isinstance(err.value, ValueError) + + +@patch.object(settings, "CANONICAL_HOSTNAME", "http://domain.tld") +def test_file_url(): + urlpath = "/rp/job/upload/" + domain = "http://domain.tld" + hexes = ["".join([choice(hexdigits) for i in range(32)]) for j in range(6)] + + with patch.object(settings, "CANONICAL_PORT", "2323"): + for hx in hexes: + assert file_url(hx) == "%s:2323%s%s/" % (domain, urlpath, hx) + + with patch.object(settings, "CANONICAL_PORT", "80"): + for hx in hexes: + assert file_url(hx) == "%s%s%s/" % (domain, urlpath, hx) + + with patch.object(settings, "CANONICAL_PORT", ""): + for hx in hexes: + assert file_url(hx) == "%s%s%s/" % (domain, urlpath, hx)