Skip to content

Commit

Permalink
Http Importer #136 WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
KrzysztofMadejski committed Nov 2, 2019
1 parent 106e0d9 commit 1f16809
Show file tree
Hide file tree
Showing 10 changed files with 2,315 additions and 5 deletions.
46 changes: 45 additions & 1 deletion moonsheep/importers/http.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from typing import Sequence
import re
import urllib
from typing import Sequence, List, Pattern

import requests
from django.core.management import BaseCommand
from django.http import QueryDict

from moonsheep.importers.core import IDocumentImporter, DocumentSaver
Expand Down Expand Up @@ -27,6 +31,46 @@ def __str__(self):
def __init__(self, name):
self.name = name

@staticmethod
def listdir(html_contents):
"""
Returns list of entries (files & dir) in a given html file
:param html_contents:
:return:
"""
regexp = r'<a\s+href="([^"]+)"\s*>([^<]+)<'
return [g[0] for g in re.findall(regexp, html_contents) if not g[1].startswith('..')]

@staticmethod
def find_urls(host: str, pattern: str, paths: List[str], log=None) -> List[str]:
# TODO have some assumptions on input arguments
path_queue = [host + '/' + path for path in paths]
if pattern:
pattern_re: Pattern = re.compile(pattern.replace('.', '\\.').replace('*', '.*'))

while path_queue:
path = path_queue.pop()

# if directory
if path.endswith('/'):
if log:
log(f"Downloading {path}")

response = requests.get(path)
# TODO test response.status_code
for entry in HttpDocumentImporter.listdir(response.text):
if not (entry.startswith('http://') or entry.startswith('https://')):
entry = path + entry

if entry.endswith('/'): # dir
path_queue.append(entry)

elif not pattern or pattern_re.match(entry): # file
yield entry

elif not pattern or pattern_re.match(path):
yield path


# Initialize and activate
# TODO name should be taken from config
Expand Down
Empty file.
1,054 changes: 1,054 additions & 0 deletions moonsheep/importers/tests/http_listings/index_dirs.html

Large diffs are not rendered by default.

1,108 changes: 1,108 additions & 0 deletions moonsheep/importers/tests/http_listings/index_files.html

Large diffs are not rendered by default.

63 changes: 63 additions & 0 deletions moonsheep/importers/tests/test_http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os
import unittest

from moonsheep.importers import HttpDocumentImporter
from django.core import management

class TestHttpImporter(unittest.TestCase):
@staticmethod
def load_file(path: str):
path = os.path.join(os.path.dirname(os.path.abspath(__file__)), path)
with open(path, 'r') as content_file:
return content_file.read()

def test_load_dirs(self):
print(__file__)
print(os.path.abspath(__file__))
content = self.load_file('http_listings/index_dirs.html')
entries = HttpDocumentImporter.listdir(content)

self.assertIn("http://debian.mirror.ac.za/debian/pool/main/t/t-code/", entries)
self.assertIn("http://debian.mirror.ac.za/debian/pool/main/t/tzsetup/", entries)
self.assertIn("http://debian.mirror.ac.za/debian/pool/main/t/tryton-modules-stock-supply-production/", entries)

self.assertNotIn("http://debian.mirror.ac.za/debian/pool/main/", entries, "Parent dir should not be returned")
self.assertNotIn("http://debian.mirror.ac.za/debian/pool/main", entries, "Parent dir should not be returned")
self.assertNotIn("http://debian.mirror.ac.za/debian/pool/main/t/", entries,
"Current dir should not be returned")
self.assertNotIn("http://debian.mirror.ac.za/debian/pool/main/t", entries, "Current dir should not be returned")

def load_files(self):
content = self.load_file('http_listings/index_files.html')
entries = HttpDocumentImporter.listdir(content)

# Encoded url
self.assertIn(
"http://debian.mirror.ac.za/debian/pool/main/t/tasksel/task-albanian-desktop_3.31%2Bdeb8u1_all.deb",
entries)
self.assertIn("http://debian.mirror.ac.za/debian/pool/main/t/tasksel/tasksel_3.56_all.deb", entries)

self.assertNotIn("http://debian.mirror.ac.za/debian/pool/main/t/", entries, "Parent dir should not be returned")
self.assertNotIn("http://debian.mirror.ac.za/debian/pool/main/t", entries, "Parent dir should not be returned")
self.assertNotIn("http://debian.mirror.ac.za/debian/pool/main/t/taskel", entries,
"Current dir should not be returned")
self.assertNotIn("http://debian.mirror.ac.za/debian/pool/main/t/taskel", entries,
"Current dir should not be returned")


# class TestHttpImporterCommand(unittest.TestCase):
# # TODO @patch and assert HttpDocumentImporter.find_urls
# def host_with_multiple_paths(self):
# management.call_command('moonsheep_import_http', '-W -h http://user@host/root dir1 dir2/file1')
# # TODO then what?
#
#
# def one_path(self):
# management.call_command('moonsheep_import_http', 'http://user@host/root/dir1')
#
#
# def file_pattern(self):
# management.call_command('moonsheep_import_http', 'http://user@host/root/dir1 -f *.pdf')



27 changes: 27 additions & 0 deletions moonsheep/management/commands/moonsheep_import_http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from django.core.management.base import BaseCommand, CommandError

from moonsheep.importers.http import HttpDocumentImporter


class Command(BaseCommand):
help = 'Imports documents published on http server with Index List enabled'

def add_arguments(self, parser):
parser.add_argument('paths', type=str, nargs='+', metavar='path', help='Paths to be imported')
# parser.add_argument('-W', dest='ask_for_password', type=bool, nargs='?', default=False, const=True,
help='Ask for password instead of specifying it on the command line')
parser.add_argument('--host', dest='host', type=str, help="Host to be used if multiple paths are provided")
parser.add_argument('-f', dest='pattern', type=str,
help="*-wildcarded pattern of the file names to be included, ie. -f *.pdf")
parser.add_argument('--dry-run', dest='dry_run', type=bool, nargs='?', default=False, const=True,
help='Dry run to see what would get imported without actually importing it')

def handle(self, *args, **options):
host = options['host']
# TODO if not host
paths = options['paths']
# TODO dry_run

for path in HttpDocumentImporter.find_urls(host=host, paths=paths, pattern=options['pattern']):
print(f"\t{path}")
# TODO actually import them into Moonsheep unless dry_run
2 changes: 2 additions & 0 deletions moonsheep/tests/test_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

INSTALLED_APPS = [
"moonsheep",
'django.contrib.auth',
'django.contrib.contenttypes',
]

DATABASES = {
Expand Down
5 changes: 2 additions & 3 deletions moonsheep/tests/tests.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json
from requests.exceptions import ConnectionError
# from requests.exceptions import ConnectionError

from django.core.exceptions import ImproperlyConfigured, ValidationError
from django.db import models
Expand All @@ -13,10 +13,9 @@
from moonsheep.exceptions import PresenterNotDefined, TaskMustSetTemplate, NoTasksLeft
from moonsheep.forms import NewTaskForm, MultipleRangeField
from moonsheep.mapper import ModelMapper
from moonsheep.register import base_task, initial_task
from moonsheep.tasks import AbstractTask
from moonsheep.verifiers import equals, OrderedListVerifier
from moonsheep.views import unpack_post, TaskView, NewTaskFormView, WebhookTaskRunView
from moonsheep.views import unpack_post, TaskView, NewTaskFormView


# TODO: FIXME
Expand Down
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[pytest]
DJANGO_SETTINGS_MODULE = moonsheep.tests.test_settings
python_files = tests.py test_*.py *_tests.py
12 changes: 11 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@
with open(os.path.join(os.path.dirname(__file__), 'README.md')) as readme:
README = readme.read()

TEST_REQUIREMENTS = [
'pytest',
'pytest-django',
'pylint',
'pylint_django',
'git-pylint-commit-hook',
]

setup(
name='django-moonsheep',
version='0.3.0',
Expand All @@ -27,8 +35,10 @@
'djangorestframework~=3.10',
'djangorestframework-jsonapi~=2.8',
'django-filter~=2.2',
'PyUtilib~=5.7'
'PyUtilib~=5.7',
'requests~=2.22',
],
tests_require=TEST_REQUIREMENTS,
classifiers=[
'Environment :: Web Environment',
'Framework :: Django',
Expand Down

0 comments on commit 1f16809

Please sign in to comment.