Skip to content
This repository was archived by the owner on May 1, 2024. It is now read-only.

Commit 2ce0ee9

Browse files
authored
Merge pull request #484 from edx/brian/delete-manifest
Delete manifest file on overwrite.
2 parents 592acf1 + 45b1863 commit 2ce0ee9

File tree

4 files changed

+63
-8
lines changed

4 files changed

+63
-8
lines changed

edx/analytics/tasks/common/mapreduce.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import luigi.task
1616
from luigi import configuration
1717

18-
from edx.analytics.tasks.util.manifest import convert_to_manifest_input_if_necessary
18+
from edx.analytics.tasks.util.manifest import convert_to_manifest_input_if_necessary, remove_manifest_target_if_exists
1919
from edx.analytics.tasks.util.url import get_target_from_url, url_path_join
2020

2121
log = logging.getLogger(__name__)
@@ -126,9 +126,15 @@ def _get_engine_parameters_from_targets(self):
126126
'input_format': input_format,
127127
}
128128

129+
@property
130+
def manifest_id(self):
131+
return str(hash(self)).replace('-', 'n')
132+
129133
def input_hadoop(self):
130-
manifest_id = str(hash(self)).replace('-', 'n')
131-
return convert_to_manifest_input_if_necessary(manifest_id, super(MapReduceJobTask, self).input_hadoop())
134+
return convert_to_manifest_input_if_necessary(self.manifest_id, super(MapReduceJobTask, self).input_hadoop())
135+
136+
def remove_manifest_target_if_exists(self):
137+
return remove_manifest_target_if_exists(self.manifest_id)
132138

133139

134140
class MapReduceJobRunner(luigi.contrib.hadoop.HadoopJobRunner):

edx/analytics/tasks/insights/module_engagement.py

+2
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,8 @@ def run(self):
208208
output_target = self.output()
209209
if not self.complete() and output_target.exists():
210210
output_target.remove()
211+
if self.overwrite:
212+
self.remove_manifest_target_if_exists()
211213
return super(ModuleEngagementDataTask, self).run()
212214

213215

edx/analytics/tasks/util/manifest.py

+20-4
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import luigi.task
66
from luigi import configuration
77

8-
from edx.analytics.tasks.util.url import get_target_class_from_url, url_path_join
8+
from edx.analytics.tasks.util.url import get_target_class_from_url, get_target_from_url, url_path_join
99

1010
CONFIG_SECTION = 'manifest'
1111

@@ -27,6 +27,13 @@ def convert_to_manifest_input_if_necessary(manifest_id, targets):
2727
return targets
2828

2929

30+
def get_manifest_file_path(manifest_id):
31+
# Construct the manifest file URL from the manifest_id and the configuration
32+
base_url = configuration.get_config().get(CONFIG_SECTION, 'path')
33+
manifest_file_path = url_path_join(base_url, manifest_id + '.manifest')
34+
return manifest_file_path
35+
36+
3037
def create_manifest_target(manifest_id, targets):
3138
# If we are running locally, we need our manifest file to be a local file target, however, if we are running on
3239
# a real Hadoop cluster, it has to be an HDFS file so that the input format can read it. Luigi makes it a little
@@ -35,8 +42,7 @@ def create_manifest_target(manifest_id, targets):
3542
# base class at runtime based on the URL of the manifest file.
3643

3744
# Construct the manifest file URL from the manifest_id and the configuration
38-
base_url = configuration.get_config().get(CONFIG_SECTION, 'path')
39-
manifest_file_path = url_path_join(base_url, manifest_id + '.manifest')
45+
manifest_file_path = get_manifest_file_path(manifest_id)
4046

4147
# Figure out the type of target that should be used to write/read the file.
4248
manifest_file_target_class, init_args, init_kwargs = get_target_class_from_url(manifest_file_path)
@@ -49,6 +55,16 @@ class ManifestInputTarget(ManifestInputTargetMixin, manifest_file_target_class):
4955
return ManifestInputTarget.from_existing_targets(targets, *init_args, **init_kwargs)
5056

5157

58+
def remove_manifest_target_if_exists(manifest_id):
59+
"""Given an id and configuration, construct a target that can check and remove a manifest file."""
60+
manifest_file_path = get_manifest_file_path(manifest_id)
61+
# we don't need the mixin in order to check for existence or to remove the manifest file.
62+
manifest_target = get_target_from_url(manifest_file_path)
63+
if manifest_target.exists():
64+
log.info('Removing existing manifest found at %s', manifest_target.path)
65+
manifest_target.remove()
66+
67+
5268
class ManifestInputTargetMixin(object):
5369

5470
def __init__(self, *args, **kwargs):
@@ -66,7 +82,7 @@ def __init__(self, *args, **kwargs):
6682
def from_existing_targets(cls, other_targets, *init_args, **init_kwargs):
6783
manifest_target = cls(*init_args, **init_kwargs)
6884
if not manifest_target.exists():
69-
log.debug('Writing manifest file %s', manifest_target.path)
85+
log.info('Writing manifest file %s', manifest_target.path)
7086
with manifest_target.open('w') as manifest_file:
7187
for target in other_targets:
7288
manifest_file.write(target.path)

edx/analytics/tasks/util/tests/test_manifest.py

+32-1
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
"""Ensure manifest files are created appropriately."""
22

3+
import os
4+
import shutil
5+
import tempfile
36
from unittest import TestCase
47

58
import luigi
69
from luigi.contrib.hdfs.target import HdfsTarget
710
from mock import patch
811

912
from edx.analytics.tasks.util.manifest import (
10-
ManifestInputTargetMixin, convert_to_manifest_input_if_necessary, create_manifest_target
13+
ManifestInputTargetMixin, convert_to_manifest_input_if_necessary, create_manifest_target,
14+
remove_manifest_target_if_exists
1115
)
1216
from edx.analytics.tasks.util.tests.config import OPTION_REMOVED, with_luigi_config
1317
from edx.analytics.tasks.util.tests.target import FakeTarget
@@ -81,3 +85,30 @@ def test_negative_threshold(self):
8185
@with_luigi_config('manifest', 'threshold', OPTION_REMOVED)
8286
def test_threshold_not_set(self):
8387
self.assert_no_conversion()
88+
89+
90+
temp_rootdir = None
91+
92+
93+
class ManifestInputTargetDeletionTest(TestCase):
94+
"""Ensure manifest files are deleted appropriately."""
95+
96+
MANIFEST_ID = 'test'
97+
98+
def setUp(self):
99+
temp_rootdir = tempfile.mkdtemp()
100+
self.addCleanup(self.cleanup, temp_rootdir)
101+
102+
def cleanup(self, dirname):
103+
"""Remove the temp directory only if it exists."""
104+
if os.path.exists(dirname):
105+
shutil.rmtree(dirname)
106+
107+
@with_luigi_config(
108+
('manifest', 'path', temp_rootdir),
109+
)
110+
def test_manifest_file_deletion(self):
111+
target = create_manifest_target(self.MANIFEST_ID, [HdfsTarget('s3://foo/bar')])
112+
self.assertTrue(target.exists())
113+
remove_manifest_target_if_exists(self.MANIFEST_ID)
114+
self.assertFalse(target.exists())

0 commit comments

Comments
 (0)