Skip to content
This repository was archived by the owner on May 1, 2024. It is now read-only.

Commit 2f1ec9c

Browse files
test
1 parent a9d5d7b commit 2f1ec9c

File tree

2 files changed

+48
-38
lines changed

2 files changed

+48
-38
lines changed

edx/analytics/tasks/common/spark.py

+40-31
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
ManifestInputTargetMixin, convert_to_manifest_input_if_necessary, remove_manifest_target_if_exists
1515
)
1616
from edx.analytics.tasks.util.overwrite import OverwriteOutputMixin
17-
from edx.analytics.tasks.util.url import get_target_from_url, url_path_join
17+
from edx.analytics.tasks.util.url import UncheckedExternalURL, get_target_from_url, url_path_join
1818

1919
_file_path_to_package_meta_path = {}
2020

@@ -163,31 +163,42 @@ class PathSelectionTaskSpark(EventLogSelectionDownstreamMixin, luigi.WrapperTask
163163
"""
164164
Path selection task with manifest feature for spark
165165
"""
166-
requirements = None
166+
targets = None
167167
manifest_id = luigi.Parameter(
168168
description='File name for manifest'
169169
)
170+
manifest_dir = luigi.Parameter(
171+
description='Directory for manifest files'
172+
)
173+
pyspark_logger = luigi.Parameter(
174+
description='Pyspark logger',
175+
default=None
176+
)
170177

171178
def requires(self):
172-
yield PathSelectionByDateIntervalTask(
179+
if not self.targets:
180+
if self.pyspark_logger:
181+
self.pyspark_logger.warn("PathSelectionTaskSpark=> targets not found, refreshing!")
182+
self.targets = self._get_targets()
183+
else:
184+
if self.pyspark_logger:
185+
self.pyspark_logger.warn("PathSelectionTaskSpark=> targets already exist")
186+
return self.targets
187+
188+
def _get_targets(self):
189+
input = PathSelectionByDateIntervalTask(
173190
source=self.source,
174191
interval=self.interval,
175192
pattern=self.pattern,
176193
date_pattern=self.date_pattern
194+
).output()
195+
targets = luigi.task.flatten(
196+
convert_to_manifest_input_if_necessary(self.manifest_id, input, self.manifest_dir)
177197
)
178-
179-
def get_target_paths(self):
180-
log.warn("PathSelectionTaskSpark: checking requirements {}".format(self.manifest_id))
181-
if not self.requirements:
182-
log.warn("PathSelectionTaskSpark: requirements not found, refreshing!!")
183-
targets = luigi.task.flatten(
184-
convert_to_manifest_input_if_necessary(self.manifest_id, self.input())
185-
)
186-
self.requirements = targets
187-
return self.requirements
198+
return [UncheckedExternalURL(target.path) for target in targets]
188199

189200
def output(self):
190-
return self.get_target_paths()
201+
return [target.output() for target in self.requires()]
191202

192203

193204
class EventLogSelectionMixinSpark(EventLogSelectionDownstreamMixin):
@@ -240,19 +251,26 @@ def get_log_schema(self):
240251
return event_log_schema
241252

242253
def get_input_rdd(self, *args):
243-
manifest_target = self.get_manifest_path(*args)
244-
self.log.warn("PYSPARK LOGGER : Getting input rdd ---> target : {}".format(manifest_target.path))
245-
if manifest_target.exists():
254+
manifest_path = self.get_config_from_args('manifest_path', *args, default_value='')
255+
targets = PathSelectionTaskSpark(
256+
source=self.source,
257+
interval=self.interval,
258+
pattern=self.pattern,
259+
date_pattern=self.date_pattern,
260+
manifest_id=self.manifest_id,
261+
manifest_dir=manifest_path,
262+
pyspark_logger=self.log
263+
).output()
264+
if len(targets) and 'manifest' in targets[0].path:
246265
# Reading manifest as rdd with spark is alot faster as compared to hadoop.
247266
# Currently, we're getting only 1 manifest file per request, so we will create a single rdd from it.
248267
# If there are multiple manifest files, each file can be read as rdd and then union it with other manifest rdds
249-
self.log.warn("PYSPARK LOGGER: Reading manifest file :: {} ".format(manifest_target.path))
250-
source_rdd = self._spark.sparkContext.textFile(manifest_target.path)
268+
self.log.warn("PYSPARK LOGGER: Reading manifest file :: {} ".format(targets[0].path))
269+
source_rdd = self._spark.sparkContext.textFile(targets[0].path, 1)
251270
else:
252271
# maybe we only need to broadcast it ( on cluster ) and not create rdd. lets see
253272
self.log.warn("PYSPARK LOGGER: Reading normal targets")
254-
input_targets = luigi.task.flatten(self.input())
255-
source_rdd = self._spark.sparkContext.parallelize([target.path for target in input_targets])
273+
source_rdd = self._spark.sparkContext.parallelize([target.path for target in targets])
256274
return source_rdd
257275

258276
def get_event_log_dataframe(self, spark, *args, **kwargs):
@@ -309,7 +327,7 @@ def manifest_id(self):
309327
'interval': self.interval,
310328
'pattern': self.pattern,
311329
'date_pattern': self.date_pattern,
312-
'spark':'for_some_difference_with_hadoop_manifest'
330+
'spark': 'for_some_difference_with_hadoop_manifest'
313331
}
314332
return str(hash(frozenset(params.items()))).replace('-', 'n')
315333

@@ -322,15 +340,6 @@ def get_manifest_path(self, *args):
322340
)
323341
)
324342

325-
def requires(self):
326-
yield PathSelectionTaskSpark(
327-
source=self.source,
328-
interval=self.interval,
329-
pattern=self.pattern,
330-
date_pattern=self.date_pattern,
331-
manifest_id=self.manifest_id
332-
)
333-
334343
def spark_job(self):
335344
"""
336345
Spark code for the job

edx/analytics/tasks/util/manifest.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -12,37 +12,38 @@
1212
log = logging.getLogger(__name__)
1313

1414

15-
def convert_to_manifest_input_if_necessary(manifest_id, targets):
15+
def convert_to_manifest_input_if_necessary(manifest_id, targets, manifest_dir=None):
1616
targets = luigi.task.flatten(targets)
1717
threshold = configuration.get_config().getint(CONFIG_SECTION, 'threshold', -1)
1818
if threshold > 0 and len(targets) >= threshold:
1919
log.debug(
2020
'Using manifest since %d inputs are greater than or equal to the threshold %d', len(targets), threshold
2121
)
22-
return [create_manifest_target(manifest_id, targets)]
22+
return [create_manifest_target(manifest_id, targets, manifest_dir)]
2323
else:
2424
log.debug(
2525
'Directly processing files since %d inputs are less than the threshold %d', len(targets), threshold
2626
)
2727
return targets
2828

2929

30-
def get_manifest_file_path(manifest_id):
30+
def get_manifest_file_path(manifest_id, manifest_dir=None):
3131
# Construct the manifest file URL from the manifest_id and the configuration
32-
base_url = configuration.get_config().get(CONFIG_SECTION, 'path')
33-
manifest_file_path = url_path_join(base_url, manifest_id + '.manifest')
32+
if manifest_dir is None:
33+
manifest_dir = configuration.get_config().get(CONFIG_SECTION, 'path')
34+
manifest_file_path = url_path_join(manifest_dir, manifest_id + '.manifest')
3435
return manifest_file_path
3536

3637

37-
def create_manifest_target(manifest_id, targets):
38+
def create_manifest_target(manifest_id, targets, manifest_dir=None):
3839
# If we are running locally, we need our manifest file to be a local file target, however, if we are running on
3940
# a real Hadoop cluster, it has to be an HDFS file so that the input format can read it. Luigi makes it a little
4041
# difficult for us to construct a target that can be one or the other of those types of targets at runtime since
4142
# it relies on inheritance to signify the difference. We hack the inheritance here, by dynamically choosing the
4243
# base class at runtime based on the URL of the manifest file.
4344

4445
# Construct the manifest file URL from the manifest_id and the configuration
45-
manifest_file_path = get_manifest_file_path(manifest_id)
46+
manifest_file_path = get_manifest_file_path(manifest_id, manifest_dir)
4647

4748
# Figure out the type of target that should be used to write/read the file.
4849
manifest_file_target_class, init_args, init_kwargs = get_target_class_from_url(manifest_file_path)

0 commit comments

Comments
 (0)