Skip to content
This repository was archived by the owner on May 1, 2024. It is now read-only.
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 91ff398

Browse files
committedJan 24, 2018
convert TotalEventsDailyTask task to spark #486
1 parent 4d9d9ae commit 91ff398

File tree

2 files changed

+76
-1
lines changed

2 files changed

+76
-1
lines changed
 

‎edx/analytics/tasks/common/pathutil.py

+19
Original file line numberDiff line numberDiff line change
@@ -309,3 +309,22 @@ def get_map_input_file(self):
309309
log.warn('mapreduce_map_input_file not defined in os.environ, unable to determine input file path')
310310
self.incr_counter('Event', 'Missing map_input_file', 1)
311311
return ''
312+
313+
314+
class EventLogSelectionMixinSpark(EventLogSelectionDownstreamMixin):
315+
"""
316+
Extract events corresponding to a specified time interval.
317+
"""
318+
path_targets = None
319+
320+
def __init__(self, *args, **kwargs):
321+
super(EventLogSelectionDownstreamMixin, self).__init__(*args, **kwargs)
322+
self.lower_bound_date_string = self.interval.date_a.strftime('%Y-%m-%d') # pylint: disable=no-member
323+
self.upper_bound_date_string = self.interval.date_b.strftime('%Y-%m-%d') # pylint: disable=no-member
324+
path_targets = PathSelectionByDateIntervalTask(
325+
source=self.source,
326+
interval=self.interval,
327+
pattern=self.pattern,
328+
date_pattern=self.date_pattern,
329+
).output()
330+
self.path_targets = [task.path for task in path_targets]

‎edx/analytics/tasks/monitor/overall_events.py

+57-1
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
import luigi
66

77
from edx.analytics.tasks.common.mapreduce import MapReduceJobTask
8-
from edx.analytics.tasks.common.pathutil import EventLogSelectionMixin
8+
from edx.analytics.tasks.common.pathutil import EventLogSelectionMixin, EventLogSelectionMixinSpark
99
from edx.analytics.tasks.util.url import get_target_from_url
10+
from luigi.contrib.spark import PySparkTask
1011

1112
log = logging.getLogger(__name__)
1213

@@ -34,3 +35,58 @@ def reducer(self, key, values):
3435

3536
def output(self):
3637
return get_target_from_url(self.output_root)
38+
39+
40+
class SparkTotalEventsDailyTask(EventLogSelectionMixinSpark, PySparkTask):
41+
"""Produce a dataset for total events within a given time period."""
42+
43+
driver_memory = '2g'
44+
executor_memory = '3g'
45+
46+
output_root = luigi.Parameter()
47+
48+
def __init__(self, *args, **kwargs):
49+
super(SparkTotalEventsDailyTask, self).__init__(*args, **kwargs)
50+
51+
# TODO: rename this method to output after testing is complete
52+
def output_dir(self):
53+
return get_target_from_url(self.output_root)
54+
55+
def main(self, sc, *args):
56+
from pyspark.sql import SparkSession
57+
from pyspark.sql.types import *
58+
from pyspark.sql.functions import to_date, udf, struct, date_format
59+
spark = SparkSession.builder.getOrCreate()
60+
event_schema = StructType().add("POST", StringType(), True).add("GET", StringType(), True)
61+
module_schema = StructType().add("display_name", StringType(), True) \
62+
.add("original_usage_key", StringType(), True) \
63+
.add("original_usage_version", StringType(), True) \
64+
.add("usage_key", StringType(), True)
65+
context_schema = StructType().add("command", StringType(), True) \
66+
.add("course_id", StringType(), True) \
67+
.add("module", module_schema) \
68+
.add("org_id", StringType(), True) \
69+
.add("path", StringType(), True) \
70+
.add("user_id", StringType(), True)
71+
72+
event_log_schema = StructType() \
73+
.add("username", StringType(), True) \
74+
.add("event_type", StringType(), True) \
75+
.add("ip", StringType(), True) \
76+
.add("agent", StringType(), True) \
77+
.add("host", StringType(), True) \
78+
.add("referer", StringType(), True) \
79+
.add("accept_language", StringType(), True) \
80+
.add("event", event_schema) \
81+
.add("event_source", StringType(), True) \
82+
.add("context", context_schema) \
83+
.add("time", StringType(), True) \
84+
.add("name", StringType(), True) \
85+
.add("page", StringType(), True) \
86+
.add("session", StringType(), True)
87+
88+
df = spark.read.format('json').load(self.path_targets, schema=event_log_schema)
89+
df = df.withColumn('event_date', date_format(to_date(df['time']), 'yyyy-MM-dd'))
90+
df = df.filter(df['event_date'] == self.lower_bound_date_string).groupBy('event_date').count()
91+
df.repartition(1).write.csv(self.output_dir().path, mode='overwrite', sep='\t')
92+
# df.repartition(1).rdd.map(lambda row: '\t'.join(map(str, row))).saveAsTextFile(self.output_dir().path)

0 commit comments

Comments
 (0)
This repository has been archived.