|
5 | 5 | import luigi
|
6 | 6 |
|
7 | 7 | from edx.analytics.tasks.common.mapreduce import MapReduceJobTask
|
8 |
| -from edx.analytics.tasks.common.pathutil import EventLogSelectionMixin |
| 8 | +from edx.analytics.tasks.common.pathutil import EventLogSelectionMixin, EventLogSelectionMixinSpark |
9 | 9 | from edx.analytics.tasks.util.url import get_target_from_url
|
| 10 | +from luigi.contrib.spark import PySparkTask |
10 | 11 |
|
11 | 12 | log = logging.getLogger(__name__)
|
12 | 13 |
|
@@ -34,3 +35,58 @@ def reducer(self, key, values):
|
34 | 35 |
|
35 | 36 | def output(self):
|
36 | 37 | return get_target_from_url(self.output_root)
|
| 38 | + |
| 39 | + |
| 40 | +class SparkTotalEventsDailyTask(EventLogSelectionMixinSpark, PySparkTask): |
| 41 | + """Produce a dataset for total events within a given time period.""" |
| 42 | + |
| 43 | + driver_memory = '2g' |
| 44 | + executor_memory = '3g' |
| 45 | + |
| 46 | + output_root = luigi.Parameter() |
| 47 | + |
| 48 | + def __init__(self, *args, **kwargs): |
| 49 | + super(SparkTotalEventsDailyTask, self).__init__(*args, **kwargs) |
| 50 | + |
| 51 | + # TODO: rename this method to output after testing is complete |
| 52 | + def output_dir(self): |
| 53 | + return get_target_from_url(self.output_root) |
| 54 | + |
| 55 | + def main(self, sc, *args): |
| 56 | + from pyspark.sql import SparkSession |
| 57 | + from pyspark.sql.types import * |
| 58 | + from pyspark.sql.functions import to_date, udf, struct, date_format |
| 59 | + spark = SparkSession.builder.getOrCreate() |
| 60 | + event_schema = StructType().add("POST", StringType(), True).add("GET", StringType(), True) |
| 61 | + module_schema = StructType().add("display_name", StringType(), True) \ |
| 62 | + .add("original_usage_key", StringType(), True) \ |
| 63 | + .add("original_usage_version", StringType(), True) \ |
| 64 | + .add("usage_key", StringType(), True) |
| 65 | + context_schema = StructType().add("command", StringType(), True) \ |
| 66 | + .add("course_id", StringType(), True) \ |
| 67 | + .add("module", module_schema) \ |
| 68 | + .add("org_id", StringType(), True) \ |
| 69 | + .add("path", StringType(), True) \ |
| 70 | + .add("user_id", StringType(), True) |
| 71 | + |
| 72 | + event_log_schema = StructType() \ |
| 73 | + .add("username", StringType(), True) \ |
| 74 | + .add("event_type", StringType(), True) \ |
| 75 | + .add("ip", StringType(), True) \ |
| 76 | + .add("agent", StringType(), True) \ |
| 77 | + .add("host", StringType(), True) \ |
| 78 | + .add("referer", StringType(), True) \ |
| 79 | + .add("accept_language", StringType(), True) \ |
| 80 | + .add("event", event_schema) \ |
| 81 | + .add("event_source", StringType(), True) \ |
| 82 | + .add("context", context_schema) \ |
| 83 | + .add("time", StringType(), True) \ |
| 84 | + .add("name", StringType(), True) \ |
| 85 | + .add("page", StringType(), True) \ |
| 86 | + .add("session", StringType(), True) |
| 87 | + |
| 88 | + df = spark.read.format('json').load(self.path_targets, schema=event_log_schema) |
| 89 | + df = df.withColumn('event_date', date_format(to_date(df['time']), 'yyyy-MM-dd')) |
| 90 | + df = df.filter(df['event_date'] == self.lower_bound_date_string).groupBy('event_date').count() |
| 91 | + df.repartition(1).write.csv(self.output_dir().path, mode='overwrite', sep='\t') |
| 92 | + # df.repartition(1).rdd.map(lambda row: '\t'.join(map(str, row))).saveAsTextFile(self.output_dir().path) |
0 commit comments