user location spark initial task

rao-abdul-mannan · rao-abdul-mannan · commit 2ab865d5286d · 2018-05-28T16:38:46.000+05:00
diff --git a/edx/analytics/tasks/insights/location_per_course.py b/edx/analytics/tasks/insights/location_per_course.py
@@ -15,6 +15,7 @@
 from edx.analytics.tasks.common.pathutil import (
     EventLogSelectionDownstreamMixin, EventLogSelectionMixin, PathSelectionByDateIntervalTask
 )
+from edx.analytics.tasks.common.spark import EventLogSelectionMixinSpark, SparkJobTask
 from edx.analytics.tasks.insights.database_imports import ImportStudentCourseEnrollmentTask
 from edx.analytics.tasks.util import eventlog
 from edx.analytics.tasks.util.decorators import workflow_entry_point
@@ -163,6 +164,98 @@ def run(self):
                 target.open("w").close()  # touch the file
 
 
+class LastDailyIpAddressOfUserTaskSpark(EventLogSelectionMixinSpark, WarehouseMixin, SparkJobTask):
+    """Spark alternate of LastDailyIpAddressOfUserTask"""
+
+    output_parent_dir = 'last_ip_of_user_id'
+    marker = luigi.Parameter(
+        config_path={'section': 'map-reduce', 'name': 'marker'},
+        significant=False,
+        description='A URL location to a directory where a marker file will be written on task completion.',
+    )
+
+    def output_dir(self):
+        """
+        Output directory for spark task
+        """
+        return get_target_from_url(
+            url_path_join(
+                self.warehouse_path,
+                self.output_parent_dir
+            )
+        )
+
+    def output(self):
+        """
+        Marker output path
+        """
+        marker_url = url_path_join(self.marker, str(hash(self)))
+        return get_target_from_url(marker_url, marker=True)
+
+    def output_paths(self):
+        """
+        Output partition paths
+        """
+        return map(
+            lambda date: get_target_from_url(
+                url_path_join(
+                    self.hive_partition_path(self.output_parent_dir, date.isoformat())
+                )
+            ),
+            self.interval
+        )
+
+    def on_success(self):  # pragma: no cover
+        # rename files on success to the format used in parent task
+        self.output().touch_marker()
+
+    def run(self):
+        self.remove_output_on_overwrite()
+        removed_partitions = [target.remove() for target in self.output_paths() if target.exists()]
+        super(LastDailyIpAddressOfUserTaskSpark, self).run()
+
+    def spark_job(self, *args):
+        from edx.analytics.tasks.util.spark_util import get_event_predicate_labels, get_course_id, get_event_time_string
+        from pyspark.sql.functions import udf, struct, split, explode, lit, col
+        from pyspark.sql.window import Window
+        from pyspark.sql.types import ArrayType, StringType
+        df = self.get_event_log_dataframe(self._spark)
+        # register udfs
+        get_event_time = udf(get_event_time_string, StringType())
+        get_courseid = udf(get_course_id, StringType())
+        df = df.filter(
+            (df['event_source'] != 'task') &
+            ~ df['event_type'].startswith('edx.course.enrollment.') &
+            (df['context.user_id'] != '')
+        )
+        df = df.withColumn('course_id', get_courseid(df['context'])) \
+            .withColumn('timestamp', get_event_time(df['time']))
+        df = df.filter("course_id != '' or timestamp != '' or ip != ''")
+        df.createOrReplaceTempView('location')
+        query = """
+                SELECT
+                    timestamp,
+                    ip,
+                    user_id,
+                    course_id,
+                    dt
+                FROM (
+                    SELECT
+                        event_date as dt,
+                        context.user_id as user_id,
+                        course_id,
+                        timestamp,
+                        ip,
+                        ROW_NUMBER() over ( PARTITION BY event_date, context.user_id, course_id ORDER BY timestamp desc) as rank
+                        FROM location
+                ) user_location
+                WHERE rank <= 1
+                ORDER BY user_id
+                """
+        result = self._spark.sql(query)
+        result.coalesce(1).write.partitionBy('dt').csv(self.output_dir().path, mode='append', sep='\t')
+
+
 class LastCountryOfUserDownstreamMixin(
         WarehouseMixin,
         OverwriteOutputMixin,
diff --git a/edx/analytics/tasks/util/spark_util.py b/edx/analytics/tasks/util/spark_util.py
@@ -41,6 +41,18 @@ def get_key_value_from_event(event, key, default_value=None):
     return default_value
 
 
+def get_event_time_string(event_time):
+    """Returns the time of the event as an ISO8601 formatted string."""
+    try:
+        # Get entry, and strip off time zone information.  Keep microseconds, if any.
+        timestamp = event_time.split('+')[0]
+        if '.' not in timestamp:
+            timestamp = '{datetime}.000000'.format(datetime=timestamp)
+        return timestamp
+    except Exception:  # pylint: disable=broad-except
+        return None
+
+
 def get_course_id(event_context, from_url=False):
     """
     Gets course_id from event's data.