Skip to content
This repository has been archived by the owner on May 1, 2024. It is now read-only.

Commit

Permalink
user location spark initial task
Browse files Browse the repository at this point in the history
  • Loading branch information
rao-abdul-mannan committed May 28, 2018
1 parent 0b8b937 commit 2ab865d
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 0 deletions.
93 changes: 93 additions & 0 deletions edx/analytics/tasks/insights/location_per_course.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from edx.analytics.tasks.common.pathutil import (
EventLogSelectionDownstreamMixin, EventLogSelectionMixin, PathSelectionByDateIntervalTask
)
from edx.analytics.tasks.common.spark import EventLogSelectionMixinSpark, SparkJobTask
from edx.analytics.tasks.insights.database_imports import ImportStudentCourseEnrollmentTask
from edx.analytics.tasks.util import eventlog
from edx.analytics.tasks.util.decorators import workflow_entry_point
Expand Down Expand Up @@ -163,6 +164,98 @@ def run(self):
target.open("w").close() # touch the file


class LastDailyIpAddressOfUserTaskSpark(EventLogSelectionMixinSpark, WarehouseMixin, SparkJobTask):
"""Spark alternate of LastDailyIpAddressOfUserTask"""

output_parent_dir = 'last_ip_of_user_id'
marker = luigi.Parameter(
config_path={'section': 'map-reduce', 'name': 'marker'},
significant=False,
description='A URL location to a directory where a marker file will be written on task completion.',
)

def output_dir(self):
"""
Output directory for spark task
"""
return get_target_from_url(
url_path_join(
self.warehouse_path,
self.output_parent_dir
)
)

def output(self):
"""
Marker output path
"""
marker_url = url_path_join(self.marker, str(hash(self)))
return get_target_from_url(marker_url, marker=True)

def output_paths(self):
"""
Output partition paths
"""
return map(
lambda date: get_target_from_url(
url_path_join(
self.hive_partition_path(self.output_parent_dir, date.isoformat())
)
),
self.interval
)

def on_success(self): # pragma: no cover
# rename files on success to the format used in parent task
self.output().touch_marker()

def run(self):
self.remove_output_on_overwrite()
removed_partitions = [target.remove() for target in self.output_paths() if target.exists()]
super(LastDailyIpAddressOfUserTaskSpark, self).run()

def spark_job(self, *args):
from edx.analytics.tasks.util.spark_util import get_event_predicate_labels, get_course_id, get_event_time_string
from pyspark.sql.functions import udf, struct, split, explode, lit, col
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, StringType
df = self.get_event_log_dataframe(self._spark)
# register udfs
get_event_time = udf(get_event_time_string, StringType())
get_courseid = udf(get_course_id, StringType())
df = df.filter(
(df['event_source'] != 'task') &
~ df['event_type'].startswith('edx.course.enrollment.') &
(df['context.user_id'] != '')
)
df = df.withColumn('course_id', get_courseid(df['context'])) \
.withColumn('timestamp', get_event_time(df['time']))
df = df.filter("course_id != '' or timestamp != '' or ip != ''")
df.createOrReplaceTempView('location')
query = """
SELECT
timestamp,
ip,
user_id,
course_id,
dt
FROM (
SELECT
event_date as dt,
context.user_id as user_id,
course_id,
timestamp,
ip,
ROW_NUMBER() over ( PARTITION BY event_date, context.user_id, course_id ORDER BY timestamp desc) as rank
FROM location
) user_location
WHERE rank <= 1
ORDER BY user_id
"""
result = self._spark.sql(query)
result.coalesce(1).write.partitionBy('dt').csv(self.output_dir().path, mode='append', sep='\t')


class LastCountryOfUserDownstreamMixin(
WarehouseMixin,
OverwriteOutputMixin,
Expand Down
12 changes: 12 additions & 0 deletions edx/analytics/tasks/util/spark_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,18 @@ def get_key_value_from_event(event, key, default_value=None):
return default_value


def get_event_time_string(event_time):
"""Returns the time of the event as an ISO8601 formatted string."""
try:
# Get entry, and strip off time zone information. Keep microseconds, if any.
timestamp = event_time.split('+')[0]
if '.' not in timestamp:
timestamp = '{datetime}.000000'.format(datetime=timestamp)
return timestamp
except Exception: # pylint: disable=broad-except
return None


def get_course_id(event_context, from_url=False):
"""
Gets course_id from event's data.
Expand Down

0 comments on commit 2ab865d

Please sign in to comment.