|
15 | 15 | from edx.analytics.tasks.common.pathutil import (
|
16 | 16 | EventLogSelectionDownstreamMixin, EventLogSelectionMixin, PathSelectionByDateIntervalTask
|
17 | 17 | )
|
| 18 | +from edx.analytics.tasks.common.spark import EventLogSelectionMixinSpark, SparkJobTask |
18 | 19 | from edx.analytics.tasks.insights.database_imports import ImportStudentCourseEnrollmentTask
|
19 | 20 | from edx.analytics.tasks.util import eventlog
|
20 | 21 | from edx.analytics.tasks.util.decorators import workflow_entry_point
|
@@ -163,6 +164,98 @@ def run(self):
|
163 | 164 | target.open("w").close() # touch the file
|
164 | 165 |
|
165 | 166 |
|
| 167 | +class LastDailyIpAddressOfUserTaskSpark(EventLogSelectionMixinSpark, WarehouseMixin, SparkJobTask): |
| 168 | + """Spark alternate of LastDailyIpAddressOfUserTask""" |
| 169 | + |
| 170 | + output_parent_dir = 'last_ip_of_user_id' |
| 171 | + marker = luigi.Parameter( |
| 172 | + config_path={'section': 'map-reduce', 'name': 'marker'}, |
| 173 | + significant=False, |
| 174 | + description='A URL location to a directory where a marker file will be written on task completion.', |
| 175 | + ) |
| 176 | + |
| 177 | + def output_dir(self): |
| 178 | + """ |
| 179 | + Output directory for spark task |
| 180 | + """ |
| 181 | + return get_target_from_url( |
| 182 | + url_path_join( |
| 183 | + self.warehouse_path, |
| 184 | + self.output_parent_dir |
| 185 | + ) |
| 186 | + ) |
| 187 | + |
| 188 | + def output(self): |
| 189 | + """ |
| 190 | + Marker output path |
| 191 | + """ |
| 192 | + marker_url = url_path_join(self.marker, str(hash(self))) |
| 193 | + return get_target_from_url(marker_url, marker=True) |
| 194 | + |
| 195 | + def output_paths(self): |
| 196 | + """ |
| 197 | + Output partition paths |
| 198 | + """ |
| 199 | + return map( |
| 200 | + lambda date: get_target_from_url( |
| 201 | + url_path_join( |
| 202 | + self.hive_partition_path(self.output_parent_dir, date.isoformat()) |
| 203 | + ) |
| 204 | + ), |
| 205 | + self.interval |
| 206 | + ) |
| 207 | + |
| 208 | + def on_success(self): # pragma: no cover |
| 209 | + # rename files on success to the format used in parent task |
| 210 | + self.output().touch_marker() |
| 211 | + |
| 212 | + def run(self): |
| 213 | + self.remove_output_on_overwrite() |
| 214 | + removed_partitions = [target.remove() for target in self.output_paths() if target.exists()] |
| 215 | + super(LastDailyIpAddressOfUserTaskSpark, self).run() |
| 216 | + |
| 217 | + def spark_job(self, *args): |
| 218 | + from edx.analytics.tasks.util.spark_util import get_event_predicate_labels, get_course_id, get_event_time_string |
| 219 | + from pyspark.sql.functions import udf, struct, split, explode, lit, col |
| 220 | + from pyspark.sql.window import Window |
| 221 | + from pyspark.sql.types import ArrayType, StringType |
| 222 | + df = self.get_event_log_dataframe(self._spark) |
| 223 | + # register udfs |
| 224 | + get_event_time = udf(get_event_time_string, StringType()) |
| 225 | + get_courseid = udf(get_course_id, StringType()) |
| 226 | + df = df.filter( |
| 227 | + (df['event_source'] != 'task') & |
| 228 | + ~ df['event_type'].startswith('edx.course.enrollment.') & |
| 229 | + (df['context.user_id'] != '') |
| 230 | + ) |
| 231 | + df = df.withColumn('course_id', get_courseid(df['context'])) \ |
| 232 | + .withColumn('timestamp', get_event_time(df['time'])) |
| 233 | + df = df.filter("course_id != '' or timestamp != '' or ip != ''") |
| 234 | + df.createOrReplaceTempView('location') |
| 235 | + query = """ |
| 236 | + SELECT |
| 237 | + timestamp, |
| 238 | + ip, |
| 239 | + user_id, |
| 240 | + course_id, |
| 241 | + dt |
| 242 | + FROM ( |
| 243 | + SELECT |
| 244 | + event_date as dt, |
| 245 | + context.user_id as user_id, |
| 246 | + course_id, |
| 247 | + timestamp, |
| 248 | + ip, |
| 249 | + ROW_NUMBER() over ( PARTITION BY event_date, context.user_id, course_id ORDER BY timestamp desc) as rank |
| 250 | + FROM location |
| 251 | + ) user_location |
| 252 | + WHERE rank <= 1 |
| 253 | + ORDER BY user_id |
| 254 | + """ |
| 255 | + result = self._spark.sql(query) |
| 256 | + result.coalesce(1).write.partitionBy('dt').csv(self.output_dir().path, mode='append', sep='\t') |
| 257 | + |
| 258 | + |
166 | 259 | class LastCountryOfUserDownstreamMixin(
|
167 | 260 | WarehouseMixin,
|
168 | 261 | OverwriteOutputMixin,
|
|
0 commit comments