Skip to content
This repository was archived by the owner on May 1, 2024. It is now read-only.

Commit b89c71d

Browse files
convert user activity task to spark
1 parent 91ff398 commit b89c71d

File tree

9 files changed

+446
-115
lines changed

9 files changed

+446
-115
lines changed

edx/analytics/tasks/common/pathutil.py

-19
Original file line numberDiff line numberDiff line change
@@ -309,22 +309,3 @@ def get_map_input_file(self):
309309
log.warn('mapreduce_map_input_file not defined in os.environ, unable to determine input file path')
310310
self.incr_counter('Event', 'Missing map_input_file', 1)
311311
return ''
312-
313-
314-
class EventLogSelectionMixinSpark(EventLogSelectionDownstreamMixin):
315-
"""
316-
Extract events corresponding to a specified time interval.
317-
"""
318-
path_targets = None
319-
320-
def __init__(self, *args, **kwargs):
321-
super(EventLogSelectionDownstreamMixin, self).__init__(*args, **kwargs)
322-
self.lower_bound_date_string = self.interval.date_a.strftime('%Y-%m-%d') # pylint: disable=no-member
323-
self.upper_bound_date_string = self.interval.date_b.strftime('%Y-%m-%d') # pylint: disable=no-member
324-
path_targets = PathSelectionByDateIntervalTask(
325-
source=self.source,
326-
interval=self.interval,
327-
pattern=self.pattern,
328-
date_pattern=self.date_pattern,
329-
).output()
330-
self.path_targets = [task.path for task in path_targets]

edx/analytics/tasks/common/spark.py

+276
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
import json
2+
import os
3+
import tempfile
4+
import zipfile
5+
6+
import luigi.configuration
7+
from luigi.contrib.spark import PySparkTask
8+
9+
from edx.analytics.tasks.common.pathutil import EventLogSelectionDownstreamMixin, PathSelectionByDateIntervalTask
10+
from edx.analytics.tasks.util.overwrite import OverwriteOutputMixin
11+
12+
_file_path_to_package_meta_path = {}
13+
14+
15+
def get_package_metadata_paths():
16+
"""
17+
List of package metadata to be loaded on EMR cluster
18+
"""
19+
from distlib.database import DistributionPath
20+
21+
if len(_file_path_to_package_meta_path) > 0:
22+
return _file_path_to_package_meta_path
23+
24+
dist_path = DistributionPath(include_egg=True)
25+
for distribution in dist_path.get_distributions():
26+
metadata_path = distribution.path
27+
for installed_file_path, _hash, _size in distribution.list_installed_files():
28+
absolute_installed_file_path = installed_file_path
29+
if not os.path.isabs(installed_file_path):
30+
absolute_installed_file_path = os.path.join(os.path.dirname(metadata_path), installed_file_path)
31+
normalized_file_path = os.path.realpath(absolute_installed_file_path)
32+
_file_path_to_package_meta_path[normalized_file_path] = metadata_path
33+
34+
return _file_path_to_package_meta_path
35+
36+
37+
def dereference(f):
38+
if os.path.islink(f):
39+
# by joining with the dirname we are certain to get the absolute path
40+
return dereference(os.path.join(os.path.dirname(f), os.readlink(f)))
41+
else:
42+
return f
43+
44+
45+
def create_packages_archive(packages, archive_dir_path):
46+
"""
47+
Create a zip archive for all the packages listed in packages and returns the list of zip file location.
48+
"""
49+
import zipfile
50+
archives_list = []
51+
package_metadata_paths = get_package_metadata_paths()
52+
metadata_to_add = dict()
53+
54+
package_zip_path = os.path.join(archive_dir_path, 'packages.zip')
55+
package_zip = zipfile.ZipFile(package_zip_path, "w", compression=zipfile.ZIP_DEFLATED)
56+
archives_list.append(package_zip_path)
57+
58+
def add(src, dst, package_name):
59+
# Ensure any entry points and other egg-info metadata is also transmitted along with
60+
# this file. If it is associated with any egg-info directories, ship them too.
61+
metadata_path = package_metadata_paths.get(os.path.realpath(src))
62+
if metadata_path:
63+
metadata_to_add[package_name] = metadata_path
64+
65+
package_zip.write(src, dst)
66+
67+
def add_files_for_package(sub_package_path, root_package_path, root_package_name, package_name):
68+
for root, dirs, files in os.walk(sub_package_path):
69+
if '.svn' in dirs:
70+
dirs.remove('.svn')
71+
for f in files:
72+
if not f.endswith(".pyc") and not f.startswith("."):
73+
add(dereference(root + "/" + f),
74+
root.replace(root_package_path, root_package_name) + "/" + f,
75+
package_name)
76+
77+
for package in packages:
78+
# Archive each package
79+
if not getattr(package, "__path__", None) and '.' in package.__name__:
80+
package = __import__(package.__name__.rpartition('.')[0], None, None, 'non_empty')
81+
82+
n = package.__name__.replace(".", "/")
83+
84+
# Check length of path, because the attribute may exist and be an empty list.
85+
if len(getattr(package, "__path__", [])) > 0:
86+
# TODO: (BUG) picking only the first path does not
87+
# properly deal with namespaced packages in different
88+
# directories
89+
p = package.__path__[0]
90+
91+
if p.endswith('.egg') and os.path.isfile(p):
92+
raise 'Not going to archive egg files!!!'
93+
# Add the entire egg file
94+
# p = p[:p.find('.egg') + 4]
95+
# add(dereference(p), os.path.basename(p))
96+
97+
else:
98+
# include __init__ files from parent projects
99+
root = []
100+
for parent in package.__name__.split('.')[0:-1]:
101+
root.append(parent)
102+
module_name = '.'.join(root)
103+
directory = '/'.join(root)
104+
105+
add(dereference(__import__(module_name, None, None, 'non_empty').__path__[0] + "/__init__.py"),
106+
directory + "/__init__.py",
107+
package.__name__)
108+
109+
add_files_for_package(p, p, n, package.__name__)
110+
111+
else:
112+
f = package.__file__
113+
if f.endswith("pyc"):
114+
f = f[:-3] + "py"
115+
if n.find(".") == -1:
116+
add(dereference(f), os.path.basename(f), package.__name__)
117+
else:
118+
add(dereference(f), n + ".py", package.__name__)
119+
120+
# include metadata in the same zip file
121+
metadata_path = metadata_to_add.get(package.__name__)
122+
if metadata_path is not None:
123+
add_files_for_package(metadata_path, metadata_path, os.path.basename(metadata_path), package.__name__)
124+
125+
return archives_list
126+
127+
128+
class EventLogSelectionMixinSpark(EventLogSelectionDownstreamMixin):
129+
"""
130+
Extract events corresponding to a specified time interval.
131+
"""
132+
path_targets = None
133+
134+
def __init__(self, *args, **kwargs):
135+
"""
136+
Call path selection task to get list of log files matching the pattern
137+
"""
138+
super(EventLogSelectionDownstreamMixin, self).__init__(*args, **kwargs)
139+
self.lower_bound_date_string = self.interval.date_a.strftime('%Y-%m-%d') # pylint: disable=no-member
140+
self.upper_bound_date_string = self.interval.date_b.strftime('%Y-%m-%d') # pylint: disable=no-member
141+
path_targets = PathSelectionByDateIntervalTask(
142+
source=self.source,
143+
interval=self.interval,
144+
pattern=self.pattern,
145+
date_pattern=self.date_pattern,
146+
).output()
147+
self.path_targets = [task.path for task in path_targets]
148+
149+
def get_log_schema(self):
150+
"""
151+
Get spark based schema for processing event logs
152+
:return: Spark schema
153+
"""
154+
from pyspark.sql.types import StructType, StringType
155+
event_schema = StructType().add("POST", StringType(), True).add("GET", StringType(), True)
156+
module_schema = StructType().add("display_name", StringType(), True) \
157+
.add("original_usage_key", StringType(), True) \
158+
.add("original_usage_version", StringType(), True) \
159+
.add("usage_key", StringType(), True)
160+
context_schema = StructType().add("command", StringType(), True) \
161+
.add("course_id", StringType(), True) \
162+
.add("module", module_schema) \
163+
.add("org_id", StringType(), True) \
164+
.add("path", StringType(), True) \
165+
.add("user_id", StringType(), True)
166+
167+
event_log_schema = StructType() \
168+
.add("username", StringType(), True) \
169+
.add("event_type", StringType(), True) \
170+
.add("ip", StringType(), True) \
171+
.add("agent", StringType(), True) \
172+
.add("host", StringType(), True) \
173+
.add("referer", StringType(), True) \
174+
.add("accept_language", StringType(), True) \
175+
.add("event", event_schema) \
176+
.add("event_source", StringType(), True) \
177+
.add("context", context_schema) \
178+
.add("time", StringType(), True) \
179+
.add("name", StringType(), True) \
180+
.add("page", StringType(), True) \
181+
.add("session", StringType(), True)
182+
183+
return event_log_schema
184+
185+
def get_event_log_dataframe(self, spark, *args, **kwargs):
186+
from pyspark.sql.functions import to_date, udf, struct, date_format
187+
dataframe = spark.read.format('json').load(self.path_targets, schema=self.get_log_schema())
188+
dataframe = dataframe.filter(dataframe['time'].isNotNull()) \
189+
.withColumn('event_date', date_format(to_date(dataframe['time']), 'yyyy-MM-dd'))
190+
dataframe = dataframe.filter(dataframe['event_date'] == self.lower_bound_date_string)
191+
return dataframe
192+
193+
194+
class SparkJobTask(OverwriteOutputMixin, PySparkTask):
195+
"""
196+
Wrapper for spark task
197+
"""
198+
199+
_spark = None
200+
_spark_context = None
201+
_sql_context = None
202+
_hive_context = None
203+
_tmp_dir = None
204+
205+
driver_memory = '2g'
206+
executor_memory = '3g'
207+
always_log_stderr = False # log stderr if spark fails, True for verbose log
208+
209+
def init_spark(self, sc):
210+
"""
211+
Initialize spark, sql and hive context
212+
:param sc: Spark context
213+
"""
214+
from pyspark.sql import SparkSession, SQLContext, HiveContext
215+
self._sql_context = SQLContext(sc)
216+
self._spark_context = sc
217+
self._spark = SparkSession.builder.getOrCreate()
218+
self._hive_context = HiveContext(sc)
219+
220+
def spark_job(self):
221+
"""
222+
Spark code for the job
223+
"""
224+
raise NotImplementedError
225+
226+
def _load_internal_dependency_on_cluster(self):
227+
"""
228+
creates a zip of package and loads it on spark worker nodes
229+
230+
Loading via luigi configuration does not work as it creates a tar file whereas spark does not load tar files
231+
"""
232+
233+
# import packages to be loaded on cluster
234+
import edx
235+
import luigi
236+
import opaque_keys
237+
import stevedore
238+
import bson
239+
import ccx_keys
240+
import cjson
241+
import boto
242+
import filechunkio
243+
import ciso8601
244+
import chardet
245+
import urllib3
246+
import certifi
247+
import idna
248+
import requests
249+
250+
dependencies_list = []
251+
egg_files = luigi.configuration.get_config().get('spark', 'edx_egg_files', None)
252+
if isinstance(egg_files, basestring):
253+
dependencies_list = json.loads(egg_files)
254+
packages = [edx, luigi, opaque_keys, stevedore, bson, ccx_keys, cjson, boto, filechunkio, ciso8601, chardet,
255+
urllib3, certifi, idna, requests]
256+
self._tmp_dir = tempfile.mkdtemp()
257+
dependencies_list += create_packages_archive(packages, self._tmp_dir)
258+
# dependencies_list.append('s3://edx-analytics-scratch/egg_files/edx_opaque_keys-0.4-py2.7.egg')
259+
if len(dependencies_list) > 0:
260+
for file in dependencies_list:
261+
self._spark_context.addPyFile(file)
262+
263+
def run(self):
264+
self.remove_output_on_overwrite()
265+
super(SparkJobTask, self).run()
266+
267+
def _clean(self):
268+
"""Do any cleanup after job here"""
269+
import shutil
270+
shutil.rmtree(self._tmp_dir)
271+
272+
def main(self, sc, *args):
273+
self.init_spark(sc)
274+
self._load_internal_dependency_on_cluster() # load packages on EMR cluster for spark worker nodes
275+
self.spark_job()
276+
self._clean()

0 commit comments

Comments
 (0)