|
| 1 | +import json |
| 2 | +import os |
| 3 | +import tempfile |
| 4 | +import zipfile |
| 5 | + |
| 6 | +import luigi.configuration |
| 7 | +from luigi.contrib.spark import PySparkTask |
| 8 | + |
| 9 | +from edx.analytics.tasks.common.pathutil import EventLogSelectionDownstreamMixin, PathSelectionByDateIntervalTask |
| 10 | +from edx.analytics.tasks.util.overwrite import OverwriteOutputMixin |
| 11 | + |
| 12 | +_file_path_to_package_meta_path = {} |
| 13 | + |
| 14 | + |
| 15 | +def get_package_metadata_paths(): |
| 16 | + """ |
| 17 | + List of package metadata to be loaded on EMR cluster |
| 18 | + """ |
| 19 | + from distlib.database import DistributionPath |
| 20 | + |
| 21 | + if len(_file_path_to_package_meta_path) > 0: |
| 22 | + return _file_path_to_package_meta_path |
| 23 | + |
| 24 | + dist_path = DistributionPath(include_egg=True) |
| 25 | + for distribution in dist_path.get_distributions(): |
| 26 | + metadata_path = distribution.path |
| 27 | + for installed_file_path, _hash, _size in distribution.list_installed_files(): |
| 28 | + absolute_installed_file_path = installed_file_path |
| 29 | + if not os.path.isabs(installed_file_path): |
| 30 | + absolute_installed_file_path = os.path.join(os.path.dirname(metadata_path), installed_file_path) |
| 31 | + normalized_file_path = os.path.realpath(absolute_installed_file_path) |
| 32 | + _file_path_to_package_meta_path[normalized_file_path] = metadata_path |
| 33 | + |
| 34 | + return _file_path_to_package_meta_path |
| 35 | + |
| 36 | + |
| 37 | +def dereference(f): |
| 38 | + if os.path.islink(f): |
| 39 | + # by joining with the dirname we are certain to get the absolute path |
| 40 | + return dereference(os.path.join(os.path.dirname(f), os.readlink(f))) |
| 41 | + else: |
| 42 | + return f |
| 43 | + |
| 44 | + |
| 45 | +def create_packages_archive(packages, archive_dir_path): |
| 46 | + """ |
| 47 | + Create a zip archive for all the packages listed in packages and returns the list of zip file location. |
| 48 | + """ |
| 49 | + import zipfile |
| 50 | + archives_list = [] |
| 51 | + package_metadata_paths = get_package_metadata_paths() |
| 52 | + metadata_to_add = dict() |
| 53 | + |
| 54 | + package_zip_path = os.path.join(archive_dir_path, 'packages.zip') |
| 55 | + package_zip = zipfile.ZipFile(package_zip_path, "w", compression=zipfile.ZIP_DEFLATED) |
| 56 | + archives_list.append(package_zip_path) |
| 57 | + |
| 58 | + def add(src, dst, package_name): |
| 59 | + # Ensure any entry points and other egg-info metadata is also transmitted along with |
| 60 | + # this file. If it is associated with any egg-info directories, ship them too. |
| 61 | + metadata_path = package_metadata_paths.get(os.path.realpath(src)) |
| 62 | + if metadata_path: |
| 63 | + metadata_to_add[package_name] = metadata_path |
| 64 | + |
| 65 | + package_zip.write(src, dst) |
| 66 | + |
| 67 | + def add_files_for_package(sub_package_path, root_package_path, root_package_name, package_name): |
| 68 | + for root, dirs, files in os.walk(sub_package_path): |
| 69 | + if '.svn' in dirs: |
| 70 | + dirs.remove('.svn') |
| 71 | + for f in files: |
| 72 | + if not f.endswith(".pyc") and not f.startswith("."): |
| 73 | + add(dereference(root + "/" + f), |
| 74 | + root.replace(root_package_path, root_package_name) + "/" + f, |
| 75 | + package_name) |
| 76 | + |
| 77 | + for package in packages: |
| 78 | + # Archive each package |
| 79 | + if not getattr(package, "__path__", None) and '.' in package.__name__: |
| 80 | + package = __import__(package.__name__.rpartition('.')[0], None, None, 'non_empty') |
| 81 | + |
| 82 | + n = package.__name__.replace(".", "/") |
| 83 | + |
| 84 | + # Check length of path, because the attribute may exist and be an empty list. |
| 85 | + if len(getattr(package, "__path__", [])) > 0: |
| 86 | + # TODO: (BUG) picking only the first path does not |
| 87 | + # properly deal with namespaced packages in different |
| 88 | + # directories |
| 89 | + p = package.__path__[0] |
| 90 | + |
| 91 | + if p.endswith('.egg') and os.path.isfile(p): |
| 92 | + raise 'Not going to archive egg files!!!' |
| 93 | + # Add the entire egg file |
| 94 | + # p = p[:p.find('.egg') + 4] |
| 95 | + # add(dereference(p), os.path.basename(p)) |
| 96 | + |
| 97 | + else: |
| 98 | + # include __init__ files from parent projects |
| 99 | + root = [] |
| 100 | + for parent in package.__name__.split('.')[0:-1]: |
| 101 | + root.append(parent) |
| 102 | + module_name = '.'.join(root) |
| 103 | + directory = '/'.join(root) |
| 104 | + |
| 105 | + add(dereference(__import__(module_name, None, None, 'non_empty').__path__[0] + "/__init__.py"), |
| 106 | + directory + "/__init__.py", |
| 107 | + package.__name__) |
| 108 | + |
| 109 | + add_files_for_package(p, p, n, package.__name__) |
| 110 | + |
| 111 | + else: |
| 112 | + f = package.__file__ |
| 113 | + if f.endswith("pyc"): |
| 114 | + f = f[:-3] + "py" |
| 115 | + if n.find(".") == -1: |
| 116 | + add(dereference(f), os.path.basename(f), package.__name__) |
| 117 | + else: |
| 118 | + add(dereference(f), n + ".py", package.__name__) |
| 119 | + |
| 120 | + # include metadata in the same zip file |
| 121 | + metadata_path = metadata_to_add.get(package.__name__) |
| 122 | + if metadata_path is not None: |
| 123 | + add_files_for_package(metadata_path, metadata_path, os.path.basename(metadata_path), package.__name__) |
| 124 | + |
| 125 | + return archives_list |
| 126 | + |
| 127 | + |
| 128 | +class EventLogSelectionMixinSpark(EventLogSelectionDownstreamMixin): |
| 129 | + """ |
| 130 | + Extract events corresponding to a specified time interval. |
| 131 | + """ |
| 132 | + path_targets = None |
| 133 | + |
| 134 | + def __init__(self, *args, **kwargs): |
| 135 | + """ |
| 136 | + Call path selection task to get list of log files matching the pattern |
| 137 | + """ |
| 138 | + super(EventLogSelectionDownstreamMixin, self).__init__(*args, **kwargs) |
| 139 | + self.lower_bound_date_string = self.interval.date_a.strftime('%Y-%m-%d') # pylint: disable=no-member |
| 140 | + self.upper_bound_date_string = self.interval.date_b.strftime('%Y-%m-%d') # pylint: disable=no-member |
| 141 | + path_targets = PathSelectionByDateIntervalTask( |
| 142 | + source=self.source, |
| 143 | + interval=self.interval, |
| 144 | + pattern=self.pattern, |
| 145 | + date_pattern=self.date_pattern, |
| 146 | + ).output() |
| 147 | + self.path_targets = [task.path for task in path_targets] |
| 148 | + |
| 149 | + def get_log_schema(self): |
| 150 | + """ |
| 151 | + Get spark based schema for processing event logs |
| 152 | + :return: Spark schema |
| 153 | + """ |
| 154 | + from pyspark.sql.types import StructType, StringType |
| 155 | + event_schema = StructType().add("POST", StringType(), True).add("GET", StringType(), True) |
| 156 | + module_schema = StructType().add("display_name", StringType(), True) \ |
| 157 | + .add("original_usage_key", StringType(), True) \ |
| 158 | + .add("original_usage_version", StringType(), True) \ |
| 159 | + .add("usage_key", StringType(), True) |
| 160 | + context_schema = StructType().add("command", StringType(), True) \ |
| 161 | + .add("course_id", StringType(), True) \ |
| 162 | + .add("module", module_schema) \ |
| 163 | + .add("org_id", StringType(), True) \ |
| 164 | + .add("path", StringType(), True) \ |
| 165 | + .add("user_id", StringType(), True) |
| 166 | + |
| 167 | + event_log_schema = StructType() \ |
| 168 | + .add("username", StringType(), True) \ |
| 169 | + .add("event_type", StringType(), True) \ |
| 170 | + .add("ip", StringType(), True) \ |
| 171 | + .add("agent", StringType(), True) \ |
| 172 | + .add("host", StringType(), True) \ |
| 173 | + .add("referer", StringType(), True) \ |
| 174 | + .add("accept_language", StringType(), True) \ |
| 175 | + .add("event", event_schema) \ |
| 176 | + .add("event_source", StringType(), True) \ |
| 177 | + .add("context", context_schema) \ |
| 178 | + .add("time", StringType(), True) \ |
| 179 | + .add("name", StringType(), True) \ |
| 180 | + .add("page", StringType(), True) \ |
| 181 | + .add("session", StringType(), True) |
| 182 | + |
| 183 | + return event_log_schema |
| 184 | + |
| 185 | + def get_event_log_dataframe(self, spark, *args, **kwargs): |
| 186 | + from pyspark.sql.functions import to_date, udf, struct, date_format |
| 187 | + dataframe = spark.read.format('json').load(self.path_targets, schema=self.get_log_schema()) |
| 188 | + dataframe = dataframe.filter(dataframe['time'].isNotNull()) \ |
| 189 | + .withColumn('event_date', date_format(to_date(dataframe['time']), 'yyyy-MM-dd')) |
| 190 | + dataframe = dataframe.filter(dataframe['event_date'] == self.lower_bound_date_string) |
| 191 | + return dataframe |
| 192 | + |
| 193 | + |
| 194 | +class SparkJobTask(OverwriteOutputMixin, PySparkTask): |
| 195 | + """ |
| 196 | + Wrapper for spark task |
| 197 | + """ |
| 198 | + |
| 199 | + _spark = None |
| 200 | + _spark_context = None |
| 201 | + _sql_context = None |
| 202 | + _hive_context = None |
| 203 | + _tmp_dir = None |
| 204 | + |
| 205 | + driver_memory = '2g' |
| 206 | + executor_memory = '3g' |
| 207 | + always_log_stderr = False # log stderr if spark fails, True for verbose log |
| 208 | + |
| 209 | + def init_spark(self, sc): |
| 210 | + """ |
| 211 | + Initialize spark, sql and hive context |
| 212 | + :param sc: Spark context |
| 213 | + """ |
| 214 | + from pyspark.sql import SparkSession, SQLContext, HiveContext |
| 215 | + self._sql_context = SQLContext(sc) |
| 216 | + self._spark_context = sc |
| 217 | + self._spark = SparkSession.builder.getOrCreate() |
| 218 | + self._hive_context = HiveContext(sc) |
| 219 | + |
| 220 | + def spark_job(self): |
| 221 | + """ |
| 222 | + Spark code for the job |
| 223 | + """ |
| 224 | + raise NotImplementedError |
| 225 | + |
| 226 | + def _load_internal_dependency_on_cluster(self): |
| 227 | + """ |
| 228 | + creates a zip of package and loads it on spark worker nodes |
| 229 | +
|
| 230 | + Loading via luigi configuration does not work as it creates a tar file whereas spark does not load tar files |
| 231 | + """ |
| 232 | + |
| 233 | + # import packages to be loaded on cluster |
| 234 | + import edx |
| 235 | + import luigi |
| 236 | + import opaque_keys |
| 237 | + import stevedore |
| 238 | + import bson |
| 239 | + import ccx_keys |
| 240 | + import cjson |
| 241 | + import boto |
| 242 | + import filechunkio |
| 243 | + import ciso8601 |
| 244 | + import chardet |
| 245 | + import urllib3 |
| 246 | + import certifi |
| 247 | + import idna |
| 248 | + import requests |
| 249 | + |
| 250 | + dependencies_list = [] |
| 251 | + egg_files = luigi.configuration.get_config().get('spark', 'edx_egg_files', None) |
| 252 | + if isinstance(egg_files, basestring): |
| 253 | + dependencies_list = json.loads(egg_files) |
| 254 | + packages = [edx, luigi, opaque_keys, stevedore, bson, ccx_keys, cjson, boto, filechunkio, ciso8601, chardet, |
| 255 | + urllib3, certifi, idna, requests] |
| 256 | + self._tmp_dir = tempfile.mkdtemp() |
| 257 | + dependencies_list += create_packages_archive(packages, self._tmp_dir) |
| 258 | + # dependencies_list.append('s3://edx-analytics-scratch/egg_files/edx_opaque_keys-0.4-py2.7.egg') |
| 259 | + if len(dependencies_list) > 0: |
| 260 | + for file in dependencies_list: |
| 261 | + self._spark_context.addPyFile(file) |
| 262 | + |
| 263 | + def run(self): |
| 264 | + self.remove_output_on_overwrite() |
| 265 | + super(SparkJobTask, self).run() |
| 266 | + |
| 267 | + def _clean(self): |
| 268 | + """Do any cleanup after job here""" |
| 269 | + import shutil |
| 270 | + shutil.rmtree(self._tmp_dir) |
| 271 | + |
| 272 | + def main(self, sc, *args): |
| 273 | + self.init_spark(sc) |
| 274 | + self._load_internal_dependency_on_cluster() # load packages on EMR cluster for spark worker nodes |
| 275 | + self.spark_job() |
| 276 | + self._clean() |
0 commit comments