Skip to content
This repository was archived by the owner on May 1, 2024. It is now read-only.

Commit 744684e

Browse files
Enable logging in spark tasks
1 parent b863f58 commit 744684e

File tree

1 file changed

+4
-0
lines changed

1 file changed

+4
-0
lines changed

edx/analytics/tasks/common/spark.py

+4
Original file line numberDiff line numberDiff line change
@@ -255,9 +255,11 @@ def get_input_source(self, *args):
255255
# Reading manifest as rdd with spark is alot faster as compared to hadoop.
256256
# Currently, we're getting only 1 manifest file per request, so we will create a single rdd from it.
257257
# If there are multiple manifest files, each file can be read as rdd and then union it with other manifest rdds
258+
self.log.warn("PYSPARK LOGGER: Reading manifest file :: {} ".format(targets[0].path))
258259
source_rdd = self._spark.sparkContext.textFile(targets[0].path)
259260
broadcast_value = self._spark.sparkContext.broadcast(source_rdd.collect())
260261
else:
262+
self.log.warn("PYSPARK LOGGER: Reading normal targets")
261263
broadcast_value = self._spark.sparkContext.broadcast([target.path for target in targets])
262264
return broadcast_value
263265

@@ -298,6 +300,8 @@ def init_spark(self, sc):
298300
self._spark_context = sc
299301
self._spark = SparkSession.builder.getOrCreate()
300302
self._hive_context = HiveContext(sc)
303+
log4jLogger = sc._jvm.org.apache.log4j # using spark logger
304+
self.log = log4jLogger.LogManager.getLogger(__name__)
301305

302306
@property
303307
def conf(self):

0 commit comments

Comments
 (0)