- 
                Notifications
    You must be signed in to change notification settings 
- Fork 2
Description
Hello All,
I am using spark-submit to submit a job by modifying a nested struct column by adding a new field using withField()
Following is the spark-submit command:
spark-submit --master local --packages com.github.fqaiser94:mse_2.11:0.2.4 --py-files mse.zip write_data.py
I have ziped mse, since I dont want to install it globally and also I am going to execute it in AWS EMR.
I am getting following error:
Traceback (most recent call last):
File "/Users/fkj/DataLakeManagement/archive/write_data.py", line 3, in
from mse import *
File "/Users/fkj/DataLakeManagement/archive/mse.zip/mse/init.py", line 1, in
File "/Users/fkj/DataLakeManagement/archive/mse.zip/mse/methods.py", line 5
def __withField(self: Column, fieldName: str, fieldValue: Column):
^
SyntaxError: invalid syntax
Following is my mse.zip:
write_data.py:
from datetime import datetime
from mse import *
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StringType, TimestampType, IntegerType
def spark():
    """
    This function is invoked to create the Spark Session.
    :return: the spark session
    """
    spark_session = (SparkSession
                     .builder
                     .appName("Data_Experimentation_Framework")
                     .getOrCreate())
    spark_session.conf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MILLIS")
    return spark_session
address = StructType().add("state", StringType())
schema = StructType().add("_id", IntegerType()) \
    .add("employer", StringType()) \
    .add("created_at", TimestampType()) \
    .add("name", StringType())\
    .add("address", address)
employees = [{'_id': 1,
              'employer': 'Microsoft',
              'created_at': datetime.now(),
              'name': 'Noel',
              'address': {
                  "state": "Pennsylvania"
              }
              },
             {'_id': 2,
              'employer': 'Apple',
              'created_at': datetime.now(),
              'name': 'Steve',
              'address': {
                  "state": "New York"
              }
              }
             ]
df = spark().createDataFrame(employees, schema=schema)
df.withColumn("address", f.col("address").withField("country", f.lit("USA")))
df.printSchema()
df.write \
    .format("parquet") \
    .mode("append") \
    .save("/Users/felixkizhakkeljose/Downloads/test12")
Could you help me to identify what am I doing wrong?