Concurrent Data Writing in Databricks with PySpark and Delta-rs Without Losing Data #2696

premtiwari14f · 2024-07-22T17:24:36Z

premtiwari14f
Jul 22, 2024

Q2: How to handle concurrent writes to a Delta table in Databricks without losing data?

Q1: Why does delta-rs not create the .crc file?

Spark code

`import pytz
import random
import time
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import os
import pandas as pd

Function to generate a random date within the last 10 days

def random_date():
return datetime.now().date() - timedelta(days=random.randint(0, 10))

Function to write DataFrame to Delta Lake with retries

def write_with_retries(df, path, max_retries=10, initial_delay=0.5):
attempt = 0
while attempt < max_retries:
try:
df.write
.mode("append")
.partitionBy("date")
.save(path)
print("Data written to Delta Lake table successfully.")
return # Exit the function if write succeeds
except Exception as e:
if "version 0 already exists" in str(e):
attempt += 1
delay = initial_delay * (2 ** (attempt - 1)) # Exponential backoff
delay += random.uniform(0, 0.1) # Add jitter to avoid collision
print(f"Attempt {attempt} failed with error: {e}. Retrying in {delay:.2f} seconds.")
time.sleep(delay)
else:
raise # Raise other exceptions immediately

final_error_message = f"All {max_retries} attempts failed."
print(final_error_message)
# Optionally, you can log the error to a file or monitoring system

def fixed_date():
return datetime.now().date() - timedelta(days=1)

Main loop to generate and write data

delta_table_path = "/mnt/path" # Replace with your S3 Delta Lake table path

for i in range(20):
num_rows = 50000
data = {
'id': range(1, num_rows + 1),
'name': [random.choice(['Alice', 'Bob', 'Charlie', 'David', 'Eve']) for _ in range(num_rows)],
'age': [random.choice([25, 30, 35, 40, 45]) for _ in range(num_rows)],
'date': [fixed_date() for _ in range(num_rows)]
}

df = spark.createDataFrame(pd.DataFrame(data))
tm = datetime.utcnow().replace(tzinfo=pytz.utc)
df = df.withColumn("utctime", lit(tm))

write_with_retries(df, delta_table_path)

print("Iteration:", i, "Timestamp:", tm)
time.sleep(1)

`

delta-rs code

`import os
import pandas as pd
from deltalake import DeltaTable, write_deltalake
from datetime import datetime, timedelta

storage_options = {
'AWS_S3_LOCKING_PROVIDER': 'dynamodb',
'DELTA_DYNAMO_TABLE_NAME': 'delta_log2'
}

s3_bucket = 'aaaaaaaa'
s3_delta_table_path = f's3://{s3_bucket}/mnt/path'
os.environ['AWS_ACCESS_KEY_ID'] = 'aaaaaaaaaaaaa'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
os.environ['AWS_REGION'] = 'eu-west-1'
os.environ['AWS_S3_ALLOW_UNSAFE_RENAME'] = 'true'
import pytz
import random
import time
import delta.tables as delta

Function to generate a random date within the last 10 days

def random_date():
return datetime.now().date() - timedelta(days=random.randint(0, 10))

Function to write DataFrame to Delta Lake with retries

def write_with_retries(df, max_retries=10, initial_delay=0.5):
attempt = 0

def delta_table_exists(path):
    try:
        DeltaTable(path)
        return True
    except:
        return False

if not delta_table_exists(s3_delta_table_path):
    write_deltalake(s3_delta_table_path, df , mode='overwrite', engine="rust", partition_by=['date'], storage_options=storage_options)
    print("Created a new Delta table at:", s3_delta_table_path)
    return

while attempt < max_retries:
    try:
        write_deltalake(s3_delta_table_path, df, mode='append', engine="rust", partition_by=['date'], storage_options=storage_options)
        print("Data written to S3 Delta table successfully.")
        return  # Exit the function if write succeeds
    except Exception as e:
        if "version 0 already exists" in str(e):
            attempt += 1
            delay = initial_delay * (2 ** (attempt - 1))  # Exponential backoff
            delay += random.uniform(0, 0.1)  # Add jitter to avoid collision
            print(f"Attempt {attempt} failed with error: {e}. Retrying in {delay:.2f} seconds.")
            time.sleep(delay)
        else:
            raise  # Raise other exceptions immediately

final_error_message = f"All {max_retries} attempts failed."
print(final_error_message)
write_error_log(final_error_message)

def fixed_date():
return datetime.now().date() - timedelta(days=1)

Main loop to generate and write data

for i in range(20):
num_rows = 50000
data = {
'id': range(1, num_rows + 1),
'name': [random.choice(['Alice', 'Bob', 'Charlie', 'David', 'Eve']) for _ in range(num_rows)],
'age': [random.choice([25, 30, 35, 40, 45]) for _ in range(num_rows)],
'date': [fixed_date() for _ in range(num_rows)]
}

df = pd.DataFrame(data)
tm = datetime.utcnow().replace(tzinfo=pytz.utc)
df['utctime'] = tm

write_with_retries(df)

print("Iteration:", i, "Timestamp:", tm)
time.sleep(1)

`

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Concurrent Data Writing in Databricks with PySpark and Delta-rs Without Losing Data #2696

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Concurrent Data Writing in Databricks with PySpark and Delta-rs Without Losing Data #2696

Uh oh!

Uh oh!

premtiwari14f Jul 22, 2024

Q2: How to handle concurrent writes to a Delta table in Databricks without losing data?

Q1: Why does delta-rs not create the .crc file?

Spark code

Function to generate a random date within the last 10 days

Function to write DataFrame to Delta Lake with retries

Main loop to generate and write data

delta-rs code

Function to generate a random date within the last 10 days

Function to write DataFrame to Delta Lake with retries

Main loop to generate and write data

Replies: 0 comments

premtiwari14f
Jul 22, 2024