refactoring

ericm-db · ericm-db · commit a4b7c10cb146 · 2025-04-29T11:32:03.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala
@@ -1013,6 +1013,18 @@ class RocksDB(
     }
   }
 
+  /**
+   * Releases resources associated with this RocksDB instance without rolling back changes.
+   * 
+   * This method is used in the read-then-write pattern where:
+   * 1. A read-only store is opened to retrieve existing state
+   * 2. The same store is converted to a writable store
+   * 3. After the write store commits, we need to release resources without rolling back
+   *    the changes that were just committed
+   * 
+   * Unlike abort() which rolls back uncommitted changes, release() simply releases
+   * resources and locks without affecting the state data.
+   */
   def release(): Unit = {
     release(LoadStore)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala
@@ -44,7 +44,20 @@ private[sql] class RocksDBStateStoreProvider
   import RocksDBStateStoreProvider._
 
   class RocksDBStateStore(lastVersion: Long) extends StateStore {
-    /** Trait and classes representing the internal state of the store */
+    /**
+     * Trait and classes representing the internal state of the store
+     * 
+     * State transitions:
+     * - Initial state: UPDATING
+     * - UPDATING -> COMMITTED: After successful commit()
+     * - UPDATING -> ABORTED: After abort() or failed commit()
+     * - UPDATING -> RELEASED: After release() without committing changes
+     * - COMMITTED -> RELEASED: After release() following a successful commit
+     * - ABORTED -> RELEASED: After release() following an abort
+     * 
+     * The RELEASED state is terminal and indicates that resources have been released
+     * without affecting the underlying data (unlike ABORTED which rolls back changes).
+     */
     trait STATE
     case object UPDATING extends STATE
     case object COMMITTED extends STATE
@@ -454,6 +467,15 @@ private[sql] class RocksDBStateStoreProvider
 
   override def stateStoreId: StateStoreId = stateStoreId_
 
+  /**
+   * Creates and returns a state store with the specified parameters.
+   *
+   * @param version The version of the state store to load
+   * @param uniqueId Optional unique identifier for checkpoint
+   * @param readOnly Whether to open the store in read-only mode
+   * @param existingStore Optional existing store to reuse instead of creating a new one
+   * @return The loaded state store
+   */
   /**
    * Creates and returns a state store with the specified parameters.
    *
@@ -468,32 +490,30 @@ private[sql] class RocksDBStateStoreProvider
       uniqueId: Option[String],
       readOnly: Boolean,
       existingStore: Option[ReadStateStore] = None): StateStore = {
+    if (version < 0) {
+      throw QueryExecutionErrors.unexpectedStateStoreVersion(version)
+    }
+    
     try {
-      if (version < 0) {
-        throw QueryExecutionErrors.unexpectedStateStoreVersion(version)
-      }
-      try {
-        // Load RocksDB store
-        rocksDB.load(
-          version,
-          stateStoreCkptId = if (storeConf.enableStateStoreCheckpointIds) uniqueId else None,
-          readOnly = readOnly)
-
-        // Return appropriate store instance
-        existingStore match {
-          case Some(stateStore: RocksDBStateStore) =>
-            // Reuse existing store for getWriteStore case
-            stateStore
-          case Some(_) =>
-            throw new IllegalArgumentException("Existing store must be a RocksDBStateStore")
-          case None =>
-            // Create new store instance for getStore/getReadStore cases
-            new RocksDBStateStore(version)
-        }
-      } catch {
-        case e: Throwable =>
-          throw e
+      // Load RocksDB store
+      rocksDB.load(
+        version,
+        stateStoreCkptId = if (storeConf.enableStateStoreCheckpointIds) uniqueId else None,
+        readOnly = readOnly)
+
+      // Return appropriate store instance
+      val stateStore = existingStore match {
+        case Some(stateStore: RocksDBStateStore) =>
+          // Reuse existing store for getWriteStore case
+          stateStore
+        case Some(_) =>
+          throw new IllegalArgumentException("Existing store must be a RocksDBStateStore")
+        case None =>
+          // Create new store instance for getStore/getReadStore cases
+          new RocksDBStateStore(version)
       }
+      
+      stateStore
     } catch {
       case e: SparkException
         if Option(e.getCondition).exists(_.contains("CANNOT_LOAD_STATE_STORE")) =>
@@ -503,7 +523,9 @@ private[sql] class RocksDBStateStoreProvider
           stateStoreId.toString,
           "ROCKSDB_STORE_PROVIDER",
           e)
-      case e: Throwable => throw QueryExecutionErrors.cannotLoadStore(e)
+      case e: Throwable => 
+        logError(s"Failed to load state store version $version with uniqueId $uniqueId", e)
+        throw QueryExecutionErrors.cannotLoadStore(e)
     }
   }
   override def getStore(version: Long, uniqueId: Option[String] = None): StateStore = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -118,6 +118,23 @@ trait ReadStateStore {
    */
   def abort(): Unit
 
+  /**
+   * Release resources associated with this state store without rolling back changes.
+   * 
+   * Unlike `abort()` which rolls back uncommitted changes, `release()` simply releases
+   * resources and locks without affecting the state data. This is particularly important
+   * in the read-then-write pattern where:
+   * 
+   * 1. A read-only store is opened to retrieve existing state
+   * 2. The same store is converted to a writable store using `getWriteStore()`
+   * 3. After the write store commits, we need to release resources without rolling back
+   *    the changes that were just committed
+   * 
+   * Implementations should ensure that:
+   * 1. Any locks or resources held by this store are released
+   * 2. No uncommitted changes are rolled back (unlike `abort()`)
+   * 3. The method is idempotent and safe to call multiple times
+   */
   def release(): Unit
 }
 
@@ -236,7 +253,7 @@ class WrappedReadStateStore(store: StateStore) extends ReadStateStore {
 
   override def abort(): Unit = store.abort()
 
-  override def release(): Unit = {}
+  override def release(): Unit = store.release()
 
   override def prefixScan(prefixKey: UnsafeRow,
     colFamilyName: String = StateStore.DEFAULT_COL_FAMILY_NAME): Iterator[UnsafeRowPair] =
@@ -569,6 +586,24 @@ trait StateStoreProvider {
       version: Long,
       stateStoreCkptId: Option[String] = None): StateStore
 
+  /**
+   * Converts a read-only state store to a writable state store.
+   *
+   * This method is a key part of the read-then-write pattern optimization that avoids
+   * lock contention issues when the same state store needs to be accessed for both
+   * reading and writing. Instead of opening two separate connections (which would block
+   * with lock hardening), this method reuses the existing read store connection.
+   *
+   * Implementations should ensure that:
+   * 1. The returned store has the same version as the input read store
+   * 2. The returned store has access to all state that was visible to the read store
+   * 3. The returned store can be used for both reading and writing operations
+   *
+   * @param readStore The read-only state store to convert to a writable store
+   * @param version The version of the state store (should match readStore.version)
+   * @param uniqueId Optional unique identifier for checkpoint
+   * @return A writable state store that reuses the same underlying connection
+   */
   def getWriteStore(
       readStore: ReadStateStore,
       version: Long,
@@ -1258,4 +1293,3 @@ object StateStore extends Logging {
     }
   }
 }
-
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala
@@ -40,6 +40,22 @@ import org.apache.spark.util.SerializableConfiguration
  * 2. The same state store is then converted to read-write mode for updates
  * 3. This avoids having two separate open connections to the same state store
  *    which would cause blocking or contention issues
+ *
+ * This pattern is particularly important for stateful aggregations where:
+ * - StateStoreRestoreExec first reads previous state using a read-only store
+ * - StateStoreSaveExec then updates the state using a writable store
+ *
+ * Without this optimization, the following pattern would cause contention:
+ *   readStore.acquire()
+ *   writeStore.acquire() // This would block with lock hardening changes
+ *   writeStore.commit()
+ *   readStore.abort()
+ *
+ * With this optimization, the pattern becomes:
+ *   readStore = getReadStore()
+ *   writeStore = getWriteStore(readStore) // Reuses the same store connection
+ *   writeStore.commit()
+ *   // No need to abort/release readStore as it's the same underlying store
  */
 trait StateStoreRDDProvider {
   /**
@@ -108,9 +124,24 @@ class ReadStateStoreRDD[T: ClassTag, U: ClassTag](
   extends BaseStateStoreRDD[T, U](dataRDD, checkpointLocation, queryRunId, operatorId,
     sessionState, storeCoordinator, extraOptions) with StateStoreRDDProvider {
 
-  // Using a ConcurrentHashMap to track state stores by partition ID
-  @transient private lazy val partitionStores =
-  new java.util.concurrent.ConcurrentHashMap[Int, ReadStateStore]()
+  // Using a bounded concurrent map to track state stores by partition ID
+  // This prevents memory leaks for long-running tasks by limiting the maximum size
+  @transient private lazy val partitionStores = {
+    val maxSize = 100 // Maximum number of state stores to cache
+    java.util.Collections.synchronizedMap(
+      new java.util.LinkedHashMap[Int, ReadStateStore](16, 0.75f, true) {
+        override def removeEldestEntry(
+            eldest: java.util.Map.Entry[Int, ReadStateStore]): Boolean = {
+          val tooMany = size() > maxSize
+          if (tooMany) {
+            // Release resources for the state store being evicted
+            eldest.getValue.release()
+          }
+          tooMany
+        }
+      }
+    )
+  }
 
   override def getStateStoreForPartition(partitionId: Int): Option[ReadStateStore] = {
     Option(partitionStores.get(partitionId))
@@ -182,22 +213,42 @@ class StateStoreRDD[T: ClassTag, U: ClassTag](
    * This is particularly important for stateful aggregations where StateStoreRestoreExec
    * first reads previous state and StateStoreSaveExec then updates it.
    *
-   * The method performs a depth-first search through the RDD dependency graph.
+   * The method performs an optimized depth-first search through the RDD dependency graph,
+   * prioritizing paths that are more likely to contain state store providers.
    *
    * @param rdd The starting RDD to search from
    * @return Some(provider) if a StateStoreRDDProvider is found in the lineage, None otherwise
    */
   private def findStateStoreProvider(rdd: RDD[_]): Option[StateStoreRDDProvider] = {
-    rdd match {
-      case null => None
-      case provider: StateStoreRDDProvider => Some(provider)
-      case _ if rdd.dependencies.isEmpty => None
-      case _ =>
-        // Search all dependencies
-        rdd.dependencies.view
-          .map(dep => findStateStoreProvider(dep.rdd))
-          .find(_.isDefined)
-          .flatten
+    // Early termination conditions
+    if (rdd == null) return None
+    
+    // Check if the current RDD is a provider
+    if (rdd.isInstanceOf[StateStoreRDDProvider]) {
+      return Some(rdd.asInstanceOf[StateStoreRDDProvider])
+    }
+    
+    // If no dependencies, we can't find a provider
+    if (rdd.dependencies.isEmpty) return None
+    
+    // Prioritize narrow dependencies over wide dependencies
+    // Narrow dependencies are more likely to preserve the state store provider lineage
+    val (narrowDeps, wideDeps) = rdd.dependencies.partition(_.isInstanceOf[NarrowDependency[_]])
+    
+    // First search through narrow dependencies
+    val narrowResult = narrowDeps.view
+      .map(dep => findStateStoreProvider(dep.rdd))
+      .find(_.isDefined)
+      .flatten
+    
+    if (narrowResult.isDefined) {
+      narrowResult
+    } else {
+      // If not found in narrow dependencies, try wide dependencies
+      wideDeps.view
+        .map(dep => findStateStoreProvider(dep.rdd))
+        .find(_.isDefined)
+        .flatten
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
@@ -116,6 +116,9 @@ package object state {
         })
         taskContext.addTaskFailureListener(new TaskFailureListener {
           override def onTaskFailure(context: TaskContext, error: Throwable): Unit = {
+            // On task failure, we need to abort to roll back any uncommitted changes
+            // We don't call release() here because it would leave the state in an inconsistent state
+            // abort() ensures proper cleanup and rollback of uncommitted changes
             store.abort()
           }
         })

Original file line number	Diff line number	Diff line change
`@@ -1013,6 +1013,18 @@ class RocksDB(`
`1013`	`1013`	`}`
`1014`	`1014`	`}`
`1015`	`1015`
	`1016`	`+ /**`
	`1017`	`+ * Releases resources associated with this RocksDB instance without rolling back changes.`
	`1018`	`+ *`
	`1019`	`+ * This method is used in the read-then-write pattern where:`
	`1020`	`+ * 1. A read-only store is opened to retrieve existing state`
	`1021`	`+ * 2. The same store is converted to a writable store`
	`1022`	`+ * 3. After the write store commits, we need to release resources without rolling back`
	`1023`	`+ * the changes that were just committed`
	`1024`	`+ *`
	`1025`	`+ * Unlike abort() which rolls back uncommitted changes, release() simply releases`
	`1026`	`+ * resources and locks without affecting the state data.`
	`1027`	`+ */`
`1016`	`1028`	`def release(): Unit = {`
`1017`	`1029`	`release(LoadStore)`
`1018`	`1030`	`}`