Add a flag to force stats collection during query optimizations

minyyy · minyyy · commit 48d49019a975 · 2026-01-20T19:40:00.000Z
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala
@@ -392,6 +392,16 @@ trait DeltaSQLConfBase extends DeltaSQLConfUtils {
       .booleanConf
       .createWithDefault(true)
 
+  val DELTA_ALWAYS_COLLECT_STATS =
+    buildConf("alwaysCollectStats.enabled")
+      .internal()
+      .doc("When true, row counts are collected from file statistics even when there are no " +
+        "data filters. This is useful for ensuring PreparedDeltaFileIndex always has row count " +
+        "information available. Note: this may have a small performance overhead as it requires " +
+        "summing numRecords from all files.")
+      .booleanConf
+      .createWithDefault(false)
+
   val DELTA_LIMIT_PUSHDOWN_ENABLED =
     buildConf("stats.limitPushdown.enabled")
       .internal()
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/stats/DataSkippingReader.scala b/spark/src/main/scala/org/apache/spark/sql/delta/stats/DataSkippingReader.scala
@@ -1238,7 +1238,10 @@ trait DataSkippingReaderBase
       partitionFilters: Seq[Expression],
       keepNumRecords: Boolean): (Seq[AddFile], DataSize) = recordFrameProfile(
       "Delta", "DataSkippingReader.filterOnPartitions") {
-    val df = if (keepNumRecords) {
+    val forceCollectRowCount =
+      spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_ALWAYS_COLLECT_STATS)
+    val shouldCollectStats = keepNumRecords || forceCollectRowCount
+    val df = if (shouldCollectStats) {
       // use withStats instead of allFiles so the `stats` column is already parsed
       val filteredFiles =
         DeltaLog.filterFileList(metadata.partitionSchema, withStats, partitionFilters)
@@ -1253,7 +1256,26 @@ trait DataSkippingReaderBase
     }
     val files = convertDataFrameToAddFiles(df)
     val sizeInBytesByPartitionFilters = files.map(_.size).sum
-    files.toSeq -> DataSize(Some(sizeInBytesByPartitionFilters), None, Some(files.size))
+    // Compute row count if we have stats available and forceCollectRowCount is enabled
+    val rowCount = if (forceCollectRowCount) {
+      sumRowCounts(files)
+    } else {
+      None
+    }
+    files.toSeq -> DataSize(Some(sizeInBytesByPartitionFilters), rowCount, Some(files.size))
+  }
+
+  /**
+   * Sums up the numPhysicalRecords from the given AddFile objects.
+   * Returns None if any file is missing stats (to indicate incomplete row count).
+   */
+  private def sumRowCounts(files: Seq[AddFile]): Option[Long] = {
+    files.foldLeft(Option(0L)) { (accOpt, file) =>
+      for {
+        acc <- accOpt
+        count <- file.numPhysicalRecords
+      } yield acc + count
+    }
   }
 
   /**
@@ -1310,13 +1332,23 @@ trait DataSkippingReaderBase
     if (filters == Seq(TrueLiteral) || filters.isEmpty || schema.isEmpty) {
       recordDeltaOperation(deltaLog, "delta.skipping.none") {
         // When there are no filters we can just return allFiles with no extra processing
+        val forceCollectRowCount =
+          spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_ALWAYS_COLLECT_STATS)
+        val shouldCollectStats = keepNumRecords || forceCollectRowCount
+        val files = getAllFiles(shouldCollectStats)
+        // Compute row count if forceCollectRowCount is enabled
+        val rowCount = if (forceCollectRowCount) {
+          sumRowCounts(files)
+        } else {
+          None
+        }
         val dataSize = DataSize(
           bytesCompressed = sizeInBytesIfKnown,
-          rows = None,
+          rows = rowCount,
           files = numOfFilesIfKnown)
         return DeltaScan(
           version = version,
-          files = getAllFiles(keepNumRecords),
+          files = files,
           total = dataSize,
           partition = dataSize,
           scanned = dataSize)(
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/stats/PreparedDeltaFileIndexRowCountSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/stats/PreparedDeltaFileIndexRowCountSuite.scala
@@ -0,0 +1,133 @@
+/*
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta.stats
+
+// scalastyle:off import.ordering.noEmptyLine
+import org.apache.spark.sql.delta.{DeltaLog, DeltaTable}
+import org.apache.spark.sql.delta.sources.DeltaSQLConf
+import org.apache.spark.sql.delta.test.DeltaSQLCommandTest
+
+import org.apache.spark.sql.{DataFrame, QueryTest}
+import org.apache.spark.sql.functions._
+
+/**
+ * Test suite to verify when preparedScan.scanned.rows is populated in PreparedDeltaFileIndex,
+ * and the behavior of the DELTA_ALWAYS_COLLECT_STATS flag.
+ */
+class PreparedDeltaFileIndexRowCountSuite
+    extends QueryTest
+    with DeltaSQLCommandTest {
+
+  import testImplicits._
+
+  private def getDeltaScan(df: DataFrame): DeltaScan = {
+    val scans = df.queryExecution.optimizedPlan.collect {
+      case DeltaTable(prepared: PreparedDeltaFileIndex) => prepared.preparedScan
+    }
+    assert(scans.size == 1, s"Expected 1 DeltaScan, found ${scans.size}")
+    scans.head
+  }
+
+  /**
+   * Test utility that creates a partitioned Delta table and verifies scanned.rows behavior.
+  *
+   * @param alwaysCollectStats value of the DELTA_ALWAYS_COLLECT_STATS flag
+   * @param queryTransform function to transform the base DataFrame (apply filters)
+   * @param expectedRowsDefined whether scanned.rows should be defined
+   * @param expectedRowCount expected row count if defined (None to skip validation)
+   */
+  private def testRowCountBehavior(
+      alwaysCollectStats: Boolean,
+      queryTransform: DataFrame => DataFrame,
+      expectedRowsDefined: Boolean,
+      expectedRowCount: Option[Long] = None): Unit = {
+    withTempDir { dir =>
+      withSQLConf(DeltaSQLConf.DELTA_COLLECT_STATS.key -> "true") {
+        spark.range(100).toDF("id")
+          .withColumn("part", $"id" % 4)
+          .repartition(4)
+          .write.format("delta").partitionBy("part").save(dir.getAbsolutePath)
+      }
+
+      DeltaLog.clearCache()
+
+      withSQLConf(DeltaSQLConf.DELTA_ALWAYS_COLLECT_STATS.key -> alwaysCollectStats.toString) {
+        val df = spark.read.format("delta").load(dir.getAbsolutePath)
+        val scan = getDeltaScan(queryTransform(df))
+
+        if (expectedRowsDefined) {
+          assert(scan.scanned.rows.isDefined, "scanned.rows should be defined")
+          expectedRowCount.foreach { expected =>
+            assert(scan.scanned.rows.get == expected,
+              s"Expected $expected rows, got ${scan.scanned.rows.get}")
+          }
+        } else {
+          assert(scan.scanned.rows.isEmpty, "scanned.rows should be None")
+        }
+      }
+    }
+  }
+
+  // Define query cases: (name, transform function, always collects rows)
+  // Note: In the Edge code path, DataSkippingReaderEdge.filterOnPartitions always collects
+  // row counts for partition filter cases (see keepNumRecords = true in that method).
+  // So only "no filter" and "TrueLiteral filter" depend on the alwaysCollectStats flag.
+  private val queryCases: Seq[(String, DataFrame => DataFrame, Boolean)] = Seq(
+    ("no filter", identity[DataFrame], false),
+    ("TrueLiteral filter", _.where(lit(true)), false),
+    ("partition filter only", _.where($"part" === 1), false),
+    ("data filter", _.where($"id" === 50), true),
+    ("partition + data filter", _.where($"part" === 1).where($"id" === 50), true)
+  )
+
+  // Grid test: all query cases x flag values
+  for {
+    (caseName, queryTransform, alwaysCollectsRows) <- queryCases
+    alwaysCollectStats <- Seq(false, true)
+  } {
+    val flagDesc = s"alwaysCollectStats=$alwaysCollectStats"
+    // If the query type always collects rows, rows is always defined; otherwise depends on flag
+    val expectedRowsDefined = alwaysCollectsRows || alwaysCollectStats
+
+    test(s"$caseName - $flagDesc") {
+      testRowCountBehavior(
+        alwaysCollectStats = alwaysCollectStats,
+        queryTransform = queryTransform,
+        expectedRowsDefined = expectedRowsDefined
+      )
+    }
+  }
+
+  test("alwaysCollectStats with missing stats returns None") {
+    withTempDir { dir =>
+      // Create table without stats
+      withSQLConf(DeltaSQLConf.DELTA_COLLECT_STATS.key -> "false") {
+        spark.range(100).toDF("id")
+          .write.format("delta").save(dir.getAbsolutePath)
+      }
+
+      DeltaLog.clearCache()
+
+      withSQLConf(DeltaSQLConf.DELTA_ALWAYS_COLLECT_STATS.key -> "true") {
+        val df = spark.read.format("delta").load(dir.getAbsolutePath)
+        val scan = getDeltaScan(df)
+        assert(scan.scanned.rows.isEmpty, "scanned.rows should be None when stats are missing")
+      }
+    }
+  }
+
+}