Revert "Skip getFileStatus call during iceberg to delta clone" (#3855)

This reverts commit a2ba9e9. The approach needs to be revisited and prepared in separate PR later.  #### Which Delta project/connector is this regarding?  - [x] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [ ] Other (fill in here) ## Description  ## How was this patch tested?  ## Does this PR introduce _any_ user-facing changes?
delta-io · Nov 8, 2024 · f4dbc9b · f4dbc9b
1 parent 6257799
commit f4dbc9b
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 28 deletions.
diff --git a/iceberg/src/main/scala/org/apache/spark/sql/delta/IcebergFileManifest.scala b/iceberg/src/main/scala/org/apache/spark/sql/delta/IcebergFileManifest.scala
@@ -109,16 +109,12 @@ class IcebergFileManifest(
     spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_CONVERT_ICEBERG_UNSAFE_MOR_TABLE_ENABLE)
 
     var numFiles = 0L
-    val skipGetFileStatus = spark.sessionState.conf.getConf(
-      DeltaSQLConf.DELTA_CLONE_ICEBERG_SKIP_GETFILESTATUS)
-    val snapshotTimestamp: Option[Long] = Option(table.currentSnapshot()).map(_.timestampMillis())
-
     val res = table.newScan().planFiles().iterator().asScala.grouped(schemaBatchSize).map { batch =>
       logInfo(log"Getting file statuses for a batch of " +
         log"${MDC(DeltaLogKeys.BATCH_SIZE, batch.size)} of files; " +
         log"finished ${MDC(DeltaLogKeys.NUM_FILES, numFiles)} files so far")
       numFiles += batch.length
-      val filePathWithPartValuesAndSize = batch.map { fileScanTask =>
+      val filePathWithPartValues = batch.map { fileScanTask =>
         val filePath = fileScanTask.file().path().toString
         // If an Iceberg table has deletion file associated with the data file (Supported in
         // Iceberg V2, either position deletes or equality deletes), we could not convert directly.
@@ -133,23 +129,18 @@ class IcebergFileManifest(
           Some(convertIcebergPartitionToPartitionValues(
             fileScanTask.file().partition()))
         } else None
-        (filePath, partitionValues, fileScanTask.file.fileSizeInBytes())
+        (filePath, partitionValues)
       }
-      val numParallelism = Math.min(Math.max(filePathWithPartValuesAndSize.size, 1),
+      val numParallelism = Math.min(Math.max(filePathWithPartValues.size, 1),
         spark.sparkContext.defaultParallelism)
 
-      val rdd = spark.sparkContext.parallelize(filePathWithPartValuesAndSize, numParallelism)
+      val rdd = spark.sparkContext.parallelize(filePathWithPartValues, numParallelism)
         .mapPartitions { iterator =>
-          iterator.map { case (filePath, partValues, size) =>
-            val serializableFileStatus = (skipGetFileStatus, snapshotTimestamp) match {
-              case (true, Some(ts)) =>
-                SerializableFileStatus(filePath, size, isDir = false, ts)
-              case _ =>
-                val path = new Path(filePath)
-                val fs = path.getFileSystem(conf.value.value)
-                SerializableFileStatus.fromStatus(fs.getFileStatus(path))
-            }
-            ConvertTargetFile(serializableFileStatus, partValues)
+          iterator.map { case (filePath, partValues) =>
+            val path = new Path(filePath)
+            val fs = path.getFileSystem(conf.value.value)
+            val fileStatus = fs.getFileStatus(path)
+            ConvertTargetFile(SerializableFileStatus.fromStatus(fileStatus), partValues)
           }
         }
       spark.createDataset(rdd)

diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/CloneTableCommand.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/CloneTableCommand.scala
@@ -243,7 +243,7 @@ abstract class CloneConvertedSource(spark: SparkSession) extends CloneSource {
         val basePath = new Path(baseDir)
         val fs = basePath.getFileSystem(conf.value.value)
         targetFile.map(ConvertUtils.createAddFile(
-          _, basePath, fs, SQLConf.get, Some(partitionSchema), useAbsolutePath = true))
+          _, basePath, fs, SQLConf.get, Some(partitionSchema)))
       }
     }
   }

diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala
@@ -1717,15 +1717,6 @@ trait DeltaSQLConfBase {
       .booleanConf
       .createWithDefault(true)
 
-  val DELTA_CLONE_ICEBERG_SKIP_GETFILESTATUS = {
-    buildConf("clone.IcebergSkipGetFileStatus")
-      .internal()
-      .doc("If clone with Iceberg source can skip getFileStatus and " +
-        "use snapshot timestamp as the modificationTime for Delta AddFile")
-      .booleanConf
-      .createWithDefault(true)
-  }
-
   val DELTA_OPTIMIZE_METADATA_QUERY_ENABLED =
     buildConf("optimizeMetadataQuery.enabled")
       .internal()