WIP

yruslan · yruslan · commit 99eb524074ae · 2025-04-25T11:36:47.000+02:00
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/VarLenNestedReader.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/VarLenNestedReader.scala
@@ -56,7 +56,6 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
   def recordExtractor(startingRecordNumber: Long,
                       dataStream: SimpleStream,
                       headerStream: SimpleStream,
-                      copybook: Copybook
                      ): Option[RawRecordExtractor] = {
     val rdwParams = RecordHeaderParameters(readerProperties.isRdwBigEndian, readerProperties.rdwAdjustment)
 
@@ -66,7 +65,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
     val bdwParamsOpt = bdwOpt.map(bdw => RecordHeaderParameters(bdw.isBigEndian, bdw.adjustment))
     val bdwDecoderOpt = bdwParamsOpt.map(bdwParams => new RecordHeaderDecoderBdw(bdwParams))
 
-    val reParams = RawRecordContext(startingRecordNumber, dataStream, headerStream, copybook, rdwDecoder, bdwDecoderOpt.getOrElse(rdwDecoder), readerProperties.reAdditionalInfo)
+    val reParams = RawRecordContext(startingRecordNumber, dataStream, headerStream, cobolSchema.copybook, rdwDecoder, bdwDecoderOpt.getOrElse(rdwDecoder), readerProperties.reAdditionalInfo)
 
     readerProperties.recordExtractor match {
       case Some(recordExtractorClass) =>
@@ -113,7 +112,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
         dataStream,
         readerProperties,
         recordHeaderParser,
-        recordExtractor(startingRecordIndex, dataStream, headerStream, cobolSchema.copybook),
+        recordExtractor(startingRecordIndex, dataStream, headerStream),
         fileNumber,
         startingRecordIndex,
         startingFileOffset,
@@ -123,7 +122,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
         dataStream,
         readerProperties,
         recordHeaderParser,
-        recordExtractor(startingRecordIndex, dataStream, headerStream, cobolSchema.copybook),
+        recordExtractor(startingRecordIndex, dataStream, headerStream),
         fileNumber,
         startingRecordIndex,
         startingFileOffset,
@@ -178,7 +177,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
         dataStream,
         readerProperties.fileStartOffset,
         recordHeaderParser,
-        recordExtractor(0L, dataStream, headerStream, copybook),
+        recordExtractor(0L, dataStream, headerStream),
         inputSplitSizeRecords,
         inputSplitSizeMB,
         Some(copybook),
@@ -189,7 +188,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
         dataStream,
         readerProperties.fileStartOffset,
         recordHeaderParser,
-        recordExtractor(0L, dataStream, headerStream, copybook),
+        recordExtractor(0L, dataStream, headerStream),
         inputSplitSizeRecords,
         inputSplitSizeMB,
         None,
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala
@@ -117,6 +117,7 @@ case class ReaderParameters(
                              decodeBinaryAsHex:       Boolean = false,
                              dropGroupFillers:        Boolean = false,
                              dropValueFillers:        Boolean = true,
+                             enableSelfChecks:        Boolean = true,
                              fillerNamingPolicy:      FillerNamingPolicy = FillerNamingPolicy.SequenceNumbers,
                              nonTerminals:            Seq[String] = Nil,
                              occursMappings:          Map[String, Map[String, Int]] = Map(),
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala
@@ -418,6 +418,7 @@ object CobolParametersParser extends Logging {
       parameters.decodeBinaryAsHex,
       parameters.dropGroupFillers,
       parameters.dropValueFillers,
+      parameters.enableSelfChecks,
       parameters.fillerNamingPolicy,
       parameters.nonTerminals,
       parameters.occursMappings,
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/reader/VarLenNestedReader.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/reader/VarLenNestedReader.scala
@@ -60,7 +60,7 @@ final class VarLenNestedReader(copybookContents: Seq[String],
           dataStream,
           getReaderProperties,
           recordHeaderParser,
-          recordExtractor(startingRecordIndex, dataStream, headerStream, cobolSchema.copybook),
+          recordExtractor(startingRecordIndex, dataStream, headerStream),
           fileNumber,
           startingRecordIndex,
           startingFileOffset,
@@ -72,7 +72,7 @@ final class VarLenNestedReader(copybookContents: Seq[String],
           dataStream,
           getReaderProperties,
           recordHeaderParser,
-          recordExtractor(startingRecordIndex, dataStream, headerStream, cobolSchema.copybook),
+          recordExtractor(startingRecordIndex, dataStream, headerStream),
           fileNumber,
           startingRecordIndex,
           startingFileOffset,
diff --git a/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala b/spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala
@@ -25,6 +25,8 @@ import org.apache.spark.sql.SQLContext
 import za.co.absa.cobrix.cobol.internal.Logging
 import za.co.absa.cobrix.cobol.reader.common.Constants
 import za.co.absa.cobrix.cobol.reader.index.entry.SparseIndexEntry
+import za.co.absa.cobrix.cobol.reader.stream.SimpleStream
+import za.co.absa.cobrix.cobol.reader.{VarLenNestedReader => ReaderVarLenNestedReader}
 import za.co.absa.cobrix.spark.cobol.reader.{Reader, VarLenReader}
 import za.co.absa.cobrix.spark.cobol.source.SerializableConfiguration
 import za.co.absa.cobrix.spark.cobol.source.parameters.LocalityParameters
@@ -111,10 +113,15 @@ private[source] object IndexBuilder extends Logging {
   private[cobol] def buildIndexForVarLenReader(filesList: Array[FileWithOrder],
                                                reader: VarLenReader,
                                                sqlContext: SQLContext): RDD[SparseIndexEntry] = {
-    val filesRDD = sqlContext.sparkContext.parallelize(filesList, filesList.length)
     val conf = sqlContext.sparkContext.hadoopConfiguration
     val sconf = new SerializableConfiguration(conf)
 
+    if (reader.getReaderProperties.enableSelfChecks && filesList.nonEmpty) {
+      selfCheckForIndexCompatibility(reader, filesList.head.filePath, conf)
+    }
+
+    val filesRDD = sqlContext.sparkContext.parallelize(filesList, filesList.length)
+
     val indexRDD = filesRDD.mapPartitions(
       partition => {
         partition.flatMap(row => {
@@ -149,36 +156,114 @@ private[source] object IndexBuilder extends Logging {
                                         config: Configuration,
                                         reader: VarLenReader): ArrayBuffer[SparseIndexEntry] = {
     val filePath = fileWithOrder.filePath
-    val path = new Path(filePath)
     val fileOrder = fileWithOrder.order
+    val startOffset = reader.getReaderProperties.fileStartOffset
+    val endOffset = reader.getReaderProperties.fileEndOffset
+
+    logger.info(s"Going to generate index for the file: $filePath")
+
+    val (inputStream, headerStream, maximumBytes) = getStreams(filePath, startOffset, endOffset, config)
+    val index = reader.generateIndex(inputStream, headerStream, fileOrder, reader.isRdwBigEndian)
+
+    val indexWithEndOffset = if (maximumBytes > 0 ){
+      index.map(entry => if (entry.offsetTo == -1) entry.copy(offsetTo = startOffset + maximumBytes) else entry)
+    } else {
+      index
+    }
+
+    indexWithEndOffset
+  }
+
+  private[cobol] def getStreams(filePath: String,
+                                fileStartOffset: Long,
+                                fileEndOffset: Long,
+                                config: Configuration,
+                               ): (SimpleStream, SimpleStream, Long) = {
+    val path = new Path(filePath)
     val fileSystem = path.getFileSystem(config)
 
-    val startOffset = reader.getReaderProperties.fileStartOffset
-    val maximumBytes = if (reader.getReaderProperties.fileEndOffset == 0) {
+    val startOffset = fileStartOffset
+    val maximumBytes = if (fileEndOffset == 0) {
       0
     } else {
-      val bytesToRead = fileSystem.getContentSummary(path).getLength - reader.getReaderProperties.fileEndOffset - startOffset
+      val bytesToRead = fileSystem.getContentSummary(path).getLength - fileEndOffset - startOffset
       if (bytesToRead < 0)
         0
       else
         bytesToRead
     }
 
-    logger.info(s"Going to generate index for the file: $filePath")
     val inputStream = new FileStreamer(filePath, fileSystem, startOffset, maximumBytes)
     val headerStream = new FileStreamer(filePath, fileSystem)
-    val index = reader.generateIndex(inputStream, headerStream,
-                                     fileOrder, reader.isRdwBigEndian)
 
-    val indexWithEndOffset = if (maximumBytes > 0 ){
-      index.map(entry => if (entry.offsetTo == -1) entry.copy(offsetTo = startOffset + maximumBytes) else entry)
-    } else {
-      index
-    }
-
-    indexWithEndOffset
+    (inputStream, headerStream, maximumBytes)
   }
 
+  private[cobol] def selfCheckForIndexCompatibility(reader: VarLenReader, filePath: String, config: Configuration): Unit = {
+    if (!reader.isInstanceOf[ReaderVarLenNestedReader[_]])
+      return
+
+    val readerProperties = reader.getReaderProperties
+
+    val startOffset = readerProperties.fileStartOffset
+    val endOffset = readerProperties.fileEndOffset
+
+    readerProperties.recordExtractor.foreach { recordExtractorClass =>
+      val (dataStream, headerStream, _) = getStreams(filePath, startOffset, endOffset, config)
+
+      val extractorOpt = reader.asInstanceOf[ReaderVarLenNestedReader[_]].recordExtractor(0, dataStream, headerStream)
+
+      var offset = -1L
+      var record = Array[Byte]()
+
+      extractorOpt.foreach { extractor =>
+        if (extractor.hasNext) {
+          // Getting the first record, if available
+          extractor.next()
+          offset = extractor.offset // Saving offset to jump to later
+
+          if (extractor.hasNext) {
+            // Getting the second record, if available
+            record = extractor.next() // Saving the record to check later
+
+            dataStream.close()
+            headerStream.close()
+
+            // Getting new streams and record extractor that points directly to the second record
+            val (dataStream2, headerStream2, _) = getStreams(filePath, offset, endOffset, config)
+            val extractorOpt2 = reader.asInstanceOf[ReaderVarLenNestedReader[_]].recordExtractor(1, dataStream2, headerStream2)
+
+            extractorOpt2.foreach { extractor2 =>
+              if (!extractor2.hasNext) {
+                // If the extractor refuses to return the second record, it is obviously faulty in terms of indexing support.
+                throw new RuntimeException(
+                  s"Record extractor self-check failed. When reading from a non-zero offset the extractor returned hasNext()=false. " +
+                    "Please, use 'enable_indexes = false'. " +
+                    s"File: $filePath, offset: $offset"
+                )
+              }
+
+              // Getting the second record from the extractor pointing to the second record offset at the start.
+              val expectedRecord = extractor2.next()
+
+              if (!expectedRecord.sameElements(record)) {
+                // Records should match. If they don't, the record extractor is faulty in terms of indexing support..
+                throw new RuntimeException(
+                  s"Record extractor self-check failed. The record extractor returned wrong record when started from non-zero offset. " +
+                    "Please, use 'enable_indexes = false'. " +
+                    s"File: $filePath, offset: $offset"
+                )
+              } else {
+                logger.info(s"Record extractor self-check passed. File: $filePath, offset: $offset")
+              }
+              dataStream2.close()
+              headerStream2.close()
+            }
+          }
+        }
+      }
+    }
+  }
 
   private[cobol] def getBlockLengthByIndexEntry(entry: SparseIndexEntry): Long = {
     val indexedLength = if (entry.offsetTo - entry.offsetFrom > 0)
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/mocks/FixedRecordExtractor.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/mocks/FixedRecordExtractor.scala
@@ -1,3 +1,19 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package za.co.absa.cobrix.spark.cobol.mocks
 
 import za.co.absa.cobrix.cobol.reader.extractors.raw.{RawRecordContext, RawRecordExtractor}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/mocks/FixedRecordExtractorBroken.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/mocks/FixedRecordExtractorBroken.scala
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.spark.cobol.mocks
+
+import za.co.absa.cobrix.cobol.reader.extractors.raw.{RawRecordContext, RawRecordExtractor}
+
+/**
+  * This record extractor that returns hasNext=false when started with non-zero offset
+  */
+class FixedRecordExtractorBroken(ctx: RawRecordContext) extends Serializable with RawRecordExtractor {
+  ctx.headerStream.close()
+
+  private var recordNumber = ctx.startingRecordNumber
+
+  private val startingOffset = ctx.inputStream.offset
+
+  override def offset: Long = ctx.inputStream.offset
+
+  override def hasNext: Boolean = startingOffset == 0 && !ctx.inputStream.isEndOfStream
+
+  @throws[NoSuchElementException]
+  override def next(): Array[Byte] = {
+    if (!hasNext) {
+      throw new NoSuchElementException
+    }
+
+    val rawRecord = ctx.inputStream.next(2)
+
+    recordNumber += 1
+
+    rawRecord
+  }
+}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/mocks/FixedRecordExtractorNoIndex.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/mocks/FixedRecordExtractorNoIndex.scala
@@ -1,3 +1,19 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package za.co.absa.cobrix.spark.cobol.mocks
 
 import za.co.absa.cobrix.cobol.reader.extractors.raw.{RawRecordContext, RawRecordExtractor}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test39RecordExtractorSelfCheck.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test39RecordExtractorSelfCheck.scala
@@ -35,6 +35,7 @@ class Test39RecordExtractorSelfCheck extends AnyWordSpec with SparkTestBase with
 
       withTempBinFile("custom_re", ".dat", data.getBytes) { tmpFileName =>
         val df = getDataFrame(tmpFileName, Map(
+          "enable_self_checks" -> "true",
           "record_extractor" -> "za.co.absa.cobrix.spark.cobol.mocks.FixedRecordExtractor",
           "input_split_records" -> "2")
         )
@@ -47,9 +48,10 @@ class Test39RecordExtractorSelfCheck extends AnyWordSpec with SparkTestBase with
   }
 
   "Record extractor not supporting indexes" should {
-    "should fail self checks" ignore /* Not implemented yet */ {
+    "should fail self checks when offsets are not properly handled" in {
       withTempBinFile("custom_re", ".dat", data.getBytes) { tmpFileName =>
         val df = getDataFrame(tmpFileName, Map(
+          "enable_self_checks" -> "true",
           "record_extractor" -> "za.co.absa.cobrix.spark.cobol.mocks.FixedRecordExtractorNoIndex",
           "input_split_records" -> "2")
         )
@@ -59,7 +61,26 @@ class Test39RecordExtractorSelfCheck extends AnyWordSpec with SparkTestBase with
           df.count()
         }
 
-        assert(ex.getMessage == "")
+        assert(ex.getMessage.contains("Record extractor self-check failed. The record extractor returned wrong record when started from non-zero offset"))
+        assert(ex.getMessage.contains("offset: 4"))
+      }
+    }
+
+    "should fail self checks when the extractor returns hasNext=false unexpectedly" in {
+      withTempBinFile("custom_re", ".dat", data.getBytes) { tmpFileName =>
+        val df = getDataFrame(tmpFileName, Map(
+          "enable_self_checks" -> "true",
+          "record_extractor" -> "za.co.absa.cobrix.spark.cobol.mocks.FixedRecordExtractorBroken",
+          "input_split_records" -> "2")
+        )
+
+        val ex = intercept[RuntimeException] {
+          df.show(false)
+          df.count()
+        }
+
+        assert(ex.getMessage.contains("Record extractor self-check failed. When reading from a non-zero offset the extractor returned hasNext()=false"))
+        assert(ex.getMessage.contains("offset: 2"))
       }
     }
 
@@ -76,6 +97,17 @@ class Test39RecordExtractorSelfCheck extends AnyWordSpec with SparkTestBase with
       }
     }
 
+    "should still work if there is just one record" in {
+      withTempBinFile("custom_re", ".dat", "AA".getBytes) { tmpFileName =>
+        val df = getDataFrame(tmpFileName, Map(
+          "enable_self_checks" -> "true",
+          "record_extractor" -> "za.co.absa.cobrix.spark.cobol.mocks.FixedRecordExtractorNoIndex")
+        )
+
+        assert(df.count() == 1)
+      }
+    }
+
     "should still work if indexes are disabled" in {
       val expected = """[{"A":"AA"},{"A":"BB"},{"A":"CC"},{"A":"DD"},{"A":"EE"},{"A":"FF"}]"""