#759 Add tests for self-checks.

yruslan · yruslan · commit 5b5cdd6ed7bf · 2025-04-25T09:17:05.000+02:00
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/mocks/FixedRecordExtractor.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/mocks/FixedRecordExtractor.scala
@@ -0,0 +1,30 @@
+package za.co.absa.cobrix.spark.cobol.mocks
+
+import za.co.absa.cobrix.cobol.reader.extractors.raw.{RawRecordContext, RawRecordExtractor}
+
+/**
+  * This record extractor assumes each record has the size of 2 bytes.
+  */
+class FixedRecordExtractor(ctx: RawRecordContext) extends Serializable with RawRecordExtractor {
+  ctx.headerStream.close()
+
+  private var recordNumber = ctx.startingRecordNumber
+
+  override def offset: Long = ctx.inputStream.offset
+
+  override def hasNext: Boolean = !ctx.inputStream.isEndOfStream
+
+  @throws[NoSuchElementException]
+  override def next(): Array[Byte] = {
+    if (!hasNext) {
+      throw new NoSuchElementException
+    }
+
+    val rawRecord = ctx.inputStream.next(2)
+
+    recordNumber += 1
+
+    rawRecord
+  }
+
+}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/mocks/FixedRecordExtractorNoIndex.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/mocks/FixedRecordExtractorNoIndex.scala
@@ -0,0 +1,48 @@
+package za.co.absa.cobrix.spark.cobol.mocks
+
+import za.co.absa.cobrix.cobol.reader.extractors.raw.{RawRecordContext, RawRecordExtractor}
+
+/**
+  * This record extractor assumes each record has the size of 2 bytes.
+  *
+  * This record extractor is not index compatible.
+  */
+class FixedRecordExtractorNoIndex (ctx: RawRecordContext) extends Serializable with RawRecordExtractor {
+  ctx.headerStream.close()
+
+  private var currentOffset = ctx.inputStream.offset
+  private var recordNumber = ctx.startingRecordNumber
+
+  private var currentRecord = fetchRecord()
+
+  // This record extractor does not support indexes because it returns offsets not pointing to the next record.
+  // Since the record is fetched eagerly, it returns the offset of the next record.
+  override def offset: Long = currentOffset
+
+  override def hasNext: Boolean = currentRecord.nonEmpty
+
+  @throws[NoSuchElementException]
+  override def next(): Array[Byte] = {
+    if (!hasNext) {
+      throw new NoSuchElementException
+    }
+
+    val rawRecord = currentRecord.get
+
+    // In order to support indexes the next 2 lines should be reversed.
+    currentRecord = fetchRecord()
+    currentOffset = ctx.inputStream.offset
+
+    recordNumber += 1
+
+    rawRecord
+  }
+
+  def fetchRecord(): Option[Array[Byte]] = {
+    if (ctx.inputStream.isEndOfStream) {
+      None
+    } else {
+      Option(ctx.inputStream.next(2))
+    }
+  }
+}
diff --git a/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test39RecordExtractorSelfCheck.scala b/spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test39RecordExtractorSelfCheck.scala
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.spark.cobol.source.integration
+
+import org.apache.spark.sql.DataFrame
+import org.scalatest.wordspec.AnyWordSpec
+import za.co.absa.cobrix.spark.cobol.mocks.CustomRecordExtractorMock
+import za.co.absa.cobrix.spark.cobol.source.base.SparkTestBase
+import za.co.absa.cobrix.spark.cobol.source.fixtures.BinaryFileFixture
+
+class Test39RecordExtractorSelfCheck extends AnyWordSpec with SparkTestBase with BinaryFileFixture {
+  private val copybook =
+    """      01  R.
+                03 A        PIC X(2).
+      """
+  private val data = "AABBCCDDEEFF"
+
+  "Record extractor supporting indexes" should {
+    "should work with indexes" in {
+      val expected = """[{"A":"AA"},{"A":"BB"},{"A":"CC"},{"A":"DD"},{"A":"EE"},{"A":"FF"}]"""
+
+      withTempBinFile("custom_re", ".dat", data.getBytes) { tmpFileName =>
+        val df = getDataFrame(tmpFileName, Map(
+          "record_extractor" -> "za.co.absa.cobrix.spark.cobol.mocks.FixedRecordExtractor",
+          "input_split_records" -> "2")
+        )
+
+        val actual = df.toJSON.collect().mkString("[", ",", "]")
+
+        assert(actual == expected)
+      }
+    }
+  }
+
+  "Record extractor not supporting indexes" should {
+    "should fail self checks" ignore /* Not implemented yet */ {
+      withTempBinFile("custom_re", ".dat", data.getBytes) { tmpFileName =>
+        val df = getDataFrame(tmpFileName, Map(
+          "record_extractor" -> "za.co.absa.cobrix.spark.cobol.mocks.FixedRecordExtractorNoIndex",
+          "input_split_records" -> "2")
+        )
+
+        val ex = intercept[RuntimeException] {
+          df.show(false)
+          df.count()
+        }
+
+        assert(ex.getMessage == "")
+      }
+    }
+
+    "should still work if self checks is turned off" in {
+      withTempBinFile("custom_re", ".dat", data.getBytes) { tmpFileName =>
+        val df = getDataFrame(tmpFileName, Map(
+          "enable_self_checks" -> "false",
+          "record_extractor" -> "za.co.absa.cobrix.spark.cobol.mocks.FixedRecordExtractorNoIndex",
+          "input_split_records" -> "2")
+        )
+
+        // No guarantees regarding the correct record count at this point
+        assert(df.count() > 4)
+      }
+    }
+
+    "should still work if indexes are disabled" in {
+      val expected = """[{"A":"AA"},{"A":"BB"},{"A":"CC"},{"A":"DD"},{"A":"EE"},{"A":"FF"}]"""
+
+      withTempBinFile("custom_re", ".dat", data.getBytes) { tmpFileName =>
+        val df = getDataFrame(tmpFileName, Map(
+          "record_extractor" -> "za.co.absa.cobrix.spark.cobol.mocks.FixedRecordExtractorNoIndex",
+          "enable_indexes" -> "false")
+        )
+
+        val actual = df.toJSON.collect().mkString("[", ",", "]")
+
+        assert(actual == expected)
+      }
+    }
+  }
+
+  private def getDataFrame(inputPath: String, extraOptions: Map[String, String] = Map.empty[String, String]): DataFrame = {
+    spark
+      .read
+      .format("cobol")
+      .option("copybook_contents", copybook)
+      .option("encoding", "ascii")
+      .options(extraOptions)
+      .load(inputPath)
+  }
+
+}