Skip to content

Commit 99eb524

Browse files
committed
WIP
1 parent 5b5cdd6 commit 99eb524

File tree

9 files changed

+222
-25
lines changed

9 files changed

+222
-25
lines changed

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/VarLenNestedReader.scala

+5-6
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
5656
def recordExtractor(startingRecordNumber: Long,
5757
dataStream: SimpleStream,
5858
headerStream: SimpleStream,
59-
copybook: Copybook
6059
): Option[RawRecordExtractor] = {
6160
val rdwParams = RecordHeaderParameters(readerProperties.isRdwBigEndian, readerProperties.rdwAdjustment)
6261

@@ -66,7 +65,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
6665
val bdwParamsOpt = bdwOpt.map(bdw => RecordHeaderParameters(bdw.isBigEndian, bdw.adjustment))
6766
val bdwDecoderOpt = bdwParamsOpt.map(bdwParams => new RecordHeaderDecoderBdw(bdwParams))
6867

69-
val reParams = RawRecordContext(startingRecordNumber, dataStream, headerStream, copybook, rdwDecoder, bdwDecoderOpt.getOrElse(rdwDecoder), readerProperties.reAdditionalInfo)
68+
val reParams = RawRecordContext(startingRecordNumber, dataStream, headerStream, cobolSchema.copybook, rdwDecoder, bdwDecoderOpt.getOrElse(rdwDecoder), readerProperties.reAdditionalInfo)
7069

7170
readerProperties.recordExtractor match {
7271
case Some(recordExtractorClass) =>
@@ -113,7 +112,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
113112
dataStream,
114113
readerProperties,
115114
recordHeaderParser,
116-
recordExtractor(startingRecordIndex, dataStream, headerStream, cobolSchema.copybook),
115+
recordExtractor(startingRecordIndex, dataStream, headerStream),
117116
fileNumber,
118117
startingRecordIndex,
119118
startingFileOffset,
@@ -123,7 +122,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
123122
dataStream,
124123
readerProperties,
125124
recordHeaderParser,
126-
recordExtractor(startingRecordIndex, dataStream, headerStream, cobolSchema.copybook),
125+
recordExtractor(startingRecordIndex, dataStream, headerStream),
127126
fileNumber,
128127
startingRecordIndex,
129128
startingFileOffset,
@@ -178,7 +177,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
178177
dataStream,
179178
readerProperties.fileStartOffset,
180179
recordHeaderParser,
181-
recordExtractor(0L, dataStream, headerStream, copybook),
180+
recordExtractor(0L, dataStream, headerStream),
182181
inputSplitSizeRecords,
183182
inputSplitSizeMB,
184183
Some(copybook),
@@ -189,7 +188,7 @@ class VarLenNestedReader[T: ClassTag](copybookContents: Seq[String],
189188
dataStream,
190189
readerProperties.fileStartOffset,
191190
recordHeaderParser,
192-
recordExtractor(0L, dataStream, headerStream, copybook),
191+
recordExtractor(0L, dataStream, headerStream),
193192
inputSplitSizeRecords,
194193
inputSplitSizeMB,
195194
None,

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala

+1
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ case class ReaderParameters(
117117
decodeBinaryAsHex: Boolean = false,
118118
dropGroupFillers: Boolean = false,
119119
dropValueFillers: Boolean = true,
120+
enableSelfChecks: Boolean = true,
120121
fillerNamingPolicy: FillerNamingPolicy = FillerNamingPolicy.SequenceNumbers,
121122
nonTerminals: Seq[String] = Nil,
122123
occursMappings: Map[String, Map[String, Int]] = Map(),

spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/parameters/CobolParametersParser.scala

+1
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,7 @@ object CobolParametersParser extends Logging {
418418
parameters.decodeBinaryAsHex,
419419
parameters.dropGroupFillers,
420420
parameters.dropValueFillers,
421+
parameters.enableSelfChecks,
421422
parameters.fillerNamingPolicy,
422423
parameters.nonTerminals,
423424
parameters.occursMappings,

spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/reader/VarLenNestedReader.scala

+2-2
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ final class VarLenNestedReader(copybookContents: Seq[String],
6060
dataStream,
6161
getReaderProperties,
6262
recordHeaderParser,
63-
recordExtractor(startingRecordIndex, dataStream, headerStream, cobolSchema.copybook),
63+
recordExtractor(startingRecordIndex, dataStream, headerStream),
6464
fileNumber,
6565
startingRecordIndex,
6666
startingFileOffset,
@@ -72,7 +72,7 @@ final class VarLenNestedReader(copybookContents: Seq[String],
7272
dataStream,
7373
getReaderProperties,
7474
recordHeaderParser,
75-
recordExtractor(startingRecordIndex, dataStream, headerStream, cobolSchema.copybook),
75+
recordExtractor(startingRecordIndex, dataStream, headerStream),
7676
fileNumber,
7777
startingRecordIndex,
7878
startingFileOffset,

spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/source/index/IndexBuilder.scala

+100-15
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ import org.apache.spark.sql.SQLContext
2525
import za.co.absa.cobrix.cobol.internal.Logging
2626
import za.co.absa.cobrix.cobol.reader.common.Constants
2727
import za.co.absa.cobrix.cobol.reader.index.entry.SparseIndexEntry
28+
import za.co.absa.cobrix.cobol.reader.stream.SimpleStream
29+
import za.co.absa.cobrix.cobol.reader.{VarLenNestedReader => ReaderVarLenNestedReader}
2830
import za.co.absa.cobrix.spark.cobol.reader.{Reader, VarLenReader}
2931
import za.co.absa.cobrix.spark.cobol.source.SerializableConfiguration
3032
import za.co.absa.cobrix.spark.cobol.source.parameters.LocalityParameters
@@ -111,10 +113,15 @@ private[source] object IndexBuilder extends Logging {
111113
private[cobol] def buildIndexForVarLenReader(filesList: Array[FileWithOrder],
112114
reader: VarLenReader,
113115
sqlContext: SQLContext): RDD[SparseIndexEntry] = {
114-
val filesRDD = sqlContext.sparkContext.parallelize(filesList, filesList.length)
115116
val conf = sqlContext.sparkContext.hadoopConfiguration
116117
val sconf = new SerializableConfiguration(conf)
117118

119+
if (reader.getReaderProperties.enableSelfChecks && filesList.nonEmpty) {
120+
selfCheckForIndexCompatibility(reader, filesList.head.filePath, conf)
121+
}
122+
123+
val filesRDD = sqlContext.sparkContext.parallelize(filesList, filesList.length)
124+
118125
val indexRDD = filesRDD.mapPartitions(
119126
partition => {
120127
partition.flatMap(row => {
@@ -149,36 +156,114 @@ private[source] object IndexBuilder extends Logging {
149156
config: Configuration,
150157
reader: VarLenReader): ArrayBuffer[SparseIndexEntry] = {
151158
val filePath = fileWithOrder.filePath
152-
val path = new Path(filePath)
153159
val fileOrder = fileWithOrder.order
160+
val startOffset = reader.getReaderProperties.fileStartOffset
161+
val endOffset = reader.getReaderProperties.fileEndOffset
162+
163+
logger.info(s"Going to generate index for the file: $filePath")
164+
165+
val (inputStream, headerStream, maximumBytes) = getStreams(filePath, startOffset, endOffset, config)
166+
val index = reader.generateIndex(inputStream, headerStream, fileOrder, reader.isRdwBigEndian)
167+
168+
val indexWithEndOffset = if (maximumBytes > 0 ){
169+
index.map(entry => if (entry.offsetTo == -1) entry.copy(offsetTo = startOffset + maximumBytes) else entry)
170+
} else {
171+
index
172+
}
173+
174+
indexWithEndOffset
175+
}
176+
177+
private[cobol] def getStreams(filePath: String,
178+
fileStartOffset: Long,
179+
fileEndOffset: Long,
180+
config: Configuration,
181+
): (SimpleStream, SimpleStream, Long) = {
182+
val path = new Path(filePath)
154183
val fileSystem = path.getFileSystem(config)
155184

156-
val startOffset = reader.getReaderProperties.fileStartOffset
157-
val maximumBytes = if (reader.getReaderProperties.fileEndOffset == 0) {
185+
val startOffset = fileStartOffset
186+
val maximumBytes = if (fileEndOffset == 0) {
158187
0
159188
} else {
160-
val bytesToRead = fileSystem.getContentSummary(path).getLength - reader.getReaderProperties.fileEndOffset - startOffset
189+
val bytesToRead = fileSystem.getContentSummary(path).getLength - fileEndOffset - startOffset
161190
if (bytesToRead < 0)
162191
0
163192
else
164193
bytesToRead
165194
}
166195

167-
logger.info(s"Going to generate index for the file: $filePath")
168196
val inputStream = new FileStreamer(filePath, fileSystem, startOffset, maximumBytes)
169197
val headerStream = new FileStreamer(filePath, fileSystem)
170-
val index = reader.generateIndex(inputStream, headerStream,
171-
fileOrder, reader.isRdwBigEndian)
172198

173-
val indexWithEndOffset = if (maximumBytes > 0 ){
174-
index.map(entry => if (entry.offsetTo == -1) entry.copy(offsetTo = startOffset + maximumBytes) else entry)
175-
} else {
176-
index
177-
}
178-
179-
indexWithEndOffset
199+
(inputStream, headerStream, maximumBytes)
180200
}
181201

202+
private[cobol] def selfCheckForIndexCompatibility(reader: VarLenReader, filePath: String, config: Configuration): Unit = {
203+
if (!reader.isInstanceOf[ReaderVarLenNestedReader[_]])
204+
return
205+
206+
val readerProperties = reader.getReaderProperties
207+
208+
val startOffset = readerProperties.fileStartOffset
209+
val endOffset = readerProperties.fileEndOffset
210+
211+
readerProperties.recordExtractor.foreach { recordExtractorClass =>
212+
val (dataStream, headerStream, _) = getStreams(filePath, startOffset, endOffset, config)
213+
214+
val extractorOpt = reader.asInstanceOf[ReaderVarLenNestedReader[_]].recordExtractor(0, dataStream, headerStream)
215+
216+
var offset = -1L
217+
var record = Array[Byte]()
218+
219+
extractorOpt.foreach { extractor =>
220+
if (extractor.hasNext) {
221+
// Getting the first record, if available
222+
extractor.next()
223+
offset = extractor.offset // Saving offset to jump to later
224+
225+
if (extractor.hasNext) {
226+
// Getting the second record, if available
227+
record = extractor.next() // Saving the record to check later
228+
229+
dataStream.close()
230+
headerStream.close()
231+
232+
// Getting new streams and record extractor that points directly to the second record
233+
val (dataStream2, headerStream2, _) = getStreams(filePath, offset, endOffset, config)
234+
val extractorOpt2 = reader.asInstanceOf[ReaderVarLenNestedReader[_]].recordExtractor(1, dataStream2, headerStream2)
235+
236+
extractorOpt2.foreach { extractor2 =>
237+
if (!extractor2.hasNext) {
238+
// If the extractor refuses to return the second record, it is obviously faulty in terms of indexing support.
239+
throw new RuntimeException(
240+
s"Record extractor self-check failed. When reading from a non-zero offset the extractor returned hasNext()=false. " +
241+
"Please, use 'enable_indexes = false'. " +
242+
s"File: $filePath, offset: $offset"
243+
)
244+
}
245+
246+
// Getting the second record from the extractor pointing to the second record offset at the start.
247+
val expectedRecord = extractor2.next()
248+
249+
if (!expectedRecord.sameElements(record)) {
250+
// Records should match. If they don't, the record extractor is faulty in terms of indexing support..
251+
throw new RuntimeException(
252+
s"Record extractor self-check failed. The record extractor returned wrong record when started from non-zero offset. " +
253+
"Please, use 'enable_indexes = false'. " +
254+
s"File: $filePath, offset: $offset"
255+
)
256+
} else {
257+
logger.info(s"Record extractor self-check passed. File: $filePath, offset: $offset")
258+
}
259+
dataStream2.close()
260+
headerStream2.close()
261+
}
262+
}
263+
}
264+
}
265+
}
266+
}
182267

183268
private[cobol] def getBlockLengthByIndexEntry(entry: SparseIndexEntry): Long = {
184269
val indexedLength = if (entry.offsetTo - entry.offsetFrom > 0)

spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/mocks/FixedRecordExtractor.scala

+16
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
/*
2+
* Copyright 2018 ABSA Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
117
package za.co.absa.cobrix.spark.cobol.mocks
218

319
import za.co.absa.cobrix.cobol.reader.extractors.raw.{RawRecordContext, RawRecordExtractor}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Copyright 2018 ABSA Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package za.co.absa.cobrix.spark.cobol.mocks
18+
19+
import za.co.absa.cobrix.cobol.reader.extractors.raw.{RawRecordContext, RawRecordExtractor}
20+
21+
/**
22+
* This record extractor that returns hasNext=false when started with non-zero offset
23+
*/
24+
class FixedRecordExtractorBroken(ctx: RawRecordContext) extends Serializable with RawRecordExtractor {
25+
ctx.headerStream.close()
26+
27+
private var recordNumber = ctx.startingRecordNumber
28+
29+
private val startingOffset = ctx.inputStream.offset
30+
31+
override def offset: Long = ctx.inputStream.offset
32+
33+
override def hasNext: Boolean = startingOffset == 0 && !ctx.inputStream.isEndOfStream
34+
35+
@throws[NoSuchElementException]
36+
override def next(): Array[Byte] = {
37+
if (!hasNext) {
38+
throw new NoSuchElementException
39+
}
40+
41+
val rawRecord = ctx.inputStream.next(2)
42+
43+
recordNumber += 1
44+
45+
rawRecord
46+
}
47+
}

spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/mocks/FixedRecordExtractorNoIndex.scala

+16
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
/*
2+
* Copyright 2018 ABSA Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
117
package za.co.absa.cobrix.spark.cobol.mocks
218

319
import za.co.absa.cobrix.cobol.reader.extractors.raw.{RawRecordContext, RawRecordExtractor}

spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/integration/Test39RecordExtractorSelfCheck.scala

+34-2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class Test39RecordExtractorSelfCheck extends AnyWordSpec with SparkTestBase with
3535

3636
withTempBinFile("custom_re", ".dat", data.getBytes) { tmpFileName =>
3737
val df = getDataFrame(tmpFileName, Map(
38+
"enable_self_checks" -> "true",
3839
"record_extractor" -> "za.co.absa.cobrix.spark.cobol.mocks.FixedRecordExtractor",
3940
"input_split_records" -> "2")
4041
)
@@ -47,9 +48,10 @@ class Test39RecordExtractorSelfCheck extends AnyWordSpec with SparkTestBase with
4748
}
4849

4950
"Record extractor not supporting indexes" should {
50-
"should fail self checks" ignore /* Not implemented yet */ {
51+
"should fail self checks when offsets are not properly handled" in {
5152
withTempBinFile("custom_re", ".dat", data.getBytes) { tmpFileName =>
5253
val df = getDataFrame(tmpFileName, Map(
54+
"enable_self_checks" -> "true",
5355
"record_extractor" -> "za.co.absa.cobrix.spark.cobol.mocks.FixedRecordExtractorNoIndex",
5456
"input_split_records" -> "2")
5557
)
@@ -59,7 +61,26 @@ class Test39RecordExtractorSelfCheck extends AnyWordSpec with SparkTestBase with
5961
df.count()
6062
}
6163

62-
assert(ex.getMessage == "")
64+
assert(ex.getMessage.contains("Record extractor self-check failed. The record extractor returned wrong record when started from non-zero offset"))
65+
assert(ex.getMessage.contains("offset: 4"))
66+
}
67+
}
68+
69+
"should fail self checks when the extractor returns hasNext=false unexpectedly" in {
70+
withTempBinFile("custom_re", ".dat", data.getBytes) { tmpFileName =>
71+
val df = getDataFrame(tmpFileName, Map(
72+
"enable_self_checks" -> "true",
73+
"record_extractor" -> "za.co.absa.cobrix.spark.cobol.mocks.FixedRecordExtractorBroken",
74+
"input_split_records" -> "2")
75+
)
76+
77+
val ex = intercept[RuntimeException] {
78+
df.show(false)
79+
df.count()
80+
}
81+
82+
assert(ex.getMessage.contains("Record extractor self-check failed. When reading from a non-zero offset the extractor returned hasNext()=false"))
83+
assert(ex.getMessage.contains("offset: 2"))
6384
}
6485
}
6586

@@ -76,6 +97,17 @@ class Test39RecordExtractorSelfCheck extends AnyWordSpec with SparkTestBase with
7697
}
7798
}
7899

100+
"should still work if there is just one record" in {
101+
withTempBinFile("custom_re", ".dat", "AA".getBytes) { tmpFileName =>
102+
val df = getDataFrame(tmpFileName, Map(
103+
"enable_self_checks" -> "true",
104+
"record_extractor" -> "za.co.absa.cobrix.spark.cobol.mocks.FixedRecordExtractorNoIndex")
105+
)
106+
107+
assert(df.count() == 1)
108+
}
109+
}
110+
79111
"should still work if indexes are disabled" in {
80112
val expected = """[{"A":"AA"},{"A":"BB"},{"A":"CC"},{"A":"DD"},{"A":"EE"},{"A":"FF"}]"""
81113

0 commit comments

Comments
 (0)