Skip to content

Commit 888d212

Browse files
committed
#740 Make sure unsigned binary fields can fit data types.
1 parent 6d48657 commit 888d212

File tree

12 files changed

+174
-108
lines changed

12 files changed

+174
-108
lines changed

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/BinaryNumberDecoders.scala

+30
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,14 @@ object BinaryNumberDecoders {
8282
if (v<0) null else v
8383
}
8484

85+
def decodeBinaryUnsignedIntBigEndianAsLong(bytes: Array[Byte]): java.lang.Long = {
86+
if (bytes.length < 4) {
87+
return null
88+
}
89+
val v: Long = ((bytes(0) & 255L) << 24L) | ((bytes(1) & 255L) << 16L) | ((bytes(2) & 255L) << 8L) | (bytes(3) & 255L)
90+
if (v<0) null else v
91+
}
92+
8593
def decodeBinaryUnsignedIntLittleEndian(bytes: Array[Byte]): Integer = {
8694
if (bytes.length < 4) {
8795
return null
@@ -90,6 +98,14 @@ object BinaryNumberDecoders {
9098
if (v<0) null else v
9199
}
92100

101+
def decodeBinaryUnsignedIntLittleEndianAsLong(bytes: Array[Byte]): java.lang.Long = {
102+
if (bytes.length < 4) {
103+
return null
104+
}
105+
val v: Long = ((bytes(3) & 255L) << 24L) | ((bytes(2) & 255L) << 16L) | ((bytes(1) & 255L) << 8L) | (bytes(0) & 255L)
106+
if (v<0) null else v
107+
}
108+
93109
def decodeBinarySignedLongBigEndian(bytes: Array[Byte]): java.lang.Long = {
94110
if (bytes.length < 8) {
95111
return null
@@ -112,6 +128,13 @@ object BinaryNumberDecoders {
112128
if (v < 0L) null else v
113129
}
114130

131+
def decodeBinaryUnsignedLongBigEndianAsDecimal(bytes: Array[Byte]): BigDecimal = {
132+
if (bytes.length < 8) {
133+
return null
134+
}
135+
BigDecimal(BigInt(1, bytes).toString())
136+
}
137+
115138
def decodeBinaryUnsignedLongLittleEndian(bytes: Array[Byte]): java.lang.Long = {
116139
if (bytes.length < 8) {
117140
return null
@@ -120,6 +143,13 @@ object BinaryNumberDecoders {
120143
if (v < 0L) null else v
121144
}
122145

146+
def decodeBinaryUnsignedLongLittleEndianAsDecimal(bytes: Array[Byte]): BigDecimal = {
147+
if (bytes.length < 8) {
148+
return null
149+
}
150+
BigDecimal(BigInt(1, bytes.reverse).toString())
151+
}
152+
123153
def decodeBinaryAribtraryPrecision(bytes: Array[Byte], isBigEndian: Boolean, isSigned: Boolean): BigDecimal = {
124154
if (bytes.length == 0) {
125155
return null

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala

+24-17
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package za.co.absa.cobrix.cobol.parser.decoders
1919
import java.nio.charset.{Charset, StandardCharsets}
2020
import za.co.absa.cobrix.cobol.parser.ast.datatype._
2121
import za.co.absa.cobrix.cobol.parser.common.Constants
22+
import za.co.absa.cobrix.cobol.parser.common.Constants.{maxIntegerPrecision, maxLongPrecision}
2223
import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPointFormat
2324
import za.co.absa.cobrix.cobol.parser.encoding._
2425
import za.co.absa.cobrix.cobol.parser.encoding.codepage.{CodePage, CodePageCommon}
@@ -255,26 +256,32 @@ object DecoderSelector {
255256
val isSigned = signPosition.nonEmpty
256257

257258
val numOfBytes = BinaryUtils.getBytesCount(compact, precision, isSigned, isExplicitDecimalPt = false, isSignSeparate = false)
259+
val isMaxUnsignedPrecision = precision == maxIntegerPrecision || precision == maxLongPrecision
260+
258261
val decoder = if (strictIntegralPrecision) {
259262
(a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned)
260263
} else {
261-
(isSigned, isBigEndian, numOfBytes) match {
262-
case (true, true, 1) => BinaryNumberDecoders.decodeSignedByte _
263-
case (true, true, 2) => BinaryNumberDecoders.decodeBinarySignedShortBigEndian _
264-
case (true, true, 4) => BinaryNumberDecoders.decodeBinarySignedIntBigEndian _
265-
case (true, true, 8) => BinaryNumberDecoders.decodeBinarySignedLongBigEndian _
266-
case (true, false, 1) => BinaryNumberDecoders.decodeSignedByte _
267-
case (true, false, 2) => BinaryNumberDecoders.decodeBinarySignedShortLittleEndian _
268-
case (true, false, 4) => BinaryNumberDecoders.decodeBinarySignedIntLittleEndian _
269-
case (true, false, 8) => BinaryNumberDecoders.decodeBinarySignedLongLittleEndian _
270-
case (false, true, 1) => BinaryNumberDecoders.decodeUnsignedByte _
271-
case (false, true, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortBigEndian _
272-
case (false, true, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntBigEndian _
273-
case (false, true, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongBigEndian _
274-
case (false, false, 1) => BinaryNumberDecoders.decodeUnsignedByte _
275-
case (false, false, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortLittleEndian _
276-
case (false, false, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntLittleEndian _
277-
case (false, false, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongLittleEndian _
264+
(isSigned, isBigEndian, isMaxUnsignedPrecision, numOfBytes) match {
265+
case (true, true, _, 1) => BinaryNumberDecoders.decodeSignedByte _
266+
case (true, true, _, 2) => BinaryNumberDecoders.decodeBinarySignedShortBigEndian _
267+
case (true, true, _, 4) => BinaryNumberDecoders.decodeBinarySignedIntBigEndian _
268+
case (true, true, _, 8) => BinaryNumberDecoders.decodeBinarySignedLongBigEndian _
269+
case (true, false, _, 1) => BinaryNumberDecoders.decodeSignedByte _
270+
case (true, false, _, 2) => BinaryNumberDecoders.decodeBinarySignedShortLittleEndian _
271+
case (true, false, _, 4) => BinaryNumberDecoders.decodeBinarySignedIntLittleEndian _
272+
case (true, false, _, 8) => BinaryNumberDecoders.decodeBinarySignedLongLittleEndian _
273+
case (false, true, _, 1) => BinaryNumberDecoders.decodeUnsignedByte _
274+
case (false, true, _, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortBigEndian _
275+
case (false, true, false, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntBigEndian _
276+
case (false, true, true, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntBigEndianAsLong _
277+
case (false, true, false, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongBigEndian _
278+
case (false, true, true, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongBigEndianAsDecimal _
279+
case (false, false, _, 1) => BinaryNumberDecoders.decodeUnsignedByte _
280+
case (false, false, _, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortLittleEndian _
281+
case (false, false, false, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntLittleEndian _
282+
case (false, false, true, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntLittleEndianAsLong _
283+
case (false, false, false, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongLittleEndian _
284+
case (false, false, true, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongLittleEndianAsDecimal _
278285
case _ =>
279286
(a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned)
280287
}

cobol-parser/src/test/scala/za/co/absa/cobrix/cobol/parser/decoders/BinaryDecoderSpec.scala

+20
Original file line numberDiff line numberDiff line change
@@ -460,10 +460,14 @@ class BinaryDecoderSpec extends AnyFunSuite {
460460
val decoderUnsignedShort = DecoderSelector.getIntegralDecoder(integralType.copy(precision = 3, compact = Some(COMP5()), signPosition = None), strictSignOverpunch = false, improvedNullDetection = false, strictIntegralPrecision = false)
461461
val decoderSignedInt = DecoderSelector.getIntegralDecoder(integralType.copy(precision = 8, compact = Some(COMP4())), strictSignOverpunch = false, improvedNullDetection = false, strictIntegralPrecision = false)
462462
val decoderUnsignedIntBe = DecoderSelector.getIntegralDecoder(integralType.copy(precision = 8, compact = Some(COMP5()), signPosition = None), strictSignOverpunch = false, improvedNullDetection = false, strictIntegralPrecision = false)
463+
val decoderUnsignedIntBeAsLong = DecoderSelector.getIntegralDecoder(integralType.copy(precision = 9, compact = Some(COMP5()), signPosition = None), strictSignOverpunch = false, improvedNullDetection = false, strictIntegralPrecision = false)
463464
val decoderUnsignedIntLe = DecoderSelector.getIntegralDecoder(integralType.copy(precision = 8, compact = Some(COMP9()), signPosition = None), strictSignOverpunch = false, improvedNullDetection = false, strictIntegralPrecision = false)
465+
val decoderUnsignedIntLeAsLong = DecoderSelector.getIntegralDecoder(integralType.copy(precision = 9, compact = Some(COMP9()), signPosition = None), strictSignOverpunch = false, improvedNullDetection = false, strictIntegralPrecision = false)
464466
val decoderSignedLong = DecoderSelector.getIntegralDecoder(integralType.copy(precision = 15, compact = Some(COMP4())), strictSignOverpunch = false, improvedNullDetection = false, strictIntegralPrecision = false)
465467
val decoderUnsignedLongBe = DecoderSelector.getIntegralDecoder(integralType.copy(precision = 15, compact = Some(COMP5()), signPosition = None), strictSignOverpunch = false, improvedNullDetection = false, strictIntegralPrecision = false)
468+
val decoderUnsignedLongBeAsBig = DecoderSelector.getIntegralDecoder(integralType.copy(precision = 18, compact = Some(COMP5()), signPosition = None), strictSignOverpunch = false, improvedNullDetection = false, strictIntegralPrecision = false)
466469
val decoderUnsignedLongLe = DecoderSelector.getIntegralDecoder(integralType.copy(precision = 15, compact = Some(COMP9()), signPosition = None), strictSignOverpunch = false, improvedNullDetection = false, strictIntegralPrecision = false)
470+
val decoderUnsignedLongLeAsBig = DecoderSelector.getIntegralDecoder(integralType.copy(precision = 18, compact = Some(COMP9()), signPosition = None), strictSignOverpunch = false, improvedNullDetection = false, strictIntegralPrecision = false)
467471

468472
val num1 = decoderSignedByte(Array(0x10).map(_.toByte))
469473
assert(num1.isInstanceOf[Integer])
@@ -501,10 +505,18 @@ class BinaryDecoderSpec extends AnyFunSuite {
501505
assert(num9.isInstanceOf[Integer])
502506
assert(num9.asInstanceOf[Integer] == 9437184)
503507

508+
val num9a = decoderUnsignedIntBeAsLong(Array(0x00, 0x90, 0x00, 0x00).map(_.toByte))
509+
assert(num9a.isInstanceOf[java.lang.Long])
510+
assert(num9a.asInstanceOf[java.lang.Long] == 9437184L)
511+
504512
val num10 = decoderUnsignedIntLe(Array(0x00, 0x00, 0x90, 0x00).map(_.toByte))
505513
assert(num10.isInstanceOf[Integer])
506514
assert(num10.asInstanceOf[Integer] == 9437184)
507515

516+
val num10a = decoderUnsignedIntLeAsLong(Array(0x00, 0x00, 0x90, 0x00).map(_.toByte))
517+
assert(num10a.isInstanceOf[java.lang.Long])
518+
assert(num10a.asInstanceOf[java.lang.Long] == 9437184L)
519+
508520
val num11 = decoderSignedLong(Array(0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00).map(_.toByte))
509521
assert(num11.isInstanceOf[Long])
510522
assert(num11.asInstanceOf[Long] == 72057594037927936L)
@@ -517,9 +529,17 @@ class BinaryDecoderSpec extends AnyFunSuite {
517529
assert(num13.isInstanceOf[Long])
518530
assert(num13.asInstanceOf[Long] == 40532396646334464L)
519531

532+
val num13a = decoderUnsignedLongBeAsBig(Array(0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00).map(_.toByte))
533+
assert(num13a.isInstanceOf[BigDecimal])
534+
assert(num13a.asInstanceOf[BigDecimal] == BigDecimal("40532396646334464"))
535+
520536
val num14 = decoderUnsignedLongLe(Array(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x00).map(_.toByte))
521537
assert(num14.isInstanceOf[Long])
522538
assert(num14.asInstanceOf[Long] == 40532396646334464L)
539+
540+
val num14a = decoderUnsignedLongLeAsBig(Array(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, 0x00).map(_.toByte))
541+
assert(num14a.isInstanceOf[BigDecimal])
542+
assert(num14a.asInstanceOf[BigDecimal] == BigDecimal("40532396646334464"))
523543
}
524544

525545
test("Test Binary strict integral precision numbers") {

data/test17_expected/test17a_schema.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
}
4141
}, {
4242
"name" : "TAXPAYER",
43-
"type" : "integer",
43+
"type" : "long",
4444
"nullable" : true,
4545
"metadata" : { }
4646
} ]

data/test17_expected/test17b_schema.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
}
6262
}, {
6363
"name" : "TAXPAYER",
64-
"type" : "integer",
64+
"type" : "long",
6565
"nullable" : true,
6666
"metadata" : { }
6767
} ]

data/test17_expected/test17c_schema.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
}
4141
}, {
4242
"name" : "TAXPAYER",
43-
"type" : "integer",
43+
"type" : "long",
4444
"nullable" : true,
4545
"metadata" : { }
4646
}, {

data/test18 special_char_expected/test18a_schema.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
}
4141
}, {
4242
"name" : "TAXPAYER",
43-
"type" : "integer",
43+
"type" : "long",
4444
"nullable" : true,
4545
"metadata" : { }
4646
} ]

data/test24_expected/test24_schema.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -712,7 +712,7 @@
712712
}
713713
}, {
714714
"name" : "NUM_BIN_INT07",
715-
"type" : "integer",
715+
"type" : "long",
716716
"nullable" : true,
717717
"metadata" : { }
718718
}, {
@@ -760,7 +760,7 @@
760760
}
761761
}, {
762762
"name" : "NUM_BIN_INT11",
763-
"type" : "long",
763+
"type" : "decimal(20,0)",
764764
"nullable" : true,
765765
"metadata" : { }
766766
}, {

data/test24_expected/test24b_schema.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -712,7 +712,7 @@
712712
}
713713
}, {
714714
"name" : "NUM_BIN_INT07",
715-
"type" : "integer",
715+
"type" : "long",
716716
"nullable" : true,
717717
"metadata" : { }
718718
}, {
@@ -760,7 +760,7 @@
760760
}
761761
}, {
762762
"name" : "NUM_BIN_INT11",
763-
"type" : "long",
763+
"type" : "decimal(20,0)",
764764
"nullable" : true,
765765
"metadata" : { }
766766
}, {

data/test6_expected/test6_schema.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@
299299
"metadata" : { }
300300
}, {
301301
"name" : "NUM_BIN_INT07",
302-
"type" : "integer",
302+
"type" : "long",
303303
"nullable" : true,
304304
"metadata" : { }
305305
}, {
@@ -319,7 +319,7 @@
319319
"metadata" : { }
320320
}, {
321321
"name" : "NUM_BIN_INT11",
322-
"type" : "long",
322+
"type" : "decimal(20,0)",
323323
"nullable" : true,
324324
"metadata" : { }
325325
}, {

spark-cobol/src/main/scala/za/co/absa/cobrix/spark/cobol/schema/CobolSchema.scala

+7-63
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ import org.apache.spark.sql.types._
2020
import za.co.absa.cobrix.cobol.internal.Logging
2121
import za.co.absa.cobrix.cobol.parser.Copybook
2222
import za.co.absa.cobrix.cobol.parser.ast._
23-
import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, COMP1, COMP2, Decimal, Integral}
23+
import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, COMP1, COMP2, COMP4, COMP5, COMP9, Decimal, Integral}
2424
import za.co.absa.cobrix.cobol.parser.common.Constants
2525
import za.co.absa.cobrix.cobol.parser.encoding.RAW
2626
import za.co.absa.cobrix.cobol.parser.policies.MetadataPolicy
@@ -66,23 +66,10 @@ class CobolSchema(copybook: Copybook,
6666
@throws(classOf[IllegalStateException])
6767
private[this] lazy val sparkSchema = createSparkSchema()
6868

69-
@throws(classOf[IllegalStateException])
70-
private[this] lazy val sparkFlatSchema = {
71-
val arraySchema = copybook.ast.children.toArray
72-
val records = arraySchema.flatMap(record => {
73-
parseGroupFlat(record.asInstanceOf[Group], s"${record.name}_")
74-
})
75-
StructType(records)
76-
}
77-
7869
def getSparkSchema: StructType = {
7970
sparkSchema
8071
}
8172

82-
def getSparkFlatSchema: StructType = {
83-
sparkFlatSchema
84-
}
85-
8673
@throws(classOf[IllegalStateException])
8774
private def createSparkSchema(): StructType = {
8875
val records = for (record <- copybook.getRootRecords) yield {
@@ -200,12 +187,16 @@ class CobolSchema(copybook: Copybook,
200187
case dt: Integral if strictIntegralPrecision =>
201188
DecimalType(precision = dt.precision, scale = 0)
202189
case dt: Integral =>
190+
val isBinary = dt.compact.exists(c => c == COMP4() || c == COMP5() || c == COMP9())
203191
if (dt.precision > Constants.maxLongPrecision) {
204192
DecimalType(precision = dt.precision, scale = 0)
193+
} else if (dt.precision == Constants.maxLongPrecision && isBinary && dt.signPosition.isEmpty) { // promoting unsigned int to long to be able to fit any value
194+
DecimalType(precision = dt.precision + 2, scale = 0)
205195
} else if (dt.precision > Constants.maxIntegerPrecision) {
206196
LongType
207-
}
208-
else {
197+
} else if (dt.precision == Constants.maxIntegerPrecision && isBinary && dt.signPosition.isEmpty) { // promoting unsigned long to decimal(20) to be able to fit any value
198+
LongType
199+
} else {
209200
IntegerType
210201
}
211202
case _ => throw new IllegalStateException("Unknown AST object")
@@ -290,53 +281,6 @@ class CobolSchema(copybook: Copybook,
290281
})
291282
childSegments
292283
}
293-
294-
@throws(classOf[IllegalStateException])
295-
private def parseGroupFlat(group: Group, structPath: String = ""): ArrayBuffer[StructField] = {
296-
val fields = new ArrayBuffer[StructField]()
297-
for (field <- group.children if !field.isFiller) {
298-
field match {
299-
case group: Group =>
300-
if (group.isArray) {
301-
for (i <- Range(1, group.arrayMaxSize + 1)) {
302-
val path = s"$structPath${group.name}_${i}_"
303-
fields ++= parseGroupFlat(group, path)
304-
}
305-
} else {
306-
val path = s"$structPath${group.name}_"
307-
fields ++= parseGroupFlat(group, path)
308-
}
309-
case s: Primitive =>
310-
val dataType: DataType = s.dataType match {
311-
case d: Decimal =>
312-
DecimalType(d.getEffectivePrecision, d.getEffectiveScale)
313-
case a: AlphaNumeric =>
314-
a.enc match {
315-
case Some(RAW) => BinaryType
316-
case _ => StringType
317-
}
318-
case dt: Integral =>
319-
if (dt.precision > Constants.maxIntegerPrecision) {
320-
LongType
321-
}
322-
else {
323-
IntegerType
324-
}
325-
case _ => throw new IllegalStateException("Unknown AST object")
326-
}
327-
val path = s"$structPath" //${group.name}_"
328-
if (s.isArray) {
329-
for (i <- Range(1, s.arrayMaxSize + 1)) {
330-
fields += StructField(s"$path{s.name}_$i", ArrayType(dataType), nullable = true)
331-
}
332-
} else {
333-
fields += StructField(s"$path${s.name}", dataType, nullable = true)
334-
}
335-
}
336-
}
337-
338-
fields
339-
}
340284
}
341285

342286
object CobolSchema {

0 commit comments

Comments
 (0)