Skip to content

Commit 6f27ea7

Browse files
committed
#25 Generate string Spark schema when display_pic_always_string=true.
1 parent 7b21b21 commit 6f27ea7

File tree

19 files changed

+290
-48
lines changed

19 files changed

+290
-48
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1532,6 +1532,7 @@ The output looks like this:
15321532
| Option (usage example) | Description |
15331533
|-----------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
15341534
| .option("string_trimming_policy", "both") | Specifies if and how string fields should be trimmed. Available options: `both` (default), `none`, `left`, `right`, `keep_all`. `keep_all` - keeps control characters when decoding ASCII text files |
1535+
| .option("display_pic_always_string", "false") | If `true` fields that have `DISPLAY` format will always be converted to `string` type, even if such fields contain numbers, retaining leading and trailing zeros. |
15351536
| .option("ebcdic_code_page", "common") | Specifies a code page for EBCDIC encoding. Currently supported values: `common` (default), `common_extended`, `cp037`, `cp037_extended`, and others (see "Currently supported EBCDIC code pages" section. |
15361537
| .option("ebcdic_code_page_class", "full.class.specifier") | Specifies a user provided class for a custom code page to UNICODE conversion. |
15371538
| .option("field_code_page:cp825", "field1, field2") | Specifies the code page for selected fields. You can add mo than 1 such option for multiple code page overrides. |

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -107,24 +107,25 @@ object CopybookParser extends Logging {
107107
* Tokenizes a Cobol Copybook contents and returns the AST.
108108
*
109109
* @param dataEncoding Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
110-
* @param copyBookContents A string containing all lines of a copybook
111-
* @param dropGroupFillers Drop groups marked as fillers from the output AST
112-
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
113-
* @param fillerNamingPolicy Specifies a naming policy for fillers
110+
* @param copyBookContents A string containing all lines of a copybook.
111+
* @param dropGroupFillers Drop groups marked as fillers from the output AST.
112+
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST.
113+
* @param fillerNamingPolicy Specifies a naming policy for fillers.
114114
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
115115
* resolving segment redefines.
116-
* @param fieldParentMap A segment fields parent mapping
117-
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
118-
* @param strictSignOverpunch If true sign overpunching is not allowed for unsigned numbers
116+
* @param fieldParentMap A segment fields parent mapping.
117+
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed.
118+
* @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers.
119+
* @param strictSignOverpunch If true sign overpunching is not allowed for unsigned numbers.
119120
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
120-
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
121-
* @param ebcdicCodePage A code page for EBCDIC encoded data
122-
* @param asciiCharset A charset for ASCII encoded data
121+
* @param commentPolicy Specifies a policy for comments truncation inside a copybook.
122+
* @param ebcdicCodePage A code page for EBCDIC encoded data.
123+
* @param asciiCharset A charset for ASCII encoded data.
123124
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
124-
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
125-
* @param nonTerminals A list of non-terminals that should be extracted as strings
125+
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754).
126+
* @param nonTerminals A list of non-terminals that should be extracted as strings.
126127
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
127-
* @return Seq[Group] where a group is a record inside the copybook
128+
* @return Seq[Group] where a group is a record inside the copybook.
128129
*/
129130
def parse(copyBookContents: String,
130131
dataEncoding: Encoding = EBCDIC,
@@ -134,6 +135,7 @@ object CopybookParser extends Logging {
134135
segmentRedefines: Seq[String] = Nil,
135136
fieldParentMap: Map[String, String] = HashMap[String, String](),
136137
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
138+
isDisplayAlwaysString: Boolean = false,
137139
commentPolicy: CommentPolicy = CommentPolicy(),
138140
strictSignOverpunch: Boolean = true,
139141
improvedNullDetection: Boolean = false,
@@ -155,6 +157,7 @@ object CopybookParser extends Logging {
155157
segmentRedefines,
156158
fieldParentMap,
157159
stringTrimmingPolicy,
160+
isDisplayAlwaysString,
158161
commentPolicy,
159162
strictSignOverpunch,
160163
improvedNullDetection,
@@ -180,6 +183,7 @@ object CopybookParser extends Logging {
180183
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
181184
* @param fieldParentMap A segment fields parent mapping
182185
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
186+
* @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers
183187
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
184188
* @param strictSignOverpunch If true sign overpunching is not allowed for unsigned numbers
185189
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
@@ -198,6 +202,7 @@ object CopybookParser extends Logging {
198202
segmentRedefines: Seq[String] = Nil,
199203
fieldParentMap: Map[String, String] = HashMap[String, String](),
200204
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
205+
isDisplayAlwaysString: Boolean = false,
201206
commentPolicy: CommentPolicy = CommentPolicy(),
202207
strictSignOverpunch: Boolean = true,
203208
improvedNullDetection: Boolean = false,
@@ -219,6 +224,7 @@ object CopybookParser extends Logging {
219224
segmentRedefines,
220225
fieldParentMap,
221226
stringTrimmingPolicy,
227+
isDisplayAlwaysString,
222228
commentPolicy,
223229
strictSignOverpunch,
224230
improvedNullDetection,
@@ -265,6 +271,7 @@ object CopybookParser extends Logging {
265271
segmentRedefines: Seq[String],
266272
fieldParentMap: Map[String, String],
267273
stringTrimmingPolicy: StringTrimmingPolicy,
274+
isDisplayAlwaysString: Boolean,
268275
commentPolicy: CommentPolicy,
269276
strictSignOverpunch: Boolean,
270277
improvedNullDetection: Boolean,
@@ -279,7 +286,7 @@ object CopybookParser extends Logging {
279286
debugFieldsPolicy: DebugFieldsPolicy,
280287
fieldCodePageMap: Map[String, String]): Copybook = {
281288

282-
val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap)
289+
val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, isDisplayAlwaysString, commentPolicy, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap)
283290

284291
val nonTerms: Set[String] = (for (id <- nonTerminals)
285292
yield transformIdentifier(id)

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ object ANTLRParser extends Logging {
5454
def parse(copyBookContents: String,
5555
enc: Encoding,
5656
stringTrimmingPolicy: StringTrimmingPolicy,
57+
isDisplayAlwaysString: Boolean,
5758
commentPolicy: CommentPolicy,
5859
strictSignOverpunch: Boolean,
5960
improvedNullDetection: Boolean,
@@ -64,7 +65,7 @@ object ANTLRParser extends Logging {
6465
isUtf16BigEndian: Boolean,
6566
floatingPointFormat: FloatingPointFormat,
6667
fieldCodePageMap: Map[String, String]): CopybookAST = {
67-
val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, fieldCodePageMap)
68+
val visitor = new ParserVisitor(enc, stringTrimmingPolicy, isDisplayAlwaysString, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, fieldCodePageMap)
6869

6970
val strippedContents = filterSpecialCharacters(copyBookContents).split("\\r?\\n").map(
7071
line =>

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ sealed trait Expr
4141

4242
class ParserVisitor(enc: Encoding,
4343
stringTrimmingPolicy: StringTrimmingPolicy,
44+
isDisplayAlwaysString: Boolean,
4445
ebcdicCodePage: CodePage,
4546
asciiCharset: Charset,
4647
isUtf16BigEndian: Boolean,
@@ -854,7 +855,7 @@ class ParserVisitor(enc: Encoding,
854855
Map(),
855856
isDependee = false,
856857
identifier.toUpperCase() == Constants.FILLER,
857-
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision)
858+
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, isDisplayAlwaysString, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian = isUtf16BigEndian, floatingPointFormat, strictSignOverpunch = strictSignOverpunch, improvedNullDetection = improvedNullDetection, strictIntegralPrecision = strictIntegralPrecision)
858859
) (Some(parent))
859860

860861
parent.children.append(prim)

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/asttransform/NonTerminalsAdder.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ class NonTerminalsAdder(
7373
)
7474
val sz = g.binaryProperties.actualSize
7575
val dataType = AlphaNumeric(s"X($sz)", sz, enc = Some(enc))
76-
val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
76+
val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, isDisplayAlwaysString = false, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
7777
val newName = getNonTerminalName(g.name, g.parent.get)
7878
newChildren.append(
7979
Primitive(

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ object DecoderSelector {
5656
*/
5757
def getDecoder(dataType: CobolType,
5858
stringTrimmingPolicy: StringTrimmingPolicy = TrimBoth,
59+
isDisplayAlwaysString: Boolean = false,
5960
ebcdicCodePage: CodePage = new CodePageCommon,
6061
asciiCharset: Charset = StandardCharsets.US_ASCII,
6162
isUtf16BigEndian: Boolean = true,
@@ -66,6 +67,7 @@ object DecoderSelector {
6667
val decoder = dataType match {
6768
case alphaNumeric: AlphaNumeric => getStringDecoder(alphaNumeric.enc.getOrElse(EBCDIC), stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, improvedNullDetection)
6869
case decimalType: Decimal => getDecimalDecoder(decimalType, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
70+
case integralType: Integral if isDisplayAlwaysString => getDisplayDecoderAsString(integralType, improvedNullDetection, strictIntegralPrecision)
6971
case integralType: Integral => getIntegralDecoder(integralType, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision)
7072
case _ => throw new IllegalStateException("Unknown AST object")
7173
}
@@ -251,6 +253,29 @@ object DecoderSelector {
251253
}
252254
}
253255

256+
private[parser] def getDisplayDecoderAsString(integralType: Integral,
257+
improvedNullDetection: Boolean,
258+
strictSignOverpunch: Boolean): Decoder = {
259+
val encoding = integralType.enc.getOrElse(EBCDIC)
260+
val isSigned = integralType.signPosition.isDefined
261+
val allowedSignOverpunch = isSigned || !strictSignOverpunch
262+
263+
val isEbcdic = encoding match {
264+
case EBCDIC => true
265+
case _ => false
266+
}
267+
268+
if (isEbcdic) {
269+
bytes: Array[Byte] => {
270+
StringDecoders.decodeEbcdicNumber(bytes, !isSigned, allowedSignOverpunch,improvedNullDetection)
271+
}
272+
} else {
273+
bytes: Array[Byte] => {
274+
StringDecoders.decodeAsciiNumber(bytes, !isSigned, allowedSignOverpunch,improvedNullDetection)
275+
}
276+
}
277+
}
278+
254279
/** Gets a decoder function for a binary encoded integral data type. A direct conversion from array of bytes to the target type is used where possible. */
255280
private def getBinaryEncodedIntegralDecoder(compact: Option[Usage], precision: Int, signPosition: Option[Position] = None, isBigEndian: Boolean, strictIntegralPrecision: Boolean): Decoder = {
256281
val isSigned = signPosition.nonEmpty

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/CobolParameters.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten
4949
* @param generateRecordBytes Generate 'record_bytes' field containing raw bytes of the original record
5050
* @param schemaRetentionPolicy A copybook usually has a root group struct element that acts like a rowtag in XML. This can be retained in Spark schema or can be collapsed
5151
* @param stringTrimmingPolicy Specify if and how strings should be trimmed when parsed
52+
* @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers
5253
* @param allowPartialRecords If true, partial ASCII records can be parsed (in cases when LF character is missing for example)
5354
* @param multisegmentParams Parameters for reading multisegment mainframe files
5455
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
@@ -87,6 +88,7 @@ case class CobolParameters(
8788
generateRecordBytes: Boolean,
8889
schemaRetentionPolicy: SchemaRetentionPolicy,
8990
stringTrimmingPolicy: StringTrimmingPolicy,
91+
isDisplayAlwaysString: Boolean,
9092
allowPartialRecords: Boolean,
9193
multisegmentParams: Option[MultisegmentParameters],
9294
commentPolicy: CommentPolicy,

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/parameters/ReaderParameters.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ import za.co.absa.cobrix.cobol.reader.policies.SchemaRetentionPolicy.SchemaReten
5959
* @param generateRecordBytes Generate 'record_bytes' field containing raw bytes of the original record
6060
* @param schemaPolicy Specifies a policy to transform the input schema. The default policy is to keep the schema exactly as it is in the copybook.
6161
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed.
62+
* @param isDisplayAlwaysString If true, all fields having DISPLAY format will remain strings and won't be converted to numbers.
6263
* @param allowPartialRecords If true, partial ASCII records can be parsed (in cases when LF character is missing for example)
6364
* @param multisegment Parameters specific to reading multisegment files
6465
* @param commentPolicy A comment truncation policy
@@ -108,6 +109,7 @@ case class ReaderParameters(
108109
generateRecordBytes: Boolean = false,
109110
schemaPolicy: SchemaRetentionPolicy = SchemaRetentionPolicy.CollapseRoot,
110111
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
112+
isDisplayAlwaysString: Boolean = false,
111113
allowPartialRecords: Boolean = false,
112114
multisegment: Option[MultisegmentParameters] = None,
113115
commentPolicy: CommentPolicy = CommentPolicy(),

0 commit comments

Comments
 (0)