Skip to content

Commit 258e0fe

Browse files
authored
[Java] Add recoverWithNull to JSONOptions and pass to Table.readJSON (#14078)
This PR exposes the recently added `json_reader_options_builder::recovery_mode` option in the JNI layer. closes #14073 Authors: - Andy Grove (https://github.com/andygrove) - Nghia Truong (https://github.com/ttnghia) Approvers: - Gera Shegalov (https://github.com/gerashegalov) - Robert (Bobby) Evans (https://github.com/revans2) - Raza Jafri (https://github.com/razajafri) - Nghia Truong (https://github.com/ttnghia) URL: #14078
1 parent 72c9583 commit 258e0fe

File tree

5 files changed

+83
-10
lines changed

5 files changed

+83
-10
lines changed

java/src/main/java/ai/rapids/cudf/JSONOptions.java

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
*
3-
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
3+
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
44
*
55
* Licensed under the Apache License, Version 2.0 (the "License");
66
* you may not use this file except in compliance with the License.
@@ -29,11 +29,13 @@ public final class JSONOptions extends ColumnFilterOptions {
2929

3030
private final boolean dayFirst;
3131
private final boolean lines;
32+
private final boolean recoverWithNull;
3233

3334
private JSONOptions(Builder builder) {
3435
super(builder);
3536
dayFirst = builder.dayFirst;
3637
lines = builder.lines;
38+
recoverWithNull = builder.recoverWithNull;
3739
}
3840

3941
public boolean isDayFirst() {
@@ -44,6 +46,11 @@ public boolean isLines() {
4446
return lines;
4547
}
4648

49+
/** Return the value of the recoverWithNull option */
50+
public boolean isRecoverWithNull() {
51+
return recoverWithNull;
52+
}
53+
4754
@Override
4855
String[] getIncludeColumnNames() {
4956
throw new UnsupportedOperationException("JSON reader didn't support column prune");
@@ -57,6 +64,8 @@ public static final class Builder extends ColumnFilterOptions.Builder<JSONOptio
5764
private boolean dayFirst = false;
5865
private boolean lines = true;
5966

67+
private boolean recoverWithNull = false;
68+
6069
/**
6170
* Whether to parse dates as DD/MM versus MM/DD
6271
* @param dayFirst true: DD/MM, false, MM/DD
@@ -78,6 +87,20 @@ public Builder withLines(boolean perLine) {
7887
return this;
7988
}
8089

90+
/**
91+
* Specify how to handle invalid lines when parsing json. Setting
92+
* recoverWithNull to true will cause null values to be returned
93+
* for invalid lines. Setting recoverWithNull to false will cause
94+
* the parsing to fail with an exception.
95+
*
96+
* @param recoverWithNull true: return nulls, false: throw exception
97+
* @return builder for chaining
98+
*/
99+
public Builder withRecoverWithNull(boolean recoverWithNull) {
100+
this.recoverWithNull = recoverWithNull;
101+
return this;
102+
}
103+
81104
@Override
82105
public Builder includeColumn(String... names) {
83106
throw new UnsupportedOperationException("JSON reader didn't support column prune");

java/src/main/java/ai/rapids/cudf/Table.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,11 @@ private static native long[] readCSV(String[] columnNames,
241241
private static native long readJSON(String[] columnNames,
242242
int[] dTypeIds, int[] dTypeScales,
243243
String filePath, long address, long length,
244-
boolean dayFirst, boolean lines) throws CudfException;
244+
boolean dayFirst, boolean lines,
245+
boolean recoverWithNulls) throws CudfException;
245246

246247
private static native long readAndInferJSON(long address, long length,
247-
boolean dayFirst, boolean lines) throws CudfException;
248+
boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException;
248249

249250
/**
250251
* Read in Parquet formatted data.
@@ -1047,7 +1048,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
10471048
readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
10481049
path.getAbsolutePath(),
10491050
0, 0,
1050-
opts.isDayFirst(), opts.isLines()))) {
1051+
opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()))) {
10511052

10521053
return gatherJSONColumns(schema, twm);
10531054
}
@@ -1099,7 +1100,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
10991100
assert len <= buffer.length - offset;
11001101
assert offset >= 0 && offset < buffer.length;
11011102
return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
1102-
opts.isDayFirst(), opts.isLines()));
1103+
opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()));
11031104
}
11041105

11051106
/**
@@ -1121,7 +1122,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
11211122
assert offset >= 0 && offset < buffer.length;
11221123
try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(),
11231124
schema.getTypeIds(), schema.getTypeScales(), null,
1124-
buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines()))) {
1125+
buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
1126+
opts.isRecoverWithNull()))) {
11251127
return gatherJSONColumns(schema, twm);
11261128
}
11271129
}

java/src/main/native/src/TableJni.cpp

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1331,7 +1331,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
13311331
}
13321332

13331333
JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
1334-
JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
1334+
JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
1335+
jboolean recover_with_null) {
13351336

13361337
JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
13371338
if (buffer_length <= 0) {
@@ -1344,9 +1345,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
13441345
auto source = cudf::io::source_info{reinterpret_cast<char *>(buffer),
13451346
static_cast<std::size_t>(buffer_length)};
13461347

1348+
auto const recovery_mode = recover_with_null ?
1349+
cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
1350+
cudf::io::json_recovery_mode_t::FAIL;
13471351
cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
13481352
.dayfirst(static_cast<bool>(day_first))
1349-
.lines(static_cast<bool>(lines));
1353+
.lines(static_cast<bool>(lines))
1354+
.recovery_mode(recovery_mode);
13501355

13511356
auto result =
13521357
std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1404,7 +1409,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
14041409

14051410
JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
14061411
JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
1407-
jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
1412+
jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
1413+
jboolean recover_with_null) {
14081414

14091415
bool read_buffer = true;
14101416
if (buffer == 0) {
@@ -1448,9 +1454,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
14481454
static_cast<std::size_t>(buffer_length)} :
14491455
cudf::io::source_info{filename.get()};
14501456

1457+
cudf::io::json_recovery_mode_t recovery_mode =
1458+
recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
1459+
cudf::io::json_recovery_mode_t::FAIL;
14511460
cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
14521461
.dayfirst(static_cast<bool>(day_first))
1453-
.lines(static_cast<bool>(lines));
1462+
.lines(static_cast<bool>(lines))
1463+
.recovery_mode(recovery_mode);
14541464

14551465
if (!n_col_names.is_null() && data_types.size() > 0) {
14561466
if (n_col_names.size() != n_types.size()) {

java/src/test/java/ai/rapids/cudf/TableTest.java

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ public class TableTest extends CudfTestBase {
8686
private static final File TEST_ALL_TYPES_PLAIN_AVRO_FILE = TestUtils.getResourceAsFile("alltypes_plain.avro");
8787
private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
8888
private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
89+
private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json");
8990

9091
private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
9192
.column(DType.INT32, "A")
@@ -326,6 +327,39 @@ void testReadJSONFile() {
326327
}
327328
}
328329

330+
@Test
331+
void testReadJSONFileWithInvalidLines() {
332+
Schema schema = Schema.builder()
333+
.column(DType.STRING, "name")
334+
.column(DType.INT32, "age")
335+
.build();
336+
337+
// test with recoverWithNulls=true
338+
{
339+
JSONOptions opts = JSONOptions.builder()
340+
.withLines(true)
341+
.withRecoverWithNull(true)
342+
.build();
343+
try (Table expected = new Table.TestBuilder()
344+
.column("Michael", "Andy", null, "Justin")
345+
.column(null, 30, null, 19)
346+
.build();
347+
Table table = Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE)) {
348+
assertTablesAreEqual(expected, table);
349+
}
350+
}
351+
352+
// test with recoverWithNulls=false
353+
{
354+
JSONOptions opts = JSONOptions.builder()
355+
.withLines(true)
356+
.withRecoverWithNull(false)
357+
.build();
358+
assertThrows(CudfException.class, () ->
359+
Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE));
360+
}
361+
}
362+
329363
@Test
330364
void testReadJSONFileWithDifferentColumnOrder() {
331365
Schema schema = Schema.builder()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{"name":"Michael"}
2+
{"name":"Andy", "age":30}
3+
this_line_is_not_valid
4+
{"name":"Justin", "age":19}

0 commit comments

Comments
 (0)