Skip to content

Commit db393ef

Browse files
committed
apacheGH-37910: [Java][Integration] Implement C Data Interface integration testing
1 parent a2561e3 commit db393ef

File tree

14 files changed

+281
-31
lines changed

14 files changed

+281
-31
lines changed

ci/scripts/integration_arrow.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ arrow_dir=${1}
2323
gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration
2424

2525
pip install -e $arrow_dir/dev/archery[integration]
26-
# For C# C Data Interface testing
27-
pip install pythonnet
26+
# For C Data Interface testing
27+
pip install jpype1 pythonnet
2828

2929
# Get more detailed context on crashes
3030
export PYTHONFAULTHANDLER=1

dev/archery/archery/integration/datagen.py

-1
Original file line numberDiff line numberDiff line change
@@ -1722,7 +1722,6 @@ def generate_dictionary_unsigned_case():
17221722

17231723
# TODO: JavaScript does not support uint64 dictionary indices, so disabled
17241724
# for now
1725-
17261725
# dict3 = Dictionary(3, StringField('dictionary3'), size=5, name='DICT3')
17271726
fields = [
17281727
DictionaryField('f0', get_field('', 'uint8'), dict0),

dev/archery/archery/integration/tester_csharp.py

+9
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from contextlib import contextmanager
1919
import gc
2020
import os
21+
import weakref
2122

2223
from . import cdata
2324
from .tester import Tester, CDataExporter, CDataImporter
@@ -72,6 +73,14 @@ def __init__(self, debug, args):
7273
self.ffi = cdata.ffi()
7374
_load_clr()
7475

76+
def _finalize():
77+
# Collect GC handles so as to call release functions from other
78+
# exporters before it gets too late.
79+
# TODO make this a run_gc() function?
80+
from Apache.Arrow.IntegrationTest import CDataInterface
81+
CDataInterface.RunGC()
82+
weakref.finalize(self, _finalize)
83+
7584
def _pointer_to_int(self, c_ptr):
7685
return int(self.ffi.cast('uintptr_t', c_ptr))
7786

dev/archery/archery/integration/tester_java.py

+169-8
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,12 @@
1616
# under the License.
1717

1818
import contextlib
19+
import functools
1920
import os
2021
import subprocess
2122

22-
from .tester import Tester
23+
from . import cdata
24+
from .tester import Tester, CDataExporter, CDataImporter
2325
from .util import run_cmd, log
2426
from ..utils.source import ARROW_ROOT_DEFAULT
2527

@@ -42,18 +44,25 @@ def load_version_from_pom():
4244
"ARROW_JAVA_INTEGRATION_JAR",
4345
os.path.join(
4446
ARROW_ROOT_DEFAULT,
45-
"java/tools/target/arrow-tools-{}-"
46-
"jar-with-dependencies.jar".format(_arrow_version),
47-
),
47+
"java/tools/target",
48+
f"arrow-tools-{_arrow_version}-jar-with-dependencies.jar"
49+
)
50+
)
51+
_ARROW_C_DATA_JAR = os.environ.get(
52+
"ARROW_C_DATA_JAVA_INTEGRATION_JAR",
53+
os.path.join(
54+
ARROW_ROOT_DEFAULT,
55+
"java/c/target",
56+
f"arrow-c-data-{_arrow_version}.jar"
57+
)
4858
)
4959
_ARROW_FLIGHT_JAR = os.environ.get(
5060
"ARROW_FLIGHT_JAVA_INTEGRATION_JAR",
5161
os.path.join(
5262
ARROW_ROOT_DEFAULT,
53-
"java/flight/flight-integration-tests/target/"
54-
"flight-integration-tests-{}-jar-with-dependencies.jar".format(
55-
_arrow_version),
56-
),
63+
"java/flight/flight-integration-tests/target",
64+
f"flight-integration-tests-{_arrow_version}-jar-with-dependencies.jar"
65+
)
5766
)
5867
_ARROW_FLIGHT_SERVER = (
5968
"org.apache.arrow.flight.integration.tests.IntegrationTestServer"
@@ -63,11 +72,157 @@ def load_version_from_pom():
6372
)
6473

6574

75+
@functools.lru_cache
76+
def setup_jpype():
77+
import jpype
78+
jar_path = f"{_ARROW_TOOLS_JAR}:{_ARROW_C_DATA_JAR}"
79+
# XXX Didn't manage to tone down the logging level here (DEBUG -> INFO)
80+
jpype.startJVM(jpype.getDefaultJVMPath(),
81+
"-Djava.class.path=" + jar_path, *_JAVA_OPTS)
82+
83+
84+
class _CDataBase:
85+
86+
def __init__(self, debug, args):
87+
import jpype
88+
self.debug = debug
89+
self.args = args
90+
self.ffi = cdata.ffi()
91+
setup_jpype()
92+
# JPype pointers to java.io, org.apache.arrow...
93+
self.java_io = jpype.JPackage("java").io
94+
self.java_arrow = jpype.JPackage("org").apache.arrow
95+
self.java_allocator = self._make_java_allocator()
96+
97+
def _pointer_to_int(self, c_ptr):
98+
return int(self.ffi.cast('uintptr_t', c_ptr))
99+
100+
def _wrap_c_schema_ptr(self, c_schema_ptr):
101+
return self.java_arrow.c.ArrowSchema.wrap(
102+
self._pointer_to_int(c_schema_ptr))
103+
104+
def _wrap_c_array_ptr(self, c_array_ptr):
105+
return self.java_arrow.c.ArrowArray.wrap(
106+
self._pointer_to_int(c_array_ptr))
107+
108+
def _make_java_allocator(self):
109+
# Return a new allocator
110+
return self.java_arrow.memory.RootAllocator()
111+
112+
def _assert_schemas_equal(self, expected, actual):
113+
# XXX This is fragile for dictionaries, as Schema.equals compares
114+
# dictionary ids.
115+
self.java_arrow.vector.util.Validator.compareSchemas(
116+
expected, actual)
117+
118+
def _assert_batches_equal(self, expected, actual):
119+
self.java_arrow.vector.util.Validator.compareVectorSchemaRoot(
120+
expected, actual)
121+
122+
def _assert_dict_providers_equal(self, expected, actual):
123+
self.java_arrow.vector.util.Validator.compareDictionaryProviders(
124+
expected, actual)
125+
126+
127+
class JavaCDataExporter(CDataExporter, _CDataBase):
128+
129+
def export_schema_from_json(self, json_path, c_schema_ptr):
130+
json_file = self.java_io.File(json_path)
131+
with self.java_arrow.vector.ipc.JsonFileReader(
132+
json_file, self.java_allocator) as json_reader:
133+
schema = json_reader.start()
134+
dict_provider = json_reader
135+
self.java_arrow.c.Data.exportSchema(
136+
self.java_allocator, schema, dict_provider,
137+
self._wrap_c_schema_ptr(c_schema_ptr)
138+
)
139+
140+
def export_batch_from_json(self, json_path, num_batch, c_array_ptr):
141+
json_file = self.java_io.File(json_path)
142+
with self.java_arrow.vector.ipc.JsonFileReader(
143+
json_file, self.java_allocator) as json_reader:
144+
json_reader.start()
145+
if num_batch > 0:
146+
actually_skipped = json_reader.skip(num_batch)
147+
assert actually_skipped == num_batch
148+
with json_reader.read() as batch:
149+
dict_provider = json_reader
150+
self.java_arrow.c.Data.exportVectorSchemaRoot(
151+
self.java_allocator, batch, dict_provider,
152+
self._wrap_c_array_ptr(c_array_ptr))
153+
154+
@property
155+
def supports_releasing_memory(self):
156+
return True
157+
158+
def record_allocation_state(self):
159+
return self.java_allocator.getAllocatedMemory()
160+
161+
def compare_allocation_state(self, recorded, gc_until):
162+
def pred():
163+
return self.java_allocator.getAllocatedMemory() == recorded
164+
165+
return gc_until(pred)
166+
167+
168+
class JavaCDataImporter(CDataImporter, _CDataBase):
169+
170+
def import_schema_and_compare_to_json(self, json_path, c_schema_ptr):
171+
json_file = self.java_io.File(json_path)
172+
with self.java_arrow.vector.ipc.JsonFileReader(
173+
json_file, self.java_allocator) as json_reader:
174+
json_schema = json_reader.start()
175+
with self.java_arrow.c.CDataDictionaryProvider() as dict_provider:
176+
imported_schema = self.java_arrow.c.Data.importSchema(
177+
self.java_allocator,
178+
self._wrap_c_schema_ptr(c_schema_ptr),
179+
dict_provider)
180+
self._assert_schemas_equal(json_schema, imported_schema)
181+
182+
def import_batch_and_compare_to_json(self, json_path, num_batch,
183+
c_array_ptr):
184+
json_file = self.java_io.File(json_path)
185+
with self.java_arrow.vector.ipc.JsonFileReader(
186+
json_file, self.java_allocator) as json_reader:
187+
schema = json_reader.start()
188+
if num_batch > 0:
189+
actually_skipped = json_reader.skip(num_batch)
190+
assert actually_skipped == num_batch
191+
with (json_reader.read() as batch,
192+
self.java_arrow.vector.VectorSchemaRoot.create(
193+
schema, self.java_allocator) as imported_batch):
194+
# We need to pass a dict provider primed with dictionary ids
195+
# matching those in the schema, hence an empty
196+
# CDataDictionaryProvider would not work here!
197+
dict_provider = (self.java_arrow.vector.dictionary
198+
.DictionaryProvider.MapDictionaryProvider())
199+
dict_provider.copyStructureFrom(json_reader, self.java_allocator)
200+
with dict_provider:
201+
self.java_arrow.c.Data.importIntoVectorSchemaRoot(
202+
self.java_allocator,
203+
self._wrap_c_array_ptr(c_array_ptr),
204+
imported_batch, dict_provider)
205+
self._assert_batches_equal(batch, imported_batch)
206+
self._assert_dict_providers_equal(json_reader, dict_provider)
207+
208+
@property
209+
def supports_releasing_memory(self):
210+
return True
211+
212+
def gc_until(self, predicate):
213+
# No need to call the Java GC thanks to AutoCloseable (?)
214+
return predicate()
215+
216+
66217
class JavaTester(Tester):
67218
PRODUCER = True
68219
CONSUMER = True
69220
FLIGHT_SERVER = True
70221
FLIGHT_CLIENT = True
222+
C_DATA_SCHEMA_EXPORTER = True
223+
C_DATA_SCHEMA_IMPORTER = True
224+
C_DATA_ARRAY_EXPORTER = True
225+
C_DATA_ARRAY_IMPORTER = True
71226

72227
name = 'Java'
73228

@@ -186,3 +341,9 @@ def flight_server(self, scenario_name=None):
186341
finally:
187342
server.kill()
188343
server.wait(5)
344+
345+
def make_c_data_exporter(self):
346+
return JavaCDataExporter(self.debug, self.args)
347+
348+
def make_c_data_importer(self):
349+
return JavaCDataImporter(self.debug, self.args)

docker-compose.yml

+8-3
Original file line numberDiff line numberDiff line change
@@ -1730,16 +1730,21 @@ services:
17301730
volumes: *conda-volumes
17311731
environment:
17321732
<<: [*common, *ccache]
1733-
# tell archery where the arrow binaries are located
1733+
ARCHERY_INTEGRATION_WITH_RUST: 0
1734+
# Tell Archery where the arrow C++ binaries are located
17341735
ARROW_CPP_EXE_PATH: /build/cpp/debug
17351736
ARROW_GO_INTEGRATION: 1
1736-
ARCHERY_INTEGRATION_WITH_RUST: 0
1737+
ARROW_JAVA_CDATA: "ON"
1738+
JAVA_JNI_CMAKE_ARGS: >-
1739+
-DARROW_JAVA_JNI_ENABLE_DEFAULT=OFF
1740+
-DARROW_JAVA_JNI_ENABLE_C=ON
17371741
command:
17381742
["/arrow/ci/scripts/rust_build.sh /arrow /build &&
17391743
/arrow/ci/scripts/cpp_build.sh /arrow /build &&
17401744
/arrow/ci/scripts/csharp_build.sh /arrow /build &&
17411745
/arrow/ci/scripts/go_build.sh /arrow &&
1742-
/arrow/ci/scripts/java_build.sh /arrow /build &&
1746+
/arrow/ci/scripts/java_jni_build.sh /arrow $${ARROW_HOME} /build /tmp/dist/java/$$(arch) &&
1747+
/arrow/ci/scripts/java_build.sh /arrow /build /tmp/dist/java &&
17431748
/arrow/ci/scripts/js_build.sh /arrow /build &&
17441749
/arrow/ci/scripts/integration_arrow.sh /arrow /build"]
17451750

java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -165,9 +165,9 @@ public List<ArrowBuf> visit(ArrowType.Union type) {
165165
return Collections.singletonList(importFixedBytes(type, 0, UnionVector.TYPE_WIDTH));
166166
case Dense:
167167
return Arrays.asList(importFixedBytes(type, 0, DenseUnionVector.TYPE_WIDTH),
168-
importFixedBytes(type, 0, DenseUnionVector.OFFSET_WIDTH));
168+
importFixedBytes(type, 1, DenseUnionVector.OFFSET_WIDTH));
169169
default:
170-
throw new UnsupportedOperationException("Importing buffers for type: " + type);
170+
throw new UnsupportedOperationException("Importing buffers for union type: " + type);
171171
}
172172
}
173173

java/c/src/main/java/org/apache/arrow/c/Format.java

+4
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ static String asString(ArrowType arrowType) {
138138
return "tiD";
139139
case YEAR_MONTH:
140140
return "tiM";
141+
case MONTH_DAY_NANO:
142+
return "tin";
141143
default:
142144
throw new UnsupportedOperationException(
143145
String.format("Interval type with unit %s is unsupported", type.getUnit()));
@@ -277,6 +279,8 @@ static ArrowType asType(String format, long flags)
277279
return new ArrowType.Interval(IntervalUnit.YEAR_MONTH);
278280
case "tiD":
279281
return new ArrowType.Interval(IntervalUnit.DAY_TIME);
282+
case "tin":
283+
return new ArrowType.Interval(IntervalUnit.MONTH_DAY_NANO);
280284
case "+l":
281285
return new ArrowType.List();
282286
case "+L":

java/c/src/main/java/org/apache/arrow/c/SchemaImporter.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ final class SchemaImporter {
4444
private static final Logger logger = LoggerFactory.getLogger(SchemaImporter.class);
4545

4646
private static final int MAX_IMPORT_RECURSION_LEVEL = 64;
47-
private long nextDictionaryID = 1L;
47+
private long nextDictionaryID = 0L;
4848

4949
private final BufferAllocator allocator;
5050

java/vector/src/main/java/org/apache/arrow/vector/NullVector.java

+1
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ public List<FieldVector> getChildrenFromFields() {
192192
@Override
193193
public void loadFieldBuffers(ArrowFieldNode fieldNode, List<ArrowBuf> ownBuffers) {
194194
Preconditions.checkArgument(ownBuffers.isEmpty(), "Null vector has no buffers");
195+
valueCount = fieldNode.getLength();
195196
}
196197

197198
@Override

java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java

+4-2
Original file line numberDiff line numberDiff line change
@@ -121,9 +121,11 @@ public boolean rangeEquals(Range range) {
121121
"rightStart %s must be non negative.", range.getRightStart());
122122

123123
Preconditions.checkArgument(range.getRightStart() + range.getLength() <= right.getValueCount(),
124-
"(rightStart + length) %s out of range[0, %s].", 0, right.getValueCount());
124+
"(rightStart + length) %s out of range[0, %s].",
125+
range.getRightStart() + range.getLength(), right.getValueCount());
125126
Preconditions.checkArgument(range.getLeftStart() + range.getLength() <= left.getValueCount(),
126-
"(leftStart + length) %s out of range[0, %s].", 0, left.getValueCount());
127+
"(leftStart + length) %s out of range[0, %s].",
128+
range.getLeftStart() + range.getLength(), left.getValueCount());
127129

128130
return left.accept(this, range);
129131
}

java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryProvider.java

+25-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import java.util.Map;
2222
import java.util.Set;
2323

24+
import org.apache.arrow.memory.BufferAllocator;
25+
2426
/**
2527
* A manager for association of dictionary IDs to their corresponding {@link Dictionary}.
2628
*/
@@ -35,7 +37,7 @@ public interface DictionaryProvider {
3537
/**
3638
* Implementation of {@link DictionaryProvider} that is backed by a hash-map.
3739
*/
38-
class MapDictionaryProvider implements DictionaryProvider {
40+
class MapDictionaryProvider implements AutoCloseable, DictionaryProvider {
3941

4042
private final Map<Long, Dictionary> map;
4143

@@ -49,6 +51,21 @@ public MapDictionaryProvider(Dictionary... dictionaries) {
4951
}
5052
}
5153

54+
/**
55+
* Initialize the map structure from another provider, but with empty vectors.
56+
*
57+
* @param other the {@link DictionaryProvider} to copy the ids and fields from
58+
* @param allocator allocator to create the empty vectors
59+
*/
60+
public void copyStructureFrom(DictionaryProvider other, BufferAllocator allocator) {
61+
for (Long id : other.getDictionaryIds()) {
62+
Dictionary otherDict = other.lookup(id);
63+
Dictionary newDict = new Dictionary(otherDict.getVector().getField().createVector(allocator),
64+
otherDict.getEncoding());
65+
put(newDict);
66+
}
67+
}
68+
5269
public void put(Dictionary dictionary) {
5370
map.put(dictionary.getEncoding().getId(), dictionary);
5471
}
@@ -62,5 +79,12 @@ public final Set<Long> getDictionaryIds() {
6279
public Dictionary lookup(long id) {
6380
return map.get(id);
6481
}
82+
83+
@Override
84+
public void close() {
85+
for (Dictionary dictionary : map.values()) {
86+
dictionary.getVector().close();
87+
}
88+
}
6589
}
6690
}

0 commit comments

Comments
 (0)