Skip to content

Commit e952b63

Browse files
authored
feat(java): jit support for chunk based map serialization (#2027)
## What does this PR do? This PR added jit support for chunk based map serialization, it supports all kinds of map serializaiton by generated code: - final map key and value field type - polymorphic map key and value field type - nested map key and value type This PR also removed the old map serialization protocol code. The new chunk based protocol improve serialized size by **2.3X** at most. data: ``` stringMap: {"k1": "v1", "k2": "v2, ..., "k10": "v10" } intMap: {1:2, 2:4, 3: 6, ..., 10: 20} ``` new protocol: ``` stringMapBytes 68 stringKVStructBytes 69 intMapBytes 28 intKVStructBytes 29 ``` old protocol: ``` stringMapBytes 104 stringKVStructBytes 87 intMapBytes 64 intKVStructBytes 47 ``` And improve performance by 20% ## Related issues Closes #925 #2025 ## Does this PR introduce any user-facing change? <!-- If any user-facing interface changes, please [open an issue](https://github.com/apache/fury/issues/new/choose) describing the need to do so and update the document if necessary. --> - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark [chunk-jmh-result.csv](https://github.com/user-attachments/files/18575900/chunk-jmh-result.csv) [nochunk-jmh-result.csv](https://github.com/user-attachments/files/18575901/nochunk-jmh-result.csv) ![image](https://github.com/user-attachments/assets/754f8e48-b45e-489b-adf5-cca1c5d03f1e)
1 parent 1e63705 commit e952b63

File tree

19 files changed

+1508
-760
lines changed

19 files changed

+1508
-760
lines changed
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
import pandas as pd
19+
import matplotlib.pyplot as plt
20+
import numpy as np
21+
22+
# Load the CSV data
23+
no_chunk_file = "nochunk-jmh-result.csv"
24+
chunk_file = "chunk-jmh-result.csv"
25+
# Read the CSV files
26+
no_chunk_df = pd.read_csv(no_chunk_file)
27+
chunk_df = pd.read_csv(chunk_file)
28+
29+
30+
# Function to plot the figures
31+
def plot_benchmark(ax, data1, data2, operation, struct, datatype, title):
32+
# Filter data
33+
filtered_data1 = data1[
34+
(data1["Benchmark"] == (operation))
35+
& (data1["struct"] == struct)
36+
& (data1["datatype"] == datatype)
37+
]
38+
filtered_data2 = data2[
39+
(data2["Benchmark"] == (operation))
40+
& (data2["struct"] == struct)
41+
& (data2["datatype"] == datatype)
42+
]
43+
44+
# Sort data according to 'mapSize'
45+
filtered_data1 = filtered_data1.sort_values("mapSize")
46+
filtered_data2 = filtered_data2.sort_values("mapSize")
47+
48+
# Plotting
49+
x_labels = filtered_data1["mapSize"].astype(str).tolist()
50+
x = np.arange(len(x_labels))
51+
width = 0.35
52+
53+
ax.bar(
54+
x - width / 2,
55+
filtered_data1["Score"],
56+
width,
57+
yerr=filtered_data1["ScoreError"],
58+
label="No Chunk",
59+
)
60+
ax.bar(
61+
x + width / 2,
62+
filtered_data2["Score"],
63+
width,
64+
yerr=filtered_data2["ScoreError"],
65+
label="Chunk",
66+
)
67+
ax.set_xlabel("Map Size")
68+
ax.set_ylabel("Score (ops/s)")
69+
ax.set_title(title)
70+
ax.set_xticks(x)
71+
ax.set_xticklabels(x_labels)
72+
ax.legend()
73+
74+
75+
# Create the subplots for datatype "int"
76+
fig1, axs1 = plt.subplots(2, 2, figsize=(10, 8))
77+
plot_benchmark(
78+
axs1[0, 0],
79+
no_chunk_df,
80+
chunk_df,
81+
"serialize",
82+
True,
83+
"int",
84+
"Serialize | Datatype: Int",
85+
)
86+
plot_benchmark(
87+
axs1[0, 1],
88+
no_chunk_df,
89+
chunk_df,
90+
"serialize",
91+
True,
92+
"string",
93+
"Serialize | Datatype: String",
94+
)
95+
plot_benchmark(
96+
axs1[1, 0],
97+
no_chunk_df,
98+
chunk_df,
99+
"deserialize",
100+
True,
101+
"int",
102+
"Deserialize | Datatype: Int",
103+
)
104+
plot_benchmark(
105+
axs1[1, 1],
106+
no_chunk_df,
107+
chunk_df,
108+
"deserialize",
109+
True,
110+
"string",
111+
"Deserialize | Datatype: String",
112+
)
113+
plt.tight_layout()
114+
plt.suptitle("Benchmarks for codegen", y=1.05)
115+
116+
117+
# Create the subplots for datatype "string"
118+
fig2, axs2 = plt.subplots(2, 2, figsize=(10, 8))
119+
plot_benchmark(
120+
axs2[0, 0],
121+
no_chunk_df,
122+
chunk_df,
123+
"serialize",
124+
False,
125+
"int",
126+
"Serialize | Datatype: Int",
127+
)
128+
plot_benchmark(
129+
axs2[0, 1],
130+
no_chunk_df,
131+
chunk_df,
132+
"serialize",
133+
False,
134+
"string",
135+
"Serialize | Datatype: String",
136+
)
137+
plot_benchmark(
138+
axs2[1, 0],
139+
no_chunk_df,
140+
chunk_df,
141+
"deserialize",
142+
False,
143+
"int",
144+
"Deserialize | Datatype: Int",
145+
)
146+
plot_benchmark(
147+
axs2[1, 1],
148+
no_chunk_df,
149+
chunk_df,
150+
"deserialize",
151+
False,
152+
"string",
153+
"Deserialize | Datatype: String",
154+
)
155+
plt.tight_layout()
156+
plt.suptitle("Benchmarks for no codegen", y=1.05)
157+
158+
plt.show()

java/benchmark/src/main/java/org/apache/fury/benchmark/MapSerializationSuite.java

Lines changed: 44 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323
import java.util.HashMap;
2424
import java.util.Map;
2525
import org.apache.fury.Fury;
26-
import org.apache.fury.serializer.Serializer;
27-
import org.apache.fury.serializer.collection.AbstractMapSerializer;
2826
import org.openjdk.jmh.Main;
2927
import org.openjdk.jmh.annotations.Benchmark;
3028
import org.openjdk.jmh.annotations.BenchmarkMode;
@@ -49,53 +47,70 @@ public static void main(String[] args) throws IOException {
4947
Main.main(args);
5048
}
5149

50+
public static class StringKVMapStruct {
51+
Map<String, String> map;
52+
}
53+
54+
public static class IntKVMapStruct {
55+
Map<Integer, Integer> map;
56+
}
57+
5258
@State(Scope.Thread)
5359
public static class MapState {
54-
@Param({"5", "20", "50", "100", "200"})
60+
@Param({"50"})
5561
public int mapSize;
5662

5763
@Param({"false", "true"})
58-
public boolean enableChunkEncoding;
64+
public boolean struct;
65+
66+
@Param({"int", "string"})
67+
public String datatype;
5968

60-
private Map<String, String> stringMap;
61-
private Map<Integer, Integer> integerMap;
62-
private byte[] stringMapBytes;
63-
private byte[] integerMapBytes;
69+
private Object object;
70+
private byte[] bytes;
6471
private Fury fury;
6572

6673
@Setup(Level.Trial)
6774
public void setup() {
6875
fury = Fury.builder().build();
69-
Serializer<HashMap> serializer = fury.getSerializer(HashMap.class);
70-
((AbstractMapSerializer) serializer).setUseChunkSerialize(enableChunkEncoding);
71-
stringMap = new HashMap<>(mapSize);
72-
integerMap = new HashMap<>(mapSize);
76+
fury.register(StringKVMapStruct.class);
77+
fury.register(IntKVMapStruct.class);
78+
Map<String, String> stringMap = new HashMap<>(mapSize);
79+
Map<Integer, Integer> intMap = new HashMap<>(mapSize);
7380
for (int i = 0; i < mapSize; i++) {
7481
stringMap.put("k" + i, "v" + i);
75-
integerMap.put(i, i * 2);
82+
intMap.put(i, i * 2);
83+
}
84+
StringKVMapStruct stringKVMapStruct = new StringKVMapStruct();
85+
stringKVMapStruct.map = stringMap;
86+
IntKVMapStruct intKVMapStruct = new IntKVMapStruct();
87+
intKVMapStruct.map = intMap;
88+
byte[] stringMapBytes = fury.serialize(stringMap);
89+
byte[] intMapBytes = fury.serialize(intMap);
90+
byte[] stringKVStructBytes = fury.serialize(stringKVMapStruct);
91+
byte[] intKVStructBytes = fury.serialize(intKVMapStruct);
92+
switch (datatype) {
93+
case "int":
94+
object = struct ? intKVMapStruct : intMap;
95+
bytes = struct ? intKVStructBytes : intMapBytes;
96+
break;
97+
case "string":
98+
object = struct ? stringKVMapStruct : stringMap;
99+
bytes = struct ? stringKVStructBytes : stringMapBytes;
100+
break;
101+
default:
102+
throw new UnsupportedOperationException();
76103
}
77-
stringMapBytes = fury.serialize(stringMap);
78-
integerMapBytes = fury.serialize(integerMap);
79104
}
80105
}
81106

82107
@Benchmark
83-
public Object serializeStringMap(MapState state) {
84-
return state.fury.serialize(state.stringMap);
85-
}
86-
87-
@Benchmark
88-
public Object serializeIntMap(MapState state) {
89-
return state.fury.serialize(state.integerMap);
90-
}
91-
92-
@Benchmark
93-
public Object deserializeStringMap(MapState state) {
94-
return state.fury.deserialize(state.stringMapBytes);
108+
public Object serialize(MapState state) {
109+
return state.fury.serialize(state.object);
95110
}
96111

97112
@Benchmark
98-
public Object deserializeIntMap(MapState state) {
99-
return state.fury.deserialize(state.integerMapBytes);
113+
public Object deserialize(MapState state) {
114+
return state.fury.deserialize(state.bytes);
100115
}
101116
}

0 commit comments

Comments
 (0)