Skip to content

Commit 394e76c

Browse files
authored
Add schema-based parsing (#43)
1 parent 84736d5 commit 394e76c

File tree

111 files changed

+10911
-1086
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

111 files changed

+10911
-1086
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
build
44
profilers
55
testdata
6+
hotspot_*.log

build.gradle

+24-12
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import me.champeau.jmh.JmhBytecodeGeneratorTask
2-
import org.gradle.internal.os.OperatingSystem
32
import org.ajoberstar.grgit.Grgit
3+
import org.gradle.internal.os.OperatingSystem
4+
45
import java.time.Duration
56

67
plugins {
@@ -42,20 +43,20 @@ java {
4243
}
4344

4445
ext {
45-
junitVersion = '5.9.1'
46-
jsoniterScalaVersion = '2.24.4'
46+
junitVersion = '5.10.2'
47+
jsoniterScalaVersion = '2.28.4'
4748
}
4849

4950
dependencies {
50-
jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.16.0'
51-
jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.42'
52-
jmhImplementation group: 'com.jsoniter', name: 'jsoniter', version: '0.9.23'
51+
jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.17.0'
52+
jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.49'
5353
jmhImplementation group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-core_2.13', version: jsoniterScalaVersion
5454
jmhImplementation group: 'com.google.guava', name: 'guava', version: '32.1.2-jre'
5555
compileOnly group: 'com.github.plokhotnyuk.jsoniter-scala', name: 'jsoniter-scala-macros_2.13', version: jsoniterScalaVersion
5656

5757
testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2'
5858
testImplementation group: 'org.apache.commons', name: 'commons-text', version: '1.10.0'
59+
testImplementation group: 'org.junit-pioneer', name: 'junit-pioneer', version: '2.2.0'
5960
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: junitVersion
6061
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-params', version: junitVersion
6162
testRuntimeOnly group: 'org.junit.jupiter', name: 'junit-jupiter-engine', version: junitVersion
@@ -136,15 +137,21 @@ jmh {
136137
'--add-modules=jdk.incubator.vector'
137138
]
138139
if (getBooleanProperty('jmh.profilersEnabled', false)) {
140+
createDirIfDoesNotExist('./profilers')
139141
if (OperatingSystem.current().isLinux()) {
140-
profilers = [
141-
'perf',
142-
'perfasm:intelSyntax=true',
143-
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getAsyncProfilerLibPath('LD_LIBRARY_PATH')
142+
def profilerList = [
143+
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getLibPath('LD_LIBRARY_PATH')
144144
]
145+
if (getBooleanProperty('jmh.jitLogEnabled', false)) {
146+
createDirIfDoesNotExist('./profilers/perfasm')
147+
profilerList += [
148+
'perfasm:intelSyntax=true;saveLog=true;saveLogTo=./profilers/perfasm'
149+
]
150+
}
151+
profilers = profilerList
145152
} else if (OperatingSystem.current().isMacOsX()) {
146153
profilers = [
147-
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getAsyncProfilerLibPath('DYLD_LIBRARY_PATH')
154+
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getLibPath('DYLD_LIBRARY_PATH')
148155
]
149156
}
150157
}
@@ -218,6 +225,11 @@ def getBooleanProperty(String name, boolean defaultValue) {
218225
Boolean.valueOf((project.findProperty(name) ?: defaultValue) as String)
219226
}
220227

221-
static def getAsyncProfilerLibPath(String envVarName) {
228+
static def getLibPath(String envVarName) {
222229
System.getenv(envVarName) ?: System.getProperty('java.library.path')
223230
}
231+
232+
static createDirIfDoesNotExist(String dir) {
233+
File file = new File(dir)
234+
file.mkdirs()
235+
}

src/jmh/java/org/simdjson/NumberParserBenchmark.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
public class NumberParserBenchmark {
2222

2323
private final Tape tape = new Tape(100);
24-
private final NumberParser numberParser = new NumberParser(tape);
24+
private final NumberParser numberParser = new NumberParser();
2525

2626
@Param({
2727
"2.2250738585072013e-308", // fast path
@@ -43,7 +43,7 @@ public double baseline() {
4343
@Benchmark
4444
public double simdjson() {
4545
tape.reset();
46-
numberParser.parseNumber(numberUtf8Bytes, 0);
46+
numberParser.parseNumber(numberUtf8Bytes, 0, tape);
4747
return tape.getDouble(0);
4848
}
4949
}

src/jmh/java/org/simdjson/ParseAndSelectBenchmark.java

+1-30
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,6 @@
44
import com.alibaba.fastjson2.JSONObject;
55
import com.fasterxml.jackson.databind.JsonNode;
66
import com.fasterxml.jackson.databind.ObjectMapper;
7-
import com.github.plokhotnyuk.jsoniter_scala.core.ReaderConfig$;
8-
import com.github.plokhotnyuk.jsoniter_scala.core.package$;
9-
import com.jsoniter.JsonIterator;
10-
import com.jsoniter.any.Any;
117
import org.openjdk.jmh.annotations.Benchmark;
128
import org.openjdk.jmh.annotations.BenchmarkMode;
139
import org.openjdk.jmh.annotations.Level;
@@ -43,19 +39,7 @@ public void setup() throws IOException {
4339
buffer = is.readAllBytes();
4440
bufferPadded = padded(buffer);
4541
}
46-
}
47-
48-
@Benchmark
49-
public int countUniqueUsersWithDefaultProfile_jsoniter_scala() throws IOException {
50-
Twitter twitter = package$.MODULE$.readFromArray(buffer, ReaderConfig$.MODULE$, Twitter$.MODULE$.codec());
51-
Set<String> defaultUsers = new HashSet<>();
52-
for (Status tweet: twitter.statuses()) {
53-
User user = tweet.user();
54-
if (user.default_profile()) {
55-
defaultUsers.add(user.screen_name());
56-
}
57-
}
58-
return defaultUsers.size();
42+
System.out.println("VectorSpecies = " + StructuralIndexer.BYTE_SPECIES);
5943
}
6044

6145
@Benchmark
@@ -88,19 +72,6 @@ public int countUniqueUsersWithDefaultProfile_fastjson() {
8872
return defaultUsers.size();
8973
}
9074

91-
@Benchmark
92-
public int countUniqueUsersWithDefaultProfile_jsoniter() {
93-
Any json = JsonIterator.deserialize(buffer);
94-
Set<String> defaultUsers = new HashSet<>();
95-
for (Any tweet : json.get("statuses")) {
96-
Any user = tweet.get("user");
97-
if (user.get("default_profile").toBoolean()) {
98-
defaultUsers.add(user.get("screen_name").toString());
99-
}
100-
}
101-
return defaultUsers.size();
102-
}
103-
10475
@Benchmark
10576
public int countUniqueUsersWithDefaultProfile_simdjson() {
10677
JsonValue simdJsonValue = simdJsonParser.parse(buffer, buffer.length);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
package org.simdjson;
2+
3+
import com.alibaba.fastjson2.JSON;
4+
import com.fasterxml.jackson.databind.DeserializationFeature;
5+
import com.fasterxml.jackson.databind.ObjectMapper;
6+
import com.github.plokhotnyuk.jsoniter_scala.core.ReaderConfig$;
7+
import com.github.plokhotnyuk.jsoniter_scala.core.package$;
8+
import org.openjdk.jmh.annotations.Benchmark;
9+
import org.openjdk.jmh.annotations.BenchmarkMode;
10+
import org.openjdk.jmh.annotations.Level;
11+
import org.openjdk.jmh.annotations.Mode;
12+
import org.openjdk.jmh.annotations.OutputTimeUnit;
13+
import org.openjdk.jmh.annotations.Scope;
14+
import org.openjdk.jmh.annotations.Setup;
15+
import org.openjdk.jmh.annotations.State;
16+
17+
import java.io.IOException;
18+
import java.io.InputStream;
19+
import java.util.HashSet;
20+
import java.util.List;
21+
import java.util.Set;
22+
import java.util.concurrent.TimeUnit;
23+
24+
import static org.simdjson.SimdJsonPaddingUtil.padded;
25+
26+
@State(Scope.Benchmark)
27+
@BenchmarkMode(Mode.Throughput)
28+
@OutputTimeUnit(TimeUnit.SECONDS)
29+
public class SchemaBasedParseAndSelectBenchmark {
30+
31+
private final SimdJsonParser simdJsonParser = new SimdJsonParser();
32+
private final ObjectMapper objectMapper = new ObjectMapper()
33+
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
34+
35+
private byte[] buffer;
36+
private byte[] bufferPadded;
37+
38+
@Setup(Level.Trial)
39+
public void setup() throws IOException {
40+
try (InputStream is = ParseBenchmark.class.getResourceAsStream("/twitter.json")) {
41+
buffer = is.readAllBytes();
42+
bufferPadded = padded(buffer);
43+
}
44+
System.out.println("VectorSpecies = " + StructuralIndexer.BYTE_SPECIES);
45+
}
46+
47+
@Benchmark
48+
public int countUniqueUsersWithDefaultProfile_simdjson() {
49+
Set<String> defaultUsers = new HashSet<>();
50+
SimdJsonTwitter twitter = simdJsonParser.parse(buffer, buffer.length, SimdJsonTwitter.class);
51+
for (SimdJsonStatus status : twitter.statuses()) {
52+
SimdJsonUser user = status.user();
53+
if (user.default_profile()) {
54+
defaultUsers.add(user.screen_name());
55+
}
56+
}
57+
return defaultUsers.size();
58+
}
59+
60+
@Benchmark
61+
public int countUniqueUsersWithDefaultProfile_simdjsonPadded() {
62+
Set<String> defaultUsers = new HashSet<>();
63+
SimdJsonTwitter twitter = simdJsonParser.parse(bufferPadded, buffer.length, SimdJsonTwitter.class);
64+
for (SimdJsonStatus status : twitter.statuses()) {
65+
SimdJsonUser user = status.user();
66+
if (user.default_profile()) {
67+
defaultUsers.add(user.screen_name());
68+
}
69+
}
70+
return defaultUsers.size();
71+
}
72+
73+
@Benchmark
74+
public int countUniqueUsersWithDefaultProfile_jackson() throws IOException {
75+
Set<String> defaultUsers = new HashSet<>();
76+
SimdJsonTwitter twitter = objectMapper.readValue(buffer, SimdJsonTwitter.class);
77+
for (SimdJsonStatus status : twitter.statuses()) {
78+
SimdJsonUser user = status.user();
79+
if (user.default_profile()) {
80+
defaultUsers.add(user.screen_name());
81+
}
82+
}
83+
return defaultUsers.size();
84+
}
85+
86+
@Benchmark
87+
public int countUniqueUsersWithDefaultProfile_jsoniter_scala() {
88+
Twitter twitter = package$.MODULE$.readFromArray(buffer, ReaderConfig$.MODULE$, Twitter$.MODULE$.codec());
89+
Set<String> defaultUsers = new HashSet<>();
90+
for (Status tweet: twitter.statuses()) {
91+
User user = tweet.user();
92+
if (user.default_profile()) {
93+
defaultUsers.add(user.screen_name());
94+
}
95+
}
96+
return defaultUsers.size();
97+
}
98+
99+
@Benchmark
100+
public int countUniqueUsersWithDefaultProfile_fastjson() {
101+
Set<String> defaultUsers = new HashSet<>();
102+
SimdJsonTwitter twitter = JSON.parseObject(buffer, SimdJsonTwitter.class);
103+
for (SimdJsonStatus status : twitter.statuses()) {
104+
SimdJsonUser user = status.user();
105+
if (user.default_profile()) {
106+
defaultUsers.add(user.screen_name());
107+
}
108+
}
109+
return defaultUsers.size();
110+
}
111+
112+
record SimdJsonUser(boolean default_profile, String screen_name) {
113+
114+
}
115+
116+
record SimdJsonStatus(SimdJsonUser user) {
117+
118+
}
119+
120+
record SimdJsonTwitter(List<SimdJsonStatus> statuses) {
121+
122+
}
123+
}

src/main/java/org/simdjson/BitIndexes.java

+36-1
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,26 @@ private long clearLowestBit(long bits) {
4444
return bits & (bits - 1);
4545
}
4646

47-
int advance() {
47+
void advance() {
48+
readIdx++;
49+
}
50+
51+
int getAndAdvance() {
52+
assert readIdx <= writeIdx;
4853
return indexes[readIdx++];
4954
}
5055

56+
int getLast() {
57+
return indexes[writeIdx - 1];
58+
}
59+
60+
int advanceAndGet() {
61+
assert readIdx + 1 <= writeIdx;
62+
return indexes[++readIdx];
63+
}
64+
5165
int peek() {
66+
assert readIdx <= writeIdx;
5267
return indexes[readIdx];
5368
}
5469

@@ -60,6 +75,26 @@ boolean isEnd() {
6075
return writeIdx == readIdx;
6176
}
6277

78+
boolean isPastEnd() {
79+
return readIdx > writeIdx;
80+
}
81+
82+
void finish() {
83+
// If we go past the end of the detected structural indexes, it means we are dealing with an invalid JSON.
84+
// Thus, we need to stop processing immediately and throw an exception. To avoid checking after every increment
85+
// of readIdx whether this has happened, we jump to the first structural element. This should produce the
86+
// desired outcome, i.e., an iterator should detect invalid JSON. To understand how this works, let's first
87+
// exclude primitive values (numbers, strings, booleans, nulls) from the scope of possible JSON documents. We
88+
// can do this because, when these values are parsed, the length of the input buffer is verified, ensuring we
89+
// never go past its end. Therefore, we can focus solely on objects and arrays. Since we always check that if
90+
// the first character is '{', the last one must be '}', and if the first character is '[', the last one must
91+
// be ']', we know that if we've reached beyond the buffer without crashing, the input is either '{...}' or '[...]'.
92+
// Thus, if we jump to the first structural element, we will generate either '{...}{' or '[...]['. Both of these
93+
// are invalid sequences and will be detected by the iterator, which will then stop processing and throw an
94+
// exception informing about the invalid JSON.
95+
indexes[writeIdx] = 0;
96+
}
97+
6398
void reset() {
6499
writeIdx = 0;
65100
readIdx = 0;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package org.simdjson;
2+
3+
import java.lang.reflect.Type;
4+
import java.util.HashMap;
5+
import java.util.Map;
6+
7+
class ClassResolver {
8+
9+
private final Map<Type, ResolvedClass> classCache = new HashMap<>();
10+
11+
ResolvedClass resolveClass(Type type) {
12+
ResolvedClass resolvedClass = classCache.get(type);
13+
if (resolvedClass != null) {
14+
return resolvedClass;
15+
}
16+
resolvedClass = new ResolvedClass(type, this);
17+
classCache.put(type, resolvedClass);
18+
return resolvedClass;
19+
}
20+
21+
void reset() {
22+
classCache.clear();
23+
}
24+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
package org.simdjson;
2+
3+
record ConstructorArgument(int idx, ResolvedClass resolvedClass) {
4+
}

0 commit comments

Comments
 (0)