Skip to content

Commit 3294542

Browse files
thisisArjitWill-Lo
authored andcommitted
[GOBBLIN-2223] Optimise writing of serialised Work Unit to File system (#4133)
* Optimise writing of serialised Work Unit to File system * Optimise writing of serialised Work Unit to File system
1 parent 510c401 commit 3294542

File tree

2 files changed

+34
-8
lines changed

2 files changed

+34
-8
lines changed

gobblin-api/src/main/java/org/apache/gobblin/compat/hadoop/TextSerializer.java

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import java.io.DataInput;
2020
import java.io.DataOutput;
2121
import java.io.IOException;
22-
import java.nio.charset.StandardCharsets;
2322

2423

2524
/**
@@ -31,20 +30,27 @@ public class TextSerializer {
3130
* Serialize a String using the same logic as a Hadoop Text object
3231
*/
3332
public static void writeStringAsText(DataOutput stream, String str) throws IOException {
34-
byte[] utf8Encoded = str.getBytes(StandardCharsets.UTF_8);
35-
writeVLong(stream, utf8Encoded.length);
36-
stream.write(utf8Encoded);
33+
// TODO: Use writeChars instead of writeBytes to support unicode
34+
for (int i = 0; i < str.length(); i++) {
35+
if (str.charAt(i) > 0x7F) {
36+
throw new IllegalArgumentException("Non-ASCII character detected.");
37+
}
38+
}
39+
writeVLong(stream, str.length());
40+
stream.writeBytes(str);
3741
}
3842

3943
/**
4044
* Deserialize a Hadoop Text object into a String
4145
*/
4246
public static String readTextAsString(DataInput in) throws IOException {
43-
int bufLen = (int)readVLong(in);
44-
byte[] buf = new byte[bufLen];
45-
in.readFully(buf);
47+
int bufLen = (int) readVLong(in);
48+
StringBuilder sb = new StringBuilder();
4649

47-
return new String(buf, StandardCharsets.UTF_8);
50+
for (int i = 0; i < bufLen; i++) {
51+
sb.append((char) in.readByte());
52+
}
53+
return sb.toString();
4854
}
4955

5056
/**

gobblin-api/src/test/java/org/apache/gobblin/compat/TextSerializerTest.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,26 @@
3333

3434
public class TextSerializerTest {
3535
private static final String[] textsToSerialize = new String[]{"abracadabra", Strings.repeat("longString", 128000)};
36+
private static final String[] serializationErrorText = new String[]{".߸´ˇ", Strings.repeat("ˀ.¸¯.", 128000)};
37+
38+
@Test
39+
public void testSerializeError() throws IOException {
40+
// Use our serializer, verify Hadoop deserializer can read it back
41+
for (String textToSerialize : serializationErrorText) {
42+
ByteArrayOutputStream bOs = new ByteArrayOutputStream();
43+
DataOutputStream dataOutputStream = new DataOutputStream(bOs);
44+
45+
try {
46+
TextSerializer.writeStringAsText(dataOutputStream, textToSerialize);
47+
Assert.fail("Expected IOException not thrown");
48+
} catch (Exception e) {
49+
Assert.assertTrue(e instanceof IllegalArgumentException);
50+
// Expected exception
51+
} finally {
52+
dataOutputStream.close();
53+
}
54+
}
55+
}
3656

3757
@Test
3858
public void testSerialize()

0 commit comments

Comments
 (0)