Skip to content

Commit

Permalink
Use List<Token> instead of TokenStreams in NameTokenisationEncoder
Browse files Browse the repository at this point in the history
  • Loading branch information
yash-puligundla committed Oct 23, 2023
1 parent 2d836e4 commit da46b66
Showing 1 changed file with 18 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -77,26 +77,13 @@ public ByteBuffer compress(final ByteBuffer inBuffer, final int useArith){
for(int nameIndex = 0; nameIndex < numNames; nameIndex++) {
tokeniseName(tokensList, nameIndexMap, tokenFrequencies, names.get(nameIndex), nameIndex);
}
TokenStreams tokenStreams = new TokenStreams();

// TODO: Reuse tokenStream instead of creating an array of tokenStreams
// List<Token> tokenStream = ArrayList(TOTAL_TOKEN_TYPES);
// tokenStream.getTokenStreamByteBuffer(tokenPosition,TOKEN_TYPE) will be the same as
// tokenStream.get(TOKEN_TYPE)

for (int tokenPosition = 0; tokenPosition < maxToken; tokenPosition++) {

// In tokenStreams, for every token, for the given position add a ByteBuffer of length = names.len * max_len
List<Token> tokenStream = new ArrayList(TOTAL_TOKEN_TYPES);
for (int i = 0; i < TOTAL_TOKEN_TYPES; i++) {
final List<Token> currTokenStream = tokenStreams.getTokenStreamByType(i);
currTokenStream.add(new Token(ByteBuffer.allocate(numNames* maxLength).order(ByteOrder.LITTLE_ENDIAN)));
tokenStream.add(new Token(ByteBuffer.allocate(numNames* maxLength).order(ByteOrder.LITTLE_ENDIAN)));
}
fillByteStreams( tokenStreams,tokensList,tokenPosition,numNames);
for (int i = 0; i < TOTAL_TOKEN_TYPES; i++) {
final ByteBuffer currTokenStream = tokenStreams.getTokenStreamByteBuffer(tokenPosition,i);
currTokenStream.flip();
}
serializeByteStreams( tokenStreams,tokenPosition,useArith,outBuffer);
fillByteStreams( tokenStream,tokensList,tokenPosition,numNames);
serializeByteStreams( tokenStream,useArith,outBuffer);
}

// sets limit to current position and position to '0'
Expand Down Expand Up @@ -190,7 +177,7 @@ private void tokeniseName(final List<List<EncodeToken>> tokensList,
}

public void fillByteStreams(
final TokenStreams tokenStreams,
final List<Token> tokenStream,
final List<List<EncodeToken>> tokensList,
final int tokenPosition,
final int numNames) {
Expand All @@ -205,39 +192,39 @@ public void fillByteStreams(
}
EncodeToken encodeToken = tokensList.get(nameIndex).get(tokenPosition);
byte type = encodeToken.getTokenType();
tokenStreams.getTokenStreamByteBuffer(tokenPosition,TOKEN_TYPE).put(type);
tokenStream.get(TOKEN_TYPE).getByteBuffer().put(type);
switch (type) {
case TOKEN_DIFF:
tokenStreams.getTokenStreamByteBuffer(tokenPosition,TOKEN_DIFF).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
tokenStream.get(TOKEN_DIFF).getByteBuffer().putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
break;

case TOKEN_DUP:
tokenStreams.getTokenStreamByteBuffer(tokenPosition,TOKEN_DUP).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
tokenStream.get(TOKEN_DUP).getByteBuffer().putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
break;

case TOKEN_STRING:
writeString(tokenStreams.getTokenStreamByteBuffer(tokenPosition,TOKEN_STRING),encodeToken.getRelativeTokenValue());
writeString(tokenStream.get(TOKEN_STRING).getByteBuffer(),encodeToken.getRelativeTokenValue());
break;

case TOKEN_CHAR:
tokenStreams.getTokenStreamByteBuffer(tokenPosition,TOKEN_CHAR).put(encodeToken.getRelativeTokenValue().getBytes()[0]);
tokenStream.get(TOKEN_CHAR).getByteBuffer().put(encodeToken.getRelativeTokenValue().getBytes()[0]);
break;

case TOKEN_DIGITS:
tokenStreams.getTokenStreamByteBuffer(tokenPosition,TOKEN_DIGITS).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
tokenStream.get(TOKEN_DIGITS).getByteBuffer().putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
break;

case TOKEN_DIGITS0:
tokenStreams.getTokenStreamByteBuffer(tokenPosition,TOKEN_DIGITS0).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
tokenStreams.getTokenStreamByteBuffer(tokenPosition,TOKEN_DZLEN).put((byte) encodeToken.getRelativeTokenValue().length());
tokenStream.get(TOKEN_DIGITS0).getByteBuffer().putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
tokenStream.get(TOKEN_DZLEN).getByteBuffer().put((byte) encodeToken.getRelativeTokenValue().length());
break;

case TOKEN_DELTA:
tokenStreams.getTokenStreamByteBuffer(tokenPosition,TOKEN_DELTA).put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue()));
tokenStream.get(TOKEN_DELTA).getByteBuffer().put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue()));
break;

case TOKEN_DELTA0:
tokenStreams.getTokenStreamByteBuffer(tokenPosition,TOKEN_DELTA0).put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue()));
tokenStream.get(TOKEN_DELTA0).getByteBuffer().put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue()));
break;
}
}
Expand Down Expand Up @@ -286,16 +273,15 @@ public static ByteBuffer tryCompress(final ByteBuffer src, final int useArith) {
}

protected void serializeByteStreams(
final TokenStreams tokenStreams,
final int tokenPosition,
final List<Token> tokenStream,
final int useArith,
final ByteBuffer outBuffer) {

// Compress and serialise tokenStreams
for (int tokenType = 0; tokenType <= TOKEN_END; tokenType++) {
if (tokenStreams.getTokenStreamByteBuffer(tokenPosition, tokenType).remaining() > 0) {
if (tokenStream.get(tokenType).getByteBuffer().remaining() > 0) {
outBuffer.put((byte) (tokenType + ((tokenType == 0) ? 128 : 0)));
ByteBuffer tempOutByteBuffer = tryCompress(tokenStreams.getTokenStreamByteBuffer(tokenPosition, tokenType), useArith);
ByteBuffer tempOutByteBuffer = tryCompress(tokenStream.get(tokenType).getByteBuffer(), useArith);
writeUint7(tempOutByteBuffer.limit(),outBuffer);
outBuffer.put(tempOutByteBuffer);
}
Expand Down

0 comments on commit da46b66

Please sign in to comment.