Skip to content

Commit

Permalink
Addressing feedback from dec 5 - part 1
Browse files Browse the repository at this point in the history
  • Loading branch information
yash-puligundla committed Jan 30, 2024
1 parent da46b66 commit 7d2d0c5
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 88 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package htsjdk.samtools.cram.compression.nametokenisation;

import htsjdk.samtools.cram.compression.nametokenisation.tokens.EncodeToken;
import htsjdk.samtools.cram.compression.nametokenisation.tokens.Token;
import htsjdk.samtools.cram.compression.range.RangeEncode;
import htsjdk.samtools.cram.compression.range.RangeParams;
import htsjdk.samtools.cram.compression.rans.RANSEncode;
Expand All @@ -17,22 +16,8 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_CHAR;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DELTA;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DELTA0;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DIFF;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DIGITS;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DIGITS0;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DUP;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_DZLEN;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_END;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_MATCH;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_STRING;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOKEN_TYPE;
import static htsjdk.samtools.cram.compression.nametokenisation.TokenStreams.TOTAL_TOKEN_TYPES;
import static htsjdk.samtools.cram.compression.rans.Utils.writeUint7;


public class NameTokenisationEncode {

private int maxToken;
Expand Down Expand Up @@ -78,12 +63,12 @@ public ByteBuffer compress(final ByteBuffer inBuffer, final int useArith){
tokeniseName(tokensList, nameIndexMap, tokenFrequencies, names.get(nameIndex), nameIndex);
}
for (int tokenPosition = 0; tokenPosition < maxToken; tokenPosition++) {
List<Token> tokenStream = new ArrayList(TOTAL_TOKEN_TYPES);
for (int i = 0; i < TOTAL_TOKEN_TYPES; i++) {
tokenStream.add(new Token(ByteBuffer.allocate(numNames* maxLength).order(ByteOrder.LITTLE_ENDIAN)));
List<ByteBuffer> tokenStream = new ArrayList(TokenStreams.TOTAL_TOKEN_TYPES);
for (int i = 0; i < TokenStreams.TOTAL_TOKEN_TYPES; i++) {
tokenStream.add(ByteBuffer.allocate(numNames* maxLength).order(ByteOrder.LITTLE_ENDIAN));
}
fillByteStreams( tokenStream,tokensList,tokenPosition,numNames);
serializeByteStreams( tokenStream,useArith,outBuffer);
fillByteStreams(tokenStream,tokensList,tokenPosition,numNames);
serializeByteStreams(tokenStream,useArith,outBuffer);
}

// sets limit to current position and position to '0'
Expand All @@ -103,9 +88,9 @@ private void tokeniseName(final List<List<EncodeToken>> tokensList,
tokensList.add(new ArrayList<>());
if (nameIndexMap.containsKey(name)) {
// TODO: Add Test to cover this code
tokensList.get(currentNameIndex).add(new EncodeToken(String.valueOf(currentNameIndex - nameIndexMap.get(name)), String.valueOf(currentNameIndex - nameIndexMap.get(name)),TOKEN_DUP));
tokensList.get(currentNameIndex).add(new EncodeToken(String.valueOf(currentNameIndex - nameIndexMap.get(name)), String.valueOf(currentNameIndex - nameIndexMap.get(name)),TokenStreams.TOKEN_DUP));
} else {
tokensList.get(currentNameIndex).add(new EncodeToken(String.valueOf(currentNameIndex == 0 ? 0 : 1),String.valueOf(currentNameIndex == 0 ? 0 : 1),TOKEN_DIFF));
tokensList.get(currentNameIndex).add(new EncodeToken(String.valueOf(currentNameIndex == 0 ? 0 : 1),String.valueOf(currentNameIndex == 0 ? 0 : 1),TokenStreams.TOKEN_DIFF));
}
// Get the list of tokens `tok` for the current name
nameIndexMap.put(name, currentNameIndex);
Expand All @@ -121,40 +106,40 @@ private void tokeniseName(final List<List<EncodeToken>> tokensList,
// because at position "0", we have a token that provides info if the name is a DIFF or DUP
// token 0 = DIFF vs DUP
int tokenIndex = i + 1;
byte type = TOKEN_STRING;
byte type = TokenStreams.TOKEN_STRING;
String str = tok.get(i); // absolute value of the token
String val = tok.get(i); // relative value of the token (comparing to prevname's token at the same token position)
if (tok.get(i).matches("^0+[0-9]*$")) {
type = TOKEN_DIGITS0;
type = TokenStreams.TOKEN_DIGITS0;
} else if (tok.get(i).matches("^[0-9]+$")) {
type = TOKEN_DIGITS;
type = TokenStreams.TOKEN_DIGITS;
} else if (tok.get(i).length() == 1) {
type = TOKEN_CHAR;
type = TokenStreams.TOKEN_CHAR;
}

// compare the current token with token from the previous name at the current token's index
// if there exists a previous name and a token at the corresponding index of the previous name
if (prevNameIndex >=0 && tokensList.get(prevNameIndex).size() > tokenIndex) {
EncodeToken prevToken = tokensList.get(prevNameIndex).get(tokenIndex);
if (prevToken.getActualTokenValue().equals(tok.get(i))) {
type = TOKEN_MATCH;
type = TokenStreams.TOKEN_MATCH;
val = "";
} else if (type==TOKEN_DIGITS
&& (prevToken.getTokenType() == TOKEN_DIGITS || prevToken.getTokenType() == TOKEN_DELTA)) {
} else if (type==TokenStreams.TOKEN_DIGITS
&& (prevToken.getTokenType() == TokenStreams.TOKEN_DIGITS || prevToken.getTokenType() == TokenStreams.TOKEN_DELTA)) {
int v = Integer.parseInt(val);
int s = Integer.parseInt(prevToken.getActualTokenValue());
int d = v - s;
tokenFrequencies[tokenIndex]++;
if (d >= 0 && d < 256 && tokenFrequencies[tokenIndex] > currentNameIndex / 2) {
type = TOKEN_DELTA;
type = TokenStreams.TOKEN_DELTA;
val = String.valueOf(d);
}
} else if (type==TOKEN_DIGITS0 && prevToken.getActualTokenValue().length() == val.length()
&& (prevToken.getTokenType() == TOKEN_DIGITS0 || prevToken.getTokenType() == TOKEN_DELTA0)) {
} else if (type==TokenStreams.TOKEN_DIGITS0 && prevToken.getActualTokenValue().length() == val.length()
&& (prevToken.getTokenType() == TokenStreams.TOKEN_DIGITS0 || prevToken.getTokenType() == TokenStreams.TOKEN_DELTA0)) {
int d = Integer.parseInt(val) - Integer.parseInt(prevToken.getActualTokenValue());
tokenFrequencies[tokenIndex]++;
if (d >= 0 && d < 256 && tokenFrequencies[tokenIndex] > currentNameIndex / 2) {
type = TOKEN_DELTA0;
type = TokenStreams.TOKEN_DELTA0;
val = String.valueOf(d);
}
}
Expand All @@ -168,7 +153,7 @@ private void tokeniseName(final List<List<EncodeToken>> tokensList,
}
}

tokensList.get(currentNameIndex).add(new EncodeToken("","",TOKEN_END));
tokensList.get(currentNameIndex).add(new EncodeToken("","",TokenStreams.TOKEN_END));
final int currMaxToken = tokensList.get(currentNameIndex).size();
if (maxToken < currMaxToken)
maxToken = currMaxToken;
Expand All @@ -177,54 +162,54 @@ private void tokeniseName(final List<List<EncodeToken>> tokensList,
}

public void fillByteStreams(
final List<Token> tokenStream,
final List<ByteBuffer> tokenStream,
final List<List<EncodeToken>> tokensList,
final int tokenPosition,
final int numNames) {

// Fill tokenStreams object using tokensList
for (int nameIndex = 0; nameIndex < numNames; nameIndex++) {
if (tokenPosition > 0 && tokensList.get(nameIndex).get(0).getTokenType() == TOKEN_DUP) {
if (tokenPosition > 0 && tokensList.get(nameIndex).get(0).getTokenType() == TokenStreams.TOKEN_DUP) {
continue;
}
if (tokensList.get(nameIndex).size() <= tokenPosition) {
continue;
}
EncodeToken encodeToken = tokensList.get(nameIndex).get(tokenPosition);
byte type = encodeToken.getTokenType();
tokenStream.get(TOKEN_TYPE).getByteBuffer().put(type);
tokenStream.get(TokenStreams.TOKEN_TYPE).put(type);
switch (type) {
case TOKEN_DIFF:
tokenStream.get(TOKEN_DIFF).getByteBuffer().putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
case TokenStreams.TOKEN_DIFF:
tokenStream.get(TokenStreams.TOKEN_DIFF).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
break;

case TOKEN_DUP:
tokenStream.get(TOKEN_DUP).getByteBuffer().putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
case TokenStreams.TOKEN_DUP:
tokenStream.get(TokenStreams.TOKEN_DUP).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
break;

case TOKEN_STRING:
writeString(tokenStream.get(TOKEN_STRING).getByteBuffer(),encodeToken.getRelativeTokenValue());
case TokenStreams.TOKEN_STRING:
writeString(tokenStream.get(TokenStreams.TOKEN_STRING),encodeToken.getRelativeTokenValue());
break;

case TOKEN_CHAR:
tokenStream.get(TOKEN_CHAR).getByteBuffer().put(encodeToken.getRelativeTokenValue().getBytes()[0]);
case TokenStreams.TOKEN_CHAR:
tokenStream.get(TokenStreams.TOKEN_CHAR).put(encodeToken.getRelativeTokenValue().getBytes()[0]);
break;

case TOKEN_DIGITS:
tokenStream.get(TOKEN_DIGITS).getByteBuffer().putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
case TokenStreams.TOKEN_DIGITS:
tokenStream.get(TokenStreams.TOKEN_DIGITS).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
break;

case TOKEN_DIGITS0:
tokenStream.get(TOKEN_DIGITS0).getByteBuffer().putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
tokenStream.get(TOKEN_DZLEN).getByteBuffer().put((byte) encodeToken.getRelativeTokenValue().length());
case TokenStreams.TOKEN_DIGITS0:
tokenStream.get(TokenStreams.TOKEN_DIGITS0).putInt(Integer.parseInt(encodeToken.getRelativeTokenValue()));
tokenStream.get(TokenStreams.TOKEN_DZLEN).put((byte) encodeToken.getRelativeTokenValue().length());
break;

case TOKEN_DELTA:
tokenStream.get(TOKEN_DELTA).getByteBuffer().put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue()));
case TokenStreams.TOKEN_DELTA:
tokenStream.get(TokenStreams.TOKEN_DELTA).put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue()));
break;

case TOKEN_DELTA0:
tokenStream.get(TOKEN_DELTA0).getByteBuffer().put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue()));
case TokenStreams.TOKEN_DELTA0:
tokenStream.get(TokenStreams.TOKEN_DELTA0).put((byte)Integer.parseInt(encodeToken.getRelativeTokenValue()));
break;
}
}
Expand Down Expand Up @@ -273,15 +258,15 @@ public static ByteBuffer tryCompress(final ByteBuffer src, final int useArith) {
}

protected void serializeByteStreams(
final List<Token> tokenStream,
final List<ByteBuffer> tokenStream,
final int useArith,
final ByteBuffer outBuffer) {

// Compress and serialise tokenStreams
for (int tokenType = 0; tokenType <= TOKEN_END; tokenType++) {
if (tokenStream.get(tokenType).getByteBuffer().remaining() > 0) {
for (int tokenType = 0; tokenType <= TokenStreams.TOKEN_END; tokenType++) {
if (tokenStream.get(tokenType).remaining() > 0) {
outBuffer.put((byte) (tokenType + ((tokenType == 0) ? 128 : 0)));
ByteBuffer tempOutByteBuffer = tryCompress(tokenStream.get(tokenType).getByteBuffer(), useArith);
ByteBuffer tempOutByteBuffer = tryCompress(tokenStream.get(tokenType), useArith);
writeUint7(tempOutByteBuffer.limit(),outBuffer);
outBuffer.put(tempOutByteBuffer);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package htsjdk.samtools.cram.compression.nametokenisation;

import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.compression.nametokenisation.tokens.Token;
import htsjdk.samtools.cram.compression.range.RangeDecode;
import htsjdk.samtools.cram.compression.rans.RANSDecode;
import htsjdk.samtools.cram.compression.rans.Utils;
Expand Down Expand Up @@ -31,7 +30,7 @@ public class TokenStreams {
private static final int DUP_TOKEN_FLAG_MASK = 0x40;
private static final int TYPE_TOKEN_FLAG_MASK = 0x3F;

private final List<List<Token>> tokenStreams;
private final List<List<ByteBuffer>> tokenStreams;

public TokenStreams() {
tokenStreams = new ArrayList<>(TOTAL_TOKEN_TYPES);
Expand Down Expand Up @@ -70,9 +69,9 @@ public TokenStreams(final ByteBuffer inputByteBuffer, final int useArith, final
// Ensure that the size of tokenStream for each type of token = tokenPosition
// by adding an empty ByteBuffer if needed
for (int i = 0; i < TOTAL_TOKEN_TYPES; i++) {
final List<Token> currTokenStream = tokenStreams.get(i);
final List<ByteBuffer> currTokenStream = tokenStreams.get(i);
if (currTokenStream.size() < tokenPosition) {
currTokenStream.add(new Token(ByteBuffer.allocate(0)));
currTokenStream.add(ByteBuffer.allocate(0));
}
if (currTokenStream.size() < tokenPosition) {
throw new CRAMException("TokenStream is missing Token(s) at Token Type: " + i);
Expand All @@ -91,12 +90,12 @@ public TokenStreams(final ByteBuffer inputByteBuffer, final int useArith, final
}
typeDataByteBuffer.rewind();
typeDataByteBuffer.put(0, (byte) tokenType);
tokenStreams.get(0).add(new Token(typeDataByteBuffer));
tokenStreams.get(0).add(typeDataByteBuffer);
}
if (isDupToken) {
final int dupPosition = inputByteBuffer.get() & 0xFF;
final int dupType = inputByteBuffer.get() & 0xFF;
final Token dupTokenStream = new Token(tokenStreams.get(dupType).get(dupPosition).getByteBuffer().duplicate());
final ByteBuffer dupTokenStream = tokenStreams.get(dupType).get(dupPosition).duplicate();
tokenStreams.get(tokenType).add(tokenPosition,dupTokenStream);
} else {
final int clen = Utils.readUint7(inputByteBuffer);
Expand All @@ -111,16 +110,16 @@ public TokenStreams(final ByteBuffer inputByteBuffer, final int useArith, final
RANSDecode ransdecode = new RANSNx16Decode();
uncompressedDataByteBuffer = ransdecode.uncompress(ByteBuffer.wrap(dataBytes));
}
this.getTokenStreamByType(tokenType).add(tokenPosition,new Token(uncompressedDataByteBuffer));
this.getTokenStreamByType(tokenType).add(tokenPosition,uncompressedDataByteBuffer);
}
}
}

public List<Token> getTokenStreamByType(final int tokenType) {
public List<ByteBuffer> getTokenStreamByType(final int tokenType) {
return tokenStreams.get(tokenType);
}

public ByteBuffer getTokenStreamByteBuffer(final int tokenPosition, final int tokenType) {
return tokenStreams.get(tokenType).get(tokenPosition).getByteBuffer();
return tokenStreams.get(tokenType).get(tokenPosition);
}
}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public class NameTokenizationInteropTest extends HtsjdkTest {
public static final String COMPRESSED_TOK_DIR = "tok3";

@DataProvider(name = "allNameTokenizationFiles")
public Object[][] getAllRansCodecsForRoundTrip() throws IOException {
public Object[][] getAllNameTokenizationCodecsForRoundTrip() throws IOException {

// params:
// compressed testfile path, uncompressed testfile path, NameTokenization encoder, NameTokenization decoder
Expand Down Expand Up @@ -123,11 +123,10 @@ public static final String getUncompressedFileName(final String compressedFileNa
// Returns original filename from compressed file name
int lastDotIndex = compressedFileName.lastIndexOf(".");
if (lastDotIndex >= 0) {
String fileName = compressedFileName.substring(0, lastDotIndex);
return fileName;
return compressedFileName.substring(0, lastDotIndex);
} else {
throw new CRAMException("The format of the compressed File Name is not as expected. " +
"The name of the compressed file should contain a perios followed by a number that" +
"The name of the compressed file should contain a period followed by a number that" +
"indicates type of compression. Actual compressed file name = "+ compressedFileName);
}
}
Expand Down

0 comments on commit 7d2d0c5

Please sign in to comment.