Skip to content

Commit

Permalink
Make PVUS#assembleAndSum use SimdOps; optimize SimdOps assembleAndSum
Browse files Browse the repository at this point in the history
  • Loading branch information
michaeljmarshall committed Nov 14, 2024
1 parent 20ed4ac commit bebb4f6
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,7 @@ public VectorFloat<?> sub(VectorFloat<?> a, int aOffset, VectorFloat<?> b, int b

@Override
public float assembleAndSum(VectorFloat<?> data, int dataBase, ByteSequence<?> baseOffsets) {
float sum = 0f;
for (int i = 0; i < baseOffsets.length(); i++) {
sum += data.get(dataBase * i + Byte.toUnsignedInt(baseOffsets.get(i)));
}
return sum;
return SimdOps.assembleAndSum(((ArrayVectorFloat) data).get(), dataBase, ((ArrayByteSequence) baseOffsets).get());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -525,11 +525,10 @@ static float assembleAndSum512(float[] data, int dataBase, byte[] baseOffsets) {
FloatVector sum = FloatVector.zero(FloatVector.SPECIES_512);
int i = 0;
int limit = ByteVector.SPECIES_128.loopBound(baseOffsets.length);
var scale = IntVector.zero(IntVector.SPECIES_512).addIndex(dataBase);

for (; i < limit; i += ByteVector.SPECIES_128.length()) {
var scale = IntVector.zero(IntVector.SPECIES_512).addIndex(1).add(i).mul(dataBase);

ByteVector.fromArray(ByteVector.SPECIES_128, baseOffsets, i)
ByteVector.fromArray(ByteVector.SPECIES_128, baseOffsets, i * dataBase)
.convertShape(VectorOperators.B2I, IntVector.SPECIES_512, 0)
.lanewise(VectorOperators.AND, BYTE_TO_INT_MASK_512)
.reinterpretAsInts()
Expand All @@ -553,11 +552,11 @@ static float assembleAndSum256(float[] data, int dataBase, byte[] baseOffsets) {
FloatVector sum = FloatVector.zero(FloatVector.SPECIES_256);
int i = 0;
int limit = ByteVector.SPECIES_64.loopBound(baseOffsets.length);
var scale = IntVector.zero(IntVector.SPECIES_256).addIndex(dataBase);

for (; i < limit; i += ByteVector.SPECIES_64.length()) {
var scale = IntVector.zero(IntVector.SPECIES_256).addIndex(1).add(i).mul(dataBase);

ByteVector.fromArray(ByteVector.SPECIES_64, baseOffsets, i)
ByteVector.fromArray(ByteVector.SPECIES_64, baseOffsets, i * dataBase)
.convertShape(VectorOperators.B2I, IntVector.SPECIES_256, 0)
.lanewise(VectorOperators.AND, BYTE_TO_INT_MASK_256)
.reinterpretAsInts()
Expand Down

0 comments on commit bebb4f6

Please sign in to comment.