Fix invalid pointer casts in arm64 implementation (#65)

davidben · web-flow · commit d3d60ac6e0f1 · 2024-09-06T15:50:42.000-07:00
* Fix invalid pointer casts in arm64 implementation It is UB in C and C++ to type-pun pointers in this way, for two reasons. First, casting to a pointer is forbidden if the pointer is not aligned. Second, this kind of type-punning is a strict aliasing violation. Instead, the way to read 8 bytes as a time as a uint64_t is to call memcpy. Compilers recognize this pattern and optimize it, lowering it to the same code, but without breaking the language's abstract state machine. This is needed to fix some UBSan warnings in Chromium. * Use crc32c_read_le.h instead In doing so, I fixed up crc32c_read_le.h a bit: 1. The documentation and tests say the buffers need to be aligned, but they don't. 2. Add a 16-bit version. 3. Remove an unnecessary static_cast<uint8_t>. The pointer is already uint8_t. 4. Replace the necessary static_casts with uintN_t{x} per the Google style guide. https://google.github.io/styleguide/cppguide.html#Casting
diff --git a/src/crc32c_arm64.cc b/src/crc32c_arm64.cc
@@ -12,8 +12,10 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
 
 #include "./crc32c_internal.h"
+#include "./crc32c_read_le.h"
 #include "crc32c/crc32c_config.h"
 
 #if HAVE_ARM64_CRC32C
@@ -25,16 +27,12 @@
 #define SEGMENTBYTES 256
 
 // compute 8bytes for each segment parallelly
-#define CRC32C32BYTES(P, IND)                                             \
-  do {                                                                    \
-    crc1 = __crc32cd(                                                     \
-        crc1, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 1 + (IND))); \
-    crc2 = __crc32cd(                                                     \
-        crc2, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 2 + (IND))); \
-    crc3 = __crc32cd(                                                     \
-        crc3, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 3 + (IND))); \
-    crc0 = __crc32cd(                                                     \
-        crc0, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 0 + (IND))); \
+#define CRC32C32BYTES(P, IND)                                               \
+  do {                                                                      \
+    crc1 = __crc32cd(crc1, ReadUint64LE((P) + SEGMENTBYTES * 1 + (IND)*8)); \
+    crc2 = __crc32cd(crc2, ReadUint64LE((P) + SEGMENTBYTES * 2 + (IND)*8)); \
+    crc3 = __crc32cd(crc3, ReadUint64LE((P) + SEGMENTBYTES * 3 + (IND)*8)); \
+    crc0 = __crc32cd(crc0, ReadUint64LE((P) + SEGMENTBYTES * 0 + (IND)*8)); \
   } while (0);
 
 // compute 8*8 bytes for each segment parallelly
@@ -86,7 +84,7 @@ uint32_t ExtendArm64(uint32_t crc, const uint8_t *data, size_t size) {
     t2 = (uint64_t)vmull_p64(crc2, k2);
     t1 = (uint64_t)vmull_p64(crc1, k1);
     t0 = (uint64_t)vmull_p64(crc0, k0);
-    crc = __crc32cd(crc3, *(uint64_t *)data);
+    crc = __crc32cd(crc3, ReadUint64LE(data));
     data += sizeof(uint64_t);
     crc ^= __crc32cd(0, t2);
     crc ^= __crc32cd(0, t1);
@@ -96,18 +94,18 @@ uint32_t ExtendArm64(uint32_t crc, const uint8_t *data, size_t size) {
   }
 
   while (length >= 8) {
-    crc = __crc32cd(crc, *(uint64_t *)data);
+    crc = __crc32cd(crc, ReadUint64LE(data));
     data += 8;
     length -= 8;
   }
 
   if (length & 4) {
-    crc = __crc32cw(crc, *(uint32_t *)data);
+    crc = __crc32cw(crc, ReadUint32LE(data));
     data += 4;
   }
 
   if (length & 2) {
-    crc = __crc32ch(crc, *(uint16_t *)data);
+    crc = __crc32ch(crc, ReadUint16LE(data));
     data += 2;
   }
 
diff --git a/src/crc32c_read_le.h b/src/crc32c_read_le.h
@@ -12,13 +12,23 @@
 
 namespace crc32c {
 
-// Reads a little-endian 32-bit integer from a 32-bit-aligned buffer.
+// Reads a little-endian 16-bit integer from bytes, not necessarily aligned.
+inline uint16_t ReadUint16LE(const uint8_t* buffer) {
+#if BYTE_ORDER_BIG_ENDIAN
+  return ((uint16_t{buffer[0]}) | (uint16_t{buffer[1]} << 8));
+#else   // !BYTE_ORDER_BIG_ENDIAN
+  uint16_t result;
+  // This should be optimized to a single instruction.
+  std::memcpy(&result, buffer, sizeof(result));
+  return result;
+#endif  // BYTE_ORDER_BIG_ENDIAN
+}
+
+// Reads a little-endian 32-bit integer from bytes, not necessarily aligned.
 inline uint32_t ReadUint32LE(const uint8_t* buffer) {
 #if BYTE_ORDER_BIG_ENDIAN
-  return ((static_cast<uint32_t>(static_cast<uint8_t>(buffer[0]))) |
-          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[1])) << 8) |
-          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[2])) << 16) |
-          (static_cast<uint32_t>(static_cast<uint8_t>(buffer[3])) << 24));
+  return ((uint32_t{buffer[0]}) | (uint32_t{buffer[1]} << 8) |
+          (uint32_t{buffer[2]} << 16) | (uint32_t{buffer[3]} << 24));
 #else   // !BYTE_ORDER_BIG_ENDIAN
   uint32_t result;
   // This should be optimized to a single instruction.
@@ -27,17 +37,13 @@ inline uint32_t ReadUint32LE(const uint8_t* buffer) {
 #endif  // BYTE_ORDER_BIG_ENDIAN
 }
 
-// Reads a little-endian 64-bit integer from a 64-bit-aligned buffer.
+// Reads a little-endian 64-bit integer from bytes, not necessarily aligned.
 inline uint64_t ReadUint64LE(const uint8_t* buffer) {
 #if BYTE_ORDER_BIG_ENDIAN
-  return ((static_cast<uint64_t>(static_cast<uint8_t>(buffer[0]))) |
-          (static_cast<uint64_t>(static_cast<uint8_t>(buffer[1])) << 8) |
-          (static_cast<uint64_t>(static_cast<uint8_t>(buffer[2])) << 16) |
-          (static_cast<uint64_t>(static_cast<uint8_t>(buffer[3])) << 24) |
-          (static_cast<uint64_t>(static_cast<uint8_t>(buffer[4])) << 32) |
-          (static_cast<uint64_t>(static_cast<uint8_t>(buffer[5])) << 40) |
-          (static_cast<uint64_t>(static_cast<uint8_t>(buffer[6])) << 48) |
-          (static_cast<uint64_t>(static_cast<uint8_t>(buffer[7])) << 56));
+  return ((uint64_t{buffer[0]}) | (uint64_t{buffer[1]} << 8) |
+          (uint64_t{buffer[2]} << 16) | (uint64_t{buffer[3]} << 24) |
+          (uint64_t{buffer[4]} << 32) | (uint64_t{buffer[5]} << 40) |
+          (uint64_t{buffer[6]} << 48) | (uint64_t{buffer[7]} << 56));
 #else   // !BYTE_ORDER_BIG_ENDIAN
   uint64_t result;
   // This should be optimized to a single instruction.
diff --git a/src/crc32c_read_le_unittest.cc b/src/crc32c_read_le_unittest.cc
@@ -13,20 +13,25 @@
 
 namespace crc32c {
 
+TEST(Crc32CReadLETest, ReadUint16LE) {
+  // little-endian 0x1234
+  uint8_t bytes[] = {0x34, 0x12};
+
+  EXPECT_EQ(uint16_t{0x1234}, ReadUint16LE(bytes));
+}
+
 TEST(Crc32CReadLETest, ReadUint32LE) {
   // little-endian 0x12345678
-  alignas(4) uint8_t bytes[] = {0x78, 0x56, 0x34, 0x12};
+  uint8_t bytes[] = {0x78, 0x56, 0x34, 0x12};
 
-  ASSERT_EQ(RoundUp<4>(bytes), bytes) << "Stack array is not aligned";
-  EXPECT_EQ(static_cast<uint32_t>(0x12345678), ReadUint32LE(bytes));
+  EXPECT_EQ(uint32_t{0x12345678}, ReadUint32LE(bytes));
 }
 
 TEST(Crc32CReadLETest, ReadUint64LE) {
   // little-endian 0x123456789ABCDEF0
-  alignas(8) uint8_t bytes[] = {0xF0, 0xDE, 0xBC, 0x9A, 0x78, 0x56, 0x34, 0x12};
+  uint8_t bytes[] = {0xF0, 0xDE, 0xBC, 0x9A, 0x78, 0x56, 0x34, 0x12};
 
-  ASSERT_EQ(RoundUp<8>(bytes), bytes) << "Stack array is not aligned";
-  EXPECT_EQ(static_cast<uint64_t>(0x123456789ABCDEF0), ReadUint64LE(bytes));
+  EXPECT_EQ(uint64_t{0x123456789ABCDEF0}, ReadUint64LE(bytes));
 }
 
 }  // namespace crc32c