Don't emit aligned loads to unaligned addresses (#6047)

abadams · alexreinking · commit cf16f008d9b9 · 2021-05-20T12:32:35.000-07:00
* Don't emit aligned loads to unaligned addresses Fixes #6046 (cherry picked from commit 626c34a)
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -1915,8 +1915,12 @@ void CodeGen_LLVM::visit(const Load *op) {
             // Try to rewrite strided loads as shuffles of dense loads,
             // aligned to the stride. This makes adjacent strided loads
             // share the same underlying dense loads.
-            ModulusRemainder align = op->alignment;
             Expr base = ramp->base;
+            // The variable align will track the alignment of the
+            // base. Every time we change base, we also need to update
+            // align.
+            ModulusRemainder align = op->alignment;
+
             int aligned_stride = gcd(stride->value, align.modulus);
             int offset = 0;
             if (aligned_stride == stride->value) {
@@ -1930,7 +1934,7 @@ void CodeGen_LLVM::visit(const Load *op) {
 
             if (offset) {
                 base = simplify(base - offset);
-                align.remainder -= offset;
+                align.remainder = mod_imp(align.remainder - offset, align.modulus);
             }
 
             // We want to load a few more bytes than the original load did.
@@ -1947,6 +1951,11 @@ void CodeGen_LLVM::visit(const Load *op) {
 
             int slice_lanes = native_vector_bits() / op->type.bits();
 
+            // We're going to add multiples of slice_lanes to base in
+            // the loop below, so reduce alignment modulo slice_lanes.
+            align.modulus = gcd(align.modulus, slice_lanes);
+            align.remainder = mod_imp(align.remainder, align.modulus);
+
             // We need to slice the result in to native vector lanes, otherwise
             // LLVM misses optimizations like using ldN on ARM.
             vector<Value *> results;
@@ -1957,7 +1966,7 @@ void CodeGen_LLVM::visit(const Load *op) {
                 Expr slice_base = simplify(base + load_base_i);
 
                 Value *load_i = codegen_dense_vector_load(op->type.with_lanes(load_lanes_i), op->name, slice_base,
-                                                          op->image, op->param, op->alignment, nullptr, false);
+                                                          op->image, op->param, align, nullptr, false);
 
                 SmallVector<Constant *, 256> constants;
                 for (int j = 0; j < lanes_i; j++) {
diff --git a/test/correctness/align_bounds.cpp b/test/correctness/align_bounds.cpp
@@ -197,6 +197,29 @@ int main(int argc, char **argv) {
         }
     }
 
+    // Try a case where aligning a buffer means that strided loads can
+    // do dense aligned loads and then shuffle. This used to trigger a
+    // bug in codegen.
+    {
+        Func f, g;
+        Var x;
+
+        f(x) = x;
+
+        // Do strided loads of every possible alignment
+        Expr e = 0;
+        for (int i = -32; i <= 32; i++) {
+            e += f(3 * x + i);
+        }
+        g(x) = e;
+
+        f.compute_root();
+        g.bound(x, 0, 1024).vectorize(x, 16, TailStrategy::RoundUp);
+
+        // Just check if it crashes
+        g.realize({1024});
+    }
+
     printf("Success!\n");
     return 0;
 }