@@ -1915,8 +1915,12 @@ void CodeGen_LLVM::visit(const Load *op) {
19151915 // Try to rewrite strided loads as shuffles of dense loads,
19161916 // aligned to the stride. This makes adjacent strided loads
19171917 // share the same underlying dense loads.
1918- ModulusRemainder align = op->alignment ;
19191918 Expr base = ramp->base ;
1919+ // The variable align will track the alignment of the
1920+ // base. Every time we change base, we also need to update
1921+ // align.
1922+ ModulusRemainder align = op->alignment ;
1923+
19201924 int aligned_stride = gcd (stride->value , align.modulus );
19211925 int offset = 0 ;
19221926 if (aligned_stride == stride->value ) {
@@ -1930,7 +1934,7 @@ void CodeGen_LLVM::visit(const Load *op) {
19301934
19311935 if (offset) {
19321936 base = simplify (base - offset);
1933- align.remainder -= offset;
1937+ align.remainder = mod_imp (align. remainder - offset, align. modulus ) ;
19341938 }
19351939
19361940 // We want to load a few more bytes than the original load did.
@@ -1947,6 +1951,11 @@ void CodeGen_LLVM::visit(const Load *op) {
19471951
19481952 int slice_lanes = native_vector_bits () / op->type .bits ();
19491953
1954+ // We're going to add multiples of slice_lanes to base in
1955+ // the loop below, so reduce alignment modulo slice_lanes.
1956+ align.modulus = gcd (align.modulus , slice_lanes);
1957+ align.remainder = mod_imp (align.remainder , align.modulus );
1958+
19501959 // We need to slice the result in to native vector lanes, otherwise
19511960 // LLVM misses optimizations like using ldN on ARM.
19521961 vector<Value *> results;
@@ -1957,7 +1966,7 @@ void CodeGen_LLVM::visit(const Load *op) {
19571966 Expr slice_base = simplify (base + load_base_i);
19581967
19591968 Value *load_i = codegen_dense_vector_load (op->type .with_lanes (load_lanes_i), op->name , slice_base,
1960- op->image , op->param , op-> alignment , nullptr , false );
1969+ op->image , op->param , align , nullptr , false );
19611970
19621971 SmallVector<Constant *, 256 > constants;
19631972 for (int j = 0 ; j < lanes_i; j++) {
0 commit comments