Skip to content

Commit 8c943eb

Browse files
committed
Make inline array arithmetic use load+extractvalue rather than getelementptr+load to give the optimizer a better hint for vectorization
1 parent 446703b commit 8c943eb

File tree

1 file changed

+42
-7
lines changed

1 file changed

+42
-7
lines changed

src/llvm_backend.cpp

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6863,20 +6863,46 @@ lbValue lb_emit_arith_array(lbProcedure *p, TokenKind op, lbValue lhs, lbValue r
68636863
lhs = lb_emit_conv(p, lhs, type);
68646864
rhs = lb_emit_conv(p, rhs, type);
68656865

6866-
lbValue x = lb_address_from_load_or_generate_local(p, lhs);
6867-
lbValue y = lb_address_from_load_or_generate_local(p, rhs);
6868-
68696866
GB_ASSERT(is_type_array(type));
68706867
Type *elem_type = base_array_type(type);
68716868

6872-
lbAddr res = lb_add_local_generated(p, type, false);
6873-
68746869
i64 count = base_type(type)->Array.count;
68756870

68766871
bool inline_array_arith = type_size_of(type) <= build_context.max_align;
68776872

68786873
if (inline_array_arith) {
68796874
#if 1
6875+
#if 1
6876+
unsigned n = cast(unsigned)count;
6877+
auto dst_ptrs = array_make<lbValue>(temporary_allocator(), count);
6878+
6879+
auto a_loads = array_make<lbValue>(temporary_allocator(), count);
6880+
auto b_loads = array_make<lbValue>(temporary_allocator(), count);
6881+
auto c_ops = array_make<lbValue>(temporary_allocator(), count);
6882+
6883+
for (unsigned i = 0; i < n; i++) {
6884+
a_loads[i].value = LLVMBuildExtractValue(p->builder, lhs.value, i, "");
6885+
a_loads[i].type = elem_type;
6886+
}
6887+
for (unsigned i = 0; i < n; i++) {
6888+
b_loads[i].value = LLVMBuildExtractValue(p->builder, rhs.value, i, "");
6889+
b_loads[i].type = elem_type;
6890+
}
6891+
for (unsigned i = 0; i < n; i++) {
6892+
c_ops[i] = lb_emit_arith(p, op, a_loads[i], b_loads[i], elem_type);
6893+
}
6894+
6895+
lbAddr res = lb_add_local_generated(p, type, false);
6896+
for (unsigned i = 0; i < n; i++) {
6897+
dst_ptrs[i] = lb_emit_array_epi(p, res.addr, i);
6898+
}
6899+
for (unsigned i = 0; i < n; i++) {
6900+
lb_emit_store(p, dst_ptrs[i], c_ops[i]);
6901+
}
6902+
#else
6903+
lbValue x = lb_address_from_load_or_generate_local(p, lhs);
6904+
lbValue y = lb_address_from_load_or_generate_local(p, rhs);
6905+
68806906
auto a_ptrs = array_make<lbValue>(temporary_allocator(), count);
68816907
auto b_ptrs = array_make<lbValue>(temporary_allocator(), count);
68826908
auto dst_ptrs = array_make<lbValue>(temporary_allocator(), count);
@@ -6901,12 +6927,14 @@ lbValue lb_emit_arith_array(lbProcedure *p, TokenKind op, lbValue lhs, lbValue r
69016927
c_ops[i] = lb_emit_arith(p, op, a_loads[i], b_loads[i], elem_type);
69026928
}
69036929
6930+
lbAddr res = lb_add_local_generated(p, type, false);
69046931
for (i64 i = 0; i < count; i++) {
69056932
dst_ptrs[i] = lb_emit_array_epi(p, res.addr, i);
69066933
}
69076934
for (i64 i = 0; i < count; i++) {
69086935
lb_emit_store(p, dst_ptrs[i], c_ops[i]);
69096936
}
6937+
#endif
69106938
#else
69116939
for (i64 i = 0; i < count; i++) {
69126940
lbValue a_ptr = lb_emit_array_epi(p, x, i);
@@ -6919,7 +6947,14 @@ lbValue lb_emit_arith_array(lbProcedure *p, TokenKind op, lbValue lhs, lbValue r
69196947
lb_emit_store(p, dst_ptr, c);
69206948
}
69216949
#endif
6950+
6951+
return lb_addr_load(p, res);
69226952
} else {
6953+
lbValue x = lb_address_from_load_or_generate_local(p, lhs);
6954+
lbValue y = lb_address_from_load_or_generate_local(p, rhs);
6955+
6956+
lbAddr res = lb_add_local_generated(p, type, false);
6957+
69236958
auto loop_data = lb_loop_start(p, count, t_i32);
69246959

69256960
lbValue a_ptr = lb_emit_array_ep(p, x, loop_data.idx);
@@ -6932,9 +6967,9 @@ lbValue lb_emit_arith_array(lbProcedure *p, TokenKind op, lbValue lhs, lbValue r
69326967
lb_emit_store(p, dst_ptr, c);
69336968

69346969
lb_loop_end(p, loop_data);
6935-
}
69366970

6937-
return lb_addr_load(p, res);
6971+
return lb_addr_load(p, res);
6972+
}
69386973
}
69396974

69406975

0 commit comments

Comments
 (0)