Skip to content

Commit 4369b89

Browse files
ZuseZ4wsmoses
andauthored
start fixing gemm (#1448)
* start fixing gemm * update rule for gemm * start fixing gemm * update rule for gemm * Fix concat * fix * Gemm passes * fix constant 1 * Adding runtime activity tests * fix attributore * return early if output arg is runtime inactive * make runtime activity more aggressive * simplify generated IR * update some tests * update some tests * fix minor bugs * fix sdot * gemf * update some tests * f c tp lacpy * update some tests * remove test * byref over * update some tests * fix test --------- Co-authored-by: William S. Moses <[email protected]>
1 parent eb09439 commit 4369b89

26 files changed

+2330
-1846
lines changed

enzyme/Enzyme/BlasDerivatives.td

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def tp : MagicInst; // transpose the trans param.
5757
def noop : MagicInst; // gradient is zero
5858
def inactive : MagicInst; // like noop, but assert it's inactive
5959
def Rows : MagicInst; // given a transpose, normal rows, normal cols get the true rows, aka normal rows if N else normal cols
60+
def Concat : MagicInst;
6061

6162
// if !cache_A, then just use $lda.
6263
// if cache_A, then check $transa.
@@ -192,9 +193,7 @@ def gemv : CallBlasPattern<(Op $layout, $transa, $m, $n, $alpha, $A, $lda, $x, $
192193
//} else {
193194
// call sger(m, n, alpha, x, incx, ya, incy, Aa, lda)
194195
//}
195-
/* A */ (b<"ger"> $layout, $m, $n, $alpha, (Rows $transa, adj<"y">, $x),
196-
(Rows $transa, $x, adj<"y">),
197-
adj<"A">),
196+
/* A */ (b<"ger"> $layout, $m, $n, $alpha, (Rows $transa, (Concat adj<"y">, $x), (Concat $x, adj<"y">)), adj<"A">),
198197
/* x */ (b<"gemv"> $layout, transpose<"transa">, $m, $n, $alpha, $A, (ld $A, Char<"N">, $lda, $m, $n), adj<"y">, Constant<"1.0">, adj<"x">),
199198
/* beta */ (b<"dot"> (Rows $transa, $m, $n), adj<"y">, input<"y">),
200199
/* y */ (b<"scal"> (Rows $transa, $m, $n), $beta, adj<"y">)
@@ -225,10 +224,25 @@ def gemm : CallBlasPattern<(Op $layout, $transa, $transb, $m, $n, $k, $alpha, $A
225224
/* alpha */ (Seq<["AB", "product", "m", "n"]>
226225
(b<"gemm"> $layout, $transa, $transb, $m, $n, $k, Constant<"1.0">, $A, (ld $A, $transa, $lda, $m, $k), $B, (ld $B, $transb, $ldb, $k, $n), Constant<"0.0">, use<"AB">, $m),// TODO: check if last arg should be $m or $n
227226
(FrobInnerProd<""> $m, $n, adj<"C">, use<"AB">)),
228-
/* A */ (b<"gemm"> $layout, $transa, transpose<"transb">, $m, $k, $n, $alpha, adj<"C">, $B, (ld $B, $transb, $ldb, $k, $n), $beta, adj<"A">),
229-
/* B */ (b<"gemm"> $layout, transpose<"transa">, $transb, $k, $n, $m, $alpha, $A, (ld $A, $transa, $lda, $m, $k), adj<"C">, $beta, adj<"B">),
227+
/* A */ (b<"gemm"> $layout, (Rows $transa,
228+
(Concat Char<"N">, transpose<"transb">, $m, $k),
229+
(Concat $transb, Char<"T">, $k, $m)),
230+
$n, $alpha,
231+
(Rows $transa,
232+
(Concat adj<"C">, $B, (ld $B, $transb, $ldb, $k, $n)),
233+
(Concat $B, (ld $B, $transb, $ldb, $k, $n), adj<"C">)),
234+
Constant<"1.0">, adj<"A">),
235+
236+
/* B */ (b<"gemm"> $layout, (Rows $transb,
237+
(Concat transpose<"transa">, Char<"N">, $k, $n),
238+
(Concat Char<"T">, $transa, $n, $k)),
239+
$m, $alpha,
240+
(Rows $transb,
241+
(Concat $A, (ld $A, $transa, $lda, $m, $k), adj<"C">),
242+
(Concat adj<"C">, $A, (ld $A, $transa, $lda, $m, $k))),
243+
Constant<"1.0">, adj<"B">),
230244
/* beta */ (FrobInnerProd<""> $m, $n, adj<"C">, input<"C">),
231-
/* C */ (b<"lascl"> $layout, Char<"G">, ConstantInt<0>, ConstantInt<0>, Constant<"1.0">, $beta, $m, $n, adj<"C">, ConstantInt<0>)
245+
/* C */ (b<"lascl"> $layout, Char<"G">, ConstantInt<0>, ConstantInt<0>, Constant<"1.0">, $beta, $m, $n, adj<"C">)
232246
]
233247
>;
234248

enzyme/Enzyme/Utils.h

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1643,15 +1643,18 @@ llvm::Value *get_cached_mat_width(llvm::IRBuilder<> &B,
16431643
llvm::Value *dim_2, bool cacheMat,
16441644
bool byRef);
16451645

1646-
template <typename... T> static inline void nothing(T...){};
1646+
template <typename T>
1647+
static inline void append(llvm::SmallVectorImpl<T> &vec) {}
1648+
template <typename T, typename... T2>
1649+
static inline void append(llvm::SmallVectorImpl<T> &vec, llvm::ArrayRef<T> vals,
1650+
T2 &&...ts) {
1651+
vec.append(vals.begin(), vals.end());
1652+
append(vec, std::forward<T2>(ts)...);
1653+
}
16471654
template <typename... T>
1648-
static inline llvm::SmallVector<llvm::Value *, 1> concat_values(T... t) {
1655+
static inline llvm::SmallVector<llvm::Value *, 1> concat_values(T &&...t) {
16491656
llvm::SmallVector<llvm::Value *, 1> res;
1650-
auto append = [&](llvm::ArrayRef<llvm::Value *> V) {
1651-
res.append(V.begin(), V.end());
1652-
return 0;
1653-
};
1654-
nothing(append(t)...);
1657+
append(res, std::forward<T>(t)...);
16551658
return res;
16561659
}
16571660

enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_runtime_act.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -87,16 +87,16 @@ entry:
8787

8888
; CHECK: define internal void @[[active]](i32 %len, float* noalias %m, float* %"m'", i32 %incm, float* noalias %n, float* %"n'", i32 %incn, float %differeturn)
8989
; CHECK-NEXT: entry:
90-
; CHECK-NEXT: %rt.inactive.x = icmp eq float* %"m'", %m
91-
; CHECK-NEXT: %rt.inactive.y = icmp eq float* %"n'", %n
92-
; CHECK-NEXT: br i1 %rt.inactive.x, label %invertentry.x.done, label %invertentry.x.active
90+
; CHECK-NEXT: %[[rtinactivex:.+]] = icmp eq float* %"m'", %m
91+
; CHECK-NEXT: %[[rtinactivey:.+]] = icmp eq float* %"n'", %n
92+
; CHECK-NEXT: br i1 %[[rtinactivex]], label %invertentry.x.done, label %invertentry.x.active
9393

9494
; CHECK: invertentry.x.active: ; preds = %entry
9595
; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %n, i32 %incn, float* %"m'", i32 %incm)
9696
; CHECK-NEXT: br label %invertentry.x.done
9797

9898
; CHECK: invertentry.x.done: ; preds = %invertentry.x.active, %entry
99-
; CHECK-NEXT: br i1 %rt.inactive.y, label %invertentry.y.done, label %invertentry.y.active
99+
; CHECK-NEXT: br i1 %[[rtinactivey]], label %invertentry.y.done, label %invertentry.y.active
100100

101101
; CHECK: invertentry.y.active: ; preds = %invertentry.x.done
102102
; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %m, i32 %incm, float* %"n'", i32 %incn)
@@ -108,8 +108,8 @@ entry:
108108

109109
; CHECK: define internal void @[[inactiveFirst]](i32 %len, float* noalias %m, i32 %incm, float* noalias %n, float* %"n'", i32 %incn, float %differeturn)
110110
; CHECK-NEXT: entry:
111-
; CHECK-NEXT: %rt.inactive.y = icmp eq float* %"n'", %n
112-
; CHECK-NEXT: br i1 %rt.inactive.y, label %invertentry.y.done, label %invertentry.y.active
111+
; CHECK-NEXT: %[[rtinactivey:.+]] = icmp eq float* %"n'", %n
112+
; CHECK-NEXT: br i1 %[[rtinactivey]], label %invertentry.y.done, label %invertentry.y.active
113113

114114
; CHECK: invertentry.y.active: ; preds = %entry
115115
; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %m, i32 %incm, float* %"n'", i32 %incn)
@@ -121,8 +121,8 @@ entry:
121121

122122
; CHECK: define internal void @[[inactiveSecond]](i32 %len, float* noalias %m, float* %"m'", i32 %incm, float* noalias %n, i32 %incn, float %differeturn)
123123
; CHECK-NEXT: entry:
124-
; CHECK-NEXT: %rt.inactive.x = icmp eq float* %"m'", %m
125-
; CHECK-NEXT: br i1 %rt.inactive.x, label %invertentry.x.done, label %invertentry.x.active
124+
; CHECK-NEXT: %[[rtinactivex]] = icmp eq float* %"m'", %m
125+
; CHECK-NEXT: br i1 %[[rtinactivex]], label %invertentry.x.done, label %invertentry.x.active
126126

127127
; CHECK: invertentry.x.active: ; preds = %entry
128128
; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %n, i32 %incn, float* %"m'", i32 %incm)
@@ -156,18 +156,18 @@ entry:
156156

157157
; CHECK: define internal void @[[revMod]](i32 %len, float* noalias %m, float* %"m'", i32 %incm, float* noalias %n, float* %"n'", i32 %incn, float %differeturn, { float*, float* }
158158
; CHECK-NEXT: entry:
159-
; CHECK-NEXT: %rt.inactive.x = icmp eq float* %"m'", %m
160-
; CHECK-NEXT: %rt.inactive.y = icmp eq float* %"n'", %n
159+
; CHECK-NEXT: %[[rtinactivex:.+]] = icmp eq float* %"m'", %m
160+
; CHECK-NEXT: %[[rtinactivey:.+]] = icmp eq float* %"n'", %n
161161
; CHECK-NEXT: %tape.ext.x = extractvalue { float*, float* } %0, 0
162162
; CHECK-NEXT: %tape.ext.y = extractvalue { float*, float* } %0, 1
163-
; CHECK-NEXT: br i1 %rt.inactive.x, label %invertentry.x.done, label %invertentry.x.active
163+
; CHECK-NEXT: br i1 %[[rtinactivex]], label %invertentry.x.done, label %invertentry.x.active
164164

165165
; CHECK: invertentry.x.active: ; preds = %entry
166166
; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %tape.ext.y, i32 1, float* %"m'", i32 %incm)
167167
; CHECK-NEXT: br label %invertentry.x.done
168168

169169
; CHECK: invertentry.x.done: ; preds = %invertentry.x.active, %entry
170-
; CHECK-NEXT: br i1 %rt.inactive.y, label %invertentry.y.done, label %invertentry.y.active
170+
; CHECK-NEXT: br i1 %[[rtinactivey]], label %invertentry.y.done, label %invertentry.y.active
171171

172172
; CHECK: invertentry.y.active: ; preds = %invertentry.x.done
173173
; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %tape.ext.x, i32 1, float* %"n'", i32 %incn)
@@ -199,8 +199,8 @@ entry:
199199

200200
; CHECK: define internal void @[[revModFirst]](i32 %len, float* noalias %m, i32 %incm, float* noalias %n, float* %"n'", i32 %incn, float %differeturn, float*
201201
; CHECK-NEXT: entry:
202-
; CHECK-NEXT: %rt.inactive.y = icmp eq float* %"n'", %n
203-
; CHECK-NEXT: br i1 %rt.inactive.y, label %invertentry.y.done, label %invertentry.y.active
202+
; CHECK-NEXT: %[[rtinactivey:.+]] = icmp eq float* %"n'", %n
203+
; CHECK-NEXT: br i1 %[[rtinactivey]], label %invertentry.y.done, label %invertentry.y.active
204204

205205
; CHECK: invertentry.y.active: ; preds = %entry
206206
; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %0, i32 1, float* %"n'", i32 %incn)
@@ -231,8 +231,8 @@ entry:
231231

232232
; CHECK: define internal void @[[revModSecond]](i32 %len, float* noalias %m, float* %"m'", i32 %incm, float* noalias %n, i32 %incn, float %differeturn, float*
233233
; CHECK-NEXT: entry:
234-
; CHECK-NEXT: %rt.inactive.x = icmp eq float* %"m'", %m
235-
; CHECK-NEXT: br i1 %rt.inactive.x, label %invertentry.x.done, label %invertentry.x.active
234+
; CHECK-NEXT: %[[rtinactivex:.+]] = icmp eq float* %"m'", %m
235+
; CHECK-NEXT: br i1 %[[rtinactivex]], label %invertentry.x.done, label %invertentry.x.active
236236

237237
; CHECK: invertentry.x.active: ; preds = %entry
238238
; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %0, i32 1, float* %"m'", i32 %incm)

enzyme/test/Enzyme/ReverseMode/blas/gemm_f.ll

Lines changed: 78 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -47,33 +47,38 @@ entry:
4747

4848
; CHECK: define internal void @diffef(i8* %C, i8* %"C'", i8* %A, i8* %"A'", i8* %B, i8* %"B'")
4949
; CHECK-NEXT: entry:
50-
; CHECK-NEXT: %ret = alloca double
51-
; CHECK-NEXT: %byref.transpose.transa = alloca i8
52-
; CHECK-NEXT: %byref.transpose.transb = alloca i8
53-
; CHECK-NEXT: %byref.int.one = alloca i64
54-
; CHECK-NEXT: %byref.constant.char.G = alloca i8
55-
; CHECK-NEXT: %byref.constant.int.0 = alloca i64
56-
; CHECK-NEXT: %[[byrefconstantint1:.+]] = alloca i64
57-
; CHECK-NEXT: %byref.constant.fp.1.0 = alloca double
58-
; CHECK-NEXT: %[[byrefconstantint2:.+]] = alloca i64
59-
; CHECK-NEXT: %transa = alloca i8, align 1
60-
; CHECK-NEXT: %transb = alloca i8, align 1
61-
; CHECK-NEXT: %m = alloca i64, align 16
62-
; CHECK-NEXT: %m_p = bitcast i64* %m to i8*
63-
; CHECK-NEXT: %n = alloca i64, align 16
64-
; CHECK-NEXT: %n_p = bitcast i64* %n to i8*
65-
; CHECK-NEXT: %k = alloca i64, align 16
66-
; CHECK-NEXT: %k_p = bitcast i64* %k to i8*
67-
; CHECK-NEXT: %alpha = alloca double, align 16
68-
; CHECK-NEXT: %alpha_p = bitcast double* %alpha to i8*
69-
; CHECK-NEXT: %lda = alloca i64, align 16
70-
; CHECK-NEXT: %lda_p = bitcast i64* %lda to i8*
71-
; CHECK-NEXT: %ldb = alloca i64, align 16
72-
; CHECK-NEXT: %ldb_p = bitcast i64* %ldb to i8*
73-
; CHECK-NEXT: %beta = alloca double, align 16
74-
; CHECK-NEXT: %beta_p = bitcast double* %beta to i8*
75-
; CHECK-NEXT: %ldc = alloca i64, align 16
76-
; CHECK-NEXT: %ldc_p = bitcast i64* %ldc to i8*
50+
; CHECK-DAG: %ret = alloca double
51+
; CHECK-DAG: %byref.transpose.transa = alloca i8
52+
; CHECK-DAG: %byref.transpose.transb = alloca i8
53+
; CHECK-DAG: %byref.int.one = alloca i64
54+
; CHECK-DAG: %byref.constant.char.T = alloca i8, align 1
55+
; CHECK-DAG: %byref.constant.char.N = alloca i8, align 1
56+
; CHECK-DAG: %byref.constant.fp.1.0 = alloca double
57+
; CHECK-DAG: %byref.constant.char.T2 = alloca i8, align 1
58+
; CHECK-DAG: %byref.constant.char.N3 = alloca i8, align 1
59+
; CHECK-DAG: %byref.constant.fp.1.06 = alloca double
60+
; CHECK-DAG: %byref.constant.char.G = alloca i8
61+
; CHECK-DAG: %byref.constant.int.0 = alloca i64
62+
; CHECK-DAG: %[[byrefconstantint1:.+]] = alloca i64
63+
; CHECK-DAG: %byref.constant.fp.1.010 = alloca double
64+
; CHECK-DAG: %transa = alloca i8, align 1
65+
; CHECK-DAG: %transb = alloca i8, align 1
66+
; CHECK-DAG: %m = alloca i64, align 16
67+
; CHECK-DAG: %m_p = bitcast i64* %m to i8*
68+
; CHECK-DAG: %n = alloca i64, align 16
69+
; CHECK-DAG: %n_p = bitcast i64* %n to i8*
70+
; CHECK-DAG: %k = alloca i64, align 16
71+
; CHECK-DAG: %k_p = bitcast i64* %k to i8*
72+
; CHECK-DAG: %alpha = alloca double, align 16
73+
; CHECK-DAG: %alpha_p = bitcast double* %alpha to i8*
74+
; CHECK-DAG: %lda = alloca i64, align 16
75+
; CHECK-DAG: %lda_p = bitcast i64* %lda to i8*
76+
; CHECK-DAG: %ldb = alloca i64, align 16
77+
; CHECK-DAG: %ldb_p = bitcast i64* %ldb to i8*
78+
; CHECK-DAG: %beta = alloca double, align 16
79+
; CHECK-DAG: %beta_p = bitcast double* %beta to i8*
80+
; CHECK-DAG: %ldc = alloca i64, align 16
81+
; CHECK-DAG: %ldc_p = bitcast i64* %ldc to i8*
7782
; CHECK-NEXT: store i8 78, i8* %transa, align 1
7883
; CHECK-NEXT: store i8 78, i8* %transb, align 1
7984
; CHECK-NEXT: store i64 4, i64* %m, align 16
@@ -110,17 +115,56 @@ entry:
110115
; CHECK-NEXT: store i8 %[[i25]], i8* %byref.transpose.transb
111116
; CHECK-NEXT: store i64 1, i64* %byref.int.one
112117
; CHECK-NEXT: %intcast.int.one = bitcast i64* %byref.int.one to i8*
113-
; CHECK-NEXT: call void @dgemm_64_(i8* %transa, i8* %byref.transpose.transb, i8* %m_p, i8* %k_p, i8* %n_p, i8* %alpha_p, i8* %"C'", i8* %ldc_p, i8* %B, i8* %ldb_p, i8* %beta_p, i8* %"A'", i8* %lda_p, i64 1, i64 1)
114-
; CHECK-NEXT: call void @dgemm_64_(i8* %byref.transpose.transa, i8* %transb, i8* %k_p, i8* %n_p, i8* %m_p, i8* %alpha_p, i8* %A, i8* %lda_p, i8* %"C'", i8* %ldc_p, i8* %beta_p, i8* %"B'", i8* %ldb_p, i64 1, i64 1)
118+
119+
; CHECK-NEXT: store i8 84, i8* %byref.constant.char.T, align 1
120+
; CHECK-NEXT: store i8 78, i8* %byref.constant.char.N, align 1
121+
; CHECK-NEXT: %ld.row.trans = load i8, i8* %transa, align 1
122+
; CHECK-NEXT: %[[a16:.+]] = icmp eq i8 %ld.row.trans, 110
123+
; CHECK-NEXT: %[[a17:.+]] = icmp eq i8 %ld.row.trans, 78
124+
; CHECK-NEXT: %[[a18:.+]] = or i1 %[[a17]], %[[a16]]
125+
; CHECK-NEXT: %[[a19:.+]] = select i1 %[[a18]], i8* %byref.constant.char.N, i8* %transb
126+
; CHECK-NEXT: %[[a20:.+]] = select i1 %[[a18]], i8* %byref.transpose.transb, i8* %byref.constant.char.T
127+
; CHECK-NEXT: %[[a21:.+]] = select i1 %[[a18]], i8* %m_p, i8* %k_p
128+
; CHECK-NEXT: %[[a22:.+]] = select i1 %[[a18]], i8* %k_p, i8* %m_p
129+
; CHECK-NEXT: %ld.row.trans1 = load i8, i8* %transa, align 1
130+
; CHECK-NEXT: %[[a23:.+]] = icmp eq i8 %ld.row.trans1, 110
131+
; CHECK-NEXT: %[[a24:.+]] = icmp eq i8 %ld.row.trans1, 78
132+
; CHECK-NEXT: %[[a25:.+]] = or i1 %[[a24]], %[[a23]]
133+
; CHECK-NEXT: %[[a26:.+]] = select i1 %[[a25]], i8* %"C'", i8* %B
134+
; CHECK-NEXT: %[[a27:.+]] = select i1 %[[a25]], i8* %ldc_p, i8* %ldb_p
135+
; CHECK-NEXT: %[[a28:.+]] = select i1 %[[a25]], i8* %B, i8* %"C'"
136+
; CHECK-NEXT: %[[a29:.+]] = select i1 %[[a25]], i8* %ldb_p, i8* %ldc_p
137+
; CHECK-NEXT: store double 1.000000e+00, double* %byref.constant.fp.1.0, align 8
138+
; CHECK-NEXT: %fpcast.constant.fp.1.0 = bitcast double* %byref.constant.fp.1.0 to i8*
139+
; CHECK-NEXT: call void @dgemm_64_(i8* %[[a19]], i8* %[[a20]], i8* %[[a21]], i8* %[[a22]], i8* %n_p, i8* %alpha_p, i8* %[[a26]], i8* %[[a27]], i8* %[[a28]], i8* %[[a29]], i8* %fpcast.constant.fp.1.0, i8* %"A'", i8* %lda_p, i64 1, i64 1)
140+
; CHECK-NEXT: store i8 84, i8* %byref.constant.char.T2, align 1
141+
; CHECK-NEXT: store i8 78, i8* %byref.constant.char.N3, align 1
142+
; CHECK-NEXT: %ld.row.trans4 = load i8, i8* %transb, align 1
143+
; CHECK-NEXT: %[[a30:.+]] = icmp eq i8 %ld.row.trans4, 110
144+
; CHECK-NEXT: %[[a31:.+]] = icmp eq i8 %ld.row.trans4, 78
145+
; CHECK-NEXT: %[[a32:.+]] = or i1 %[[a31]], %[[a30]]
146+
; CHECK-NEXT: %[[a33:.+]] = select i1 %[[a32]], i8* %byref.transpose.transa, i8* %byref.constant.char.T2
147+
; CHECK-NEXT: %[[a34:.+]] = select i1 %[[a32]], i8* %byref.constant.char.N3, i8* %transa
148+
; CHECK-NEXT: %[[a35:.+]] = select i1 %[[a32]], i8* %k_p, i8* %n_p
149+
; CHECK-NEXT: %[[a36:.+]] = select i1 %[[a32]], i8* %n_p, i8* %k_p
150+
; CHECK-NEXT: %ld.row.trans5 = load i8, i8* %transb, align 1
151+
; CHECK-NEXT: %[[a37:.+]] = icmp eq i8 %ld.row.trans5, 110
152+
; CHECK-NEXT: %[[a38:.+]] = icmp eq i8 %ld.row.trans5, 78
153+
; CHECK-NEXT: %[[a39:.+]] = or i1 %[[a38]], %[[a37]]
154+
; CHECK-NEXT: %[[a40:.+]] = select i1 %[[a39]], i8* %A, i8* %"C'"
155+
; CHECK-NEXT: %[[a41:.+]] = select i1 %[[a39]], i8* %lda_p, i8* %ldc_p
156+
; CHECK-NEXT: %[[a42:.+]] = select i1 %[[a39]], i8* %"C'", i8* %A
157+
; CHECK-NEXT: %[[a43:.+]] = select i1 %[[a39]], i8* %ldc_p, i8* %lda_p
158+
; CHECK-NEXT: store double 1.000000e+00, double* %byref.constant.fp.1.06, align 8
159+
; CHECK-NEXT: %fpcast.constant.fp.1.07 = bitcast double* %byref.constant.fp.1.06 to i8*
160+
; CHECK-NEXT: call void @dgemm_64_(i8* %[[a33]], i8* %[[a34]], i8* %[[a35]], i8* %[[a36]], i8* %m_p, i8* %alpha_p, i8* %[[a40]], i8* %[[a41]], i8* %[[a42]], i8* %[[a43]], i8* %fpcast.constant.fp.1.07, i8* %"B'", i8* %ldb_p, i64 1, i64 1)
115161
; CHECK-NEXT: store i8 71, i8* %byref.constant.char.G
116162
; CHECK-NEXT: store i64 0, i64* %byref.constant.int.0
117163
; CHECK-NEXT: %intcast.constant.int.0 = bitcast i64* %byref.constant.int.0 to i8*
118164
; CHECK-NEXT: store i64 0, i64* %[[byrefconstantint1]]
119-
; CHECK-NEXT: %intcast.constant.int.02 = bitcast i64* %byref.constant.int.01 to i8*
165+
; CHECK-NEXT: %[[int02:.+]] = bitcast i64* %[[byrefconstantint1]] to i8*
120166
; CHECK-NEXT: store double 1.000000e+00, double* %byref.constant.fp.1.0
121-
; CHECK-NEXT: %fpcast.constant.fp.1.0 = bitcast double* %byref.constant.fp.1.0 to i8*
122-
; CHECK-NEXT: store i64 0, i64* %[[byrefconstantint2]]
123-
; CHECK-NEXT: %intcast.constant.int.04 = bitcast i64* %byref.constant.int.03 to i8*
124-
; CHECK-NEXT: call void @dlascl_64_(i8* %byref.constant.char.G, i8* %intcast.constant.int.0, i8* %intcast.constant.int.02, i8* %fpcast.constant.fp.1.0, i8* %beta_p, i8* %m_p, i8* %n_p, i8* %"C'", i8* %ldc_p, i8* %intcast.constant.int.04)
167+
; CHECK-NEXT: %[[fp11:.+]] = bitcast double* %byref.constant.fp.1.010 to i8*
168+
; CHECK-NEXT: call void @dlascl_64_(i8* %byref.constant.char.G, i8* %intcast.constant.int.0, i8* %[[int02]], i8* %[[fp11]], i8* %beta_p, i8* %m_p, i8* %n_p, i8* %"C'", i8* %ldc_p, i64 1)
125169
; CHECK-NEXT: ret void
126170
; CHECK-NEXT: }

0 commit comments

Comments
 (0)