1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI %s
3 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7 ; On GFX11, ensure vdst and src2 do not partially overlap. Full overlap is ok.
9 define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
10 ; CI-LABEL: mad_i64_i32_sextops:
12 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
14 ; CI-NEXT: s_setpc_b64 s[30:31]
16 ; SI-LABEL: mad_i64_i32_sextops:
18 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19 ; SI-NEXT: v_mul_lo_u32 v4, v0, v1
20 ; SI-NEXT: v_mul_hi_i32 v1, v0, v1
21 ; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v2
22 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
23 ; SI-NEXT: s_setpc_b64 s[30:31]
25 ; GFX9-LABEL: mad_i64_i32_sextops:
27 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
29 ; GFX9-NEXT: s_setpc_b64 s[30:31]
31 ; GFX11-LABEL: mad_i64_i32_sextops:
33 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34 ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
35 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
36 ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
37 ; GFX11-NEXT: s_setpc_b64 s[30:31]
38 %sext0 = sext i32 %arg0 to i64
39 %sext1 = sext i32 %arg1 to i64
40 %mul = mul i64 %sext0, %sext1
41 %mad = add i64 %mul, %arg2
45 define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
46 ; CI-LABEL: mad_i64_i32_sextops_commute:
48 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
50 ; CI-NEXT: s_setpc_b64 s[30:31]
52 ; SI-LABEL: mad_i64_i32_sextops_commute:
54 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55 ; SI-NEXT: v_mul_lo_u32 v4, v0, v1
56 ; SI-NEXT: v_mul_hi_i32 v1, v0, v1
57 ; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v4
58 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
59 ; SI-NEXT: s_setpc_b64 s[30:31]
61 ; GFX9-LABEL: mad_i64_i32_sextops_commute:
63 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
65 ; GFX9-NEXT: s_setpc_b64 s[30:31]
67 ; GFX11-LABEL: mad_i64_i32_sextops_commute:
69 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70 ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
71 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
72 ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
73 ; GFX11-NEXT: s_setpc_b64 s[30:31]
74 %sext0 = sext i32 %arg0 to i64
75 %sext1 = sext i32 %arg1 to i64
76 %mul = mul i64 %sext0, %sext1
77 %mad = add i64 %arg2, %mul
81 define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
82 ; CI-LABEL: mad_u64_u32_zextops:
84 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
86 ; CI-NEXT: s_setpc_b64 s[30:31]
88 ; SI-LABEL: mad_u64_u32_zextops:
90 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
91 ; SI-NEXT: v_mul_lo_u32 v4, v0, v1
92 ; SI-NEXT: v_mul_hi_u32 v1, v0, v1
93 ; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v2
94 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
95 ; SI-NEXT: s_setpc_b64 s[30:31]
97 ; GFX9-LABEL: mad_u64_u32_zextops:
99 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
101 ; GFX9-NEXT: s_setpc_b64 s[30:31]
103 ; GFX11-LABEL: mad_u64_u32_zextops:
105 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106 ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
107 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
108 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
109 ; GFX11-NEXT: s_setpc_b64 s[30:31]
110 %sext0 = zext i32 %arg0 to i64
111 %sext1 = zext i32 %arg1 to i64
112 %mul = mul i64 %sext0, %sext1
113 %mad = add i64 %mul, %arg2
117 define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
118 ; CI-LABEL: mad_u64_u32_zextops_commute:
120 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
122 ; CI-NEXT: s_setpc_b64 s[30:31]
124 ; SI-LABEL: mad_u64_u32_zextops_commute:
126 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127 ; SI-NEXT: v_mul_lo_u32 v4, v0, v1
128 ; SI-NEXT: v_mul_hi_u32 v1, v0, v1
129 ; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v4
130 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
131 ; SI-NEXT: s_setpc_b64 s[30:31]
133 ; GFX9-LABEL: mad_u64_u32_zextops_commute:
135 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
136 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
137 ; GFX9-NEXT: s_setpc_b64 s[30:31]
139 ; GFX11-LABEL: mad_u64_u32_zextops_commute:
141 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142 ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
143 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
144 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
145 ; GFX11-NEXT: s_setpc_b64 s[30:31]
146 %sext0 = zext i32 %arg0 to i64
147 %sext1 = zext i32 %arg1 to i64
148 %mul = mul i64 %sext0, %sext1
149 %mad = add i64 %arg2, %mul
153 define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
154 ; CI-LABEL: mad_i64_i32_sextops_i32_i128:
156 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157 ; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
158 ; CI-NEXT: v_ashrrev_i32_e32 v13, 31, v0
159 ; CI-NEXT: v_mov_b32_e32 v8, 0
160 ; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v1, v[7:8]
161 ; CI-NEXT: v_ashrrev_i32_e32 v14, 31, v1
162 ; CI-NEXT: v_mad_i64_i32 v[11:12], s[4:5], v1, v13, 0
163 ; CI-NEXT: v_mov_b32_e32 v7, v10
164 ; CI-NEXT: v_mov_b32_e32 v10, v8
165 ; CI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[9:10]
166 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[11:12]
167 ; CI-NEXT: v_add_i32_e32 v9, vcc, v7, v9
168 ; CI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc
169 ; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v14, v[9:10]
170 ; CI-NEXT: v_add_i32_e32 v7, vcc, v9, v0
171 ; CI-NEXT: v_addc_u32_e32 v9, vcc, v10, v1, vcc
172 ; CI-NEXT: v_mov_b32_e32 v1, v8
173 ; CI-NEXT: v_add_i32_e32 v0, vcc, v6, v2
174 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
175 ; CI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc
176 ; CI-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc
177 ; CI-NEXT: s_setpc_b64 s[30:31]
179 ; SI-LABEL: mad_i64_i32_sextops_i32_i128:
181 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v0
183 ; SI-NEXT: v_mul_lo_u32 v11, v6, v1
184 ; SI-NEXT: v_mul_hi_u32 v12, v0, v1
185 ; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1
186 ; SI-NEXT: v_mul_hi_u32 v14, v6, v1
187 ; SI-NEXT: v_mul_lo_u32 v13, v0, v7
188 ; SI-NEXT: v_mul_hi_u32 v10, v0, v7
189 ; SI-NEXT: v_add_i32_e32 v12, vcc, v11, v12
190 ; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc
191 ; SI-NEXT: v_mul_hi_u32 v8, v6, v7
192 ; SI-NEXT: v_add_i32_e32 v12, vcc, v13, v12
193 ; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
194 ; SI-NEXT: v_mul_i32_i24_e32 v9, v6, v7
195 ; SI-NEXT: v_add_i32_e32 v10, vcc, v14, v10
196 ; SI-NEXT: v_mul_hi_i32 v6, v1, v6
197 ; SI-NEXT: v_mul_hi_i32 v7, v7, v0
198 ; SI-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
199 ; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v10
200 ; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc
201 ; SI-NEXT: v_add_i32_e32 v10, vcc, v13, v11
202 ; SI-NEXT: v_mul_lo_u32 v0, v0, v1
203 ; SI-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
204 ; SI-NEXT: v_add_i32_e32 v7, vcc, v9, v10
205 ; SI-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc
206 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
207 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v12, v3, vcc
208 ; SI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc
209 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc
210 ; SI-NEXT: s_setpc_b64 s[30:31]
212 ; GFX9-LABEL: mad_i64_i32_sextops_i32_i128:
214 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
215 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
216 ; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v0
217 ; GFX9-NEXT: v_mov_b32_e32 v9, 0
218 ; GFX9-NEXT: v_mov_b32_e32 v8, v7
219 ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9]
220 ; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v1
221 ; GFX9-NEXT: v_mov_b32_e32 v8, v11
222 ; GFX9-NEXT: v_mov_b32_e32 v11, v9
223 ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11]
224 ; GFX9-NEXT: v_mov_b32_e32 v12, v11
225 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12
226 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
227 ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9]
228 ; GFX9-NEXT: v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0
229 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13]
230 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v0
231 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v1, vcc
232 ; GFX9-NEXT: v_mov_b32_e32 v1, v10
233 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2
234 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
235 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
236 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
237 ; GFX9-NEXT: s_setpc_b64 s[30:31]
239 ; GFX11-LABEL: mad_i64_i32_sextops_i32_i128:
241 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v1, 0
243 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
244 ; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v0
245 ; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1
246 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
247 ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
248 ; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8
249 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
250 ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10]
251 ; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0
252 ; GFX11-NEXT: v_mov_b32_e32 v8, v12
253 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
254 ; GFX11-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10]
255 ; GFX11-NEXT: v_add_co_u32 v7, s0, v7, v8
256 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
257 ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, 0, s0
258 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8]
259 ; GFX11-NEXT: v_mov_b32_e32 v7, v11
260 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
261 ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12
262 ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
263 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
264 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
265 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
266 ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
267 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
268 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
269 ; GFX11-NEXT: s_setpc_b64 s[30:31]
270 %sext0 = sext i32 %arg0 to i128
271 %sext1 = sext i32 %arg1 to i128
272 %mul = mul i128 %sext0, %sext1
273 %mad = add i128 %mul, %arg2
277 define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
278 ; CI-LABEL: mad_i64_i32_sextops_i32_i63:
280 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
281 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
282 ; CI-NEXT: s_setpc_b64 s[30:31]
284 ; SI-LABEL: mad_i64_i32_sextops_i32_i63:
286 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
287 ; SI-NEXT: v_mul_lo_u32 v4, v0, v1
288 ; SI-NEXT: v_mul_hi_i32 v1, v0, v1
289 ; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v2
290 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
291 ; SI-NEXT: s_setpc_b64 s[30:31]
293 ; GFX9-LABEL: mad_i64_i32_sextops_i32_i63:
295 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
296 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
297 ; GFX9-NEXT: s_setpc_b64 s[30:31]
299 ; GFX11-LABEL: mad_i64_i32_sextops_i32_i63:
301 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
302 ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
303 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
304 ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
305 ; GFX11-NEXT: s_setpc_b64 s[30:31]
306 %sext0 = sext i32 %arg0 to i63
307 %sext1 = sext i32 %arg1 to i63
308 %mul = mul i63 %sext0, %sext1
309 %mad = add i63 %mul, %arg2
313 define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
314 ; CI-LABEL: mad_i64_i32_sextops_i31_i63:
316 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
317 ; CI-NEXT: v_bfe_i32 v1, v1, 0, 31
318 ; CI-NEXT: v_bfe_i32 v0, v0, 0, 31
319 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
320 ; CI-NEXT: s_setpc_b64 s[30:31]
322 ; SI-LABEL: mad_i64_i32_sextops_i31_i63:
324 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325 ; SI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
326 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1
327 ; SI-NEXT: v_ashr_i64 v[4:5], v[3:4], 33
328 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 33
329 ; SI-NEXT: v_mul_lo_u32 v1, v4, v0
330 ; SI-NEXT: v_mul_hi_i32 v4, v4, v0
331 ; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v2
332 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v4, v3, vcc
333 ; SI-NEXT: s_setpc_b64 s[30:31]
335 ; GFX9-LABEL: mad_i64_i32_sextops_i31_i63:
337 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
338 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 31
339 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 31
340 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
341 ; GFX9-NEXT: s_setpc_b64 s[30:31]
343 ; GFX11-LABEL: mad_i64_i32_sextops_i31_i63:
345 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346 ; GFX11-NEXT: v_bfe_i32 v4, v1, 0, 31
347 ; GFX11-NEXT: v_bfe_i32 v5, v0, 0, 31
348 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
349 ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
350 ; GFX11-NEXT: s_setpc_b64 s[30:31]
351 %sext0 = sext i31 %arg0 to i63
352 %sext1 = sext i31 %arg1 to i63
353 %mul = mul i63 %sext0, %sext1
354 %mad = add i63 %mul, %arg2
358 define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
359 ; CI-LABEL: mad_i64_i32_extops_i32_i64:
361 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
362 ; CI-NEXT: v_ashrrev_i32_e32 v4, 31, v0
363 ; CI-NEXT: v_mul_lo_u32 v4, v4, v1
364 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
365 ; CI-NEXT: v_add_i32_e32 v1, vcc, v4, v1
366 ; CI-NEXT: s_setpc_b64 s[30:31]
368 ; SI-LABEL: mad_i64_i32_extops_i32_i64:
370 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
371 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v0
372 ; SI-NEXT: v_mul_hi_u32 v5, v0, v1
373 ; SI-NEXT: v_mul_lo_u32 v4, v4, v1
374 ; SI-NEXT: v_mul_lo_u32 v0, v0, v1
375 ; SI-NEXT: v_add_i32_e32 v1, vcc, v5, v4
376 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
377 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
378 ; SI-NEXT: s_setpc_b64 s[30:31]
380 ; GFX9-LABEL: mad_i64_i32_extops_i32_i64:
382 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383 ; GFX9-NEXT: v_mov_b32_e32 v4, v1
384 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0
385 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v4, v[2:3]
386 ; GFX9-NEXT: v_mov_b32_e32 v2, v1
387 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v4, v[2:3]
388 ; GFX9-NEXT: v_mov_b32_e32 v1, v2
389 ; GFX9-NEXT: s_setpc_b64 s[30:31]
391 ; GFX11-LABEL: mad_i64_i32_extops_i32_i64:
393 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
394 ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
395 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
396 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
397 ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5
398 ; GFX11-NEXT: v_mov_b32_e32 v3, v1
399 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
400 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4]
401 ; GFX11-NEXT: s_setpc_b64 s[30:31]
402 %ext0 = sext i32 %arg0 to i64
403 %ext1 = zext i32 %arg1 to i64
404 %mul = mul i64 %ext0, %ext1
405 %mad = add i64 %mul, %arg2
409 define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
410 ; CI-LABEL: mad_u64_u32_bitops:
412 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
413 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
414 ; CI-NEXT: s_setpc_b64 s[30:31]
416 ; SI-LABEL: mad_u64_u32_bitops:
418 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419 ; SI-NEXT: v_mul_lo_u32 v1, v0, v2
420 ; SI-NEXT: v_mul_hi_u32 v2, v0, v2
421 ; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v4
422 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v5, vcc
423 ; SI-NEXT: s_setpc_b64 s[30:31]
425 ; GFX9-LABEL: mad_u64_u32_bitops:
427 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
429 ; GFX9-NEXT: s_setpc_b64 s[30:31]
431 ; GFX11-LABEL: mad_u64_u32_bitops:
433 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
435 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
436 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5]
437 ; GFX11-NEXT: s_setpc_b64 s[30:31]
438 %trunc.lhs = and i64 %arg0, 4294967295
439 %trunc.rhs = and i64 %arg1, 4294967295
440 %mul = mul i64 %trunc.lhs, %trunc.rhs
441 %add = add i64 %mul, %arg2
445 define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
446 ; CI-LABEL: mad_u64_u32_bitops_lhs_mask_small:
448 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
449 ; CI-NEXT: v_and_b32_e32 v3, 1, v1
450 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
451 ; CI-NEXT: v_mul_lo_u32 v2, v3, v2
452 ; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1
453 ; CI-NEXT: s_setpc_b64 s[30:31]
455 ; SI-LABEL: mad_u64_u32_bitops_lhs_mask_small:
457 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458 ; SI-NEXT: v_and_b32_e32 v1, 1, v1
459 ; SI-NEXT: v_mul_hi_u32 v3, v0, v2
460 ; SI-NEXT: v_mul_lo_u32 v1, v1, v2
461 ; SI-NEXT: v_mul_lo_u32 v0, v0, v2
462 ; SI-NEXT: v_add_i32_e32 v1, vcc, v3, v1
463 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4
464 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
465 ; SI-NEXT: s_setpc_b64 s[30:31]
467 ; GFX9-LABEL: mad_u64_u32_bitops_lhs_mask_small:
469 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v1
471 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
472 ; GFX9-NEXT: v_mov_b32_e32 v4, v1
473 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v2, v[4:5]
474 ; GFX9-NEXT: v_mov_b32_e32 v1, v2
475 ; GFX9-NEXT: s_setpc_b64 s[30:31]
477 ; GFX11-LABEL: mad_u64_u32_bitops_lhs_mask_small:
479 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
480 ; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0
481 ; GFX11-NEXT: v_mov_b32_e32 v6, v1
482 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
483 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
484 ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6
485 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
486 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
487 ; GFX11-NEXT: s_setpc_b64 s[30:31]
488 %trunc.lhs = and i64 %arg0, 8589934591
489 %trunc.rhs = and i64 %arg1, 4294967295
490 %mul = mul i64 %trunc.lhs, %trunc.rhs
491 %add = add i64 %mul, %arg2
495 define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
496 ; CI-LABEL: mad_u64_u32_bitops_rhs_mask_small:
498 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
499 ; CI-NEXT: v_mov_b32_e32 v6, v0
500 ; CI-NEXT: v_and_b32_e32 v3, 1, v3
501 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5]
502 ; CI-NEXT: v_mul_lo_u32 v2, v6, v3
503 ; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1
504 ; CI-NEXT: s_setpc_b64 s[30:31]
506 ; SI-LABEL: mad_u64_u32_bitops_rhs_mask_small:
508 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
509 ; SI-NEXT: v_and_b32_e32 v1, 1, v3
510 ; SI-NEXT: v_mul_hi_u32 v3, v0, v2
511 ; SI-NEXT: v_mul_lo_u32 v1, v0, v1
512 ; SI-NEXT: v_mul_lo_u32 v0, v0, v2
513 ; SI-NEXT: v_add_i32_e32 v1, vcc, v3, v1
514 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4
515 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
516 ; SI-NEXT: s_setpc_b64 s[30:31]
518 ; GFX9-LABEL: mad_u64_u32_bitops_rhs_mask_small:
520 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
521 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
522 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5]
523 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
524 ; GFX9-NEXT: v_mov_b32_e32 v2, v1
525 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v3, v[2:3]
526 ; GFX9-NEXT: v_mov_b32_e32 v1, v2
527 ; GFX9-NEXT: s_setpc_b64 s[30:31]
529 ; GFX11-LABEL: mad_u64_u32_bitops_rhs_mask_small:
531 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
532 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
533 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
534 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
535 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3
536 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
537 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
538 ; GFX11-NEXT: s_setpc_b64 s[30:31]
539 %trunc.lhs = and i64 %arg0, 4294967295
540 %trunc.rhs = and i64 %arg1, 8589934591
541 %mul = mul i64 %trunc.lhs, %trunc.rhs
542 %add = add i64 %mul, %arg2
546 define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
547 ; CI-LABEL: mad_i64_i32_bitops:
549 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
550 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v2, v[4:5]
551 ; CI-NEXT: s_setpc_b64 s[30:31]
553 ; SI-LABEL: mad_i64_i32_bitops:
555 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
556 ; SI-NEXT: v_mul_lo_u32 v1, v0, v2
557 ; SI-NEXT: v_mul_hi_i32 v2, v0, v2
558 ; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v4
559 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v5, vcc
560 ; SI-NEXT: s_setpc_b64 s[30:31]
562 ; GFX9-LABEL: mad_i64_i32_bitops:
564 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
565 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v2, v[4:5]
566 ; GFX9-NEXT: s_setpc_b64 s[30:31]
568 ; GFX11-LABEL: mad_i64_i32_bitops:
570 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
571 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
572 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
573 ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v3, v2, v[4:5]
574 ; GFX11-NEXT: s_setpc_b64 s[30:31]
575 %shl.lhs = shl i64 %arg0, 32
576 %trunc.lhs = ashr i64 %shl.lhs, 32
577 %shl.rhs = shl i64 %arg1, 32
578 %trunc.rhs = ashr i64 %shl.rhs, 32
579 %mul = mul i64 %trunc.lhs, %trunc.rhs
580 %add = add i64 %mul, %arg2
584 ; Example from bug report
585 define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
586 ; CI-LABEL: mad_i64_i32_unpack_i64ops:
588 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
589 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[0:1]
590 ; CI-NEXT: s_setpc_b64 s[30:31]
592 ; SI-LABEL: mad_i64_i32_unpack_i64ops:
594 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
595 ; SI-NEXT: v_mul_lo_u32 v2, v1, v0
596 ; SI-NEXT: v_mul_hi_u32 v3, v1, v0
597 ; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
598 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
599 ; SI-NEXT: s_setpc_b64 s[30:31]
601 ; GFX9-LABEL: mad_i64_i32_unpack_i64ops:
603 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
604 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[0:1]
605 ; GFX9-NEXT: s_setpc_b64 s[30:31]
607 ; GFX11-LABEL: mad_i64_i32_unpack_i64ops:
609 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
610 ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
611 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
612 ; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
613 ; GFX11-NEXT: s_setpc_b64 s[30:31]
614 %tmp4 = lshr i64 %arg0, 32
615 %tmp5 = and i64 %arg0, 4294967295
616 %mul = mul nuw i64 %tmp4, %tmp5
617 %mad = add i64 %mul, %arg0
621 define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 {
622 ; CI-LABEL: mad_i64_i32_uniform:
624 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
625 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
626 ; CI-NEXT: s_mov_b32 s3, 0xf000
627 ; CI-NEXT: s_mov_b32 s2, -1
628 ; CI-NEXT: s_waitcnt lgkmcnt(0)
629 ; CI-NEXT: v_mov_b32_e32 v2, s7
630 ; CI-NEXT: v_mov_b32_e32 v0, s0
631 ; CI-NEXT: v_mov_b32_e32 v1, s1
632 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1]
633 ; CI-NEXT: s_mov_b32 s0, s4
634 ; CI-NEXT: s_mov_b32 s1, s5
635 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
638 ; SI-LABEL: mad_i64_i32_uniform:
640 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
641 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
642 ; SI-NEXT: s_mov_b32 s3, 0xf000
643 ; SI-NEXT: s_mov_b32 s2, -1
644 ; SI-NEXT: s_waitcnt lgkmcnt(0)
645 ; SI-NEXT: v_mov_b32_e32 v0, s7
646 ; SI-NEXT: v_mul_hi_u32 v1, s6, v0
647 ; SI-NEXT: s_mov_b32 s0, s4
648 ; SI-NEXT: s_mul_i32 s4, s6, s7
649 ; SI-NEXT: v_mov_b32_e32 v0, s4
650 ; SI-NEXT: v_mov_b32_e32 v2, s9
651 ; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
652 ; SI-NEXT: s_mov_b32 s1, s5
653 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
654 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
657 ; GFX9-LABEL: mad_i64_i32_uniform:
659 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
660 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
661 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
662 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
663 ; GFX9-NEXT: s_mul_i32 s0, s6, s7
664 ; GFX9-NEXT: s_mul_hi_u32 s1, s6, s7
665 ; GFX9-NEXT: s_add_u32 s0, s0, s2
666 ; GFX9-NEXT: s_addc_u32 s1, s1, s3
667 ; GFX9-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
668 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
669 ; GFX9-NEXT: s_endpgm
671 ; GFX11-LABEL: mad_i64_i32_uniform:
673 ; GFX11-NEXT: s_clause 0x1
674 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
675 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
676 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
677 ; GFX11-NEXT: s_mul_i32 s2, s6, s7
678 ; GFX11-NEXT: s_mul_hi_u32 s3, s6, s7
679 ; GFX11-NEXT: s_add_u32 s0, s2, s0
680 ; GFX11-NEXT: s_addc_u32 s1, s3, s1
681 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
682 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
683 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
684 ; GFX11-NEXT: s_nop 0
685 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
686 ; GFX11-NEXT: s_endpgm
687 %ext0 = zext i32 %arg0 to i64
688 %ext1 = zext i32 %arg1 to i64
689 %mul = mul i64 %ext0, %ext1
690 %mad = add i64 %mul, %arg2
691 store i64 %mad, ptr addrspace(1) %out
695 define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
696 ; CI-LABEL: mad_i64_i32_twice:
698 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
699 ; CI-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
700 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[4:5]
701 ; CI-NEXT: v_xor_b32_e32 v1, v3, v1
702 ; CI-NEXT: v_xor_b32_e32 v0, v2, v0
703 ; CI-NEXT: s_setpc_b64 s[30:31]
705 ; SI-LABEL: mad_i64_i32_twice:
707 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
708 ; SI-NEXT: v_mul_lo_u32 v6, v0, v1
709 ; SI-NEXT: v_mul_hi_i32 v0, v0, v1
710 ; SI-NEXT: v_add_i32_e32 v2, vcc, v6, v2
711 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v0, v3, vcc
712 ; SI-NEXT: v_add_i32_e32 v3, vcc, v6, v4
713 ; SI-NEXT: v_addc_u32_e32 v0, vcc, v0, v5, vcc
714 ; SI-NEXT: v_xor_b32_e32 v1, v1, v0
715 ; SI-NEXT: v_xor_b32_e32 v0, v2, v3
716 ; SI-NEXT: s_setpc_b64 s[30:31]
718 ; GFX9-LABEL: mad_i64_i32_twice:
720 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
721 ; GFX9-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
722 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[4:5]
723 ; GFX9-NEXT: v_xor_b32_e32 v1, v3, v1
724 ; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0
725 ; GFX9-NEXT: s_setpc_b64 s[30:31]
727 ; GFX11-LABEL: mad_i64_i32_twice:
729 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730 ; GFX11-NEXT: v_mad_i64_i32 v[6:7], null, v0, v1, v[2:3]
731 ; GFX11-NEXT: v_mad_i64_i32 v[2:3], null, v0, v1, v[4:5]
732 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
733 ; GFX11-NEXT: v_xor_b32_e32 v0, v6, v2
734 ; GFX11-NEXT: v_xor_b32_e32 v1, v7, v3
735 ; GFX11-NEXT: s_setpc_b64 s[30:31]
736 %sext0 = sext i32 %arg0 to i64
737 %sext1 = sext i32 %arg1 to i64
738 %mul = mul i64 %sext0, %sext1
739 %mad1 = add i64 %mul, %arg2
740 %mad2 = add i64 %mul, %arg3
741 %out = xor i64 %mad1, %mad2
745 define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %arg4) #0 {
746 ; CI-LABEL: mad_i64_i32_thrice:
748 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
749 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0
750 ; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
751 ; CI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
752 ; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v4
753 ; CI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc
754 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v6
755 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
756 ; CI-NEXT: v_xor_b32_e32 v3, v3, v5
757 ; CI-NEXT: v_xor_b32_e32 v2, v2, v4
758 ; CI-NEXT: v_xor_b32_e32 v1, v3, v1
759 ; CI-NEXT: v_xor_b32_e32 v0, v2, v0
760 ; CI-NEXT: s_setpc_b64 s[30:31]
762 ; SI-LABEL: mad_i64_i32_thrice:
764 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
765 ; SI-NEXT: v_mul_lo_u32 v8, v0, v1
766 ; SI-NEXT: v_mul_hi_i32 v0, v0, v1
767 ; SI-NEXT: v_add_i32_e32 v1, vcc, v8, v2
768 ; SI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc
769 ; SI-NEXT: v_add_i32_e32 v3, vcc, v8, v4
770 ; SI-NEXT: v_addc_u32_e32 v4, vcc, v0, v5, vcc
771 ; SI-NEXT: v_add_i32_e32 v5, vcc, v8, v6
772 ; SI-NEXT: v_addc_u32_e32 v0, vcc, v0, v7, vcc
773 ; SI-NEXT: v_xor_b32_e32 v2, v2, v4
774 ; SI-NEXT: v_xor_b32_e32 v3, v1, v3
775 ; SI-NEXT: v_xor_b32_e32 v1, v2, v0
776 ; SI-NEXT: v_xor_b32_e32 v0, v3, v5
777 ; SI-NEXT: s_setpc_b64 s[30:31]
779 ; GFX9-LABEL: mad_i64_i32_thrice:
781 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
782 ; GFX9-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
783 ; GFX9-NEXT: v_mad_i64_i32 v[4:5], s[4:5], v0, v1, v[4:5]
784 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[6:7]
785 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v5
786 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4
787 ; GFX9-NEXT: v_xor_b32_e32 v1, v3, v1
788 ; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0
789 ; GFX9-NEXT: s_setpc_b64 s[30:31]
791 ; GFX11-LABEL: mad_i64_i32_thrice:
793 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
794 ; GFX11-NEXT: v_mad_i64_i32 v[8:9], null, v0, v1, 0
795 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
796 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v8, v2
797 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo
798 ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4
799 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
800 ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v8, v6
801 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo
802 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
803 ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2
804 ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3
805 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
806 ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4
807 ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
808 ; GFX11-NEXT: s_setpc_b64 s[30:31]
809 %sext0 = sext i32 %arg0 to i64
810 %sext1 = sext i32 %arg1 to i64
811 %mul = mul i64 %sext0, %sext1
812 %mad1 = add i64 %mul, %arg2
813 %mad2 = add i64 %mul, %arg3
814 %mad3 = add i64 %mul, %arg4
815 %out.p = xor i64 %mad1, %mad2
816 %out = xor i64 %out.p, %mad3
820 define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
821 ; CI-LABEL: mad_i64_i32_secondary_use:
823 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
824 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0
825 ; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
826 ; CI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
827 ; CI-NEXT: v_xor_b32_e32 v1, v3, v1
828 ; CI-NEXT: v_xor_b32_e32 v0, v2, v0
829 ; CI-NEXT: s_setpc_b64 s[30:31]
831 ; SI-LABEL: mad_i64_i32_secondary_use:
833 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
834 ; SI-NEXT: v_mul_lo_u32 v4, v0, v1
835 ; SI-NEXT: v_mul_hi_i32 v0, v0, v1
836 ; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2
837 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v0, v3, vcc
838 ; SI-NEXT: v_xor_b32_e32 v1, v1, v0
839 ; SI-NEXT: v_xor_b32_e32 v0, v2, v4
840 ; SI-NEXT: s_setpc_b64 s[30:31]
842 ; GFX9-LABEL: mad_i64_i32_secondary_use:
844 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
845 ; GFX9-NEXT: v_mad_i64_i32 v[4:5], s[4:5], v0, v1, 0
846 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
847 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v5
848 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4
849 ; GFX9-NEXT: s_setpc_b64 s[30:31]
851 ; GFX11-LABEL: mad_i64_i32_secondary_use:
853 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
854 ; GFX11-NEXT: v_mad_i64_i32 v[4:5], null, v0, v1, 0
855 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
856 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
857 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo
858 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
859 ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4
860 ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
861 ; GFX11-NEXT: s_setpc_b64 s[30:31]
862 %sext0 = sext i32 %arg0 to i64
863 %sext1 = sext i32 %arg1 to i64
864 %mul = mul i64 %sext0, %sext1
865 %mad = add i64 %mul, %arg2
866 %out = xor i64 %mad, %mul
870 define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
871 ; CI-LABEL: mad_i48_i48:
873 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
874 ; CI-NEXT: v_mov_b32_e32 v6, v1
875 ; CI-NEXT: v_mov_b32_e32 v7, v0
876 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5]
877 ; CI-NEXT: v_mul_lo_u32 v2, v6, v2
878 ; CI-NEXT: v_mul_lo_u32 v3, v7, v3
879 ; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1
880 ; CI-NEXT: v_add_i32_e32 v1, vcc, v3, v1
881 ; CI-NEXT: s_setpc_b64 s[30:31]
883 ; SI-LABEL: mad_i48_i48:
885 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
886 ; SI-NEXT: v_mul_lo_u32 v3, v0, v3
887 ; SI-NEXT: v_mul_hi_u32 v6, v0, v2
888 ; SI-NEXT: v_mul_lo_u32 v1, v1, v2
889 ; SI-NEXT: v_mul_lo_u32 v0, v0, v2
890 ; SI-NEXT: v_add_i32_e32 v3, vcc, v6, v3
891 ; SI-NEXT: v_add_i32_e32 v1, vcc, v3, v1
892 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4
893 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
894 ; SI-NEXT: s_setpc_b64 s[30:31]
896 ; GFX9-LABEL: mad_i48_i48:
898 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
899 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
900 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
901 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5]
902 ; GFX9-NEXT: v_mul_lo_u32 v3, v7, v3
903 ; GFX9-NEXT: v_mul_lo_u32 v2, v6, v2
904 ; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
905 ; GFX9-NEXT: s_setpc_b64 s[30:31]
907 ; GFX11-LABEL: mad_i48_i48:
909 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
910 ; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0
911 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
912 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5]
913 ; GFX11-NEXT: v_mul_lo_u32 v3, v7, v3
914 ; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2
915 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
916 ; GFX11-NEXT: v_add3_u32 v1, v2, v1, v3
917 ; GFX11-NEXT: s_setpc_b64 s[30:31]
918 %m = mul i48 %arg0, %arg1
919 %a = add i48 %m, %arg2
923 attributes #0 = { nounwind }
924 attributes #1 = { nounwind readnone speculatable }