1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX1150 %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
8 ; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s
10 ; On GFX11, ensure vdst and src2 do not partially overlap. Full overlap is ok.
12 define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
13 ; CI-LABEL: mad_i64_i32_sextops:
15 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
17 ; CI-NEXT: s_setpc_b64 s[30:31]
19 ; SI-LABEL: mad_i64_i32_sextops:
21 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22 ; SI-NEXT: v_mul_lo_u32 v4, v0, v1
23 ; SI-NEXT: v_mul_hi_i32 v1, v0, v1
24 ; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v2
25 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
26 ; SI-NEXT: s_setpc_b64 s[30:31]
28 ; GFX9-LABEL: mad_i64_i32_sextops:
30 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
32 ; GFX9-NEXT: s_setpc_b64 s[30:31]
34 ; GFX1100-LABEL: mad_i64_i32_sextops:
36 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37 ; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
38 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
39 ; GFX1100-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
40 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
42 ; GFX1150-LABEL: mad_i64_i32_sextops:
44 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45 ; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v0, v1, v[2:3]
46 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
48 ; GFX12-LABEL: mad_i64_i32_sextops:
50 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
51 ; GFX12-NEXT: s_wait_expcnt 0x0
52 ; GFX12-NEXT: s_wait_samplecnt 0x0
53 ; GFX12-NEXT: s_wait_bvhcnt 0x0
54 ; GFX12-NEXT: s_wait_kmcnt 0x0
55 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
56 ; GFX12-NEXT: s_setpc_b64 s[30:31]
57 %sext0 = sext i32 %arg0 to i64
58 %sext1 = sext i32 %arg1 to i64
59 %mul = mul i64 %sext0, %sext1
60 %mad = add i64 %mul, %arg2
64 define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
65 ; CI-LABEL: mad_i64_i32_sextops_commute:
67 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
69 ; CI-NEXT: s_setpc_b64 s[30:31]
71 ; SI-LABEL: mad_i64_i32_sextops_commute:
73 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74 ; SI-NEXT: v_mul_lo_u32 v4, v0, v1
75 ; SI-NEXT: v_mul_hi_i32 v1, v0, v1
76 ; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v4
77 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
78 ; SI-NEXT: s_setpc_b64 s[30:31]
80 ; GFX9-LABEL: mad_i64_i32_sextops_commute:
82 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
84 ; GFX9-NEXT: s_setpc_b64 s[30:31]
86 ; GFX1100-LABEL: mad_i64_i32_sextops_commute:
88 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
89 ; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
90 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
91 ; GFX1100-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
92 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
94 ; GFX1150-LABEL: mad_i64_i32_sextops_commute:
96 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97 ; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v0, v1, v[2:3]
98 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
100 ; GFX12-LABEL: mad_i64_i32_sextops_commute:
102 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
103 ; GFX12-NEXT: s_wait_expcnt 0x0
104 ; GFX12-NEXT: s_wait_samplecnt 0x0
105 ; GFX12-NEXT: s_wait_bvhcnt 0x0
106 ; GFX12-NEXT: s_wait_kmcnt 0x0
107 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
108 ; GFX12-NEXT: s_setpc_b64 s[30:31]
109 %sext0 = sext i32 %arg0 to i64
110 %sext1 = sext i32 %arg1 to i64
111 %mul = mul i64 %sext0, %sext1
112 %mad = add i64 %arg2, %mul
116 define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
117 ; CI-LABEL: mad_u64_u32_zextops:
119 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
120 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
121 ; CI-NEXT: s_setpc_b64 s[30:31]
123 ; SI-LABEL: mad_u64_u32_zextops:
125 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126 ; SI-NEXT: v_mul_lo_u32 v4, v0, v1
127 ; SI-NEXT: v_mul_hi_u32 v1, v0, v1
128 ; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v2
129 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
130 ; SI-NEXT: s_setpc_b64 s[30:31]
132 ; GFX9-LABEL: mad_u64_u32_zextops:
134 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
136 ; GFX9-NEXT: s_setpc_b64 s[30:31]
138 ; GFX1100-LABEL: mad_u64_u32_zextops:
140 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141 ; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
142 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
143 ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
144 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
146 ; GFX1150-LABEL: mad_u64_u32_zextops:
148 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149 ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, v[2:3]
150 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
152 ; GFX12-LABEL: mad_u64_u32_zextops:
154 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
155 ; GFX12-NEXT: s_wait_expcnt 0x0
156 ; GFX12-NEXT: s_wait_samplecnt 0x0
157 ; GFX12-NEXT: s_wait_bvhcnt 0x0
158 ; GFX12-NEXT: s_wait_kmcnt 0x0
159 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
160 ; GFX12-NEXT: s_setpc_b64 s[30:31]
161 %sext0 = zext i32 %arg0 to i64
162 %sext1 = zext i32 %arg1 to i64
163 %mul = mul i64 %sext0, %sext1
164 %mad = add i64 %mul, %arg2
168 define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
169 ; CI-LABEL: mad_u64_u32_zextops_commute:
171 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
173 ; CI-NEXT: s_setpc_b64 s[30:31]
175 ; SI-LABEL: mad_u64_u32_zextops_commute:
177 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
178 ; SI-NEXT: v_mul_lo_u32 v4, v0, v1
179 ; SI-NEXT: v_mul_hi_u32 v1, v0, v1
180 ; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v4
181 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
182 ; SI-NEXT: s_setpc_b64 s[30:31]
184 ; GFX9-LABEL: mad_u64_u32_zextops_commute:
186 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
187 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
188 ; GFX9-NEXT: s_setpc_b64 s[30:31]
190 ; GFX1100-LABEL: mad_u64_u32_zextops_commute:
192 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193 ; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
194 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
195 ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
196 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
198 ; GFX1150-LABEL: mad_u64_u32_zextops_commute:
200 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
201 ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, v[2:3]
202 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
204 ; GFX12-LABEL: mad_u64_u32_zextops_commute:
206 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
207 ; GFX12-NEXT: s_wait_expcnt 0x0
208 ; GFX12-NEXT: s_wait_samplecnt 0x0
209 ; GFX12-NEXT: s_wait_bvhcnt 0x0
210 ; GFX12-NEXT: s_wait_kmcnt 0x0
211 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
212 ; GFX12-NEXT: s_setpc_b64 s[30:31]
213 %sext0 = zext i32 %arg0 to i64
214 %sext1 = zext i32 %arg1 to i64
215 %mul = mul i64 %sext0, %sext1
216 %mad = add i64 %arg2, %mul
220 define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
221 ; CI-LABEL: mad_i64_i32_sextops_i32_i128:
223 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224 ; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
225 ; CI-NEXT: v_ashrrev_i32_e32 v12, 31, v0
226 ; CI-NEXT: v_mov_b32_e32 v8, 0
227 ; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v1, v[7:8]
228 ; CI-NEXT: v_ashrrev_i32_e32 v13, 31, v1
229 ; CI-NEXT: v_mov_b32_e32 v11, v10
230 ; CI-NEXT: v_mov_b32_e32 v10, v8
231 ; CI-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v13, v[9:10]
232 ; CI-NEXT: v_add_i32_e32 v8, vcc, v11, v8
233 ; CI-NEXT: v_mad_i64_i32 v[10:11], s[4:5], v1, v12, 0
234 ; CI-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, vcc
235 ; CI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v13, v[8:9]
236 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v13, v0, v[10:11]
237 ; CI-NEXT: v_add_i32_e32 v8, vcc, v8, v0
238 ; CI-NEXT: v_addc_u32_e32 v9, vcc, v9, v1, vcc
239 ; CI-NEXT: v_mov_b32_e32 v1, v7
240 ; CI-NEXT: v_add_i32_e32 v0, vcc, v6, v2
241 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
242 ; CI-NEXT: v_addc_u32_e32 v2, vcc, v8, v4, vcc
243 ; CI-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc
244 ; CI-NEXT: s_setpc_b64 s[30:31]
246 ; SI-LABEL: mad_i64_i32_sextops_i32_i128:
248 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249 ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v0
250 ; SI-NEXT: v_mul_lo_u32 v11, v6, v1
251 ; SI-NEXT: v_mul_hi_u32 v12, v0, v1
252 ; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1
253 ; SI-NEXT: v_mul_hi_u32 v14, v6, v1
254 ; SI-NEXT: v_mul_lo_u32 v13, v0, v7
255 ; SI-NEXT: v_mul_hi_u32 v10, v0, v7
256 ; SI-NEXT: v_add_i32_e32 v12, vcc, v11, v12
257 ; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc
258 ; SI-NEXT: v_mul_hi_u32 v8, v6, v7
259 ; SI-NEXT: v_add_i32_e32 v12, vcc, v13, v12
260 ; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
261 ; SI-NEXT: v_mul_i32_i24_e32 v9, v6, v7
262 ; SI-NEXT: v_add_i32_e32 v10, vcc, v14, v10
263 ; SI-NEXT: v_mul_hi_i32 v6, v1, v6
264 ; SI-NEXT: v_mul_hi_i32 v7, v7, v0
265 ; SI-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
266 ; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v10
267 ; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc
268 ; SI-NEXT: v_add_i32_e32 v10, vcc, v13, v11
269 ; SI-NEXT: v_mul_lo_u32 v0, v0, v1
270 ; SI-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
271 ; SI-NEXT: v_add_i32_e32 v7, vcc, v9, v10
272 ; SI-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc
273 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
274 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v12, v3, vcc
275 ; SI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc
276 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc
277 ; SI-NEXT: s_setpc_b64 s[30:31]
279 ; GFX9-LABEL: mad_i64_i32_sextops_i32_i128:
281 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
282 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
283 ; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v0
284 ; GFX9-NEXT: v_mov_b32_e32 v9, 0
285 ; GFX9-NEXT: v_mov_b32_e32 v8, v7
286 ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9]
287 ; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v1
288 ; GFX9-NEXT: v_mov_b32_e32 v8, v11
289 ; GFX9-NEXT: v_mov_b32_e32 v11, v9
290 ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11]
291 ; GFX9-NEXT: v_mov_b32_e32 v12, v11
292 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12
293 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
294 ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9]
295 ; GFX9-NEXT: v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0
296 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13]
297 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v0
298 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v1, vcc
299 ; GFX9-NEXT: v_mov_b32_e32 v1, v10
300 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2
301 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
302 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
303 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
304 ; GFX9-NEXT: s_setpc_b64 s[30:31]
306 ; GFX1100-LABEL: mad_i64_i32_sextops_i32_i128:
308 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
309 ; GFX1100-NEXT: v_mad_u64_u32 v[6:7], null, v0, v1, 0
310 ; GFX1100-NEXT: v_mov_b32_e32 v8, 0
311 ; GFX1100-NEXT: v_ashrrev_i32_e32 v14, 31, v0
312 ; GFX1100-NEXT: v_ashrrev_i32_e32 v15, 31, v1
313 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
314 ; GFX1100-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
315 ; GFX1100-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8
316 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
317 ; GFX1100-NEXT: v_mad_u64_u32 v[7:8], null, v0, v15, v[9:10]
318 ; GFX1100-NEXT: v_mov_b32_e32 v10, v8
319 ; GFX1100-NEXT: v_mad_i64_i32 v[8:9], null, v1, v14, 0
320 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
321 ; GFX1100-NEXT: v_add_co_u32 v10, s0, v11, v10
322 ; GFX1100-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0
323 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
324 ; GFX1100-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[8:9]
325 ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[10:11]
326 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
327 ; GFX1100-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12
328 ; GFX1100-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
329 ; GFX1100-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
330 ; GFX1100-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
331 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
332 ; GFX1100-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
333 ; GFX1100-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
334 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
336 ; GFX1150-LABEL: mad_i64_i32_sextops_i32_i128:
338 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
339 ; GFX1150-NEXT: v_mad_u64_u32 v[6:7], null, v0, v1, 0
340 ; GFX1150-NEXT: v_mov_b32_e32 v8, 0
341 ; GFX1150-NEXT: v_ashrrev_i32_e32 v12, 31, v0
342 ; GFX1150-NEXT: v_ashrrev_i32_e32 v13, 31, v1
343 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
344 ; GFX1150-NEXT: v_mad_u64_u32 v[9:10], null, v12, v1, v[7:8]
345 ; GFX1150-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8
346 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
347 ; GFX1150-NEXT: v_mad_u64_u32 v[7:8], null, v0, v13, v[9:10]
348 ; GFX1150-NEXT: v_mov_b32_e32 v10, v8
349 ; GFX1150-NEXT: v_mad_i64_i32 v[8:9], null, v1, v12, 0
350 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
351 ; GFX1150-NEXT: v_add_co_u32 v10, s0, v11, v10
352 ; GFX1150-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0
353 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
354 ; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v13, v0, v[8:9]
355 ; GFX1150-NEXT: v_mad_u64_u32 v[8:9], null, v12, v13, v[10:11]
356 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
357 ; GFX1150-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0
358 ; GFX1150-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo
359 ; GFX1150-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
360 ; GFX1150-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
361 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
362 ; GFX1150-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
363 ; GFX1150-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
364 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
366 ; GFX12-LABEL: mad_i64_i32_sextops_i32_i128:
368 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
369 ; GFX12-NEXT: s_wait_expcnt 0x0
370 ; GFX12-NEXT: s_wait_samplecnt 0x0
371 ; GFX12-NEXT: s_wait_bvhcnt 0x0
372 ; GFX12-NEXT: s_wait_kmcnt 0x0
373 ; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v0, v1, 0
374 ; GFX12-NEXT: v_mov_b32_e32 v8, 0
375 ; GFX12-NEXT: v_ashrrev_i32_e32 v12, 31, v0
376 ; GFX12-NEXT: v_ashrrev_i32_e32 v13, 31, v1
377 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
378 ; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v12, v1, v[7:8]
379 ; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8
380 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
381 ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v0, v13, v[9:10]
382 ; GFX12-NEXT: v_mov_b32_e32 v10, v8
383 ; GFX12-NEXT: v_mad_co_i64_i32 v[8:9], null, v1, v12, 0
384 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
385 ; GFX12-NEXT: v_add_co_u32 v10, s0, v11, v10
386 ; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0
387 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
388 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v13, v0, v[8:9]
389 ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v12, v13, v[10:11]
390 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
391 ; GFX12-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0
392 ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo
393 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
394 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
395 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
396 ; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
397 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
398 ; GFX12-NEXT: s_setpc_b64 s[30:31]
399 %sext0 = sext i32 %arg0 to i128
400 %sext1 = sext i32 %arg1 to i128
401 %mul = mul i128 %sext0, %sext1
402 %mad = add i128 %mul, %arg2
406 define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
407 ; CI-LABEL: mad_i64_i32_sextops_i32_i63:
409 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
411 ; CI-NEXT: s_setpc_b64 s[30:31]
413 ; SI-LABEL: mad_i64_i32_sextops_i32_i63:
415 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
416 ; SI-NEXT: v_mul_lo_u32 v4, v0, v1
417 ; SI-NEXT: v_mul_hi_i32 v1, v0, v1
418 ; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v2
419 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
420 ; SI-NEXT: s_setpc_b64 s[30:31]
422 ; GFX9-LABEL: mad_i64_i32_sextops_i32_i63:
424 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
426 ; GFX9-NEXT: s_setpc_b64 s[30:31]
428 ; GFX1100-LABEL: mad_i64_i32_sextops_i32_i63:
430 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
431 ; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
432 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
433 ; GFX1100-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
434 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
436 ; GFX1150-LABEL: mad_i64_i32_sextops_i32_i63:
438 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
439 ; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v0, v1, v[2:3]
440 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
442 ; GFX12-LABEL: mad_i64_i32_sextops_i32_i63:
444 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
445 ; GFX12-NEXT: s_wait_expcnt 0x0
446 ; GFX12-NEXT: s_wait_samplecnt 0x0
447 ; GFX12-NEXT: s_wait_bvhcnt 0x0
448 ; GFX12-NEXT: s_wait_kmcnt 0x0
449 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
450 ; GFX12-NEXT: s_setpc_b64 s[30:31]
451 %sext0 = sext i32 %arg0 to i63
452 %sext1 = sext i32 %arg1 to i63
453 %mul = mul i63 %sext0, %sext1
454 %mad = add i63 %mul, %arg2
458 define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
459 ; CI-LABEL: mad_i64_i32_sextops_i31_i63:
461 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
462 ; CI-NEXT: v_bfe_i32 v1, v1, 0, 31
463 ; CI-NEXT: v_bfe_i32 v0, v0, 0, 31
464 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
465 ; CI-NEXT: s_setpc_b64 s[30:31]
467 ; SI-LABEL: mad_i64_i32_sextops_i31_i63:
469 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470 ; SI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
471 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1
472 ; SI-NEXT: v_ashr_i64 v[4:5], v[3:4], 33
473 ; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 33
474 ; SI-NEXT: v_mul_lo_u32 v1, v4, v0
475 ; SI-NEXT: v_mul_hi_i32 v4, v4, v0
476 ; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v2
477 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v4, v3, vcc
478 ; SI-NEXT: s_setpc_b64 s[30:31]
480 ; GFX9-LABEL: mad_i64_i32_sextops_i31_i63:
482 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
483 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 31
484 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 31
485 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
486 ; GFX9-NEXT: s_setpc_b64 s[30:31]
488 ; GFX1100-LABEL: mad_i64_i32_sextops_i31_i63:
490 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
491 ; GFX1100-NEXT: v_bfe_i32 v4, v1, 0, 31
492 ; GFX1100-NEXT: v_bfe_i32 v5, v0, 0, 31
493 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
494 ; GFX1100-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
495 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
497 ; GFX1150-LABEL: mad_i64_i32_sextops_i31_i63:
499 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
500 ; GFX1150-NEXT: v_bfe_i32 v1, v1, 0, 31
501 ; GFX1150-NEXT: v_bfe_i32 v0, v0, 0, 31
502 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
503 ; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v0, v1, v[2:3]
504 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
506 ; GFX12-LABEL: mad_i64_i32_sextops_i31_i63:
508 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
509 ; GFX12-NEXT: s_wait_expcnt 0x0
510 ; GFX12-NEXT: s_wait_samplecnt 0x0
511 ; GFX12-NEXT: s_wait_bvhcnt 0x0
512 ; GFX12-NEXT: s_wait_kmcnt 0x0
513 ; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 31
514 ; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 31
515 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
516 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
517 ; GFX12-NEXT: s_setpc_b64 s[30:31]
518 %sext0 = sext i31 %arg0 to i63
519 %sext1 = sext i31 %arg1 to i63
520 %mul = mul i63 %sext0, %sext1
521 %mad = add i63 %mul, %arg2
525 define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
526 ; CI-LABEL: mad_i64_i32_extops_i32_i64:
528 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529 ; CI-NEXT: v_ashrrev_i32_e32 v4, 31, v0
530 ; CI-NEXT: v_mul_lo_u32 v4, v4, v1
531 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
532 ; CI-NEXT: v_add_i32_e32 v1, vcc, v4, v1
533 ; CI-NEXT: s_setpc_b64 s[30:31]
535 ; SI-LABEL: mad_i64_i32_extops_i32_i64:
537 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v0
539 ; SI-NEXT: v_mul_hi_u32 v5, v0, v1
540 ; SI-NEXT: v_mul_lo_u32 v4, v4, v1
541 ; SI-NEXT: v_mul_lo_u32 v0, v0, v1
542 ; SI-NEXT: v_add_i32_e32 v1, vcc, v5, v4
543 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
544 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
545 ; SI-NEXT: s_setpc_b64 s[30:31]
547 ; GFX9-LABEL: mad_i64_i32_extops_i32_i64:
549 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
550 ; GFX9-NEXT: v_mov_b32_e32 v4, v1
551 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0
552 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v4, v[2:3]
553 ; GFX9-NEXT: v_mov_b32_e32 v2, v1
554 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v4, v[2:3]
555 ; GFX9-NEXT: v_mov_b32_e32 v1, v2
556 ; GFX9-NEXT: s_setpc_b64 s[30:31]
558 ; GFX1100-LABEL: mad_i64_i32_extops_i32_i64:
560 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
561 ; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
562 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
563 ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
564 ; GFX1100-NEXT: v_ashrrev_i32_e32 v5, 31, v5
565 ; GFX1100-NEXT: v_mov_b32_e32 v3, v1
566 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
567 ; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4]
568 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
570 ; GFX1150-LABEL: mad_i64_i32_extops_i32_i64:
572 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
573 ; GFX1150-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
574 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
575 ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
576 ; GFX1150-NEXT: v_ashrrev_i32_e32 v2, 31, v5
577 ; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v2, v4, v[1:2]
578 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
580 ; GFX12-LABEL: mad_i64_i32_extops_i32_i64:
582 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
583 ; GFX12-NEXT: s_wait_expcnt 0x0
584 ; GFX12-NEXT: s_wait_samplecnt 0x0
585 ; GFX12-NEXT: s_wait_bvhcnt 0x0
586 ; GFX12-NEXT: s_wait_kmcnt 0x0
587 ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
588 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
589 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v4, v[2:3]
590 ; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v5
591 ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, v4, v[1:2]
592 ; GFX12-NEXT: s_setpc_b64 s[30:31]
593 %ext0 = sext i32 %arg0 to i64
594 %ext1 = zext i32 %arg1 to i64
595 %mul = mul i64 %ext0, %ext1
596 %mad = add i64 %mul, %arg2
600 define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
601 ; CI-LABEL: mad_u64_u32_bitops:
603 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
604 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
605 ; CI-NEXT: s_setpc_b64 s[30:31]
607 ; SI-LABEL: mad_u64_u32_bitops:
609 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
610 ; SI-NEXT: v_mul_lo_u32 v1, v0, v2
611 ; SI-NEXT: v_mul_hi_u32 v2, v0, v2
612 ; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v4
613 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v5, vcc
614 ; SI-NEXT: s_setpc_b64 s[30:31]
616 ; GFX9-LABEL: mad_u64_u32_bitops:
618 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
619 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
620 ; GFX9-NEXT: s_setpc_b64 s[30:31]
622 ; GFX1100-LABEL: mad_u64_u32_bitops:
624 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
625 ; GFX1100-NEXT: v_mov_b32_e32 v3, v0
626 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
627 ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5]
628 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
630 ; GFX1150-LABEL: mad_u64_u32_bitops:
632 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
633 ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v0, v2, v[4:5]
634 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
636 ; GFX12-LABEL: mad_u64_u32_bitops:
638 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
639 ; GFX12-NEXT: s_wait_expcnt 0x0
640 ; GFX12-NEXT: s_wait_samplecnt 0x0
641 ; GFX12-NEXT: s_wait_bvhcnt 0x0
642 ; GFX12-NEXT: s_wait_kmcnt 0x0
643 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, v[4:5]
644 ; GFX12-NEXT: s_setpc_b64 s[30:31]
645 %trunc.lhs = and i64 %arg0, 4294967295
646 %trunc.rhs = and i64 %arg1, 4294967295
647 %mul = mul i64 %trunc.lhs, %trunc.rhs
648 %add = add i64 %mul, %arg2
652 define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
653 ; CI-LABEL: mad_u64_u32_bitops_lhs_mask_small:
655 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
656 ; CI-NEXT: v_and_b32_e32 v3, 1, v1
657 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
658 ; CI-NEXT: v_mul_lo_u32 v2, v3, v2
659 ; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1
660 ; CI-NEXT: s_setpc_b64 s[30:31]
662 ; SI-LABEL: mad_u64_u32_bitops_lhs_mask_small:
664 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665 ; SI-NEXT: v_and_b32_e32 v1, 1, v1
666 ; SI-NEXT: v_mul_hi_u32 v3, v0, v2
667 ; SI-NEXT: v_mul_lo_u32 v1, v1, v2
668 ; SI-NEXT: v_mul_lo_u32 v0, v0, v2
669 ; SI-NEXT: v_add_i32_e32 v1, vcc, v3, v1
670 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4
671 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
672 ; SI-NEXT: s_setpc_b64 s[30:31]
674 ; GFX9-LABEL: mad_u64_u32_bitops_lhs_mask_small:
676 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
677 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v1
678 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
679 ; GFX9-NEXT: v_mov_b32_e32 v4, v1
680 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v2, v[4:5]
681 ; GFX9-NEXT: v_mov_b32_e32 v1, v2
682 ; GFX9-NEXT: s_setpc_b64 s[30:31]
684 ; GFX1100-LABEL: mad_u64_u32_bitops_lhs_mask_small:
686 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
687 ; GFX1100-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0
688 ; GFX1100-NEXT: v_mov_b32_e32 v6, v1
689 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
690 ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
691 ; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6
692 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
693 ; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
694 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
696 ; GFX1150-LABEL: mad_u64_u32_bitops_lhs_mask_small:
698 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
699 ; GFX1150-NEXT: v_mov_b32_e32 v3, v1
700 ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v0, v2, v[4:5]
701 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
702 ; GFX1150-NEXT: v_and_b32_e32 v3, 1, v3
703 ; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v3, v2, v[1:2]
704 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
706 ; GFX12-LABEL: mad_u64_u32_bitops_lhs_mask_small:
708 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
709 ; GFX12-NEXT: s_wait_expcnt 0x0
710 ; GFX12-NEXT: s_wait_samplecnt 0x0
711 ; GFX12-NEXT: s_wait_bvhcnt 0x0
712 ; GFX12-NEXT: s_wait_kmcnt 0x0
713 ; GFX12-NEXT: v_mov_b32_e32 v3, v1
714 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, v[4:5]
715 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
716 ; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
717 ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2]
718 ; GFX12-NEXT: s_setpc_b64 s[30:31]
719 %trunc.lhs = and i64 %arg0, 8589934591
720 %trunc.rhs = and i64 %arg1, 4294967295
721 %mul = mul i64 %trunc.lhs, %trunc.rhs
722 %add = add i64 %mul, %arg2
726 define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
727 ; CI-LABEL: mad_u64_u32_bitops_rhs_mask_small:
729 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730 ; CI-NEXT: v_mov_b32_e32 v6, v0
731 ; CI-NEXT: v_and_b32_e32 v3, 1, v3
732 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5]
733 ; CI-NEXT: v_mul_lo_u32 v2, v6, v3
734 ; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1
735 ; CI-NEXT: s_setpc_b64 s[30:31]
737 ; SI-LABEL: mad_u64_u32_bitops_rhs_mask_small:
739 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
740 ; SI-NEXT: v_and_b32_e32 v1, 1, v3
741 ; SI-NEXT: v_mul_hi_u32 v3, v0, v2
742 ; SI-NEXT: v_mul_lo_u32 v1, v0, v1
743 ; SI-NEXT: v_mul_lo_u32 v0, v0, v2
744 ; SI-NEXT: v_add_i32_e32 v1, vcc, v3, v1
745 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4
746 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
747 ; SI-NEXT: s_setpc_b64 s[30:31]
749 ; GFX9-LABEL: mad_u64_u32_bitops_rhs_mask_small:
751 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752 ; GFX9-NEXT: v_mov_b32_e32 v6, v0
753 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5]
754 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
755 ; GFX9-NEXT: v_mov_b32_e32 v2, v1
756 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v3, v[2:3]
757 ; GFX9-NEXT: v_mov_b32_e32 v1, v2
758 ; GFX9-NEXT: s_setpc_b64 s[30:31]
760 ; GFX1100-LABEL: mad_u64_u32_bitops_rhs_mask_small:
762 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
763 ; GFX1100-NEXT: v_mov_b32_e32 v6, v0
764 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
765 ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
766 ; GFX1100-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3
767 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
768 ; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
769 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
771 ; GFX1150-LABEL: mad_u64_u32_bitops_rhs_mask_small:
773 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
774 ; GFX1150-NEXT: v_mov_b32_e32 v6, v0
775 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
776 ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
777 ; GFX1150-NEXT: v_and_b32_e32 v2, 1, v3
778 ; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v6, v2, v[1:2]
779 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
781 ; GFX12-LABEL: mad_u64_u32_bitops_rhs_mask_small:
783 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
784 ; GFX12-NEXT: s_wait_expcnt 0x0
785 ; GFX12-NEXT: s_wait_samplecnt 0x0
786 ; GFX12-NEXT: s_wait_bvhcnt 0x0
787 ; GFX12-NEXT: s_wait_kmcnt 0x0
788 ; GFX12-NEXT: v_mov_b32_e32 v6, v0
789 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
790 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v2, v[4:5]
791 ; GFX12-NEXT: v_and_b32_e32 v2, 1, v3
792 ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v2, v[1:2]
793 ; GFX12-NEXT: s_setpc_b64 s[30:31]
794 %trunc.lhs = and i64 %arg0, 4294967295
795 %trunc.rhs = and i64 %arg1, 8589934591
796 %mul = mul i64 %trunc.lhs, %trunc.rhs
797 %add = add i64 %mul, %arg2
801 define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
802 ; CI-LABEL: mad_i64_i32_bitops:
804 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
805 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v2, v[4:5]
806 ; CI-NEXT: s_setpc_b64 s[30:31]
808 ; SI-LABEL: mad_i64_i32_bitops:
810 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
811 ; SI-NEXT: v_mul_lo_u32 v1, v0, v2
812 ; SI-NEXT: v_mul_hi_i32 v2, v0, v2
813 ; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v4
814 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v5, vcc
815 ; SI-NEXT: s_setpc_b64 s[30:31]
817 ; GFX9-LABEL: mad_i64_i32_bitops:
819 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
820 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v2, v[4:5]
821 ; GFX9-NEXT: s_setpc_b64 s[30:31]
823 ; GFX1100-LABEL: mad_i64_i32_bitops:
825 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
826 ; GFX1100-NEXT: v_mov_b32_e32 v3, v0
827 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
828 ; GFX1100-NEXT: v_mad_i64_i32 v[0:1], null, v3, v2, v[4:5]
829 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
831 ; GFX1150-LABEL: mad_i64_i32_bitops:
833 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
834 ; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v0, v2, v[4:5]
835 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
837 ; GFX12-LABEL: mad_i64_i32_bitops:
839 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
840 ; GFX12-NEXT: s_wait_expcnt 0x0
841 ; GFX12-NEXT: s_wait_samplecnt 0x0
842 ; GFX12-NEXT: s_wait_bvhcnt 0x0
843 ; GFX12-NEXT: s_wait_kmcnt 0x0
844 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v2, v[4:5]
845 ; GFX12-NEXT: s_setpc_b64 s[30:31]
846 %shl.lhs = shl i64 %arg0, 32
847 %trunc.lhs = ashr i64 %shl.lhs, 32
848 %shl.rhs = shl i64 %arg1, 32
849 %trunc.rhs = ashr i64 %shl.rhs, 32
850 %mul = mul i64 %trunc.lhs, %trunc.rhs
851 %add = add i64 %mul, %arg2
855 ; Example from bug report
856 define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
857 ; CI-LABEL: mad_i64_i32_unpack_i64ops:
859 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
860 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[0:1]
861 ; CI-NEXT: s_setpc_b64 s[30:31]
863 ; SI-LABEL: mad_i64_i32_unpack_i64ops:
865 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
866 ; SI-NEXT: v_mul_lo_u32 v2, v1, v0
867 ; SI-NEXT: v_mul_hi_u32 v3, v1, v0
868 ; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
869 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
870 ; SI-NEXT: s_setpc_b64 s[30:31]
872 ; GFX9-LABEL: mad_i64_i32_unpack_i64ops:
874 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
875 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[0:1]
876 ; GFX9-NEXT: s_setpc_b64 s[30:31]
878 ; GFX1100-LABEL: mad_i64_i32_unpack_i64ops:
880 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
881 ; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
882 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
883 ; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
884 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
886 ; GFX1150-LABEL: mad_i64_i32_unpack_i64ops:
888 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
889 ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, v[0:1]
890 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
892 ; GFX12-LABEL: mad_i64_i32_unpack_i64ops:
894 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
895 ; GFX12-NEXT: s_wait_expcnt 0x0
896 ; GFX12-NEXT: s_wait_samplecnt 0x0
897 ; GFX12-NEXT: s_wait_bvhcnt 0x0
898 ; GFX12-NEXT: s_wait_kmcnt 0x0
899 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[0:1]
900 ; GFX12-NEXT: s_setpc_b64 s[30:31]
901 %tmp4 = lshr i64 %arg0, 32
902 %tmp5 = and i64 %arg0, 4294967295
903 %mul = mul nuw i64 %tmp4, %tmp5
904 %mad = add i64 %mul, %arg0
908 define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 {
909 ; CI-LABEL: mad_i64_i32_uniform:
911 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
912 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
913 ; CI-NEXT: s_mov_b32 s3, 0xf000
914 ; CI-NEXT: s_mov_b32 s2, -1
915 ; CI-NEXT: s_waitcnt lgkmcnt(0)
916 ; CI-NEXT: v_mov_b32_e32 v2, s7
917 ; CI-NEXT: v_mov_b32_e32 v0, s0
918 ; CI-NEXT: v_mov_b32_e32 v1, s1
919 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1]
920 ; CI-NEXT: s_mov_b32 s0, s4
921 ; CI-NEXT: s_mov_b32 s1, s5
922 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
925 ; SI-LABEL: mad_i64_i32_uniform:
927 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
928 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
929 ; SI-NEXT: s_mov_b32 s11, 0xf000
930 ; SI-NEXT: s_mov_b32 s10, -1
931 ; SI-NEXT: s_waitcnt lgkmcnt(0)
932 ; SI-NEXT: v_mov_b32_e32 v0, s7
933 ; SI-NEXT: v_mul_hi_u32 v1, s6, v0
934 ; SI-NEXT: s_mul_i32 s2, s6, s7
935 ; SI-NEXT: v_mov_b32_e32 v0, s2
936 ; SI-NEXT: v_mov_b32_e32 v2, s1
937 ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
938 ; SI-NEXT: s_mov_b32 s8, s4
939 ; SI-NEXT: s_mov_b32 s9, s5
940 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
941 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
944 ; GFX9-LABEL: mad_i64_i32_uniform:
946 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
947 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
948 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
949 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
950 ; GFX9-NEXT: s_mul_i32 s3, s6, s7
951 ; GFX9-NEXT: s_mul_hi_u32 s2, s6, s7
952 ; GFX9-NEXT: s_add_u32 s0, s3, s0
953 ; GFX9-NEXT: s_addc_u32 s1, s2, s1
954 ; GFX9-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
955 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
956 ; GFX9-NEXT: s_endpgm
958 ; GFX11-LABEL: mad_i64_i32_uniform:
960 ; GFX11-NEXT: s_clause 0x1
961 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
962 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
963 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
964 ; GFX11-NEXT: s_mul_i32 s2, s6, s7
965 ; GFX11-NEXT: s_mul_hi_u32 s3, s6, s7
966 ; GFX11-NEXT: s_add_u32 s0, s2, s0
967 ; GFX11-NEXT: s_addc_u32 s1, s3, s1
968 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
969 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
970 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
971 ; GFX11-NEXT: s_nop 0
972 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
973 ; GFX11-NEXT: s_endpgm
975 ; GFX12-LABEL: mad_i64_i32_uniform:
977 ; GFX12-NEXT: s_clause 0x1
978 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
979 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
980 ; GFX12-NEXT: s_mov_b32 s3, 0
981 ; GFX12-NEXT: s_wait_kmcnt 0x0
982 ; GFX12-NEXT: s_mov_b32 s2, s6
983 ; GFX12-NEXT: s_mov_b32 s6, s7
984 ; GFX12-NEXT: s_mov_b32 s7, s3
985 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
986 ; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[6:7]
987 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
988 ; GFX12-NEXT: v_mov_b32_e32 v2, 0
989 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
990 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
991 ; GFX12-NEXT: s_nop 0
992 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
993 ; GFX12-NEXT: s_endpgm
994 %ext0 = zext i32 %arg0 to i64
995 %ext1 = zext i32 %arg1 to i64
996 %mul = mul i64 %ext0, %ext1
997 %mad = add i64 %mul, %arg2
998 store i64 %mad, ptr addrspace(1) %out
1002 define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
1003 ; CI-LABEL: mad_i64_i32_twice:
1005 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1006 ; CI-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
1007 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[4:5]
1008 ; CI-NEXT: v_xor_b32_e32 v1, v3, v1
1009 ; CI-NEXT: v_xor_b32_e32 v0, v2, v0
1010 ; CI-NEXT: s_setpc_b64 s[30:31]
1012 ; SI-LABEL: mad_i64_i32_twice:
1014 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1015 ; SI-NEXT: v_mul_lo_u32 v6, v0, v1
1016 ; SI-NEXT: v_mul_hi_i32 v0, v0, v1
1017 ; SI-NEXT: v_add_i32_e32 v2, vcc, v6, v2
1018 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v0, v3, vcc
1019 ; SI-NEXT: v_add_i32_e32 v3, vcc, v6, v4
1020 ; SI-NEXT: v_addc_u32_e32 v0, vcc, v0, v5, vcc
1021 ; SI-NEXT: v_xor_b32_e32 v1, v1, v0
1022 ; SI-NEXT: v_xor_b32_e32 v0, v2, v3
1023 ; SI-NEXT: s_setpc_b64 s[30:31]
1025 ; GFX9-LABEL: mad_i64_i32_twice:
1027 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1028 ; GFX9-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
1029 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[4:5]
1030 ; GFX9-NEXT: v_xor_b32_e32 v1, v3, v1
1031 ; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0
1032 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1034 ; GFX1100-LABEL: mad_i64_i32_twice:
1036 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1037 ; GFX1100-NEXT: v_mad_i64_i32 v[6:7], null, v0, v1, v[2:3]
1038 ; GFX1100-NEXT: v_mad_i64_i32 v[2:3], null, v0, v1, v[4:5]
1039 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1040 ; GFX1100-NEXT: v_xor_b32_e32 v0, v6, v2
1041 ; GFX1100-NEXT: v_xor_b32_e32 v1, v7, v3
1042 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
1044 ; GFX1150-LABEL: mad_i64_i32_twice:
1046 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1047 ; GFX1150-NEXT: v_mad_i64_i32 v[2:3], null, v0, v1, v[2:3]
1048 ; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v0, v1, v[4:5]
1049 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1050 ; GFX1150-NEXT: v_xor_b32_e32 v0, v2, v0
1051 ; GFX1150-NEXT: v_xor_b32_e32 v1, v3, v1
1052 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
1054 ; GFX12-LABEL: mad_i64_i32_twice:
1056 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1057 ; GFX12-NEXT: s_wait_expcnt 0x0
1058 ; GFX12-NEXT: s_wait_samplecnt 0x0
1059 ; GFX12-NEXT: s_wait_bvhcnt 0x0
1060 ; GFX12-NEXT: s_wait_kmcnt 0x0
1061 ; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, v0, v1, v[2:3]
1062 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[4:5]
1063 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1064 ; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
1065 ; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
1066 ; GFX12-NEXT: s_setpc_b64 s[30:31]
1067 %sext0 = sext i32 %arg0 to i64
1068 %sext1 = sext i32 %arg1 to i64
1069 %mul = mul i64 %sext0, %sext1
1070 %mad1 = add i64 %mul, %arg2
1071 %mad2 = add i64 %mul, %arg3
1072 %out = xor i64 %mad1, %mad2
1076 define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %arg4) #0 {
1077 ; CI-LABEL: mad_i64_i32_thrice:
1079 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1080 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0
1081 ; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
1082 ; CI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
1083 ; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v4
1084 ; CI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc
1085 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v6
1086 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
1087 ; CI-NEXT: v_xor_b32_e32 v3, v3, v5
1088 ; CI-NEXT: v_xor_b32_e32 v2, v2, v4
1089 ; CI-NEXT: v_xor_b32_e32 v1, v3, v1
1090 ; CI-NEXT: v_xor_b32_e32 v0, v2, v0
1091 ; CI-NEXT: s_setpc_b64 s[30:31]
1093 ; SI-LABEL: mad_i64_i32_thrice:
1095 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1096 ; SI-NEXT: v_mul_lo_u32 v8, v0, v1
1097 ; SI-NEXT: v_mul_hi_i32 v0, v0, v1
1098 ; SI-NEXT: v_add_i32_e32 v1, vcc, v8, v2
1099 ; SI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc
1100 ; SI-NEXT: v_add_i32_e32 v3, vcc, v8, v4
1101 ; SI-NEXT: v_addc_u32_e32 v4, vcc, v0, v5, vcc
1102 ; SI-NEXT: v_add_i32_e32 v5, vcc, v8, v6
1103 ; SI-NEXT: v_addc_u32_e32 v0, vcc, v0, v7, vcc
1104 ; SI-NEXT: v_xor_b32_e32 v2, v2, v4
1105 ; SI-NEXT: v_xor_b32_e32 v3, v1, v3
1106 ; SI-NEXT: v_xor_b32_e32 v1, v2, v0
1107 ; SI-NEXT: v_xor_b32_e32 v0, v3, v5
1108 ; SI-NEXT: s_setpc_b64 s[30:31]
1110 ; GFX9-LABEL: mad_i64_i32_thrice:
1112 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1113 ; GFX9-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
1114 ; GFX9-NEXT: v_mad_i64_i32 v[4:5], s[4:5], v0, v1, v[4:5]
1115 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[6:7]
1116 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v5
1117 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4
1118 ; GFX9-NEXT: v_xor_b32_e32 v1, v3, v1
1119 ; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0
1120 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1122 ; GFX1100-LABEL: mad_i64_i32_thrice:
1124 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1125 ; GFX1100-NEXT: v_mad_i64_i32 v[8:9], null, v0, v1, 0
1126 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1127 ; GFX1100-NEXT: v_add_co_u32 v0, vcc_lo, v8, v2
1128 ; GFX1100-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo
1129 ; GFX1100-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4
1130 ; GFX1100-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
1131 ; GFX1100-NEXT: v_add_co_u32 v4, vcc_lo, v8, v6
1132 ; GFX1100-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo
1133 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1134 ; GFX1100-NEXT: v_xor_b32_e32 v0, v0, v2
1135 ; GFX1100-NEXT: v_xor_b32_e32 v1, v1, v3
1136 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1137 ; GFX1100-NEXT: v_xor_b32_e32 v0, v0, v4
1138 ; GFX1100-NEXT: v_xor_b32_e32 v1, v1, v5
1139 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
1141 ; GFX1150-LABEL: mad_i64_i32_thrice:
1143 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1144 ; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v0, v1, 0
1145 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1146 ; GFX1150-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
1147 ; GFX1150-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
1148 ; GFX1150-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4
1149 ; GFX1150-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
1150 ; GFX1150-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
1151 ; GFX1150-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo
1152 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1153 ; GFX1150-NEXT: v_xor_b32_e32 v2, v2, v4
1154 ; GFX1150-NEXT: v_xor_b32_e32 v3, v3, v5
1155 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1156 ; GFX1150-NEXT: v_xor_b32_e32 v0, v2, v0
1157 ; GFX1150-NEXT: v_xor_b32_e32 v1, v3, v1
1158 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
1160 ; GFX12-LABEL: mad_i64_i32_thrice:
1162 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1163 ; GFX12-NEXT: s_wait_expcnt 0x0
1164 ; GFX12-NEXT: s_wait_samplecnt 0x0
1165 ; GFX12-NEXT: s_wait_bvhcnt 0x0
1166 ; GFX12-NEXT: s_wait_kmcnt 0x0
1167 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0
1168 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1169 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
1170 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
1171 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4
1172 ; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
1173 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
1174 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo
1175 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1176 ; GFX12-NEXT: v_xor_b32_e32 v2, v2, v4
1177 ; GFX12-NEXT: v_xor_b32_e32 v3, v3, v5
1178 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1179 ; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
1180 ; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
1181 ; GFX12-NEXT: s_setpc_b64 s[30:31]
1182 %sext0 = sext i32 %arg0 to i64
1183 %sext1 = sext i32 %arg1 to i64
1184 %mul = mul i64 %sext0, %sext1
1185 %mad1 = add i64 %mul, %arg2
1186 %mad2 = add i64 %mul, %arg3
1187 %mad3 = add i64 %mul, %arg4
1188 %out.p = xor i64 %mad1, %mad2
1189 %out = xor i64 %out.p, %mad3
1193 define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
1194 ; CI-LABEL: mad_i64_i32_secondary_use:
1196 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1197 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0
1198 ; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
1199 ; CI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
1200 ; CI-NEXT: v_xor_b32_e32 v1, v3, v1
1201 ; CI-NEXT: v_xor_b32_e32 v0, v2, v0
1202 ; CI-NEXT: s_setpc_b64 s[30:31]
1204 ; SI-LABEL: mad_i64_i32_secondary_use:
1206 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1207 ; SI-NEXT: v_mul_lo_u32 v4, v0, v1
1208 ; SI-NEXT: v_mul_hi_i32 v0, v0, v1
1209 ; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2
1210 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v0, v3, vcc
1211 ; SI-NEXT: v_xor_b32_e32 v1, v1, v0
1212 ; SI-NEXT: v_xor_b32_e32 v0, v2, v4
1213 ; SI-NEXT: s_setpc_b64 s[30:31]
1215 ; GFX9-LABEL: mad_i64_i32_secondary_use:
1217 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1218 ; GFX9-NEXT: v_mad_i64_i32 v[4:5], s[4:5], v0, v1, 0
1219 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
1220 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v5
1221 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4
1222 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1224 ; GFX1100-LABEL: mad_i64_i32_secondary_use:
1226 ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1227 ; GFX1100-NEXT: v_mad_i64_i32 v[4:5], null, v0, v1, 0
1228 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1229 ; GFX1100-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
1230 ; GFX1100-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo
1231 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1232 ; GFX1100-NEXT: v_xor_b32_e32 v0, v0, v4
1233 ; GFX1100-NEXT: v_xor_b32_e32 v1, v1, v5
1234 ; GFX1100-NEXT: s_setpc_b64 s[30:31]
1236 ; GFX1150-LABEL: mad_i64_i32_secondary_use:
1238 ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1239 ; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, v0, v1, 0
1240 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1241 ; GFX1150-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
1242 ; GFX1150-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
1243 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1244 ; GFX1150-NEXT: v_xor_b32_e32 v0, v2, v0
1245 ; GFX1150-NEXT: v_xor_b32_e32 v1, v3, v1
1246 ; GFX1150-NEXT: s_setpc_b64 s[30:31]
1248 ; GFX12-LABEL: mad_i64_i32_secondary_use:
1250 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1251 ; GFX12-NEXT: s_wait_expcnt 0x0
1252 ; GFX12-NEXT: s_wait_samplecnt 0x0
1253 ; GFX12-NEXT: s_wait_bvhcnt 0x0
1254 ; GFX12-NEXT: s_wait_kmcnt 0x0
1255 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0
1256 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1257 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
1258 ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
1259 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1260 ; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
1261 ; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
1262 ; GFX12-NEXT: s_setpc_b64 s[30:31]
1263 %sext0 = sext i32 %arg0 to i64
1264 %sext1 = sext i32 %arg1 to i64
1265 %mul = mul i64 %sext0, %sext1
1266 %mad = add i64 %mul, %arg2
1267 %out = xor i64 %mad, %mul
1271 define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
1272 ; CI-LABEL: mad_i48_i48:
1274 ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1275 ; CI-NEXT: v_mov_b32_e32 v6, v1
1276 ; CI-NEXT: v_mov_b32_e32 v7, v0
1277 ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5]
1278 ; CI-NEXT: v_mul_lo_u32 v2, v6, v2
1279 ; CI-NEXT: v_mul_lo_u32 v3, v7, v3
1280 ; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1
1281 ; CI-NEXT: v_add_i32_e32 v1, vcc, v3, v1
1282 ; CI-NEXT: s_setpc_b64 s[30:31]
1284 ; SI-LABEL: mad_i48_i48:
1286 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1287 ; SI-NEXT: v_mul_lo_u32 v3, v0, v3
1288 ; SI-NEXT: v_mul_hi_u32 v6, v0, v2
1289 ; SI-NEXT: v_mul_lo_u32 v1, v1, v2
1290 ; SI-NEXT: v_mul_lo_u32 v0, v0, v2
1291 ; SI-NEXT: v_add_i32_e32 v3, vcc, v6, v3
1292 ; SI-NEXT: v_add_i32_e32 v1, vcc, v3, v1
1293 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4
1294 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
1295 ; SI-NEXT: s_setpc_b64 s[30:31]
1297 ; GFX9-LABEL: mad_i48_i48:
1299 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1300 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
1301 ; GFX9-NEXT: v_mov_b32_e32 v7, v0
1302 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5]
1303 ; GFX9-NEXT: v_mul_lo_u32 v3, v7, v3
1304 ; GFX9-NEXT: v_mul_lo_u32 v2, v6, v2
1305 ; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
1306 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1308 ; GFX11-LABEL: mad_i48_i48:
1310 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1311 ; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0
1312 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1313 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5]
1314 ; GFX11-NEXT: v_mul_lo_u32 v3, v7, v3
1315 ; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2
1316 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1317 ; GFX11-NEXT: v_add3_u32 v1, v2, v1, v3
1318 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1320 ; GFX12-LABEL: mad_i48_i48:
1322 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1323 ; GFX12-NEXT: s_wait_expcnt 0x0
1324 ; GFX12-NEXT: s_wait_samplecnt 0x0
1325 ; GFX12-NEXT: s_wait_bvhcnt 0x0
1326 ; GFX12-NEXT: s_wait_kmcnt 0x0
1327 ; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0
1328 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1329 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v7, v2, v[4:5]
1330 ; GFX12-NEXT: v_mul_lo_u32 v3, v7, v3
1331 ; GFX12-NEXT: v_mul_lo_u32 v2, v6, v2
1332 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1333 ; GFX12-NEXT: v_add3_u32 v1, v2, v1, v3
1334 ; GFX12-NEXT: s_setpc_b64 s[30:31]
1335 %m = mul i48 %arg0, %arg1
1336 %a = add i48 %m, %arg2
1340 attributes #0 = { nounwind }
1341 attributes #1 = { nounwind readnone speculatable }