1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
8 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
10 ; mul24 and mad24 are affected
12 define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
13 ; SI-LABEL: test_mul_v2i32:
14 ; SI: ; %bb.0: ; %entry
15 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
16 ; SI-NEXT: s_mov_b32 s7, 0xf000
17 ; SI-NEXT: s_mov_b32 s6, -1
18 ; SI-NEXT: s_mov_b32 s10, s6
19 ; SI-NEXT: s_mov_b32 s11, s7
20 ; SI-NEXT: s_waitcnt lgkmcnt(0)
21 ; SI-NEXT: s_mov_b32 s8, s2
22 ; SI-NEXT: s_mov_b32 s9, s3
23 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
24 ; SI-NEXT: s_mov_b32 s4, s0
25 ; SI-NEXT: s_mov_b32 s5, s1
26 ; SI-NEXT: s_waitcnt vmcnt(0)
27 ; SI-NEXT: v_mul_lo_u32 v1, v1, v3
28 ; SI-NEXT: v_mul_lo_u32 v0, v0, v2
29 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
32 ; VI-LABEL: test_mul_v2i32:
33 ; VI: ; %bb.0: ; %entry
34 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
35 ; VI-NEXT: s_mov_b32 s7, 0xf000
36 ; VI-NEXT: s_mov_b32 s6, -1
37 ; VI-NEXT: s_mov_b32 s10, s6
38 ; VI-NEXT: s_mov_b32 s11, s7
39 ; VI-NEXT: s_waitcnt lgkmcnt(0)
40 ; VI-NEXT: s_mov_b32 s8, s2
41 ; VI-NEXT: s_mov_b32 s9, s3
42 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
43 ; VI-NEXT: s_mov_b32 s4, s0
44 ; VI-NEXT: s_mov_b32 s5, s1
45 ; VI-NEXT: s_waitcnt vmcnt(0)
46 ; VI-NEXT: v_mul_lo_u32 v1, v1, v3
47 ; VI-NEXT: v_mul_lo_u32 v0, v0, v2
48 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
51 ; GFX9-LABEL: test_mul_v2i32:
52 ; GFX9: ; %bb.0: ; %entry
53 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
54 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
55 ; GFX9-NEXT: s_mov_b32 s2, -1
56 ; GFX9-NEXT: s_mov_b32 s10, s2
57 ; GFX9-NEXT: s_mov_b32 s11, s3
58 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
59 ; GFX9-NEXT: s_mov_b32 s8, s6
60 ; GFX9-NEXT: s_mov_b32 s9, s7
61 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
62 ; GFX9-NEXT: s_mov_b32 s0, s4
63 ; GFX9-NEXT: s_mov_b32 s1, s5
64 ; GFX9-NEXT: s_waitcnt vmcnt(0)
65 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3
66 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2
67 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
70 ; GFX10-LABEL: test_mul_v2i32:
71 ; GFX10: ; %bb.0: ; %entry
72 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
73 ; GFX10-NEXT: s_mov_b32 s2, -1
74 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
75 ; GFX10-NEXT: s_mov_b32 s10, s2
76 ; GFX10-NEXT: s_mov_b32 s11, s3
77 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
78 ; GFX10-NEXT: s_mov_b32 s8, s6
79 ; GFX10-NEXT: s_mov_b32 s9, s7
80 ; GFX10-NEXT: s_mov_b32 s0, s4
81 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
82 ; GFX10-NEXT: s_mov_b32 s1, s5
83 ; GFX10-NEXT: s_waitcnt vmcnt(0)
84 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3
85 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
86 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
87 ; GFX10-NEXT: s_endpgm
89 ; GFX11-LABEL: test_mul_v2i32:
90 ; GFX11: ; %bb.0: ; %entry
91 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
92 ; GFX11-NEXT: s_mov_b32 s6, -1
93 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
94 ; GFX11-NEXT: s_mov_b32 s10, s6
95 ; GFX11-NEXT: s_mov_b32 s11, s7
96 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
97 ; GFX11-NEXT: s_mov_b32 s8, s2
98 ; GFX11-NEXT: s_mov_b32 s9, s3
99 ; GFX11-NEXT: s_mov_b32 s4, s0
100 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
101 ; GFX11-NEXT: s_mov_b32 s5, s1
102 ; GFX11-NEXT: s_waitcnt vmcnt(0)
103 ; GFX11-NEXT: v_mul_lo_u32 v1, v1, v3
104 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2
105 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
106 ; GFX11-NEXT: s_nop 0
107 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
108 ; GFX11-NEXT: s_endpgm
110 ; GFX12-LABEL: test_mul_v2i32:
111 ; GFX12: ; %bb.0: ; %entry
112 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
113 ; GFX12-NEXT: s_mov_b32 s6, -1
114 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000
115 ; GFX12-NEXT: s_mov_b32 s10, s6
116 ; GFX12-NEXT: s_mov_b32 s11, s7
117 ; GFX12-NEXT: s_wait_kmcnt 0x0
118 ; GFX12-NEXT: s_mov_b32 s8, s2
119 ; GFX12-NEXT: s_mov_b32 s9, s3
120 ; GFX12-NEXT: s_mov_b32 s4, s0
121 ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
122 ; GFX12-NEXT: s_mov_b32 s5, s1
123 ; GFX12-NEXT: s_wait_loadcnt 0x0
124 ; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3
125 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
126 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
127 ; GFX12-NEXT: s_nop 0
128 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
129 ; GFX12-NEXT: s_endpgm
131 ; EG-LABEL: test_mul_v2i32:
132 ; EG: ; %bb.0: ; %entry
133 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
135 ; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
136 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
139 ; EG-NEXT: Fetch clause starting at 6:
140 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
141 ; EG-NEXT: ALU clause starting at 8:
142 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
143 ; EG-NEXT: ALU clause starting at 9:
144 ; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T0.W,
145 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
146 ; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Z,
147 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
149 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
150 %a = load <2 x i32>, ptr addrspace(1) %in
151 %b = load <2 x i32>, ptr addrspace(1) %b_ptr
152 %result = mul <2 x i32> %a, %b
153 store <2 x i32> %result, ptr addrspace(1) %out
157 define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
158 ; SI-LABEL: v_mul_v4i32:
159 ; SI: ; %bb.0: ; %entry
160 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
161 ; SI-NEXT: s_mov_b32 s7, 0xf000
162 ; SI-NEXT: s_mov_b32 s6, -1
163 ; SI-NEXT: s_mov_b32 s10, s6
164 ; SI-NEXT: s_mov_b32 s11, s7
165 ; SI-NEXT: s_waitcnt lgkmcnt(0)
166 ; SI-NEXT: s_mov_b32 s8, s2
167 ; SI-NEXT: s_mov_b32 s9, s3
168 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
169 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
170 ; SI-NEXT: s_mov_b32 s4, s0
171 ; SI-NEXT: s_mov_b32 s5, s1
172 ; SI-NEXT: s_waitcnt vmcnt(0)
173 ; SI-NEXT: v_mul_lo_u32 v3, v3, v7
174 ; SI-NEXT: v_mul_lo_u32 v2, v2, v6
175 ; SI-NEXT: v_mul_lo_u32 v1, v1, v5
176 ; SI-NEXT: v_mul_lo_u32 v0, v0, v4
177 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
180 ; VI-LABEL: v_mul_v4i32:
181 ; VI: ; %bb.0: ; %entry
182 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
183 ; VI-NEXT: s_mov_b32 s7, 0xf000
184 ; VI-NEXT: s_mov_b32 s6, -1
185 ; VI-NEXT: s_mov_b32 s10, s6
186 ; VI-NEXT: s_mov_b32 s11, s7
187 ; VI-NEXT: s_waitcnt lgkmcnt(0)
188 ; VI-NEXT: s_mov_b32 s8, s2
189 ; VI-NEXT: s_mov_b32 s9, s3
190 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
191 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
192 ; VI-NEXT: s_mov_b32 s4, s0
193 ; VI-NEXT: s_mov_b32 s5, s1
194 ; VI-NEXT: s_waitcnt vmcnt(0)
195 ; VI-NEXT: v_mul_lo_u32 v3, v3, v7
196 ; VI-NEXT: v_mul_lo_u32 v2, v2, v6
197 ; VI-NEXT: v_mul_lo_u32 v1, v1, v5
198 ; VI-NEXT: v_mul_lo_u32 v0, v0, v4
199 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
202 ; GFX9-LABEL: v_mul_v4i32:
203 ; GFX9: ; %bb.0: ; %entry
204 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
205 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
206 ; GFX9-NEXT: s_mov_b32 s2, -1
207 ; GFX9-NEXT: s_mov_b32 s10, s2
208 ; GFX9-NEXT: s_mov_b32 s11, s3
209 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
210 ; GFX9-NEXT: s_mov_b32 s8, s6
211 ; GFX9-NEXT: s_mov_b32 s9, s7
212 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
213 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
214 ; GFX9-NEXT: s_mov_b32 s0, s4
215 ; GFX9-NEXT: s_mov_b32 s1, s5
216 ; GFX9-NEXT: s_waitcnt vmcnt(0)
217 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v7
218 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v6
219 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v5
220 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4
221 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
222 ; GFX9-NEXT: s_endpgm
224 ; GFX10-LABEL: v_mul_v4i32:
225 ; GFX10: ; %bb.0: ; %entry
226 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
227 ; GFX10-NEXT: s_mov_b32 s2, -1
228 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
229 ; GFX10-NEXT: s_mov_b32 s10, s2
230 ; GFX10-NEXT: s_mov_b32 s11, s3
231 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
232 ; GFX10-NEXT: s_mov_b32 s8, s6
233 ; GFX10-NEXT: s_mov_b32 s9, s7
234 ; GFX10-NEXT: s_clause 0x1
235 ; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
236 ; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
237 ; GFX10-NEXT: s_mov_b32 s0, s4
238 ; GFX10-NEXT: s_mov_b32 s1, s5
239 ; GFX10-NEXT: s_waitcnt vmcnt(0)
240 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, v7
241 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v6
242 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, v5
243 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4
244 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
245 ; GFX10-NEXT: s_endpgm
247 ; GFX11-LABEL: v_mul_v4i32:
248 ; GFX11: ; %bb.0: ; %entry
249 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
250 ; GFX11-NEXT: s_mov_b32 s6, -1
251 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
252 ; GFX11-NEXT: s_mov_b32 s10, s6
253 ; GFX11-NEXT: s_mov_b32 s11, s7
254 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
255 ; GFX11-NEXT: s_mov_b32 s8, s2
256 ; GFX11-NEXT: s_mov_b32 s9, s3
257 ; GFX11-NEXT: s_clause 0x1
258 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
259 ; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16
260 ; GFX11-NEXT: s_mov_b32 s4, s0
261 ; GFX11-NEXT: s_mov_b32 s5, s1
262 ; GFX11-NEXT: s_waitcnt vmcnt(0)
263 ; GFX11-NEXT: v_mul_lo_u32 v3, v3, v7
264 ; GFX11-NEXT: v_mul_lo_u32 v2, v2, v6
265 ; GFX11-NEXT: v_mul_lo_u32 v1, v1, v5
266 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, v4
267 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
268 ; GFX11-NEXT: s_nop 0
269 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
270 ; GFX11-NEXT: s_endpgm
272 ; GFX12-LABEL: v_mul_v4i32:
273 ; GFX12: ; %bb.0: ; %entry
274 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
275 ; GFX12-NEXT: s_mov_b32 s6, -1
276 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000
277 ; GFX12-NEXT: s_mov_b32 s10, s6
278 ; GFX12-NEXT: s_mov_b32 s11, s7
279 ; GFX12-NEXT: s_wait_kmcnt 0x0
280 ; GFX12-NEXT: s_mov_b32 s8, s2
281 ; GFX12-NEXT: s_mov_b32 s9, s3
282 ; GFX12-NEXT: s_clause 0x1
283 ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
284 ; GFX12-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16
285 ; GFX12-NEXT: s_mov_b32 s4, s0
286 ; GFX12-NEXT: s_mov_b32 s5, s1
287 ; GFX12-NEXT: s_wait_loadcnt 0x0
288 ; GFX12-NEXT: v_mul_lo_u32 v3, v3, v7
289 ; GFX12-NEXT: v_mul_lo_u32 v2, v2, v6
290 ; GFX12-NEXT: v_mul_lo_u32 v1, v1, v5
291 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v4
292 ; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null
293 ; GFX12-NEXT: s_nop 0
294 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
295 ; GFX12-NEXT: s_endpgm
297 ; EG-LABEL: v_mul_v4i32:
298 ; EG: ; %bb.0: ; %entry
299 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
301 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
302 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
305 ; EG-NEXT: Fetch clause starting at 6:
306 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
307 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
308 ; EG-NEXT: ALU clause starting at 10:
309 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
310 ; EG-NEXT: ALU clause starting at 11:
311 ; EG-NEXT: MULLO_INT * T0.W, T0.W, T1.W,
312 ; EG-NEXT: MULLO_INT * T0.Z, T0.Z, T1.Z,
313 ; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.Y,
314 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
315 ; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X,
316 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
318 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
319 %a = load <4 x i32>, ptr addrspace(1) %in
320 %b = load <4 x i32>, ptr addrspace(1) %b_ptr
321 %result = mul <4 x i32> %a, %b
322 store <4 x i32> %result, ptr addrspace(1) %out
326 define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) {
327 ; SI-LABEL: s_trunc_i64_mul_to_i32:
328 ; SI: ; %bb.0: ; %entry
329 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
330 ; SI-NEXT: s_waitcnt lgkmcnt(0)
331 ; SI-NEXT: s_load_dword s7, s[2:3], 0xd
332 ; SI-NEXT: s_mov_b32 s3, 0xf000
333 ; SI-NEXT: s_mov_b32 s2, -1
334 ; SI-NEXT: s_mov_b32 s0, s4
335 ; SI-NEXT: s_waitcnt lgkmcnt(0)
336 ; SI-NEXT: s_mul_i32 s4, s7, s6
337 ; SI-NEXT: s_mov_b32 s1, s5
338 ; SI-NEXT: v_mov_b32_e32 v0, s4
339 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
342 ; VI-LABEL: s_trunc_i64_mul_to_i32:
343 ; VI: ; %bb.0: ; %entry
344 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
345 ; VI-NEXT: s_waitcnt lgkmcnt(0)
346 ; VI-NEXT: s_load_dword s7, s[2:3], 0x34
347 ; VI-NEXT: s_mov_b32 s3, 0xf000
348 ; VI-NEXT: s_mov_b32 s2, -1
349 ; VI-NEXT: s_mov_b32 s0, s4
350 ; VI-NEXT: s_waitcnt lgkmcnt(0)
351 ; VI-NEXT: s_mul_i32 s4, s7, s6
352 ; VI-NEXT: s_mov_b32 s1, s5
353 ; VI-NEXT: v_mov_b32_e32 v0, s4
354 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
357 ; GFX9-LABEL: s_trunc_i64_mul_to_i32:
358 ; GFX9: ; %bb.0: ; %entry
359 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
360 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
361 ; GFX9-NEXT: s_load_dword s7, s[2:3], 0x34
362 ; GFX9-NEXT: ; kill: killed $sgpr2_sgpr3
363 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
364 ; GFX9-NEXT: s_mov_b32 s2, -1
365 ; GFX9-NEXT: s_mov_b32 s0, s4
366 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
367 ; GFX9-NEXT: s_mul_i32 s4, s7, s6
368 ; GFX9-NEXT: s_mov_b32 s1, s5
369 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
370 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
371 ; GFX9-NEXT: s_endpgm
373 ; GFX10-LABEL: s_trunc_i64_mul_to_i32:
374 ; GFX10: ; %bb.0: ; %entry
375 ; GFX10-NEXT: s_clause 0x1
376 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
377 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x34
378 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
379 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
380 ; GFX10-NEXT: s_mul_i32 s0, s0, s6
381 ; GFX10-NEXT: s_mov_b32 s6, -1
382 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
383 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
384 ; GFX10-NEXT: s_endpgm
386 ; GFX11-LABEL: s_trunc_i64_mul_to_i32:
387 ; GFX11: ; %bb.0: ; %entry
388 ; GFX11-NEXT: s_clause 0x1
389 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
390 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34
391 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
392 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
393 ; GFX11-NEXT: s_mul_i32 s0, s0, s6
394 ; GFX11-NEXT: s_mov_b32 s6, -1
395 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
396 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
397 ; GFX11-NEXT: s_nop 0
398 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
399 ; GFX11-NEXT: s_endpgm
401 ; GFX12-LABEL: s_trunc_i64_mul_to_i32:
402 ; GFX12: ; %bb.0: ; %entry
403 ; GFX12-NEXT: s_clause 0x1
404 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
405 ; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x34
406 ; GFX12-NEXT: s_wait_kmcnt 0x0
407 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000
408 ; GFX12-NEXT: s_mul_i32 s0, s0, s6
409 ; GFX12-NEXT: s_mov_b32 s6, -1
410 ; GFX12-NEXT: v_mov_b32_e32 v0, s0
411 ; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
412 ; GFX12-NEXT: s_nop 0
413 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
414 ; GFX12-NEXT: s_endpgm
416 ; EG-LABEL: s_trunc_i64_mul_to_i32:
417 ; EG: ; %bb.0: ; %entry
418 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
419 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
422 ; EG-NEXT: ALU clause starting at 4:
423 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
424 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
425 ; EG-NEXT: MULLO_INT * T1.X, KC0[3].Y, KC0[2].W,
427 %mul = mul i64 %b, %a
428 %trunc = trunc i64 %mul to i32
429 store i32 %trunc, ptr addrspace(1) %out, align 8
433 define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
434 ; SI-LABEL: v_trunc_i64_mul_to_i32:
435 ; SI: ; %bb.0: ; %entry
436 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
437 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
438 ; SI-NEXT: s_mov_b32 s11, 0xf000
439 ; SI-NEXT: s_mov_b32 s10, -1
440 ; SI-NEXT: s_mov_b32 s14, s10
441 ; SI-NEXT: s_waitcnt lgkmcnt(0)
442 ; SI-NEXT: s_mov_b32 s12, s6
443 ; SI-NEXT: s_mov_b32 s13, s7
444 ; SI-NEXT: s_mov_b32 s15, s11
445 ; SI-NEXT: s_mov_b32 s2, s10
446 ; SI-NEXT: s_mov_b32 s3, s11
447 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
448 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
449 ; SI-NEXT: s_mov_b32 s8, s4
450 ; SI-NEXT: s_mov_b32 s9, s5
451 ; SI-NEXT: s_waitcnt vmcnt(0)
452 ; SI-NEXT: v_mul_lo_u32 v0, v1, v0
453 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
456 ; VI-LABEL: v_trunc_i64_mul_to_i32:
457 ; VI: ; %bb.0: ; %entry
458 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
459 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
460 ; VI-NEXT: s_mov_b32 s11, 0xf000
461 ; VI-NEXT: s_mov_b32 s10, -1
462 ; VI-NEXT: s_mov_b32 s14, s10
463 ; VI-NEXT: s_waitcnt lgkmcnt(0)
464 ; VI-NEXT: s_mov_b32 s12, s6
465 ; VI-NEXT: s_mov_b32 s13, s7
466 ; VI-NEXT: s_mov_b32 s15, s11
467 ; VI-NEXT: s_mov_b32 s2, s10
468 ; VI-NEXT: s_mov_b32 s3, s11
469 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
470 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0
471 ; VI-NEXT: s_mov_b32 s8, s4
472 ; VI-NEXT: s_mov_b32 s9, s5
473 ; VI-NEXT: s_waitcnt vmcnt(0)
474 ; VI-NEXT: v_mul_lo_u32 v0, v1, v0
475 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
478 ; GFX9-LABEL: v_trunc_i64_mul_to_i32:
479 ; GFX9: ; %bb.0: ; %entry
480 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
481 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
482 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
483 ; GFX9-NEXT: s_mov_b32 s10, -1
484 ; GFX9-NEXT: s_mov_b32 s14, s10
485 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
486 ; GFX9-NEXT: s_mov_b32 s12, s6
487 ; GFX9-NEXT: s_mov_b32 s13, s7
488 ; GFX9-NEXT: s_mov_b32 s15, s11
489 ; GFX9-NEXT: s_mov_b32 s2, s10
490 ; GFX9-NEXT: s_mov_b32 s3, s11
491 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0
492 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0
493 ; GFX9-NEXT: s_mov_b32 s8, s4
494 ; GFX9-NEXT: s_mov_b32 s9, s5
495 ; GFX9-NEXT: s_waitcnt vmcnt(0)
496 ; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0
497 ; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
498 ; GFX9-NEXT: s_endpgm
500 ; GFX10-LABEL: v_trunc_i64_mul_to_i32:
501 ; GFX10: ; %bb.0: ; %entry
502 ; GFX10-NEXT: s_clause 0x1
503 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
504 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
505 ; GFX10-NEXT: s_mov_b32 s10, -1
506 ; GFX10-NEXT: s_mov_b32 s11, 0x31016000
507 ; GFX10-NEXT: s_mov_b32 s14, s10
508 ; GFX10-NEXT: s_mov_b32 s15, s11
509 ; GFX10-NEXT: s_mov_b32 s2, s10
510 ; GFX10-NEXT: s_mov_b32 s3, s11
511 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
512 ; GFX10-NEXT: s_mov_b32 s12, s6
513 ; GFX10-NEXT: s_mov_b32 s13, s7
514 ; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0
515 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0
516 ; GFX10-NEXT: s_mov_b32 s8, s4
517 ; GFX10-NEXT: s_mov_b32 s9, s5
518 ; GFX10-NEXT: s_waitcnt vmcnt(0)
519 ; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0
520 ; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0
521 ; GFX10-NEXT: s_endpgm
523 ; GFX11-LABEL: v_trunc_i64_mul_to_i32:
524 ; GFX11: ; %bb.0: ; %entry
525 ; GFX11-NEXT: s_clause 0x1
526 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
527 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
528 ; GFX11-NEXT: s_mov_b32 s10, -1
529 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
530 ; GFX11-NEXT: s_mov_b32 s14, s10
531 ; GFX11-NEXT: s_mov_b32 s15, s11
532 ; GFX11-NEXT: s_mov_b32 s2, s10
533 ; GFX11-NEXT: s_mov_b32 s3, s11
534 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
535 ; GFX11-NEXT: s_mov_b32 s12, s6
536 ; GFX11-NEXT: s_mov_b32 s13, s7
537 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
538 ; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
539 ; GFX11-NEXT: s_mov_b32 s8, s4
540 ; GFX11-NEXT: s_mov_b32 s9, s5
541 ; GFX11-NEXT: s_waitcnt vmcnt(0)
542 ; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0
543 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
544 ; GFX11-NEXT: s_nop 0
545 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
546 ; GFX11-NEXT: s_endpgm
548 ; GFX12-LABEL: v_trunc_i64_mul_to_i32:
549 ; GFX12: ; %bb.0: ; %entry
550 ; GFX12-NEXT: s_clause 0x1
551 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
552 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
553 ; GFX12-NEXT: s_mov_b32 s10, -1
554 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
555 ; GFX12-NEXT: s_mov_b32 s14, s10
556 ; GFX12-NEXT: s_mov_b32 s15, s11
557 ; GFX12-NEXT: s_mov_b32 s2, s10
558 ; GFX12-NEXT: s_mov_b32 s3, s11
559 ; GFX12-NEXT: s_wait_kmcnt 0x0
560 ; GFX12-NEXT: s_mov_b32 s12, s6
561 ; GFX12-NEXT: s_mov_b32 s13, s7
562 ; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null
563 ; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null
564 ; GFX12-NEXT: s_mov_b32 s8, s4
565 ; GFX12-NEXT: s_mov_b32 s9, s5
566 ; GFX12-NEXT: s_wait_loadcnt 0x0
567 ; GFX12-NEXT: v_mul_lo_u32 v0, v1, v0
568 ; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
569 ; GFX12-NEXT: s_nop 0
570 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
571 ; GFX12-NEXT: s_endpgm
573 ; EG-LABEL: v_trunc_i64_mul_to_i32:
574 ; EG: ; %bb.0: ; %entry
575 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
577 ; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
578 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
581 ; EG-NEXT: Fetch clause starting at 6:
582 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
583 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
584 ; EG-NEXT: ALU clause starting at 10:
585 ; EG-NEXT: MOV T0.X, KC0[2].Z,
586 ; EG-NEXT: MOV * T1.X, KC0[2].W,
587 ; EG-NEXT: ALU clause starting at 12:
588 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
589 ; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X,
590 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
592 %a = load i64, ptr addrspace(1) %aptr, align 8
593 %b = load i64, ptr addrspace(1) %bptr, align 8
594 %mul = mul i64 %b, %a
595 %trunc = trunc i64 %mul to i32
596 store i32 %trunc, ptr addrspace(1) %out, align 8
600 ; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
601 ; 32-bits of both arguments are sign bits.
603 define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
604 ; SI-LABEL: mul64_sext_c:
605 ; SI: ; %bb.0: ; %entry
606 ; SI-NEXT: s_load_dword s4, s[2:3], 0xb
607 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
608 ; SI-NEXT: v_mov_b32_e32 v0, 0x50
609 ; SI-NEXT: s_mov_b32 s3, 0xf000
610 ; SI-NEXT: s_mov_b32 s2, -1
611 ; SI-NEXT: s_waitcnt lgkmcnt(0)
612 ; SI-NEXT: v_mul_hi_i32 v1, s4, v0
613 ; SI-NEXT: s_mulk_i32 s4, 0x50
614 ; SI-NEXT: v_mov_b32_e32 v0, s4
615 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
618 ; VI-LABEL: mul64_sext_c:
619 ; VI: ; %bb.0: ; %entry
620 ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
621 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
622 ; VI-NEXT: v_mov_b32_e32 v0, 0x50
623 ; VI-NEXT: s_waitcnt lgkmcnt(0)
624 ; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s4, v0, 0
625 ; VI-NEXT: s_mov_b32 s3, 0xf000
626 ; VI-NEXT: s_mov_b32 s2, -1
628 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
631 ; GFX9-LABEL: mul64_sext_c:
632 ; GFX9: ; %bb.0: ; %entry
633 ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c
634 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
635 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
636 ; GFX9-NEXT: s_mov_b32 s2, -1
637 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
638 ; GFX9-NEXT: s_mul_hi_i32 s5, s4, 0x50
639 ; GFX9-NEXT: s_mulk_i32 s4, 0x50
640 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
641 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
642 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
643 ; GFX9-NEXT: s_endpgm
645 ; GFX10-LABEL: mul64_sext_c:
646 ; GFX10: ; %bb.0: ; %entry
647 ; GFX10-NEXT: s_clause 0x1
648 ; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c
649 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
650 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
651 ; GFX10-NEXT: s_mul_i32 s2, s4, 0x50
652 ; GFX10-NEXT: s_mul_hi_i32 s3, s4, 0x50
653 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
654 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
655 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
656 ; GFX10-NEXT: s_mov_b32 s2, -1
657 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
658 ; GFX10-NEXT: s_endpgm
660 ; GFX11-LABEL: mul64_sext_c:
661 ; GFX11: ; %bb.0: ; %entry
662 ; GFX11-NEXT: s_clause 0x1
663 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c
664 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
665 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
666 ; GFX11-NEXT: s_mul_i32 s2, s4, 0x50
667 ; GFX11-NEXT: s_mul_hi_i32 s3, s4, 0x50
668 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
669 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
670 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
671 ; GFX11-NEXT: s_mov_b32 s2, -1
672 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
673 ; GFX11-NEXT: s_nop 0
674 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
675 ; GFX11-NEXT: s_endpgm
677 ; GFX12-LABEL: mul64_sext_c:
678 ; GFX12: ; %bb.0: ; %entry
679 ; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
680 ; GFX12-NEXT: s_wait_kmcnt 0x0
681 ; GFX12-NEXT: s_ashr_i32 s3, s2, 31
682 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
683 ; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50
684 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000
685 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
686 ; GFX12-NEXT: s_mov_b32 s2, -1
687 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
688 ; GFX12-NEXT: s_nop 0
689 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
690 ; GFX12-NEXT: s_endpgm
692 ; EG-LABEL: mul64_sext_c:
693 ; EG: ; %bb.0: ; %entry
694 ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
695 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
698 ; EG-NEXT: ALU clause starting at 4:
699 ; EG-NEXT: MULHI_INT * T0.Y, KC0[2].Z, literal.x,
700 ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
701 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
702 ; EG-NEXT: MULLO_INT * T0.X, KC0[2].Z, literal.y,
703 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
705 %0 = sext i32 %in to i64
707 store i64 %1, ptr addrspace(1) %out
711 define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
712 ; SI-LABEL: mul64_zext_c:
713 ; SI: ; %bb.0: ; %entry
714 ; SI-NEXT: s_load_dword s4, s[2:3], 0xb
715 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
716 ; SI-NEXT: v_mov_b32_e32 v0, 0x50
717 ; SI-NEXT: s_mov_b32 s3, 0xf000
718 ; SI-NEXT: s_mov_b32 s2, -1
719 ; SI-NEXT: s_waitcnt lgkmcnt(0)
720 ; SI-NEXT: v_mul_hi_u32 v1, s4, v0
721 ; SI-NEXT: s_mulk_i32 s4, 0x50
722 ; SI-NEXT: v_mov_b32_e32 v0, s4
723 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
726 ; VI-LABEL: mul64_zext_c:
727 ; VI: ; %bb.0: ; %entry
728 ; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
729 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
730 ; VI-NEXT: v_mov_b32_e32 v0, 0x50
731 ; VI-NEXT: s_waitcnt lgkmcnt(0)
732 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v0, 0
733 ; VI-NEXT: s_mov_b32 s3, 0xf000
734 ; VI-NEXT: s_mov_b32 s2, -1
736 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
739 ; GFX9-LABEL: mul64_zext_c:
740 ; GFX9: ; %bb.0: ; %entry
741 ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c
742 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
743 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
744 ; GFX9-NEXT: s_mov_b32 s2, -1
745 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
746 ; GFX9-NEXT: s_mul_hi_u32 s5, s4, 0x50
747 ; GFX9-NEXT: s_mulk_i32 s4, 0x50
748 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
749 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
750 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
751 ; GFX9-NEXT: s_endpgm
753 ; GFX10-LABEL: mul64_zext_c:
754 ; GFX10: ; %bb.0: ; %entry
755 ; GFX10-NEXT: s_clause 0x1
756 ; GFX10-NEXT: s_load_dword s4, s[2:3], 0x2c
757 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
758 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
759 ; GFX10-NEXT: s_mul_i32 s2, s4, 0x50
760 ; GFX10-NEXT: s_mul_hi_u32 s3, s4, 0x50
761 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
762 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
763 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
764 ; GFX10-NEXT: s_mov_b32 s2, -1
765 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
766 ; GFX10-NEXT: s_endpgm
768 ; GFX11-LABEL: mul64_zext_c:
769 ; GFX11: ; %bb.0: ; %entry
770 ; GFX11-NEXT: s_clause 0x1
771 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c
772 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
773 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
774 ; GFX11-NEXT: s_mul_i32 s2, s4, 0x50
775 ; GFX11-NEXT: s_mul_hi_u32 s3, s4, 0x50
776 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
777 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
778 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
779 ; GFX11-NEXT: s_mov_b32 s2, -1
780 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
781 ; GFX11-NEXT: s_nop 0
782 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
783 ; GFX11-NEXT: s_endpgm
785 ; GFX12-LABEL: mul64_zext_c:
786 ; GFX12: ; %bb.0: ; %entry
787 ; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
788 ; GFX12-NEXT: s_mov_b32 s3, 0
789 ; GFX12-NEXT: s_wait_kmcnt 0x0
790 ; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50
791 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000
792 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
793 ; GFX12-NEXT: s_mov_b32 s2, -1
794 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
795 ; GFX12-NEXT: s_nop 0
796 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
797 ; GFX12-NEXT: s_endpgm
799 ; EG-LABEL: mul64_zext_c:
800 ; EG: ; %bb.0: ; %entry
801 ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
802 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
805 ; EG-NEXT: ALU clause starting at 4:
806 ; EG-NEXT: MULHI * T0.Y, KC0[2].Z, literal.x,
807 ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
808 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
809 ; EG-NEXT: MULLO_INT * T0.X, KC0[2].Z, literal.y,
810 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
812 %0 = zext i32 %in to i64
814 store i64 %1, ptr addrspace(1) %out
818 define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) {
819 ; SI-LABEL: v_mul64_sext_c:
820 ; SI: ; %bb.0: ; %entry
821 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
822 ; SI-NEXT: s_mov_b32 s7, 0xf000
823 ; SI-NEXT: s_mov_b32 s6, -1
824 ; SI-NEXT: s_mov_b32 s10, s6
825 ; SI-NEXT: s_mov_b32 s11, s7
826 ; SI-NEXT: s_waitcnt lgkmcnt(0)
827 ; SI-NEXT: s_mov_b32 s8, s2
828 ; SI-NEXT: s_mov_b32 s9, s3
829 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
830 ; SI-NEXT: s_movk_i32 s2, 0x50
831 ; SI-NEXT: s_mov_b32 s4, s0
832 ; SI-NEXT: s_mov_b32 s5, s1
833 ; SI-NEXT: s_waitcnt vmcnt(0)
834 ; SI-NEXT: v_mul_hi_i32 v1, v0, s2
835 ; SI-NEXT: v_mul_lo_u32 v0, v0, s2
836 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
839 ; VI-LABEL: v_mul64_sext_c:
840 ; VI: ; %bb.0: ; %entry
841 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
842 ; VI-NEXT: s_mov_b32 s7, 0xf000
843 ; VI-NEXT: s_mov_b32 s6, -1
844 ; VI-NEXT: s_mov_b32 s10, s6
845 ; VI-NEXT: s_mov_b32 s11, s7
846 ; VI-NEXT: s_waitcnt lgkmcnt(0)
847 ; VI-NEXT: s_mov_b32 s8, s2
848 ; VI-NEXT: s_mov_b32 s9, s3
849 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
850 ; VI-NEXT: s_movk_i32 s2, 0x50
851 ; VI-NEXT: s_mov_b32 s4, s0
852 ; VI-NEXT: s_mov_b32 s5, s1
853 ; VI-NEXT: s_waitcnt vmcnt(0)
854 ; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0
855 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
858 ; GFX9-LABEL: v_mul64_sext_c:
859 ; GFX9: ; %bb.0: ; %entry
860 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
861 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
862 ; GFX9-NEXT: s_mov_b32 s2, -1
863 ; GFX9-NEXT: s_mov_b32 s10, s2
864 ; GFX9-NEXT: s_mov_b32 s11, s3
865 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
866 ; GFX9-NEXT: s_mov_b32 s8, s6
867 ; GFX9-NEXT: s_mov_b32 s9, s7
868 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
869 ; GFX9-NEXT: s_movk_i32 s0, 0x50
870 ; GFX9-NEXT: s_mov_b32 s1, s5
871 ; GFX9-NEXT: s_waitcnt vmcnt(0)
872 ; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0
873 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
874 ; GFX9-NEXT: s_mov_b32 s0, s4
875 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
876 ; GFX9-NEXT: s_endpgm
878 ; GFX10-LABEL: v_mul64_sext_c:
879 ; GFX10: ; %bb.0: ; %entry
880 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
881 ; GFX10-NEXT: s_mov_b32 s2, -1
882 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
883 ; GFX10-NEXT: s_mov_b32 s10, s2
884 ; GFX10-NEXT: s_mov_b32 s11, s3
885 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
886 ; GFX10-NEXT: s_mov_b32 s8, s6
887 ; GFX10-NEXT: s_mov_b32 s9, s7
888 ; GFX10-NEXT: s_mov_b32 s0, s4
889 ; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
890 ; GFX10-NEXT: s_mov_b32 s1, s5
891 ; GFX10-NEXT: s_waitcnt vmcnt(0)
892 ; GFX10-NEXT: v_mul_hi_i32 v1, 0x50, v0
893 ; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0
894 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
895 ; GFX10-NEXT: s_endpgm
897 ; GFX11-LABEL: v_mul64_sext_c:
898 ; GFX11: ; %bb.0: ; %entry
899 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
900 ; GFX11-NEXT: s_mov_b32 s6, -1
901 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
902 ; GFX11-NEXT: s_mov_b32 s10, s6
903 ; GFX11-NEXT: s_mov_b32 s11, s7
904 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
905 ; GFX11-NEXT: s_mov_b32 s8, s2
906 ; GFX11-NEXT: s_mov_b32 s9, s3
907 ; GFX11-NEXT: s_mov_b32 s4, s0
908 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
909 ; GFX11-NEXT: s_mov_b32 s5, s1
910 ; GFX11-NEXT: s_waitcnt vmcnt(0)
911 ; GFX11-NEXT: v_mul_hi_i32 v1, 0x50, v0
912 ; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0
913 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
914 ; GFX11-NEXT: s_nop 0
915 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
916 ; GFX11-NEXT: s_endpgm
918 ; GFX12-LABEL: v_mul64_sext_c:
919 ; GFX12: ; %bb.0: ; %entry
920 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
921 ; GFX12-NEXT: s_mov_b32 s6, -1
922 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000
923 ; GFX12-NEXT: s_mov_b32 s10, s6
924 ; GFX12-NEXT: s_mov_b32 s11, s7
925 ; GFX12-NEXT: s_wait_kmcnt 0x0
926 ; GFX12-NEXT: s_mov_b32 s8, s2
927 ; GFX12-NEXT: s_mov_b32 s9, s3
928 ; GFX12-NEXT: s_mov_b32 s4, s0
929 ; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
930 ; GFX12-NEXT: s_mov_b32 s5, s1
931 ; GFX12-NEXT: s_wait_loadcnt 0x0
932 ; GFX12-NEXT: v_mul_hi_i32 v1, 0x50, v0
933 ; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0
934 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
935 ; GFX12-NEXT: s_nop 0
936 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
937 ; GFX12-NEXT: s_endpgm
939 ; EG-LABEL: v_mul64_sext_c:
940 ; EG: ; %bb.0: ; %entry
941 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
943 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
944 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
947 ; EG-NEXT: Fetch clause starting at 6:
948 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
949 ; EG-NEXT: ALU clause starting at 8:
950 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
951 ; EG-NEXT: ALU clause starting at 9:
952 ; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x,
953 ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
954 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
955 ; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y,
956 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
958 %val = load i32, ptr addrspace(1) %in, align 4
959 %ext = sext i32 %val to i64
960 %mul = mul i64 %ext, 80
961 store i64 %mul, ptr addrspace(1) %out, align 8
965 define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) {
966 ; SI-LABEL: v_mul64_zext_c:
967 ; SI: ; %bb.0: ; %entry
968 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
969 ; SI-NEXT: s_mov_b32 s7, 0xf000
970 ; SI-NEXT: s_mov_b32 s6, -1
971 ; SI-NEXT: s_mov_b32 s10, s6
972 ; SI-NEXT: s_mov_b32 s11, s7
973 ; SI-NEXT: s_waitcnt lgkmcnt(0)
974 ; SI-NEXT: s_mov_b32 s8, s2
975 ; SI-NEXT: s_mov_b32 s9, s3
976 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
977 ; SI-NEXT: s_movk_i32 s2, 0x50
978 ; SI-NEXT: s_mov_b32 s4, s0
979 ; SI-NEXT: s_mov_b32 s5, s1
980 ; SI-NEXT: s_waitcnt vmcnt(0)
981 ; SI-NEXT: v_mul_hi_u32 v1, v0, s2
982 ; SI-NEXT: v_mul_lo_u32 v0, v0, s2
983 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
986 ; VI-LABEL: v_mul64_zext_c:
987 ; VI: ; %bb.0: ; %entry
988 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
989 ; VI-NEXT: s_mov_b32 s7, 0xf000
990 ; VI-NEXT: s_mov_b32 s6, -1
991 ; VI-NEXT: s_mov_b32 s10, s6
992 ; VI-NEXT: s_mov_b32 s11, s7
993 ; VI-NEXT: s_waitcnt lgkmcnt(0)
994 ; VI-NEXT: s_mov_b32 s8, s2
995 ; VI-NEXT: s_mov_b32 s9, s3
996 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
997 ; VI-NEXT: s_movk_i32 s2, 0x50
998 ; VI-NEXT: s_mov_b32 s4, s0
999 ; VI-NEXT: s_mov_b32 s5, s1
1000 ; VI-NEXT: s_waitcnt vmcnt(0)
1001 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, s2, 0
1002 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1005 ; GFX9-LABEL: v_mul64_zext_c:
1006 ; GFX9: ; %bb.0: ; %entry
1007 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1008 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1009 ; GFX9-NEXT: s_mov_b32 s2, -1
1010 ; GFX9-NEXT: s_mov_b32 s10, s2
1011 ; GFX9-NEXT: s_mov_b32 s11, s3
1012 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1013 ; GFX9-NEXT: s_mov_b32 s8, s6
1014 ; GFX9-NEXT: s_mov_b32 s9, s7
1015 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
1016 ; GFX9-NEXT: s_movk_i32 s0, 0x50
1017 ; GFX9-NEXT: s_mov_b32 s1, s5
1018 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1019 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, s0
1020 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
1021 ; GFX9-NEXT: s_mov_b32 s0, s4
1022 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1023 ; GFX9-NEXT: s_endpgm
1025 ; GFX10-LABEL: v_mul64_zext_c:
1026 ; GFX10: ; %bb.0: ; %entry
1027 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1028 ; GFX10-NEXT: s_mov_b32 s2, -1
1029 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
1030 ; GFX10-NEXT: s_mov_b32 s10, s2
1031 ; GFX10-NEXT: s_mov_b32 s11, s3
1032 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1033 ; GFX10-NEXT: s_mov_b32 s8, s6
1034 ; GFX10-NEXT: s_mov_b32 s9, s7
1035 ; GFX10-NEXT: s_mov_b32 s0, s4
1036 ; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
1037 ; GFX10-NEXT: s_mov_b32 s1, s5
1038 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1039 ; GFX10-NEXT: v_mul_hi_u32 v1, 0x50, v0
1040 ; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0
1041 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1042 ; GFX10-NEXT: s_endpgm
1044 ; GFX11-LABEL: v_mul64_zext_c:
1045 ; GFX11: ; %bb.0: ; %entry
1046 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1047 ; GFX11-NEXT: s_mov_b32 s6, -1
1048 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
1049 ; GFX11-NEXT: s_mov_b32 s10, s6
1050 ; GFX11-NEXT: s_mov_b32 s11, s7
1051 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1052 ; GFX11-NEXT: s_mov_b32 s8, s2
1053 ; GFX11-NEXT: s_mov_b32 s9, s3
1054 ; GFX11-NEXT: s_mov_b32 s4, s0
1055 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
1056 ; GFX11-NEXT: s_mov_b32 s5, s1
1057 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1058 ; GFX11-NEXT: v_mul_hi_u32 v1, 0x50, v0
1059 ; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0
1060 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
1061 ; GFX11-NEXT: s_nop 0
1062 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1063 ; GFX11-NEXT: s_endpgm
1065 ; GFX12-LABEL: v_mul64_zext_c:
1066 ; GFX12: ; %bb.0: ; %entry
1067 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1068 ; GFX12-NEXT: s_mov_b32 s6, -1
1069 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000
1070 ; GFX12-NEXT: s_mov_b32 s10, s6
1071 ; GFX12-NEXT: s_mov_b32 s11, s7
1072 ; GFX12-NEXT: s_wait_kmcnt 0x0
1073 ; GFX12-NEXT: s_mov_b32 s8, s2
1074 ; GFX12-NEXT: s_mov_b32 s9, s3
1075 ; GFX12-NEXT: s_mov_b32 s4, s0
1076 ; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
1077 ; GFX12-NEXT: s_mov_b32 s5, s1
1078 ; GFX12-NEXT: s_wait_loadcnt 0x0
1079 ; GFX12-NEXT: v_mul_hi_u32 v1, 0x50, v0
1080 ; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0
1081 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
1082 ; GFX12-NEXT: s_nop 0
1083 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1084 ; GFX12-NEXT: s_endpgm
1086 ; EG-LABEL: v_mul64_zext_c:
1087 ; EG: ; %bb.0: ; %entry
1088 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1090 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
1091 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1094 ; EG-NEXT: Fetch clause starting at 6:
1095 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1096 ; EG-NEXT: ALU clause starting at 8:
1097 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1098 ; EG-NEXT: ALU clause starting at 9:
1099 ; EG-NEXT: MULHI * T0.Y, T0.X, literal.x,
1100 ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
1101 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
1102 ; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y,
1103 ; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
1105 %val = load i32, ptr addrspace(1) %in, align 4
1106 %ext = zext i32 %val to i64
1107 %mul = mul i64 %ext, 80
1108 store i64 %mul, ptr addrspace(1) %out, align 8
1112 define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1113 ; SI-LABEL: v_mul64_sext_inline_imm:
1114 ; SI: ; %bb.0: ; %entry
1115 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1116 ; SI-NEXT: s_mov_b32 s7, 0xf000
1117 ; SI-NEXT: s_mov_b32 s6, -1
1118 ; SI-NEXT: s_mov_b32 s10, s6
1119 ; SI-NEXT: s_mov_b32 s11, s7
1120 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1121 ; SI-NEXT: s_mov_b32 s8, s2
1122 ; SI-NEXT: s_mov_b32 s9, s3
1123 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1124 ; SI-NEXT: s_mov_b32 s4, s0
1125 ; SI-NEXT: s_mov_b32 s5, s1
1126 ; SI-NEXT: s_waitcnt vmcnt(0)
1127 ; SI-NEXT: v_mul_hi_i32 v1, v0, 9
1128 ; SI-NEXT: v_mul_lo_u32 v0, v0, 9
1129 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1132 ; VI-LABEL: v_mul64_sext_inline_imm:
1133 ; VI: ; %bb.0: ; %entry
1134 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1135 ; VI-NEXT: s_mov_b32 s7, 0xf000
1136 ; VI-NEXT: s_mov_b32 s6, -1
1137 ; VI-NEXT: s_mov_b32 s10, s6
1138 ; VI-NEXT: s_mov_b32 s11, s7
1139 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1140 ; VI-NEXT: s_mov_b32 s8, s2
1141 ; VI-NEXT: s_mov_b32 s9, s3
1142 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1143 ; VI-NEXT: s_mov_b32 s4, s0
1144 ; VI-NEXT: s_mov_b32 s5, s1
1145 ; VI-NEXT: s_waitcnt vmcnt(0)
1146 ; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0
1147 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1150 ; GFX9-LABEL: v_mul64_sext_inline_imm:
1151 ; GFX9: ; %bb.0: ; %entry
1152 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1153 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1154 ; GFX9-NEXT: s_mov_b32 s2, -1
1155 ; GFX9-NEXT: s_mov_b32 s10, s2
1156 ; GFX9-NEXT: s_mov_b32 s11, s3
1157 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1158 ; GFX9-NEXT: s_mov_b32 s8, s6
1159 ; GFX9-NEXT: s_mov_b32 s9, s7
1160 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
1161 ; GFX9-NEXT: s_mov_b32 s0, s4
1162 ; GFX9-NEXT: s_mov_b32 s1, s5
1163 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1164 ; GFX9-NEXT: v_mul_hi_i32 v1, v0, 9
1165 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 9
1166 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1167 ; GFX9-NEXT: s_endpgm
1169 ; GFX10-LABEL: v_mul64_sext_inline_imm:
1170 ; GFX10: ; %bb.0: ; %entry
1171 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1172 ; GFX10-NEXT: s_mov_b32 s2, -1
1173 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
1174 ; GFX10-NEXT: s_mov_b32 s10, s2
1175 ; GFX10-NEXT: s_mov_b32 s11, s3
1176 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1177 ; GFX10-NEXT: s_mov_b32 s8, s6
1178 ; GFX10-NEXT: s_mov_b32 s9, s7
1179 ; GFX10-NEXT: s_mov_b32 s0, s4
1180 ; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
1181 ; GFX10-NEXT: s_mov_b32 s1, s5
1182 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1183 ; GFX10-NEXT: v_mul_hi_i32 v1, v0, 9
1184 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 9
1185 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1186 ; GFX10-NEXT: s_endpgm
1188 ; GFX11-LABEL: v_mul64_sext_inline_imm:
1189 ; GFX11: ; %bb.0: ; %entry
1190 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1191 ; GFX11-NEXT: s_mov_b32 s6, -1
1192 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
1193 ; GFX11-NEXT: s_mov_b32 s10, s6
1194 ; GFX11-NEXT: s_mov_b32 s11, s7
1195 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1196 ; GFX11-NEXT: s_mov_b32 s8, s2
1197 ; GFX11-NEXT: s_mov_b32 s9, s3
1198 ; GFX11-NEXT: s_mov_b32 s4, s0
1199 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
1200 ; GFX11-NEXT: s_mov_b32 s5, s1
1201 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1202 ; GFX11-NEXT: v_mul_hi_i32 v1, v0, 9
1203 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 9
1204 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
1205 ; GFX11-NEXT: s_nop 0
1206 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1207 ; GFX11-NEXT: s_endpgm
1209 ; GFX12-LABEL: v_mul64_sext_inline_imm:
1210 ; GFX12: ; %bb.0: ; %entry
1211 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1212 ; GFX12-NEXT: s_mov_b32 s6, -1
1213 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000
1214 ; GFX12-NEXT: s_mov_b32 s10, s6
1215 ; GFX12-NEXT: s_mov_b32 s11, s7
1216 ; GFX12-NEXT: s_wait_kmcnt 0x0
1217 ; GFX12-NEXT: s_mov_b32 s8, s2
1218 ; GFX12-NEXT: s_mov_b32 s9, s3
1219 ; GFX12-NEXT: s_mov_b32 s4, s0
1220 ; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
1221 ; GFX12-NEXT: s_mov_b32 s5, s1
1222 ; GFX12-NEXT: s_wait_loadcnt 0x0
1223 ; GFX12-NEXT: v_mul_hi_i32 v1, 9, v0
1224 ; GFX12-NEXT: v_mul_lo_u32 v0, 9, v0
1225 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
1226 ; GFX12-NEXT: s_nop 0
1227 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1228 ; GFX12-NEXT: s_endpgm
1230 ; EG-LABEL: v_mul64_sext_inline_imm:
1231 ; EG: ; %bb.0: ; %entry
1232 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1234 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
1235 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1238 ; EG-NEXT: Fetch clause starting at 6:
1239 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1240 ; EG-NEXT: ALU clause starting at 8:
1241 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1242 ; EG-NEXT: ALU clause starting at 9:
1243 ; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x,
1244 ; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
1245 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
1246 ; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y,
1247 ; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44)
1249 %val = load i32, ptr addrspace(1) %in, align 4
1250 %ext = sext i32 %val to i64
1251 %mul = mul i64 %ext, 9
1252 store i64 %mul, ptr addrspace(1) %out, align 8
1256 define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
1257 ; SI-LABEL: s_mul_i32:
1258 ; SI: ; %bb.0: ; %entry
1259 ; SI-NEXT: s_load_dword s4, s[2:3], 0x13
1260 ; SI-NEXT: s_load_dword s5, s[2:3], 0x1c
1261 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
1262 ; SI-NEXT: s_mov_b32 s3, 0xf000
1263 ; SI-NEXT: s_mov_b32 s2, -1
1264 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1265 ; SI-NEXT: s_mul_i32 s4, s4, s5
1266 ; SI-NEXT: v_mov_b32_e32 v0, s4
1267 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1270 ; VI-LABEL: s_mul_i32:
1271 ; VI: ; %bb.0: ; %entry
1272 ; VI-NEXT: s_load_dword s4, s[2:3], 0x4c
1273 ; VI-NEXT: s_load_dword s5, s[2:3], 0x70
1274 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1275 ; VI-NEXT: s_mov_b32 s3, 0xf000
1276 ; VI-NEXT: s_mov_b32 s2, -1
1277 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1278 ; VI-NEXT: s_mul_i32 s4, s4, s5
1279 ; VI-NEXT: v_mov_b32_e32 v0, s4
1280 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1283 ; GFX9-LABEL: s_mul_i32:
1284 ; GFX9: ; %bb.0: ; %entry
1285 ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x4c
1286 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x70
1287 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1288 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1289 ; GFX9-NEXT: s_mov_b32 s2, -1
1290 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1291 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
1292 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1293 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
1294 ; GFX9-NEXT: s_endpgm
1296 ; GFX10-LABEL: s_mul_i32:
1297 ; GFX10: ; %bb.0: ; %entry
1298 ; GFX10-NEXT: s_clause 0x2
1299 ; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c
1300 ; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70
1301 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1302 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
1303 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1304 ; GFX10-NEXT: s_mul_i32 s2, s4, s5
1305 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
1306 ; GFX10-NEXT: s_mov_b32 s2, -1
1307 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
1308 ; GFX10-NEXT: s_endpgm
1310 ; GFX11-LABEL: s_mul_i32:
1311 ; GFX11: ; %bb.0: ; %entry
1312 ; GFX11-NEXT: s_clause 0x2
1313 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c
1314 ; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70
1315 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1316 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
1317 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1318 ; GFX11-NEXT: s_mul_i32 s2, s4, s5
1319 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1320 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
1321 ; GFX11-NEXT: s_mov_b32 s2, -1
1322 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
1323 ; GFX11-NEXT: s_nop 0
1324 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1325 ; GFX11-NEXT: s_endpgm
1327 ; GFX12-LABEL: s_mul_i32:
1328 ; GFX12: ; %bb.0: ; %entry
1329 ; GFX12-NEXT: s_clause 0x2
1330 ; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c
1331 ; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70
1332 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1333 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000
1334 ; GFX12-NEXT: s_wait_kmcnt 0x0
1335 ; GFX12-NEXT: s_mul_i32 s2, s4, s5
1336 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1337 ; GFX12-NEXT: v_mov_b32_e32 v0, s2
1338 ; GFX12-NEXT: s_mov_b32 s2, -1
1339 ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
1340 ; GFX12-NEXT: s_nop 0
1341 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1342 ; GFX12-NEXT: s_endpgm
1344 ; EG-LABEL: s_mul_i32:
1345 ; EG: ; %bb.0: ; %entry
1346 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1347 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1350 ; EG-NEXT: ALU clause starting at 4:
1351 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
1352 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1353 ; EG-NEXT: MULLO_INT * T1.X, KC0[4].Z, KC0[6].W,
1355 %mul = mul i32 %a, %b
1356 store i32 %mul, ptr addrspace(1) %out, align 4
1360 define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1361 ; SI-LABEL: v_mul_i32:
1362 ; SI: ; %bb.0: ; %entry
1363 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1364 ; SI-NEXT: s_mov_b32 s7, 0xf000
1365 ; SI-NEXT: s_mov_b32 s6, -1
1366 ; SI-NEXT: s_mov_b32 s10, s6
1367 ; SI-NEXT: s_mov_b32 s11, s7
1368 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1369 ; SI-NEXT: s_mov_b32 s8, s2
1370 ; SI-NEXT: s_mov_b32 s9, s3
1371 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1372 ; SI-NEXT: s_mov_b32 s4, s0
1373 ; SI-NEXT: s_mov_b32 s5, s1
1374 ; SI-NEXT: s_waitcnt vmcnt(0)
1375 ; SI-NEXT: v_mul_lo_u32 v0, v0, v1
1376 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1379 ; VI-LABEL: v_mul_i32:
1380 ; VI: ; %bb.0: ; %entry
1381 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1382 ; VI-NEXT: s_mov_b32 s7, 0xf000
1383 ; VI-NEXT: s_mov_b32 s6, -1
1384 ; VI-NEXT: s_mov_b32 s10, s6
1385 ; VI-NEXT: s_mov_b32 s11, s7
1386 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1387 ; VI-NEXT: s_mov_b32 s8, s2
1388 ; VI-NEXT: s_mov_b32 s9, s3
1389 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1390 ; VI-NEXT: s_mov_b32 s4, s0
1391 ; VI-NEXT: s_mov_b32 s5, s1
1392 ; VI-NEXT: s_waitcnt vmcnt(0)
1393 ; VI-NEXT: v_mul_lo_u32 v0, v0, v1
1394 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1397 ; GFX9-LABEL: v_mul_i32:
1398 ; GFX9: ; %bb.0: ; %entry
1399 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1400 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1401 ; GFX9-NEXT: s_mov_b32 s2, -1
1402 ; GFX9-NEXT: s_mov_b32 s10, s2
1403 ; GFX9-NEXT: s_mov_b32 s11, s3
1404 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1405 ; GFX9-NEXT: s_mov_b32 s8, s6
1406 ; GFX9-NEXT: s_mov_b32 s9, s7
1407 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1408 ; GFX9-NEXT: s_mov_b32 s0, s4
1409 ; GFX9-NEXT: s_mov_b32 s1, s5
1410 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1411 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1
1412 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
1413 ; GFX9-NEXT: s_endpgm
1415 ; GFX10-LABEL: v_mul_i32:
1416 ; GFX10: ; %bb.0: ; %entry
1417 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1418 ; GFX10-NEXT: s_mov_b32 s2, -1
1419 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
1420 ; GFX10-NEXT: s_mov_b32 s10, s2
1421 ; GFX10-NEXT: s_mov_b32 s11, s3
1422 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1423 ; GFX10-NEXT: s_mov_b32 s8, s6
1424 ; GFX10-NEXT: s_mov_b32 s9, s7
1425 ; GFX10-NEXT: s_mov_b32 s0, s4
1426 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1427 ; GFX10-NEXT: s_mov_b32 s1, s5
1428 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1429 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1
1430 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
1431 ; GFX10-NEXT: s_endpgm
1433 ; GFX11-LABEL: v_mul_i32:
1434 ; GFX11: ; %bb.0: ; %entry
1435 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1436 ; GFX11-NEXT: s_mov_b32 s6, -1
1437 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
1438 ; GFX11-NEXT: s_mov_b32 s10, s6
1439 ; GFX11-NEXT: s_mov_b32 s11, s7
1440 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1441 ; GFX11-NEXT: s_mov_b32 s8, s2
1442 ; GFX11-NEXT: s_mov_b32 s9, s3
1443 ; GFX11-NEXT: s_mov_b32 s4, s0
1444 ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
1445 ; GFX11-NEXT: s_mov_b32 s5, s1
1446 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1447 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
1448 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
1449 ; GFX11-NEXT: s_nop 0
1450 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1451 ; GFX11-NEXT: s_endpgm
1453 ; GFX12-LABEL: v_mul_i32:
1454 ; GFX12: ; %bb.0: ; %entry
1455 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1456 ; GFX12-NEXT: s_mov_b32 s6, -1
1457 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000
1458 ; GFX12-NEXT: s_mov_b32 s10, s6
1459 ; GFX12-NEXT: s_mov_b32 s11, s7
1460 ; GFX12-NEXT: s_wait_kmcnt 0x0
1461 ; GFX12-NEXT: s_mov_b32 s8, s2
1462 ; GFX12-NEXT: s_mov_b32 s9, s3
1463 ; GFX12-NEXT: s_mov_b32 s4, s0
1464 ; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
1465 ; GFX12-NEXT: s_mov_b32 s5, s1
1466 ; GFX12-NEXT: s_wait_loadcnt 0x0
1467 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1
1468 ; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
1469 ; GFX12-NEXT: s_nop 0
1470 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1471 ; GFX12-NEXT: s_endpgm
1473 ; EG-LABEL: v_mul_i32:
1474 ; EG: ; %bb.0: ; %entry
1475 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1477 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
1478 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1481 ; EG-NEXT: Fetch clause starting at 6:
1482 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
1483 ; EG-NEXT: ALU clause starting at 8:
1484 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1485 ; EG-NEXT: ALU clause starting at 9:
1486 ; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
1487 ; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Y,
1488 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1490 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1491 %a = load i32, ptr addrspace(1) %in
1492 %b = load i32, ptr addrspace(1) %b_ptr
1493 %result = mul i32 %a, %b
1494 store i32 %result, ptr addrspace(1) %out
1498 define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind {
1499 ; SI-LABEL: s_mul_i1:
1500 ; SI: ; %bb.0: ; %entry
1501 ; SI-NEXT: s_load_dword s4, s[2:3], 0x13
1502 ; SI-NEXT: s_load_dword s5, s[2:3], 0x1c
1503 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
1504 ; SI-NEXT: s_mov_b32 s3, 0xf000
1505 ; SI-NEXT: s_mov_b32 s2, -1
1506 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1507 ; SI-NEXT: s_mul_i32 s4, s4, s5
1508 ; SI-NEXT: s_and_b32 s4, s4, 1
1509 ; SI-NEXT: v_mov_b32_e32 v0, s4
1510 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
1513 ; VI-LABEL: s_mul_i1:
1514 ; VI: ; %bb.0: ; %entry
1515 ; VI-NEXT: s_load_dword s4, s[2:3], 0x70
1516 ; VI-NEXT: s_load_dword s5, s[2:3], 0x4c
1517 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1518 ; VI-NEXT: s_mov_b32 s3, 0xf000
1519 ; VI-NEXT: s_mov_b32 s2, -1
1520 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1521 ; VI-NEXT: v_mov_b32_e32 v0, s4
1522 ; VI-NEXT: v_mul_lo_u16_e32 v0, s5, v0
1523 ; VI-NEXT: v_and_b32_e32 v0, 1, v0
1524 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
1527 ; GFX9-LABEL: s_mul_i1:
1528 ; GFX9: ; %bb.0: ; %entry
1529 ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x70
1530 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x4c
1531 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1532 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1533 ; GFX9-NEXT: s_mov_b32 s2, -1
1534 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1535 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1536 ; GFX9-NEXT: v_mul_lo_u16_e32 v0, s5, v0
1537 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
1538 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0
1539 ; GFX9-NEXT: s_endpgm
1541 ; GFX10-LABEL: s_mul_i1:
1542 ; GFX10: ; %bb.0: ; %entry
1543 ; GFX10-NEXT: s_clause 0x2
1544 ; GFX10-NEXT: s_load_dword s4, s[2:3], 0x4c
1545 ; GFX10-NEXT: s_load_dword s5, s[2:3], 0x70
1546 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1547 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
1548 ; GFX10-NEXT: s_mov_b32 s2, -1
1549 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1550 ; GFX10-NEXT: v_mul_lo_u16 v0, s4, s5
1551 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
1552 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0
1553 ; GFX10-NEXT: s_endpgm
1555 ; GFX11-LABEL: s_mul_i1:
1556 ; GFX11: ; %bb.0: ; %entry
1557 ; GFX11-NEXT: s_clause 0x2
1558 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x4c
1559 ; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x70
1560 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1561 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
1562 ; GFX11-NEXT: s_mov_b32 s2, -1
1563 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1564 ; GFX11-NEXT: v_mul_lo_u16 v0, s4, s5
1565 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1566 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
1567 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
1568 ; GFX11-NEXT: s_nop 0
1569 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1570 ; GFX11-NEXT: s_endpgm
1572 ; GFX12-LABEL: s_mul_i1:
1573 ; GFX12: ; %bb.0: ; %entry
1574 ; GFX12-NEXT: s_clause 0x2
1575 ; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x4c
1576 ; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x70
1577 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1578 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000
1579 ; GFX12-NEXT: s_mov_b32 s2, -1
1580 ; GFX12-NEXT: s_wait_kmcnt 0x0
1581 ; GFX12-NEXT: v_mul_lo_u16 v0, s4, s5
1582 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1583 ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
1584 ; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null
1585 ; GFX12-NEXT: s_nop 0
1586 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1587 ; GFX12-NEXT: s_endpgm
1589 ; EG-LABEL: s_mul_i1:
1590 ; EG: ; %bb.0: ; %entry
1591 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
1593 ; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
1594 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1597 ; EG-NEXT: Fetch clause starting at 6:
1598 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 72, #3
1599 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 108, #3
1600 ; EG-NEXT: ALU clause starting at 10:
1601 ; EG-NEXT: MOV * T0.X, 0.0,
1602 ; EG-NEXT: ALU clause starting at 11:
1603 ; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
1604 ; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X,
1605 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1606 ; EG-NEXT: AND_INT T1.W, PS, 1,
1607 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
1608 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1609 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1610 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1611 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1612 ; EG-NEXT: MOV T0.Y, 0.0,
1613 ; EG-NEXT: MOV * T0.Z, 0.0,
1614 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1615 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1617 %mul = mul i1 %a, %b
1618 store i1 %mul, ptr addrspace(1) %out, align 4
1622 define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1623 ; SI-LABEL: v_mul_i1:
1624 ; SI: ; %bb.0: ; %entry
1625 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
1626 ; SI-NEXT: s_mov_b32 s7, 0xf000
1627 ; SI-NEXT: s_mov_b32 s6, -1
1628 ; SI-NEXT: s_mov_b32 s10, s6
1629 ; SI-NEXT: s_mov_b32 s11, s7
1630 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1631 ; SI-NEXT: s_mov_b32 s8, s2
1632 ; SI-NEXT: s_mov_b32 s9, s3
1633 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
1634 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1635 ; SI-NEXT: s_mov_b32 s4, s0
1636 ; SI-NEXT: s_mov_b32 s5, s1
1637 ; SI-NEXT: s_waitcnt vmcnt(0)
1638 ; SI-NEXT: v_mul_lo_u32 v0, v0, v1
1639 ; SI-NEXT: v_and_b32_e32 v0, 1, v0
1640 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1643 ; VI-LABEL: v_mul_i1:
1644 ; VI: ; %bb.0: ; %entry
1645 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1646 ; VI-NEXT: s_mov_b32 s7, 0xf000
1647 ; VI-NEXT: s_mov_b32 s6, -1
1648 ; VI-NEXT: s_mov_b32 s10, s6
1649 ; VI-NEXT: s_mov_b32 s11, s7
1650 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1651 ; VI-NEXT: s_mov_b32 s8, s2
1652 ; VI-NEXT: s_mov_b32 s9, s3
1653 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
1654 ; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1655 ; VI-NEXT: s_mov_b32 s4, s0
1656 ; VI-NEXT: s_mov_b32 s5, s1
1657 ; VI-NEXT: s_waitcnt vmcnt(0)
1658 ; VI-NEXT: v_mul_lo_u16_e32 v0, v0, v1
1659 ; VI-NEXT: v_and_b32_e32 v0, 1, v0
1660 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
1663 ; GFX9-LABEL: v_mul_i1:
1664 ; GFX9: ; %bb.0: ; %entry
1665 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1666 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1667 ; GFX9-NEXT: s_mov_b32 s2, -1
1668 ; GFX9-NEXT: s_mov_b32 s10, s2
1669 ; GFX9-NEXT: s_mov_b32 s11, s3
1670 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1671 ; GFX9-NEXT: s_mov_b32 s8, s6
1672 ; GFX9-NEXT: s_mov_b32 s9, s7
1673 ; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
1674 ; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1675 ; GFX9-NEXT: s_mov_b32 s0, s4
1676 ; GFX9-NEXT: s_mov_b32 s1, s5
1677 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1678 ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
1679 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
1680 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0
1681 ; GFX9-NEXT: s_endpgm
1683 ; GFX10-LABEL: v_mul_i1:
1684 ; GFX10: ; %bb.0: ; %entry
1685 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1686 ; GFX10-NEXT: s_mov_b32 s2, -1
1687 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
1688 ; GFX10-NEXT: s_mov_b32 s10, s2
1689 ; GFX10-NEXT: s_mov_b32 s11, s3
1690 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1691 ; GFX10-NEXT: s_mov_b32 s8, s6
1692 ; GFX10-NEXT: s_mov_b32 s9, s7
1693 ; GFX10-NEXT: s_clause 0x1
1694 ; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
1695 ; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1696 ; GFX10-NEXT: s_mov_b32 s0, s4
1697 ; GFX10-NEXT: s_mov_b32 s1, s5
1698 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1699 ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
1700 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
1701 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0
1702 ; GFX10-NEXT: s_endpgm
1704 ; GFX11-LABEL: v_mul_i1:
1705 ; GFX11: ; %bb.0: ; %entry
1706 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1707 ; GFX11-NEXT: s_mov_b32 s6, -1
1708 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
1709 ; GFX11-NEXT: s_mov_b32 s10, s6
1710 ; GFX11-NEXT: s_mov_b32 s11, s7
1711 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1712 ; GFX11-NEXT: s_mov_b32 s8, s2
1713 ; GFX11-NEXT: s_mov_b32 s9, s3
1714 ; GFX11-NEXT: s_clause 0x1
1715 ; GFX11-NEXT: buffer_load_u8 v0, off, s[8:11], 0
1716 ; GFX11-NEXT: buffer_load_u8 v1, off, s[8:11], 0 offset:4
1717 ; GFX11-NEXT: s_mov_b32 s4, s0
1718 ; GFX11-NEXT: s_mov_b32 s5, s1
1719 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1720 ; GFX11-NEXT: v_mul_lo_u16 v0, v0, v1
1721 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1722 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
1723 ; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
1724 ; GFX11-NEXT: s_nop 0
1725 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1726 ; GFX11-NEXT: s_endpgm
1728 ; GFX12-LABEL: v_mul_i1:
1729 ; GFX12: ; %bb.0: ; %entry
1730 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1731 ; GFX12-NEXT: s_mov_b32 s6, -1
1732 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000
1733 ; GFX12-NEXT: s_mov_b32 s10, s6
1734 ; GFX12-NEXT: s_mov_b32 s11, s7
1735 ; GFX12-NEXT: s_wait_kmcnt 0x0
1736 ; GFX12-NEXT: s_mov_b32 s8, s2
1737 ; GFX12-NEXT: s_mov_b32 s9, s3
1738 ; GFX12-NEXT: s_clause 0x1
1739 ; GFX12-NEXT: buffer_load_u8 v0, off, s[8:11], null
1740 ; GFX12-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4
1741 ; GFX12-NEXT: s_mov_b32 s4, s0
1742 ; GFX12-NEXT: s_mov_b32 s5, s1
1743 ; GFX12-NEXT: s_wait_loadcnt 0x0
1744 ; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
1745 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1746 ; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
1747 ; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null
1748 ; GFX12-NEXT: s_nop 0
1749 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1750 ; GFX12-NEXT: s_endpgm
1752 ; EG-LABEL: v_mul_i1:
1753 ; EG: ; %bb.0: ; %entry
1754 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1756 ; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
1757 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1760 ; EG-NEXT: Fetch clause starting at 6:
1761 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 4, #1
1762 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1763 ; EG-NEXT: ALU clause starting at 10:
1764 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1765 ; EG-NEXT: ALU clause starting at 11:
1766 ; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
1767 ; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X,
1768 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1769 ; EG-NEXT: AND_INT T1.W, PS, 1,
1770 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
1771 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1772 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1773 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1774 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1775 ; EG-NEXT: MOV T0.Y, 0.0,
1776 ; EG-NEXT: MOV * T0.Z, 0.0,
1777 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1778 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1780 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1781 %a = load i1, ptr addrspace(1) %in
1782 %b = load i1, ptr addrspace(1) %b_ptr
1783 %result = mul i1 %a, %b
1784 store i1 %result, ptr addrspace(1) %out
1788 ; A standard 64-bit multiply. The expansion should be around 6 instructions.
1789 ; It would be difficult to match the expansion correctly without writing
1790 ; a really complicated list of FileCheck expressions. I don't want
1791 ; to confuse people who may 'break' this test with a correct optimization,
1792 ; so this test just uses FUNC-LABEL to make sure the compiler does not
1793 ; crash with a 'failed to select' error.
1795 define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
1796 ; SI-LABEL: s_mul_i64:
1797 ; SI: ; %bb.0: ; %entry
1798 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
1799 ; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd
1800 ; SI-NEXT: s_mov_b32 s3, 0xf000
1801 ; SI-NEXT: s_mov_b32 s2, -1
1802 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1803 ; SI-NEXT: s_mov_b32 s0, s4
1804 ; SI-NEXT: v_mov_b32_e32 v0, s8
1805 ; SI-NEXT: v_mul_hi_u32 v0, s6, v0
1806 ; SI-NEXT: s_mul_i32 s4, s6, s9
1807 ; SI-NEXT: s_mov_b32 s1, s5
1808 ; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
1809 ; SI-NEXT: s_mul_i32 s4, s7, s8
1810 ; SI-NEXT: v_add_i32_e32 v1, vcc, s4, v0
1811 ; SI-NEXT: s_mul_i32 s4, s6, s8
1812 ; SI-NEXT: v_mov_b32_e32 v0, s4
1813 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1816 ; VI-LABEL: s_mul_i64:
1817 ; VI: ; %bb.0: ; %entry
1818 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1819 ; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34
1820 ; VI-NEXT: s_mov_b32 s3, 0xf000
1821 ; VI-NEXT: s_mov_b32 s2, -1
1822 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1823 ; VI-NEXT: s_mov_b32 s0, s4
1824 ; VI-NEXT: v_mov_b32_e32 v0, s8
1825 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s6, v0, 0
1826 ; VI-NEXT: s_mul_i32 s4, s6, s9
1827 ; VI-NEXT: s_mov_b32 s1, s5
1828 ; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
1829 ; VI-NEXT: s_mul_i32 s4, s7, s8
1830 ; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
1831 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1834 ; GFX9-LABEL: s_mul_i64:
1835 ; GFX9: ; %bb.0: ; %entry
1836 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1837 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34
1838 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1839 ; GFX9-NEXT: s_mov_b32 s2, -1
1840 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1841 ; GFX9-NEXT: s_mov_b32 s0, s4
1842 ; GFX9-NEXT: s_mov_b32 s1, s5
1843 ; GFX9-NEXT: s_mul_i32 s4, s6, s9
1844 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s8
1845 ; GFX9-NEXT: s_add_i32 s4, s5, s4
1846 ; GFX9-NEXT: s_mul_i32 s5, s7, s8
1847 ; GFX9-NEXT: s_add_i32 s4, s4, s5
1848 ; GFX9-NEXT: s_mul_i32 s5, s6, s8
1849 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
1850 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1851 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1852 ; GFX9-NEXT: s_endpgm
1854 ; GFX10-LABEL: s_mul_i64:
1855 ; GFX10: ; %bb.0: ; %entry
1856 ; GFX10-NEXT: s_clause 0x1
1857 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1858 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1859 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
1860 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1861 ; GFX10-NEXT: s_mul_i32 s1, s6, s1
1862 ; GFX10-NEXT: s_mul_hi_u32 s2, s6, s0
1863 ; GFX10-NEXT: s_add_i32 s1, s2, s1
1864 ; GFX10-NEXT: s_mul_i32 s2, s7, s0
1865 ; GFX10-NEXT: s_mul_i32 s0, s6, s0
1866 ; GFX10-NEXT: s_add_i32 s1, s1, s2
1867 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1868 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1869 ; GFX10-NEXT: s_mov_b32 s2, -1
1870 ; GFX10-NEXT: s_mov_b32 s0, s4
1871 ; GFX10-NEXT: s_mov_b32 s1, s5
1872 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1873 ; GFX10-NEXT: s_endpgm
1875 ; GFX11-LABEL: s_mul_i64:
1876 ; GFX11: ; %bb.0: ; %entry
1877 ; GFX11-NEXT: s_clause 0x1
1878 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
1879 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
1880 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
1881 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1882 ; GFX11-NEXT: s_mul_i32 s1, s6, s1
1883 ; GFX11-NEXT: s_mul_hi_u32 s2, s6, s0
1884 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
1885 ; GFX11-NEXT: s_add_i32 s1, s2, s1
1886 ; GFX11-NEXT: s_mul_i32 s2, s7, s0
1887 ; GFX11-NEXT: s_mul_i32 s0, s6, s0
1888 ; GFX11-NEXT: s_add_i32 s1, s1, s2
1889 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1890 ; GFX11-NEXT: s_mov_b32 s2, -1
1891 ; GFX11-NEXT: s_mov_b32 s0, s4
1892 ; GFX11-NEXT: s_mov_b32 s1, s5
1893 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
1894 ; GFX11-NEXT: s_nop 0
1895 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1896 ; GFX11-NEXT: s_endpgm
1898 ; GFX12-LABEL: s_mul_i64:
1899 ; GFX12: ; %bb.0: ; %entry
1900 ; GFX12-NEXT: s_clause 0x1
1901 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
1902 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
1903 ; GFX12-NEXT: s_wait_kmcnt 0x0
1904 ; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1]
1905 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000
1906 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1907 ; GFX12-NEXT: s_mov_b32 s6, -1
1908 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
1909 ; GFX12-NEXT: s_nop 0
1910 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1911 ; GFX12-NEXT: s_endpgm
1913 ; EG-LABEL: s_mul_i64:
1914 ; EG: ; %bb.0: ; %entry
1915 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
1916 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1919 ; EG-NEXT: ALU clause starting at 4:
1920 ; EG-NEXT: MULHI * T0.X, KC0[2].W, KC0[3].Y,
1921 ; EG-NEXT: MULLO_INT * T0.Y, KC0[2].W, KC0[3].Z,
1922 ; EG-NEXT: ADD_INT T0.W, T0.X, PS,
1923 ; EG-NEXT: MULLO_INT * T0.X, KC0[3].X, KC0[3].Y,
1924 ; EG-NEXT: ADD_INT * T0.Y, PV.W, PS,
1925 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1926 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1927 ; EG-NEXT: MULLO_INT * T0.X, KC0[2].W, KC0[3].Y,
1929 %mul = mul i64 %a, %b
1930 store i64 %mul, ptr addrspace(1) %out, align 8
1934 define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
1935 ; SI-LABEL: v_mul_i64:
1936 ; SI: ; %bb.0: ; %entry
1937 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
1938 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
1939 ; SI-NEXT: s_mov_b32 s11, 0xf000
1940 ; SI-NEXT: s_mov_b32 s10, -1
1941 ; SI-NEXT: s_mov_b32 s2, s10
1942 ; SI-NEXT: s_mov_b32 s3, s11
1943 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1944 ; SI-NEXT: s_mov_b32 s12, s6
1945 ; SI-NEXT: s_mov_b32 s13, s7
1946 ; SI-NEXT: s_mov_b32 s14, s10
1947 ; SI-NEXT: s_mov_b32 s15, s11
1948 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
1949 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1950 ; SI-NEXT: s_mov_b32 s8, s4
1951 ; SI-NEXT: s_mov_b32 s9, s5
1952 ; SI-NEXT: s_waitcnt vmcnt(0)
1953 ; SI-NEXT: v_mul_lo_u32 v1, v2, v1
1954 ; SI-NEXT: v_mul_hi_u32 v4, v2, v0
1955 ; SI-NEXT: v_mul_lo_u32 v3, v3, v0
1956 ; SI-NEXT: v_mul_lo_u32 v0, v2, v0
1957 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4
1958 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3
1959 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1962 ; VI-LABEL: v_mul_i64:
1963 ; VI: ; %bb.0: ; %entry
1964 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1965 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1966 ; VI-NEXT: s_mov_b32 s11, 0xf000
1967 ; VI-NEXT: s_mov_b32 s10, -1
1968 ; VI-NEXT: s_mov_b32 s2, s10
1969 ; VI-NEXT: s_mov_b32 s3, s11
1970 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1971 ; VI-NEXT: s_mov_b32 s12, s6
1972 ; VI-NEXT: s_mov_b32 s13, s7
1973 ; VI-NEXT: s_mov_b32 s14, s10
1974 ; VI-NEXT: s_mov_b32 s15, s11
1975 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
1976 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1977 ; VI-NEXT: s_mov_b32 s8, s4
1978 ; VI-NEXT: s_mov_b32 s9, s5
1979 ; VI-NEXT: s_waitcnt vmcnt(0)
1980 ; VI-NEXT: v_mul_lo_u32 v4, v2, v1
1981 ; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v2, v0, 0
1982 ; VI-NEXT: v_mul_lo_u32 v0, v3, v0
1983 ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2
1984 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
1985 ; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[8:11], 0
1988 ; GFX9-LABEL: v_mul_i64:
1989 ; GFX9: ; %bb.0: ; %entry
1990 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1991 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
1992 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
1993 ; GFX9-NEXT: s_mov_b32 s10, -1
1994 ; GFX9-NEXT: s_mov_b32 s2, s10
1995 ; GFX9-NEXT: s_mov_b32 s3, s11
1996 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1997 ; GFX9-NEXT: s_mov_b32 s12, s6
1998 ; GFX9-NEXT: s_mov_b32 s13, s7
1999 ; GFX9-NEXT: s_mov_b32 s14, s10
2000 ; GFX9-NEXT: s_mov_b32 s15, s11
2001 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
2002 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
2003 ; GFX9-NEXT: s_mov_b32 s8, s4
2004 ; GFX9-NEXT: s_mov_b32 s9, s5
2005 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2006 ; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1
2007 ; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0
2008 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0
2009 ; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0
2010 ; GFX9-NEXT: v_add_u32_e32 v1, v4, v1
2011 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
2012 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2013 ; GFX9-NEXT: s_endpgm
2015 ; GFX10-LABEL: v_mul_i64:
2016 ; GFX10: ; %bb.0: ; %entry
2017 ; GFX10-NEXT: s_clause 0x1
2018 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2019 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
2020 ; GFX10-NEXT: s_mov_b32 s10, -1
2021 ; GFX10-NEXT: s_mov_b32 s11, 0x31016000
2022 ; GFX10-NEXT: s_mov_b32 s2, s10
2023 ; GFX10-NEXT: s_mov_b32 s3, s11
2024 ; GFX10-NEXT: s_mov_b32 s14, s10
2025 ; GFX10-NEXT: s_mov_b32 s15, s11
2026 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2027 ; GFX10-NEXT: s_mov_b32 s12, s6
2028 ; GFX10-NEXT: s_mov_b32 s13, s7
2029 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
2030 ; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
2031 ; GFX10-NEXT: s_mov_b32 s8, s4
2032 ; GFX10-NEXT: s_mov_b32 s9, s5
2033 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2034 ; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1
2035 ; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0
2036 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0
2037 ; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0
2038 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1
2039 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
2040 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2041 ; GFX10-NEXT: s_endpgm
2043 ; GFX11-LABEL: v_mul_i64:
2044 ; GFX11: ; %bb.0: ; %entry
2045 ; GFX11-NEXT: s_clause 0x1
2046 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
2047 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
2048 ; GFX11-NEXT: s_mov_b32 s10, -1
2049 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
2050 ; GFX11-NEXT: s_mov_b32 s2, s10
2051 ; GFX11-NEXT: s_mov_b32 s3, s11
2052 ; GFX11-NEXT: s_mov_b32 s14, s10
2053 ; GFX11-NEXT: s_mov_b32 s15, s11
2054 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2055 ; GFX11-NEXT: s_mov_b32 s12, s6
2056 ; GFX11-NEXT: s_mov_b32 s13, s7
2057 ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
2058 ; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0
2059 ; GFX11-NEXT: s_mov_b32 s8, s4
2060 ; GFX11-NEXT: s_mov_b32 s9, s5
2061 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2062 ; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1
2063 ; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0
2064 ; GFX11-NEXT: v_mul_lo_u32 v3, v3, v0
2065 ; GFX11-NEXT: v_mul_lo_u32 v0, v2, v0
2066 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2067 ; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1
2068 ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3
2069 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
2070 ; GFX11-NEXT: s_nop 0
2071 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2072 ; GFX11-NEXT: s_endpgm
2074 ; GFX12-LABEL: v_mul_i64:
2075 ; GFX12: ; %bb.0: ; %entry
2076 ; GFX12-NEXT: s_clause 0x1
2077 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
2078 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
2079 ; GFX12-NEXT: s_mov_b32 s10, -1
2080 ; GFX12-NEXT: s_mov_b32 s11, 0x31016000
2081 ; GFX12-NEXT: s_mov_b32 s2, s10
2082 ; GFX12-NEXT: s_mov_b32 s3, s11
2083 ; GFX12-NEXT: s_mov_b32 s14, s10
2084 ; GFX12-NEXT: s_mov_b32 s15, s11
2085 ; GFX12-NEXT: s_wait_kmcnt 0x0
2086 ; GFX12-NEXT: s_mov_b32 s12, s6
2087 ; GFX12-NEXT: s_mov_b32 s13, s7
2088 ; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null
2089 ; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null
2090 ; GFX12-NEXT: s_mov_b32 s8, s4
2091 ; GFX12-NEXT: s_mov_b32 s9, s5
2092 ; GFX12-NEXT: s_wait_loadcnt 0x0
2093 ; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3
2094 ; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2
2095 ; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2
2096 ; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
2097 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2098 ; GFX12-NEXT: v_add_nc_u32_e32 v1, v3, v1
2099 ; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v4
2100 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2101 ; GFX12-NEXT: s_nop 0
2102 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2103 ; GFX12-NEXT: s_endpgm
2105 ; EG-LABEL: v_mul_i64:
2106 ; EG: ; %bb.0: ; %entry
2107 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
2109 ; EG-NEXT: ALU 7, @12, KC0[CB0:0-32], KC1[]
2110 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
2113 ; EG-NEXT: Fetch clause starting at 6:
2114 ; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
2115 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
2116 ; EG-NEXT: ALU clause starting at 10:
2117 ; EG-NEXT: MOV T0.X, KC0[2].Z,
2118 ; EG-NEXT: MOV * T1.X, KC0[2].W,
2119 ; EG-NEXT: ALU clause starting at 12:
2120 ; EG-NEXT: MULHI * T0.Z, T0.X, T1.X,
2121 ; EG-NEXT: MULLO_INT * T0.W, T0.X, T1.Y,
2122 ; EG-NEXT: ADD_INT T0.W, T0.Z, PS,
2123 ; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.X,
2124 ; EG-NEXT: ADD_INT * T0.Y, PV.W, PS,
2125 ; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
2126 ; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X,
2127 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2129 %a = load i64, ptr addrspace(1) %aptr, align 8
2130 %b = load i64, ptr addrspace(1) %bptr, align 8
2131 %mul = mul i64 %a, %b
2132 store i64 %mul, ptr addrspace(1) %out, align 8
2136 define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) {
2137 ; SI-LABEL: mul32_in_branch:
2138 ; SI: ; %bb.0: ; %entry
2139 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
2140 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2141 ; SI-NEXT: s_cmp_lg_u32 s0, 0
2142 ; SI-NEXT: s_cbranch_scc0 .LBB15_2
2143 ; SI-NEXT: ; %bb.1: ; %else
2144 ; SI-NEXT: s_mul_i32 s6, s0, s1
2145 ; SI-NEXT: s_mov_b64 s[4:5], 0
2146 ; SI-NEXT: s_branch .LBB15_3
2147 ; SI-NEXT: .LBB15_2:
2148 ; SI-NEXT: s_mov_b64 s[4:5], -1
2149 ; SI-NEXT: ; implicit-def: $sgpr6
2150 ; SI-NEXT: .LBB15_3: ; %Flow
2151 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
2152 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
2153 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2154 ; SI-NEXT: s_mov_b64 vcc, vcc
2155 ; SI-NEXT: s_cbranch_vccnz .LBB15_5
2156 ; SI-NEXT: ; %bb.4: ; %if
2157 ; SI-NEXT: s_mov_b32 s7, 0xf000
2158 ; SI-NEXT: s_mov_b32 s6, -1
2159 ; SI-NEXT: s_mov_b32 s4, s2
2160 ; SI-NEXT: s_mov_b32 s5, s3
2161 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
2162 ; SI-NEXT: s_branch .LBB15_6
2163 ; SI-NEXT: .LBB15_5:
2164 ; SI-NEXT: v_mov_b32_e32 v0, s6
2165 ; SI-NEXT: .LBB15_6: ; %endif
2166 ; SI-NEXT: s_mov_b32 s3, 0xf000
2167 ; SI-NEXT: s_mov_b32 s2, -1
2168 ; SI-NEXT: s_waitcnt vmcnt(0)
2169 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2172 ; VI-LABEL: mul32_in_branch:
2173 ; VI: ; %bb.0: ; %entry
2174 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
2175 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2176 ; VI-NEXT: s_cmp_lg_u32 s0, 0
2177 ; VI-NEXT: s_cbranch_scc0 .LBB15_2
2178 ; VI-NEXT: ; %bb.1: ; %else
2179 ; VI-NEXT: s_mul_i32 s6, s0, s1
2180 ; VI-NEXT: s_mov_b64 s[4:5], 0
2181 ; VI-NEXT: s_branch .LBB15_3
2182 ; VI-NEXT: .LBB15_2:
2183 ; VI-NEXT: s_mov_b64 s[4:5], -1
2184 ; VI-NEXT: ; implicit-def: $sgpr6
2185 ; VI-NEXT: .LBB15_3: ; %Flow
2186 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2187 ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
2188 ; VI-NEXT: s_cbranch_vccnz .LBB15_5
2189 ; VI-NEXT: ; %bb.4: ; %if
2190 ; VI-NEXT: s_mov_b32 s7, 0xf000
2191 ; VI-NEXT: s_mov_b32 s6, -1
2192 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2193 ; VI-NEXT: s_mov_b32 s4, s2
2194 ; VI-NEXT: s_mov_b32 s5, s3
2195 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
2196 ; VI-NEXT: s_branch .LBB15_6
2197 ; VI-NEXT: .LBB15_5:
2198 ; VI-NEXT: v_mov_b32_e32 v0, s6
2199 ; VI-NEXT: .LBB15_6: ; %endif
2200 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2201 ; VI-NEXT: s_mov_b32 s3, 0xf000
2202 ; VI-NEXT: s_mov_b32 s2, -1
2203 ; VI-NEXT: s_waitcnt vmcnt(0)
2204 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2207 ; GFX9-LABEL: mul32_in_branch:
2208 ; GFX9: ; %bb.0: ; %entry
2209 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
2210 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2211 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0
2212 ; GFX9-NEXT: s_cbranch_scc0 .LBB15_2
2213 ; GFX9-NEXT: ; %bb.1: ; %else
2214 ; GFX9-NEXT: s_mul_i32 s8, s0, s1
2215 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2216 ; GFX9-NEXT: s_branch .LBB15_3
2217 ; GFX9-NEXT: .LBB15_2:
2218 ; GFX9-NEXT: s_mov_b64 s[0:1], -1
2219 ; GFX9-NEXT: ; implicit-def: $sgpr8
2220 ; GFX9-NEXT: .LBB15_3: ; %Flow
2221 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2222 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
2223 ; GFX9-NEXT: s_cbranch_vccnz .LBB15_5
2224 ; GFX9-NEXT: ; %bb.4: ; %if
2225 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
2226 ; GFX9-NEXT: s_mov_b32 s2, -1
2227 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2228 ; GFX9-NEXT: s_mov_b32 s0, s6
2229 ; GFX9-NEXT: s_mov_b32 s1, s7
2230 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0
2231 ; GFX9-NEXT: s_branch .LBB15_6
2232 ; GFX9-NEXT: .LBB15_5:
2233 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
2234 ; GFX9-NEXT: .LBB15_6: ; %endif
2235 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2236 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
2237 ; GFX9-NEXT: s_mov_b32 s6, -1
2238 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2239 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
2240 ; GFX9-NEXT: s_endpgm
2242 ; GFX10-LABEL: mul32_in_branch:
2243 ; GFX10: ; %bb.0: ; %entry
2244 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
2245 ; GFX10-NEXT: s_mov_b32 s8, 0
2246 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2247 ; GFX10-NEXT: s_cmp_lg_u32 s0, 0
2248 ; GFX10-NEXT: s_cbranch_scc0 .LBB15_2
2249 ; GFX10-NEXT: ; %bb.1: ; %else
2250 ; GFX10-NEXT: s_mul_i32 s0, s0, s1
2251 ; GFX10-NEXT: s_branch .LBB15_3
2252 ; GFX10-NEXT: .LBB15_2:
2253 ; GFX10-NEXT: s_mov_b32 s8, -1
2254 ; GFX10-NEXT: ; implicit-def: $sgpr0
2255 ; GFX10-NEXT: .LBB15_3: ; %Flow
2256 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2257 ; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
2258 ; GFX10-NEXT: s_cbranch_vccnz .LBB15_5
2259 ; GFX10-NEXT: ; %bb.4: ; %if
2260 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
2261 ; GFX10-NEXT: s_mov_b32 s2, -1
2262 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2263 ; GFX10-NEXT: s_mov_b32 s0, s6
2264 ; GFX10-NEXT: s_mov_b32 s1, s7
2265 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0
2266 ; GFX10-NEXT: s_branch .LBB15_6
2267 ; GFX10-NEXT: .LBB15_5:
2268 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
2269 ; GFX10-NEXT: .LBB15_6: ; %endif
2270 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2271 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
2272 ; GFX10-NEXT: s_mov_b32 s6, -1
2273 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2274 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
2275 ; GFX10-NEXT: s_endpgm
2277 ; GFX11-LABEL: mul32_in_branch:
2278 ; GFX11: ; %bb.0: ; %entry
2279 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
2280 ; GFX11-NEXT: s_mov_b32 s4, 0
2281 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2282 ; GFX11-NEXT: s_cmp_lg_u32 s0, 0
2283 ; GFX11-NEXT: s_cbranch_scc0 .LBB15_2
2284 ; GFX11-NEXT: ; %bb.1: ; %else
2285 ; GFX11-NEXT: s_mul_i32 s5, s0, s1
2286 ; GFX11-NEXT: s_branch .LBB15_3
2287 ; GFX11-NEXT: .LBB15_2:
2288 ; GFX11-NEXT: s_mov_b32 s4, -1
2289 ; GFX11-NEXT: ; implicit-def: $sgpr5
2290 ; GFX11-NEXT: .LBB15_3: ; %Flow
2291 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2292 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
2293 ; GFX11-NEXT: s_cbranch_vccnz .LBB15_5
2294 ; GFX11-NEXT: ; %bb.4: ; %if
2295 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
2296 ; GFX11-NEXT: s_mov_b32 s6, -1
2297 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2298 ; GFX11-NEXT: s_mov_b32 s4, s2
2299 ; GFX11-NEXT: s_mov_b32 s5, s3
2300 ; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0
2301 ; GFX11-NEXT: s_branch .LBB15_6
2302 ; GFX11-NEXT: .LBB15_5:
2303 ; GFX11-NEXT: v_mov_b32_e32 v0, s5
2304 ; GFX11-NEXT: .LBB15_6: ; %endif
2305 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2306 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
2307 ; GFX11-NEXT: s_mov_b32 s2, -1
2308 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2309 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
2310 ; GFX11-NEXT: s_nop 0
2311 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2312 ; GFX11-NEXT: s_endpgm
2314 ; GFX12-LABEL: mul32_in_branch:
2315 ; GFX12: ; %bb.0: ; %entry
2316 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
2317 ; GFX12-NEXT: s_mov_b32 s4, 0
2318 ; GFX12-NEXT: s_wait_kmcnt 0x0
2319 ; GFX12-NEXT: s_cmp_lg_u32 s0, 0
2320 ; GFX12-NEXT: s_cbranch_scc0 .LBB15_2
2321 ; GFX12-NEXT: ; %bb.1: ; %else
2322 ; GFX12-NEXT: s_mul_i32 s5, s0, s1
2323 ; GFX12-NEXT: s_branch .LBB15_3
2324 ; GFX12-NEXT: .LBB15_2:
2325 ; GFX12-NEXT: s_mov_b32 s4, -1
2326 ; GFX12-NEXT: ; implicit-def: $sgpr5
2327 ; GFX12-NEXT: .LBB15_3: ; %Flow
2328 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2329 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
2330 ; GFX12-NEXT: s_cbranch_vccnz .LBB15_5
2331 ; GFX12-NEXT: ; %bb.4: ; %if
2332 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000
2333 ; GFX12-NEXT: s_mov_b32 s6, -1
2334 ; GFX12-NEXT: s_wait_kmcnt 0x0
2335 ; GFX12-NEXT: s_mov_b32 s4, s2
2336 ; GFX12-NEXT: s_mov_b32 s5, s3
2337 ; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
2338 ; GFX12-NEXT: s_branch .LBB15_6
2339 ; GFX12-NEXT: .LBB15_5:
2340 ; GFX12-NEXT: v_mov_b32_e32 v0, s5
2341 ; GFX12-NEXT: .LBB15_6: ; %endif
2342 ; GFX12-NEXT: s_wait_kmcnt 0x0
2343 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000
2344 ; GFX12-NEXT: s_mov_b32 s2, -1
2345 ; GFX12-NEXT: s_wait_loadcnt 0x0
2346 ; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
2347 ; GFX12-NEXT: s_nop 0
2348 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2349 ; GFX12-NEXT: s_endpgm
2351 ; EG-LABEL: mul32_in_branch:
2352 ; EG: ; %bb.0: ; %entry
2353 ; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[]
2354 ; EG-NEXT: JUMP @3 POP:1
2355 ; EG-NEXT: ALU_POP_AFTER 4, @18, KC0[CB0:0-32], KC1[]
2356 ; EG-NEXT: ALU_PUSH_BEFORE 2, @23, KC0[CB0:0-32], KC1[]
2357 ; EG-NEXT: JUMP @8 POP:1
2358 ; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
2359 ; EG-NEXT: TEX 0 @12
2360 ; EG-NEXT: POP @8 POP:1
2361 ; EG-NEXT: ALU 1, @27, KC0[], KC1[]
2362 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2365 ; EG-NEXT: Fetch clause starting at 12:
2366 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
2367 ; EG-NEXT: ALU clause starting at 14:
2368 ; EG-NEXT: MOV T0.W, literal.x,
2369 ; EG-NEXT: SETNE_INT * T1.W, KC0[2].W, 0.0,
2370 ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
2371 ; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
2372 ; EG-NEXT: ALU clause starting at 18:
2373 ; EG-NEXT: MOV T1.W, KC0[2].W,
2374 ; EG-NEXT: MOV * T2.W, KC0[3].X,
2375 ; EG-NEXT: MOV T0.W, literal.x,
2376 ; EG-NEXT: MULLO_INT * T0.X, PV.W, PS,
2377 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
2378 ; EG-NEXT: ALU clause starting at 23:
2379 ; EG-NEXT: MOV T1.W, KC0[2].Y,
2380 ; EG-NEXT: SETE_INT * T0.W, T0.W, 0.0,
2381 ; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
2382 ; EG-NEXT: ALU clause starting at 26:
2383 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2384 ; EG-NEXT: ALU clause starting at 27:
2385 ; EG-NEXT: LSHR * T1.X, T1.W, literal.x,
2386 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2388 %0 = icmp eq i32 %a, 0
2389 br i1 %0, label %if, label %else
2392 %1 = load i32, ptr addrspace(1) %in
2400 %3 = phi i32 [%1, %if], [%2, %else]
2401 store i32 %3, ptr addrspace(1) %out
2405 define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) {
2406 ; SI-LABEL: mul64_in_branch:
2407 ; SI: ; %bb.0: ; %entry
2408 ; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
2409 ; SI-NEXT: s_mov_b64 s[8:9], 0
2410 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2411 ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
2412 ; SI-NEXT: s_and_b64 vcc, exec, s[10:11]
2413 ; SI-NEXT: s_cbranch_vccz .LBB16_4
2414 ; SI-NEXT: ; %bb.1: ; %else
2415 ; SI-NEXT: v_mov_b32_e32 v0, s6
2416 ; SI-NEXT: v_mul_hi_u32 v0, s4, v0
2417 ; SI-NEXT: s_mul_i32 s7, s4, s7
2418 ; SI-NEXT: s_mul_i32 s5, s5, s6
2419 ; SI-NEXT: s_mul_i32 s4, s4, s6
2420 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0
2421 ; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v0
2422 ; SI-NEXT: v_mov_b32_e32 v0, s4
2423 ; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
2424 ; SI-NEXT: s_cbranch_vccnz .LBB16_3
2425 ; SI-NEXT: .LBB16_2: ; %if
2426 ; SI-NEXT: s_mov_b32 s7, 0xf000
2427 ; SI-NEXT: s_mov_b32 s6, -1
2428 ; SI-NEXT: s_mov_b32 s4, s2
2429 ; SI-NEXT: s_mov_b32 s5, s3
2430 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2431 ; SI-NEXT: .LBB16_3: ; %endif
2432 ; SI-NEXT: s_mov_b32 s3, 0xf000
2433 ; SI-NEXT: s_mov_b32 s2, -1
2434 ; SI-NEXT: s_waitcnt vmcnt(0)
2435 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2437 ; SI-NEXT: .LBB16_4:
2438 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1
2439 ; SI-NEXT: s_branch .LBB16_2
2441 ; VI-LABEL: mul64_in_branch:
2442 ; VI: ; %bb.0: ; %entry
2443 ; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
2444 ; VI-NEXT: s_mov_b64 s[8:9], 0
2445 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2446 ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
2447 ; VI-NEXT: s_cbranch_scc0 .LBB16_4
2448 ; VI-NEXT: ; %bb.1: ; %else
2449 ; VI-NEXT: v_mov_b32_e32 v0, s6
2450 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0
2451 ; VI-NEXT: s_mul_i32 s4, s4, s7
2452 ; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
2453 ; VI-NEXT: s_mul_i32 s4, s5, s6
2454 ; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
2455 ; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
2456 ; VI-NEXT: s_cbranch_vccnz .LBB16_3
2457 ; VI-NEXT: .LBB16_2: ; %if
2458 ; VI-NEXT: s_mov_b32 s7, 0xf000
2459 ; VI-NEXT: s_mov_b32 s6, -1
2460 ; VI-NEXT: s_mov_b32 s4, s2
2461 ; VI-NEXT: s_mov_b32 s5, s3
2462 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2463 ; VI-NEXT: .LBB16_3: ; %endif
2464 ; VI-NEXT: s_mov_b32 s3, 0xf000
2465 ; VI-NEXT: s_mov_b32 s2, -1
2466 ; VI-NEXT: s_waitcnt vmcnt(0)
2467 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2469 ; VI-NEXT: .LBB16_4:
2470 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
2471 ; VI-NEXT: s_branch .LBB16_2
2473 ; GFX9-LABEL: mul64_in_branch:
2474 ; GFX9: ; %bb.0: ; %entry
2475 ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
2476 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2477 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2478 ; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
2479 ; GFX9-NEXT: s_cbranch_scc0 .LBB16_3
2480 ; GFX9-NEXT: ; %bb.1: ; %else
2481 ; GFX9-NEXT: s_mul_i32 s2, s8, s11
2482 ; GFX9-NEXT: s_mul_hi_u32 s3, s8, s10
2483 ; GFX9-NEXT: s_add_i32 s2, s3, s2
2484 ; GFX9-NEXT: s_mul_i32 s3, s9, s10
2485 ; GFX9-NEXT: s_add_i32 s3, s2, s3
2486 ; GFX9-NEXT: s_mul_i32 s2, s8, s10
2487 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
2488 ; GFX9-NEXT: s_cbranch_vccnz .LBB16_4
2489 ; GFX9-NEXT: .LBB16_2: ; %if
2490 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
2491 ; GFX9-NEXT: s_mov_b32 s2, -1
2492 ; GFX9-NEXT: s_mov_b32 s0, s6
2493 ; GFX9-NEXT: s_mov_b32 s1, s7
2494 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
2495 ; GFX9-NEXT: s_branch .LBB16_5
2496 ; GFX9-NEXT: .LBB16_3:
2497 ; GFX9-NEXT: ; implicit-def: $sgpr2_sgpr3
2498 ; GFX9-NEXT: s_branch .LBB16_2
2499 ; GFX9-NEXT: .LBB16_4:
2500 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2501 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2502 ; GFX9-NEXT: .LBB16_5: ; %endif
2503 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
2504 ; GFX9-NEXT: s_mov_b32 s6, -1
2505 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2506 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2507 ; GFX9-NEXT: s_endpgm
2509 ; GFX10-LABEL: mul64_in_branch:
2510 ; GFX10: ; %bb.0: ; %entry
2511 ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
2512 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2513 ; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0
2514 ; GFX10-NEXT: s_cbranch_scc0 .LBB16_3
2515 ; GFX10-NEXT: ; %bb.1: ; %else
2516 ; GFX10-NEXT: s_mul_i32 s0, s8, s11
2517 ; GFX10-NEXT: s_mul_hi_u32 s1, s8, s10
2518 ; GFX10-NEXT: s_mul_i32 s2, s9, s10
2519 ; GFX10-NEXT: s_add_i32 s0, s1, s0
2520 ; GFX10-NEXT: s_add_i32 s1, s0, s2
2521 ; GFX10-NEXT: s_mul_i32 s0, s8, s10
2522 ; GFX10-NEXT: s_cbranch_execnz .LBB16_4
2523 ; GFX10-NEXT: .LBB16_2: ; %if
2524 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
2525 ; GFX10-NEXT: s_mov_b32 s2, -1
2526 ; GFX10-NEXT: s_mov_b32 s0, s6
2527 ; GFX10-NEXT: s_mov_b32 s1, s7
2528 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
2529 ; GFX10-NEXT: s_branch .LBB16_5
2530 ; GFX10-NEXT: .LBB16_3:
2531 ; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1
2532 ; GFX10-NEXT: s_branch .LBB16_2
2533 ; GFX10-NEXT: .LBB16_4:
2534 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
2535 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
2536 ; GFX10-NEXT: .LBB16_5: ; %endif
2537 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
2538 ; GFX10-NEXT: s_mov_b32 s6, -1
2539 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2540 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2541 ; GFX10-NEXT: s_endpgm
2543 ; GFX11-LABEL: mul64_in_branch:
2544 ; GFX11: ; %bb.0: ; %entry
2545 ; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
2546 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2547 ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
2548 ; GFX11-NEXT: s_cbranch_scc0 .LBB16_3
2549 ; GFX11-NEXT: ; %bb.1: ; %else
2550 ; GFX11-NEXT: s_mul_i32 s7, s4, s7
2551 ; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6
2552 ; GFX11-NEXT: s_mul_i32 s5, s5, s6
2553 ; GFX11-NEXT: s_add_i32 s7, s8, s7
2554 ; GFX11-NEXT: s_mul_i32 s4, s4, s6
2555 ; GFX11-NEXT: s_add_i32 s5, s7, s5
2556 ; GFX11-NEXT: s_cbranch_execnz .LBB16_4
2557 ; GFX11-NEXT: .LBB16_2: ; %if
2558 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
2559 ; GFX11-NEXT: s_mov_b32 s6, -1
2560 ; GFX11-NEXT: s_mov_b32 s4, s2
2561 ; GFX11-NEXT: s_mov_b32 s5, s3
2562 ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0
2563 ; GFX11-NEXT: s_branch .LBB16_5
2564 ; GFX11-NEXT: .LBB16_3:
2565 ; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5
2566 ; GFX11-NEXT: s_branch .LBB16_2
2567 ; GFX11-NEXT: .LBB16_4:
2568 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
2569 ; GFX11-NEXT: .LBB16_5: ; %endif
2570 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
2571 ; GFX11-NEXT: s_mov_b32 s2, -1
2572 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2573 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
2574 ; GFX11-NEXT: s_nop 0
2575 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2576 ; GFX11-NEXT: s_endpgm
2578 ; GFX12-LABEL: mul64_in_branch:
2579 ; GFX12: ; %bb.0: ; %entry
2580 ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
2581 ; GFX12-NEXT: s_wait_kmcnt 0x0
2582 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
2583 ; GFX12-NEXT: s_cbranch_scc0 .LBB16_3
2584 ; GFX12-NEXT: ; %bb.1: ; %else
2585 ; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7]
2586 ; GFX12-NEXT: s_cbranch_execnz .LBB16_4
2587 ; GFX12-NEXT: .LBB16_2: ; %if
2588 ; GFX12-NEXT: s_mov_b32 s7, 0x31016000
2589 ; GFX12-NEXT: s_mov_b32 s6, -1
2590 ; GFX12-NEXT: s_mov_b32 s4, s2
2591 ; GFX12-NEXT: s_mov_b32 s5, s3
2592 ; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null
2593 ; GFX12-NEXT: s_branch .LBB16_5
2594 ; GFX12-NEXT: .LBB16_3:
2595 ; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
2596 ; GFX12-NEXT: s_branch .LBB16_2
2597 ; GFX12-NEXT: .LBB16_4:
2598 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
2599 ; GFX12-NEXT: .LBB16_5: ; %endif
2600 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000
2601 ; GFX12-NEXT: s_mov_b32 s2, -1
2602 ; GFX12-NEXT: s_wait_loadcnt 0x0
2603 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
2604 ; GFX12-NEXT: s_nop 0
2605 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2606 ; GFX12-NEXT: s_endpgm
2608 ; EG-LABEL: mul64_in_branch:
2609 ; EG: ; %bb.0: ; %entry
2610 ; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[]
2611 ; EG-NEXT: JUMP @3 POP:1
2612 ; EG-NEXT: ALU_POP_AFTER 11, @19, KC0[CB0:0-32], KC1[]
2613 ; EG-NEXT: ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[]
2614 ; EG-NEXT: JUMP @8 POP:1
2615 ; EG-NEXT: ALU 0, @34, KC0[CB0:0-32], KC1[]
2616 ; EG-NEXT: TEX 0 @12
2617 ; EG-NEXT: POP @8 POP:1
2618 ; EG-NEXT: ALU 1, @35, KC0[], KC1[]
2619 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2622 ; EG-NEXT: Fetch clause starting at 12:
2623 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
2624 ; EG-NEXT: ALU clause starting at 14:
2625 ; EG-NEXT: OR_INT T0.W, KC0[2].W, KC0[3].X,
2626 ; EG-NEXT: MOV * T1.W, literal.x,
2627 ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
2628 ; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0,
2629 ; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
2630 ; EG-NEXT: ALU clause starting at 19:
2631 ; EG-NEXT: MOV T0.W, KC0[2].W,
2632 ; EG-NEXT: MOV * T1.W, KC0[3].Z,
2633 ; EG-NEXT: MOV T2.W, KC0[3].Y,
2634 ; EG-NEXT: MULLO_INT * T0.X, PV.W, PS,
2635 ; EG-NEXT: MOV T1.W, KC0[3].X,
2636 ; EG-NEXT: MULHI * T0.Y, T0.W, PV.W,
2637 ; EG-NEXT: ADD_INT T3.W, PS, T0.X,
2638 ; EG-NEXT: MULLO_INT * T0.X, PV.W, T2.W,
2639 ; EG-NEXT: ADD_INT T0.Y, PV.W, PS,
2640 ; EG-NEXT: MOV T1.W, literal.x,
2641 ; EG-NEXT: MULLO_INT * T0.X, T0.W, T2.W,
2642 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
2643 ; EG-NEXT: ALU clause starting at 31:
2644 ; EG-NEXT: MOV T0.W, KC0[2].Y,
2645 ; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0,
2646 ; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
2647 ; EG-NEXT: ALU clause starting at 34:
2648 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2649 ; EG-NEXT: ALU clause starting at 35:
2650 ; EG-NEXT: LSHR * T1.X, T0.W, literal.x,
2651 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2653 %0 = icmp eq i64 %a, 0
2654 br i1 %0, label %if, label %else
2657 %1 = load i64, ptr addrspace(1) %in
2665 %3 = phi i64 [%1, %if], [%2, %else]
2666 store i64 %3, ptr addrspace(1) %out
2670 define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
2671 ; SI-LABEL: s_mul_i128:
2672 ; SI: ; %bb.0: ; %entry
2673 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x13
2674 ; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x1f
2675 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
2676 ; SI-NEXT: s_mov_b32 s3, 0xf000
2677 ; SI-NEXT: s_mov_b32 s2, -1
2678 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2679 ; SI-NEXT: v_mov_b32_e32 v0, s6
2680 ; SI-NEXT: v_mul_hi_u32 v0, s8, v0
2681 ; SI-NEXT: v_mov_b32_e32 v1, s4
2682 ; SI-NEXT: v_mul_hi_u32 v1, s10, v1
2683 ; SI-NEXT: s_mul_i32 s7, s8, s7
2684 ; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0
2685 ; SI-NEXT: s_mul_i32 s7, s10, s5
2686 ; SI-NEXT: s_mul_i32 s12, s9, s6
2687 ; SI-NEXT: s_mul_i32 s6, s8, s6
2688 ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1
2689 ; SI-NEXT: s_mul_i32 s7, s11, s4
2690 ; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0
2691 ; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1
2692 ; SI-NEXT: s_mul_i32 s7, s10, s4
2693 ; SI-NEXT: v_mov_b32_e32 v2, s6
2694 ; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2
2695 ; SI-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc
2696 ; SI-NEXT: v_mov_b32_e32 v1, s8
2697 ; SI-NEXT: v_mul_hi_u32 v5, s4, v1
2698 ; SI-NEXT: v_mul_hi_u32 v1, s5, v1
2699 ; SI-NEXT: v_mov_b32_e32 v3, s9
2700 ; SI-NEXT: v_mul_hi_u32 v4, s4, v3
2701 ; SI-NEXT: s_mul_i32 s7, s5, s8
2702 ; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5
2703 ; SI-NEXT: s_mul_i32 s6, s4, s9
2704 ; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
2705 ; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v5
2706 ; SI-NEXT: v_mul_hi_u32 v3, s5, v3
2707 ; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
2708 ; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4
2709 ; SI-NEXT: s_mul_i32 s5, s5, s9
2710 ; SI-NEXT: v_addc_u32_e64 v5, s[6:7], 0, 0, vcc
2711 ; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4
2712 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
2713 ; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2
2714 ; SI-NEXT: s_mul_i32 s4, s4, s8
2715 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc
2716 ; SI-NEXT: v_mov_b32_e32 v0, s4
2717 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2720 ; VI-LABEL: s_mul_i128:
2721 ; VI: ; %bb.0: ; %entry
2722 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c
2723 ; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c
2724 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2725 ; VI-NEXT: v_mov_b32_e32 v5, 0
2726 ; VI-NEXT: s_mov_b32 s3, 0xf000
2727 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2728 ; VI-NEXT: v_mov_b32_e32 v0, s6
2729 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0
2730 ; VI-NEXT: s_mul_i32 s7, s8, s7
2731 ; VI-NEXT: v_mov_b32_e32 v6, s8
2732 ; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3
2733 ; VI-NEXT: s_mul_i32 s12, s9, s6
2734 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0
2735 ; VI-NEXT: v_add_u32_e32 v3, vcc, s12, v3
2736 ; VI-NEXT: v_mov_b32_e32 v4, v1
2737 ; VI-NEXT: v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5]
2738 ; VI-NEXT: v_mov_b32_e32 v8, s4
2739 ; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3]
2740 ; VI-NEXT: v_mov_b32_e32 v3, v7
2741 ; VI-NEXT: v_mov_b32_e32 v7, v5
2742 ; VI-NEXT: v_mov_b32_e32 v8, s9
2743 ; VI-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7]
2744 ; VI-NEXT: s_mul_i32 s8, s11, s4
2745 ; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v2
2746 ; VI-NEXT: v_mov_b32_e32 v2, v5
2747 ; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2
2748 ; VI-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc
2749 ; VI-NEXT: s_mul_i32 s8, s10, s5
2750 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3]
2751 ; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v6
2752 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
2753 ; VI-NEXT: s_mov_b32 s2, -1
2754 ; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
2755 ; VI-NEXT: v_mov_b32_e32 v1, v4
2756 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2759 ; GFX9-LABEL: s_mul_i128:
2760 ; GFX9: ; %bb.0: ; %entry
2761 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c
2762 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c
2763 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2764 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
2765 ; GFX9-NEXT: s_mov_b32 s2, -1
2766 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2767 ; GFX9-NEXT: s_mul_i32 s7, s8, s7
2768 ; GFX9-NEXT: s_mul_hi_u32 s12, s8, s6
2769 ; GFX9-NEXT: s_add_i32 s7, s12, s7
2770 ; GFX9-NEXT: s_mul_i32 s12, s9, s6
2771 ; GFX9-NEXT: s_add_i32 s7, s7, s12
2772 ; GFX9-NEXT: s_mul_i32 s12, s10, s5
2773 ; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4
2774 ; GFX9-NEXT: s_add_i32 s12, s13, s12
2775 ; GFX9-NEXT: s_mul_i32 s11, s11, s4
2776 ; GFX9-NEXT: s_mul_i32 s6, s8, s6
2777 ; GFX9-NEXT: s_add_i32 s12, s12, s11
2778 ; GFX9-NEXT: s_mul_i32 s10, s10, s4
2779 ; GFX9-NEXT: s_add_u32 s10, s10, s6
2780 ; GFX9-NEXT: s_addc_u32 s11, s12, s7
2781 ; GFX9-NEXT: s_mul_i32 s14, s5, s8
2782 ; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8
2783 ; GFX9-NEXT: s_mul_hi_u32 s13, s5, s8
2784 ; GFX9-NEXT: s_add_u32 s14, s14, s15
2785 ; GFX9-NEXT: s_mul_i32 s7, s4, s9
2786 ; GFX9-NEXT: s_addc_u32 s13, s13, 0
2787 ; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9
2788 ; GFX9-NEXT: s_add_u32 s7, s7, s14
2789 ; GFX9-NEXT: s_addc_u32 s12, s12, 0
2790 ; GFX9-NEXT: s_add_u32 s12, s13, s12
2791 ; GFX9-NEXT: s_addc_u32 s13, 0, 0
2792 ; GFX9-NEXT: s_mul_hi_u32 s14, s5, s9
2793 ; GFX9-NEXT: s_mul_i32 s5, s5, s9
2794 ; GFX9-NEXT: s_add_u32 s5, s5, s12
2795 ; GFX9-NEXT: s_mov_b32 s6, 0
2796 ; GFX9-NEXT: s_addc_u32 s9, s14, s13
2797 ; GFX9-NEXT: s_add_u32 s10, s5, s10
2798 ; GFX9-NEXT: s_mul_i32 s4, s4, s8
2799 ; GFX9-NEXT: s_mov_b32 s5, s6
2800 ; GFX9-NEXT: s_addc_u32 s9, s9, s11
2801 ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
2802 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2803 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2804 ; GFX9-NEXT: v_mov_b32_e32 v2, s10
2805 ; GFX9-NEXT: v_mov_b32_e32 v3, s9
2806 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2807 ; GFX9-NEXT: s_endpgm
2809 ; GFX10-LABEL: s_mul_i128:
2810 ; GFX10: ; %bb.0: ; %entry
2811 ; GFX10-NEXT: s_clause 0x2
2812 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x4c
2813 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x7c
2814 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2815 ; GFX10-NEXT: s_mov_b32 s12, 0
2816 ; GFX10-NEXT: s_mov_b32 s3, s12
2817 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2818 ; GFX10-NEXT: s_mul_i32 s2, s8, s7
2819 ; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6
2820 ; GFX10-NEXT: s_mul_i32 s14, s10, s5
2821 ; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4
2822 ; GFX10-NEXT: s_mul_i32 s13, s9, s6
2823 ; GFX10-NEXT: s_mul_i32 s11, s11, s4
2824 ; GFX10-NEXT: s_add_i32 s2, s7, s2
2825 ; GFX10-NEXT: s_add_i32 s7, s15, s14
2826 ; GFX10-NEXT: s_mul_i32 s6, s8, s6
2827 ; GFX10-NEXT: s_mul_i32 s10, s10, s4
2828 ; GFX10-NEXT: s_add_i32 s2, s2, s13
2829 ; GFX10-NEXT: s_add_i32 s7, s7, s11
2830 ; GFX10-NEXT: s_mul_i32 s19, s5, s8
2831 ; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8
2832 ; GFX10-NEXT: s_add_u32 s6, s10, s6
2833 ; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8
2834 ; GFX10-NEXT: s_addc_u32 s7, s7, s2
2835 ; GFX10-NEXT: s_mul_i32 s17, s4, s9
2836 ; GFX10-NEXT: s_add_u32 s2, s19, s20
2837 ; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9
2838 ; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9
2839 ; GFX10-NEXT: s_mul_i32 s5, s5, s9
2840 ; GFX10-NEXT: s_addc_u32 s9, s18, 0
2841 ; GFX10-NEXT: s_add_u32 s13, s17, s2
2842 ; GFX10-NEXT: s_addc_u32 s10, s16, 0
2843 ; GFX10-NEXT: s_mul_i32 s2, s4, s8
2844 ; GFX10-NEXT: s_add_u32 s4, s9, s10
2845 ; GFX10-NEXT: s_addc_u32 s8, 0, 0
2846 ; GFX10-NEXT: s_add_u32 s4, s5, s4
2847 ; GFX10-NEXT: s_addc_u32 s5, s21, s8
2848 ; GFX10-NEXT: s_add_u32 s4, s4, s6
2849 ; GFX10-NEXT: s_addc_u32 s5, s5, s7
2850 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13]
2851 ; GFX10-NEXT: v_mov_b32_e32 v2, s4
2852 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
2853 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
2854 ; GFX10-NEXT: v_mov_b32_e32 v3, s5
2855 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
2856 ; GFX10-NEXT: s_mov_b32 s2, -1
2857 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2858 ; GFX10-NEXT: s_endpgm
2860 ; GFX11-LABEL: s_mul_i128:
2861 ; GFX11: ; %bb.0: ; %entry
2862 ; GFX11-NEXT: s_clause 0x2
2863 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x4c
2864 ; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x7c
2865 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
2866 ; GFX11-NEXT: s_mov_b32 s12, 0
2867 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2868 ; GFX11-NEXT: s_mov_b32 s3, s12
2869 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2870 ; GFX11-NEXT: s_mul_i32 s2, s8, s7
2871 ; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6
2872 ; GFX11-NEXT: s_mul_i32 s14, s10, s5
2873 ; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4
2874 ; GFX11-NEXT: s_mul_i32 s13, s9, s6
2875 ; GFX11-NEXT: s_mul_i32 s11, s11, s4
2876 ; GFX11-NEXT: s_add_i32 s2, s7, s2
2877 ; GFX11-NEXT: s_add_i32 s7, s15, s14
2878 ; GFX11-NEXT: s_mul_i32 s6, s8, s6
2879 ; GFX11-NEXT: s_mul_i32 s10, s10, s4
2880 ; GFX11-NEXT: s_add_i32 s2, s2, s13
2881 ; GFX11-NEXT: s_add_i32 s7, s7, s11
2882 ; GFX11-NEXT: s_mul_i32 s19, s5, s8
2883 ; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8
2884 ; GFX11-NEXT: s_add_u32 s6, s10, s6
2885 ; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8
2886 ; GFX11-NEXT: s_addc_u32 s7, s7, s2
2887 ; GFX11-NEXT: s_mul_i32 s17, s4, s9
2888 ; GFX11-NEXT: s_add_u32 s2, s19, s20
2889 ; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9
2890 ; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9
2891 ; GFX11-NEXT: s_mul_i32 s5, s5, s9
2892 ; GFX11-NEXT: s_addc_u32 s9, s18, 0
2893 ; GFX11-NEXT: s_add_u32 s13, s17, s2
2894 ; GFX11-NEXT: s_addc_u32 s10, s16, 0
2895 ; GFX11-NEXT: s_mul_i32 s2, s4, s8
2896 ; GFX11-NEXT: s_add_u32 s4, s9, s10
2897 ; GFX11-NEXT: s_addc_u32 s8, 0, 0
2898 ; GFX11-NEXT: s_add_u32 s4, s5, s4
2899 ; GFX11-NEXT: s_addc_u32 s5, s21, s8
2900 ; GFX11-NEXT: s_add_u32 s4, s4, s6
2901 ; GFX11-NEXT: s_addc_u32 s5, s5, s7
2902 ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13]
2903 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2904 ; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
2905 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
2906 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
2907 ; GFX11-NEXT: s_mov_b32 s2, -1
2908 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
2909 ; GFX11-NEXT: s_nop 0
2910 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2911 ; GFX11-NEXT: s_endpgm
2913 ; GFX12-LABEL: s_mul_i128:
2914 ; GFX12: ; %bb.0: ; %entry
2915 ; GFX12-NEXT: s_clause 0x1
2916 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x7c
2917 ; GFX12-NEXT: s_load_b128 s[8:11], s[2:3], 0x4c
2918 ; GFX12-NEXT: s_mov_b32 s13, 0
2919 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
2920 ; GFX12-NEXT: s_mov_b32 s15, s13
2921 ; GFX12-NEXT: s_mov_b32 s3, s13
2922 ; GFX12-NEXT: s_mov_b32 s17, s13
2923 ; GFX12-NEXT: s_mov_b32 s19, s13
2924 ; GFX12-NEXT: s_mov_b32 s24, s13
2925 ; GFX12-NEXT: s_wait_kmcnt 0x0
2926 ; GFX12-NEXT: s_mov_b32 s12, s4
2927 ; GFX12-NEXT: s_mov_b32 s14, s8
2928 ; GFX12-NEXT: s_mov_b32 s2, s9
2929 ; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[12:13]
2930 ; GFX12-NEXT: s_mul_u64 s[20:21], s[2:3], s[12:13]
2931 ; GFX12-NEXT: s_mov_b32 s12, s23
2932 ; GFX12-NEXT: s_mov_b32 s16, s5
2933 ; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11]
2934 ; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[12:13]
2935 ; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9]
2936 ; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17]
2937 ; GFX12-NEXT: s_mov_b32 s12, s11
2938 ; GFX12-NEXT: s_mov_b32 s11, s13
2939 ; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5]
2940 ; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11]
2941 ; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[16:17]
2942 ; GFX12-NEXT: s_mov_b32 s18, s7
2943 ; GFX12-NEXT: s_mov_b32 s25, s6
2944 ; GFX12-NEXT: s_add_nc_u64 s[6:7], s[12:13], s[18:19]
2945 ; GFX12-NEXT: s_mov_b32 s23, s13
2946 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7]
2947 ; GFX12-NEXT: s_or_b64 s[8:9], s[22:23], s[24:25]
2948 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
2949 ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
2950 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2951 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000
2952 ; GFX12-NEXT: s_mov_b32 s2, -1
2953 ; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null
2954 ; GFX12-NEXT: s_nop 0
2955 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2956 ; GFX12-NEXT: s_endpgm
2958 ; EG-LABEL: s_mul_i128:
2959 ; EG: ; %bb.0: ; %entry
2960 ; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[]
2961 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2964 ; EG-NEXT: ALU clause starting at 4:
2965 ; EG-NEXT: MULLO_INT * T0.X, KC0[5].X, KC0[8].X,
2966 ; EG-NEXT: MULHI * T0.Y, KC0[5].X, KC0[8].X,
2967 ; EG-NEXT: MULLO_INT * T0.Z, KC0[8].Y, KC0[4].W,
2968 ; EG-NEXT: MULLO_INT * T0.W, KC0[8].X, KC0[5].Y,
2969 ; EG-NEXT: MULHI * T1.X, KC0[5].X, KC0[7].W,
2970 ; EG-NEXT: MULHI * T1.Y, KC0[4].W, KC0[8].X,
2971 ; EG-NEXT: MULHI * T1.Z, KC0[8].Y, KC0[4].W,
2972 ; EG-NEXT: MULLO_INT * T1.W, KC0[8].Y, KC0[5].X,
2973 ; EG-NEXT: MULHI * T2.X, KC0[7].W, KC0[5].Y,
2974 ; EG-NEXT: MULLO_INT * T2.Y, KC0[5].X, KC0[7].W,
2975 ; EG-NEXT: MULHI * T2.Z, KC0[4].W, KC0[7].W,
2976 ; EG-NEXT: ADD_INT T2.W, T2.Y, PS,
2977 ; EG-NEXT: MULLO_INT * T3.X, KC0[4].W, KC0[8].X,
2978 ; EG-NEXT: ADDC_UINT T2.Z, T2.Y, T2.Z,
2979 ; EG-NEXT: ADDC_UINT T3.W, PS, PV.W,
2980 ; EG-NEXT: MULLO_INT * T2.Y, KC0[7].W, KC0[5].Z,
2981 ; EG-NEXT: ADD_INT T2.X, T2.X, PS,
2982 ; EG-NEXT: ADD_INT T2.Y, T1.Z, T1.W,
2983 ; EG-NEXT: ADD_INT T1.Z, T1.Y, PV.W,
2984 ; EG-NEXT: ADD_INT T1.W, T1.X, PV.Z, BS:VEC_120/SCL_212
2985 ; EG-NEXT: MULLO_INT * T1.X, KC0[8].Z, KC0[4].W,
2986 ; EG-NEXT: ADD_INT T4.X, PV.W, PV.Z,
2987 ; EG-NEXT: ADDC_UINT T1.Y, PV.W, PV.Z,
2988 ; EG-NEXT: ADD_INT T1.Z, PV.Y, PS,
2989 ; EG-NEXT: ADD_INT T0.W, PV.X, T0.W,
2990 ; EG-NEXT: MULLO_INT * T1.X, KC0[7].W, KC0[5].Y,
2991 ; EG-NEXT: ADD_INT T2.Y, PV.Z, PV.W,
2992 ; EG-NEXT: ADDC_UINT T1.Z, T0.Z, PS,
2993 ; EG-NEXT: ADD_INT T0.W, T0.Y, PV.Y,
2994 ; EG-NEXT: ADDC_UINT * T1.W, T0.X, PV.X,
2995 ; EG-NEXT: ADD_INT T0.Y, T0.X, T4.X,
2996 ; EG-NEXT: ADD_INT T0.Z, T0.Z, T1.X, BS:VEC_021/SCL_122
2997 ; EG-NEXT: ADD_INT T0.W, PV.W, PS,
2998 ; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z,
2999 ; EG-NEXT: ADD_INT T0.W, PV.W, PS,
3000 ; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z,
3001 ; EG-NEXT: ADD_INT * T0.W, PV.W, PS,
3002 ; EG-NEXT: ADD_INT * T0.Z, T0.Y, T0.Z,
3003 ; EG-NEXT: ADD_INT * T0.Y, T3.X, T2.W,
3004 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3005 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3006 ; EG-NEXT: MULLO_INT * T0.X, KC0[4].W, KC0[7].W,
3008 %mul = mul i128 %a, %b
3009 store i128 %mul, ptr addrspace(1) %out
3013 define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
3014 ; SI-LABEL: v_mul_i128:
3015 ; SI: ; %bb.0: ; %entry
3016 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xb
3017 ; SI-NEXT: s_mov_b32 s7, 0xf000
3018 ; SI-NEXT: s_mov_b32 s6, 0
3019 ; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0
3020 ; SI-NEXT: v_mov_b32_e32 v9, 0
3021 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3022 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
3023 ; SI-NEXT: s_mov_b64 s[0:1], s[2:3]
3024 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
3025 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
3026 ; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64
3027 ; SI-NEXT: s_waitcnt vmcnt(0)
3028 ; SI-NEXT: v_mul_lo_u32 v3, v4, v3
3029 ; SI-NEXT: v_mul_hi_u32 v10, v4, v2
3030 ; SI-NEXT: v_mul_lo_u32 v12, v6, v1
3031 ; SI-NEXT: v_mul_hi_u32 v13, v6, v0
3032 ; SI-NEXT: v_mul_lo_u32 v17, v1, v4
3033 ; SI-NEXT: v_mul_hi_u32 v18, v0, v4
3034 ; SI-NEXT: v_mul_lo_u32 v11, v5, v2
3035 ; SI-NEXT: v_mul_lo_u32 v7, v7, v0
3036 ; SI-NEXT: v_mul_hi_u32 v16, v1, v4
3037 ; SI-NEXT: v_mul_lo_u32 v15, v0, v5
3038 ; SI-NEXT: v_mul_hi_u32 v14, v0, v5
3039 ; SI-NEXT: v_mul_hi_u32 v19, v1, v5
3040 ; SI-NEXT: v_mul_lo_u32 v5, v1, v5
3041 ; SI-NEXT: v_add_i32_e32 v1, vcc, v10, v3
3042 ; SI-NEXT: v_add_i32_e32 v3, vcc, v13, v12
3043 ; SI-NEXT: v_mul_lo_u32 v2, v4, v2
3044 ; SI-NEXT: v_mul_lo_u32 v6, v6, v0
3045 ; SI-NEXT: v_mul_lo_u32 v0, v0, v4
3046 ; SI-NEXT: v_add_i32_e32 v4, vcc, v17, v18
3047 ; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v16, vcc
3048 ; SI-NEXT: v_add_i32_e32 v11, vcc, v1, v11
3049 ; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7
3050 ; SI-NEXT: v_add_i32_e32 v1, vcc, v15, v4
3051 ; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v14, vcc
3052 ; SI-NEXT: v_add_i32_e32 v2, vcc, v6, v2
3053 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
3054 ; SI-NEXT: v_add_i32_e32 v4, vcc, v10, v4
3055 ; SI-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, vcc
3056 ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4
3057 ; SI-NEXT: v_addc_u32_e32 v5, vcc, v19, v6, vcc
3058 ; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2
3059 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
3060 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64
3063 ; VI-LABEL: v_mul_i128:
3064 ; VI: ; %bb.0: ; %entry
3065 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x2c
3066 ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0
3067 ; VI-NEXT: v_mov_b32_e32 v11, 0
3068 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3069 ; VI-NEXT: v_mov_b32_e32 v1, s1
3070 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
3071 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3072 ; VI-NEXT: v_mov_b32_e32 v3, s3
3073 ; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2
3074 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
3075 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
3076 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9]
3077 ; VI-NEXT: s_waitcnt vmcnt(0)
3078 ; VI-NEXT: v_mul_lo_u32 v10, v4, v3
3079 ; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0
3080 ; VI-NEXT: v_mul_lo_u32 v14, v5, v2
3081 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
3082 ; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10
3083 ; VI-NEXT: v_mov_b32_e32 v10, v3
3084 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11]
3085 ; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v14
3086 ; VI-NEXT: v_mov_b32_e32 v10, v4
3087 ; VI-NEXT: v_mov_b32_e32 v4, v11
3088 ; VI-NEXT: v_mul_lo_u32 v7, v7, v0
3089 ; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13]
3090 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4]
3091 ; VI-NEXT: v_add_u32_e32 v13, vcc, v7, v13
3092 ; VI-NEXT: v_mov_b32_e32 v0, v4
3093 ; VI-NEXT: v_mul_lo_u32 v11, v6, v1
3094 ; VI-NEXT: v_add_u32_e32 v6, vcc, v10, v0
3095 ; VI-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, vcc
3096 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
3097 ; VI-NEXT: v_add_u32_e32 v5, vcc, v11, v13
3098 ; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v12
3099 ; VI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc
3100 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[2:5]
3103 ; GFX9-LABEL: v_mul_i128:
3104 ; GFX9: ; %bb.0: ; %entry
3105 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
3106 ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0
3107 ; GFX9-NEXT: v_mov_b32_e32 v10, 0
3108 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3109 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[4:5]
3110 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[6:7]
3111 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3112 ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
3113 ; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2
3114 ; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3
3115 ; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10]
3116 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
3117 ; GFX9-NEXT: v_mul_lo_u32 v16, v7, v0
3118 ; GFX9-NEXT: v_mov_b32_e32 v7, v12
3119 ; GFX9-NEXT: v_mov_b32_e32 v12, v10
3120 ; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[11:12]
3121 ; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14
3122 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3]
3123 ; GFX9-NEXT: v_mov_b32_e32 v0, v10
3124 ; GFX9-NEXT: v_mul_lo_u32 v4, v6, v1
3125 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v0
3126 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc
3127 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
3128 ; GFX9-NEXT: v_add3_u32 v3, v16, v3, v4
3129 ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2
3130 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc
3131 ; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[6:7]
3132 ; GFX9-NEXT: s_endpgm
3134 ; GFX10-LABEL: v_mul_i128:
3135 ; GFX10: ; %bb.0: ; %entry
3136 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
3137 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0
3138 ; GFX10-NEXT: v_mov_b32_e32 v10, 0
3139 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3140 ; GFX10-NEXT: s_clause 0x1
3141 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[4:5]
3142 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[6:7]
3143 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3144 ; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0
3145 ; GFX10-NEXT: v_mul_lo_u32 v15, v5, v2
3146 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, v0
3147 ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10]
3148 ; GFX10-NEXT: v_mov_b32_e32 v14, v12
3149 ; GFX10-NEXT: v_mov_b32_e32 v12, v10
3150 ; GFX10-NEXT: v_mad_u64_u32 v[9:10], s0, v0, v5, v[11:12]
3151 ; GFX10-NEXT: v_mul_lo_u32 v11, v4, v3
3152 ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v2, 0
3153 ; GFX10-NEXT: v_mul_lo_u32 v12, v6, v1
3154 ; GFX10-NEXT: v_mov_b32_e32 v4, v10
3155 ; GFX10-NEXT: v_add3_u32 v3, v3, v11, v15
3156 ; GFX10-NEXT: v_add_co_u32 v10, s0, v14, v4
3157 ; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s0, 0, 0, s0
3158 ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3]
3159 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v5, v[10:11]
3160 ; GFX10-NEXT: v_add3_u32 v3, v7, v3, v12
3161 ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
3162 ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
3163 ; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[6:7]
3164 ; GFX10-NEXT: s_endpgm
3166 ; GFX11-LABEL: v_mul_i128:
3167 ; GFX11: ; %bb.0: ; %entry
3168 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c
3169 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
3170 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3171 ; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0
3172 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3173 ; GFX11-NEXT: s_clause 0x1
3174 ; GFX11-NEXT: global_load_b128 v[0:3], v15, s[0:1]
3175 ; GFX11-NEXT: global_load_b128 v[4:7], v15, s[2:3]
3176 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3177 ; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0
3178 ; GFX11-NEXT: v_mul_lo_u32 v14, v5, v2
3179 ; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3
3180 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3181 ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10]
3182 ; GFX11-NEXT: v_dual_mov_b32 v13, v12 :: v_dual_mov_b32 v12, v10
3183 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3184 ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12]
3185 ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v4, v2, 0
3186 ; GFX11-NEXT: v_mul_lo_u32 v4, v6, v1
3187 ; GFX11-NEXT: v_mov_b32_e32 v2, v10
3188 ; GFX11-NEXT: v_mul_lo_u32 v10, v7, v0
3189 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
3190 ; GFX11-NEXT: v_add3_u32 v12, v12, v3, v14
3191 ; GFX11-NEXT: v_add_co_u32 v2, s0, v13, v2
3192 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
3193 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0
3194 ; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v6, v0, v[11:12]
3195 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3196 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3]
3197 ; GFX11-NEXT: v_add3_u32 v0, v10, v14, v4
3198 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3199 ; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v13
3200 ; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
3201 ; GFX11-NEXT: global_store_b128 v15, v[8:11], s[2:3]
3202 ; GFX11-NEXT: s_nop 0
3203 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3204 ; GFX11-NEXT: s_endpgm
3206 ; GFX12-LABEL: v_mul_i128:
3207 ; GFX12: ; %bb.0: ; %entry
3208 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x2c
3209 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
3210 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3211 ; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0
3212 ; GFX12-NEXT: s_wait_kmcnt 0x0
3213 ; GFX12-NEXT: s_clause 0x1
3214 ; GFX12-NEXT: global_load_b128 v[0:3], v13, s[0:1]
3215 ; GFX12-NEXT: global_load_b128 v[4:7], v13, s[2:3]
3216 ; GFX12-NEXT: s_wait_loadcnt 0x0
3217 ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0
3218 ; GFX12-NEXT: v_mul_lo_u32 v15, v5, v2
3219 ; GFX12-NEXT: v_mul_lo_u32 v7, v7, v0
3220 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3221 ; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10]
3222 ; GFX12-NEXT: v_mov_b32_e32 v14, v12
3223 ; GFX12-NEXT: v_mov_b32_e32 v12, v10
3224 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
3225 ; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v0, v5, v[11:12]
3226 ; GFX12-NEXT: v_mul_lo_u32 v11, v4, v3
3227 ; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v2, 0
3228 ; GFX12-NEXT: v_mul_lo_u32 v12, v6, v1
3229 ; GFX12-NEXT: v_mov_b32_e32 v4, v10
3230 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3231 ; GFX12-NEXT: v_add3_u32 v3, v3, v11, v15
3232 ; GFX12-NEXT: v_add_co_u32 v10, s0, v14, v4
3233 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
3234 ; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0
3235 ; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v6, v0, v[2:3]
3236 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3237 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v5, v[10:11]
3238 ; GFX12-NEXT: v_add3_u32 v3, v7, v3, v12
3239 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3240 ; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
3241 ; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
3242 ; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3]
3243 ; GFX12-NEXT: s_nop 0
3244 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3245 ; GFX12-NEXT: s_endpgm
3247 ; EG-LABEL: v_mul_i128:
3248 ; EG: ; %bb.0: ; %entry
3249 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
3251 ; EG-NEXT: ALU 41, @14, KC0[], KC1[]
3252 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
3255 ; EG-NEXT: Fetch clause starting at 6:
3256 ; EG-NEXT: VTX_READ_128 T2.XYZW, T1.X, 0, #1
3257 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
3258 ; EG-NEXT: ALU clause starting at 10:
3259 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
3260 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
3261 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
3262 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
3263 ; EG-NEXT: ALU clause starting at 14:
3264 ; EG-NEXT: MULLO_INT * T1.Y, T0.Y, T2.Y,
3265 ; EG-NEXT: MULHI * T1.Z, T0.Y, T2.Y,
3266 ; EG-NEXT: MULLO_INT * T1.W, T2.Z, T0.X,
3267 ; EG-NEXT: MULLO_INT * T3.X, T2.Y, T0.Z,
3268 ; EG-NEXT: MULHI * T3.Y, T0.Y, T2.X,
3269 ; EG-NEXT: MULHI * T3.Z, T0.X, T2.Y,
3270 ; EG-NEXT: MULHI * T3.W, T2.Z, T0.X,
3271 ; EG-NEXT: MULLO_INT * T2.Z, T2.Z, T0.Y,
3272 ; EG-NEXT: MULHI * T4.X, T2.X, T0.Z,
3273 ; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T2.X,
3274 ; EG-NEXT: MULHI * T4.Y, T0.X, T2.X,
3275 ; EG-NEXT: ADD_INT T4.W, T0.Y, PS,
3276 ; EG-NEXT: MULLO_INT * T2.Y, T0.X, T2.Y,
3277 ; EG-NEXT: ADDC_UINT T4.Z, T0.Y, T4.Y,
3278 ; EG-NEXT: ADDC_UINT T5.W, PS, PV.W,
3279 ; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.W,
3280 ; EG-NEXT: ADD_INT T4.X, T4.X, PS,
3281 ; EG-NEXT: ADD_INT T0.Y, T3.W, T2.Z,
3282 ; EG-NEXT: ADD_INT T2.Z, T3.Z, PV.W,
3283 ; EG-NEXT: ADD_INT T0.W, T3.Y, PV.Z,
3284 ; EG-NEXT: MULLO_INT * T2.W, T2.W, T0.X,
3285 ; EG-NEXT: ADD_INT T5.X, PV.W, PV.Z,
3286 ; EG-NEXT: ADDC_UINT T3.Y, PV.W, PV.Z,
3287 ; EG-NEXT: ADD_INT T2.Z, PV.Y, PS,
3288 ; EG-NEXT: ADD_INT T0.W, PV.X, T3.X,
3289 ; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.Z,
3290 ; EG-NEXT: ADD_INT T4.Y, PV.Z, PV.W,
3291 ; EG-NEXT: ADDC_UINT T0.Z, T1.W, PS,
3292 ; EG-NEXT: ADD_INT T0.W, T1.Z, PV.Y,
3293 ; EG-NEXT: ADDC_UINT * T2.W, T1.Y, PV.X,
3294 ; EG-NEXT: ADD_INT T1.Y, T1.Y, T5.X,
3295 ; EG-NEXT: ADD_INT T1.Z, T1.W, T0.Y,
3296 ; EG-NEXT: ADD_INT T0.W, PV.W, PS,
3297 ; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z,
3298 ; EG-NEXT: ADD_INT T0.W, PV.W, PS,
3299 ; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z,
3300 ; EG-NEXT: ADD_INT * T0.W, PV.W, PS,
3301 ; EG-NEXT: ADD_INT * T0.Z, T1.Y, T1.Z,
3302 ; EG-NEXT: ADD_INT * T0.Y, T2.Y, T4.W,
3303 ; EG-NEXT: LSHR T1.X, T1.X, literal.x,
3304 ; EG-NEXT: MULLO_INT * T0.X, T0.X, T2.X,
3305 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3307 %tid = call i32 @llvm.amdgcn.workitem.id.x()
3308 %gep.a = getelementptr inbounds i128, ptr addrspace(1) %aptr, i32 %tid
3309 %gep.b = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
3310 %gep.out = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
3311 %a = load i128, ptr addrspace(1) %gep.a
3312 %b = load i128, ptr addrspace(1) %gep.b
3313 %mul = mul i128 %a, %b
3314 store i128 %mul, ptr addrspace(1) %gep.out
3318 define i32 @mul_pow2_plus_1(i32 %val) {
3319 ; SI-LABEL: mul_pow2_plus_1:
3321 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3322 ; SI-NEXT: v_mul_lo_u32 v0, v0, 9
3323 ; SI-NEXT: s_setpc_b64 s[30:31]
3325 ; VI-LABEL: mul_pow2_plus_1:
3327 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3328 ; VI-NEXT: v_mul_lo_u32 v0, v0, 9
3329 ; VI-NEXT: s_setpc_b64 s[30:31]
3331 ; GFX9-LABEL: mul_pow2_plus_1:
3333 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3334 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 3, v0
3335 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3337 ; GFX10-LABEL: mul_pow2_plus_1:
3339 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3340 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 3, v0
3341 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3343 ; GFX11-LABEL: mul_pow2_plus_1:
3345 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3346 ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 3, v0
3347 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3349 ; GFX12-LABEL: mul_pow2_plus_1:
3351 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
3352 ; GFX12-NEXT: s_wait_expcnt 0x0
3353 ; GFX12-NEXT: s_wait_samplecnt 0x0
3354 ; GFX12-NEXT: s_wait_bvhcnt 0x0
3355 ; GFX12-NEXT: s_wait_kmcnt 0x0
3356 ; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, v0
3357 ; GFX12-NEXT: s_setpc_b64 s[30:31]
3359 ; EG-LABEL: mul_pow2_plus_1:
3363 %mul = mul i32 %val, 9
3367 declare i32 @llvm.amdgcn.workitem.id.x() #1
3369 attributes #0 = { nounwind }
3370 attributes #1 = { nounwind readnone}