1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
7 define amdgpu_kernel void @fmul_f16(
9 ; SI: ; %bb.0: ; %entry
10 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
11 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
12 ; SI-NEXT: s_mov_b32 s7, 0xf000
13 ; SI-NEXT: s_mov_b32 s6, -1
14 ; SI-NEXT: s_mov_b32 s14, s6
15 ; SI-NEXT: s_waitcnt lgkmcnt(0)
16 ; SI-NEXT: s_mov_b32 s12, s2
17 ; SI-NEXT: s_mov_b32 s13, s3
18 ; SI-NEXT: s_mov_b32 s15, s7
19 ; SI-NEXT: s_mov_b32 s10, s6
20 ; SI-NEXT: s_mov_b32 s11, s7
21 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
22 ; SI-NEXT: s_waitcnt vmcnt(0)
23 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
24 ; SI-NEXT: s_waitcnt vmcnt(0)
25 ; SI-NEXT: s_mov_b32 s4, s0
26 ; SI-NEXT: s_mov_b32 s5, s1
27 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
28 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
29 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
30 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
31 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
34 ; GFX89-LABEL: fmul_f16:
35 ; GFX89: ; %bb.0: ; %entry
36 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
37 ; GFX89-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
38 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
39 ; GFX89-NEXT: s_mov_b32 s6, -1
40 ; GFX89-NEXT: s_mov_b32 s14, s6
41 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
42 ; GFX89-NEXT: s_mov_b32 s12, s2
43 ; GFX89-NEXT: s_mov_b32 s13, s3
44 ; GFX89-NEXT: s_mov_b32 s15, s7
45 ; GFX89-NEXT: s_mov_b32 s10, s6
46 ; GFX89-NEXT: s_mov_b32 s11, s7
47 ; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
48 ; GFX89-NEXT: s_waitcnt vmcnt(0)
49 ; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
50 ; GFX89-NEXT: s_waitcnt vmcnt(0)
51 ; GFX89-NEXT: s_mov_b32 s4, s0
52 ; GFX89-NEXT: s_mov_b32 s5, s1
53 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
54 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
55 ; GFX89-NEXT: s_endpgm
57 ; GFX11-LABEL: fmul_f16:
58 ; GFX11: ; %bb.0: ; %entry
59 ; GFX11-NEXT: s_clause 0x1
60 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
61 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
62 ; GFX11-NEXT: s_mov_b32 s10, -1
63 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
64 ; GFX11-NEXT: s_mov_b32 s14, s10
65 ; GFX11-NEXT: s_mov_b32 s15, s11
66 ; GFX11-NEXT: s_mov_b32 s6, s10
67 ; GFX11-NEXT: s_mov_b32 s7, s11
68 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX11-NEXT: s_mov_b32 s12, s2
70 ; GFX11-NEXT: s_mov_b32 s13, s3
71 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
72 ; GFX11-NEXT: s_waitcnt vmcnt(0)
73 ; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
74 ; GFX11-NEXT: s_waitcnt vmcnt(0)
75 ; GFX11-NEXT: s_mov_b32 s8, s0
76 ; GFX11-NEXT: s_mov_b32 s9, s1
77 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
78 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
79 ; GFX11-NEXT: s_endpgm
82 ptr addrspace(1) %b) {
84 %a.val = load volatile half, ptr addrspace(1) %a
85 %b.val = load volatile half, ptr addrspace(1) %b
86 %r.val = fmul half %a.val, %b.val
87 store half %r.val, ptr addrspace(1) %r
91 define amdgpu_kernel void @fmul_f16_imm_a(
92 ; SI-LABEL: fmul_f16_imm_a:
93 ; SI: ; %bb.0: ; %entry
94 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
95 ; SI-NEXT: s_mov_b32 s7, 0xf000
96 ; SI-NEXT: s_mov_b32 s6, -1
97 ; SI-NEXT: s_mov_b32 s10, s6
98 ; SI-NEXT: s_mov_b32 s11, s7
99 ; SI-NEXT: s_waitcnt lgkmcnt(0)
100 ; SI-NEXT: s_mov_b32 s8, s2
101 ; SI-NEXT: s_mov_b32 s9, s3
102 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
103 ; SI-NEXT: s_waitcnt vmcnt(0)
104 ; SI-NEXT: s_mov_b32 s4, s0
105 ; SI-NEXT: s_mov_b32 s5, s1
106 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
107 ; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0
108 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
109 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
112 ; GFX89-LABEL: fmul_f16_imm_a:
113 ; GFX89: ; %bb.0: ; %entry
114 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
115 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
116 ; GFX89-NEXT: s_mov_b32 s6, -1
117 ; GFX89-NEXT: s_mov_b32 s10, s6
118 ; GFX89-NEXT: s_mov_b32 s11, s7
119 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
120 ; GFX89-NEXT: s_mov_b32 s8, s2
121 ; GFX89-NEXT: s_mov_b32 s9, s3
122 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
123 ; GFX89-NEXT: s_waitcnt vmcnt(0)
124 ; GFX89-NEXT: s_mov_b32 s4, s0
125 ; GFX89-NEXT: s_mov_b32 s5, s1
126 ; GFX89-NEXT: v_mul_f16_e32 v0, 0x4200, v0
127 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
128 ; GFX89-NEXT: s_endpgm
130 ; GFX11-LABEL: fmul_f16_imm_a:
131 ; GFX11: ; %bb.0: ; %entry
132 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
133 ; GFX11-NEXT: s_mov_b32 s6, -1
134 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
135 ; GFX11-NEXT: s_mov_b32 s10, s6
136 ; GFX11-NEXT: s_mov_b32 s11, s7
137 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
138 ; GFX11-NEXT: s_mov_b32 s8, s2
139 ; GFX11-NEXT: s_mov_b32 s9, s3
140 ; GFX11-NEXT: s_mov_b32 s4, s0
141 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
142 ; GFX11-NEXT: s_waitcnt vmcnt(0)
143 ; GFX11-NEXT: s_mov_b32 s5, s1
144 ; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0
145 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
146 ; GFX11-NEXT: s_endpgm
148 ptr addrspace(1) %b) {
150 %b.val = load volatile half, ptr addrspace(1) %b
151 %r.val = fmul half 3.0, %b.val
152 store half %r.val, ptr addrspace(1) %r
156 define amdgpu_kernel void @fmul_f16_imm_b(
157 ; SI-LABEL: fmul_f16_imm_b:
158 ; SI: ; %bb.0: ; %entry
159 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
160 ; SI-NEXT: s_mov_b32 s7, 0xf000
161 ; SI-NEXT: s_mov_b32 s6, -1
162 ; SI-NEXT: s_mov_b32 s10, s6
163 ; SI-NEXT: s_mov_b32 s11, s7
164 ; SI-NEXT: s_waitcnt lgkmcnt(0)
165 ; SI-NEXT: s_mov_b32 s8, s2
166 ; SI-NEXT: s_mov_b32 s9, s3
167 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
168 ; SI-NEXT: s_waitcnt vmcnt(0)
169 ; SI-NEXT: s_mov_b32 s4, s0
170 ; SI-NEXT: s_mov_b32 s5, s1
171 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
172 ; SI-NEXT: v_mul_f32_e32 v0, 4.0, v0
173 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
174 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
177 ; GFX89-LABEL: fmul_f16_imm_b:
178 ; GFX89: ; %bb.0: ; %entry
179 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
180 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
181 ; GFX89-NEXT: s_mov_b32 s6, -1
182 ; GFX89-NEXT: s_mov_b32 s10, s6
183 ; GFX89-NEXT: s_mov_b32 s11, s7
184 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
185 ; GFX89-NEXT: s_mov_b32 s8, s2
186 ; GFX89-NEXT: s_mov_b32 s9, s3
187 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
188 ; GFX89-NEXT: s_waitcnt vmcnt(0)
189 ; GFX89-NEXT: s_mov_b32 s4, s0
190 ; GFX89-NEXT: s_mov_b32 s5, s1
191 ; GFX89-NEXT: v_mul_f16_e32 v0, 4.0, v0
192 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
193 ; GFX89-NEXT: s_endpgm
195 ; GFX11-LABEL: fmul_f16_imm_b:
196 ; GFX11: ; %bb.0: ; %entry
197 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
198 ; GFX11-NEXT: s_mov_b32 s6, -1
199 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
200 ; GFX11-NEXT: s_mov_b32 s10, s6
201 ; GFX11-NEXT: s_mov_b32 s11, s7
202 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
203 ; GFX11-NEXT: s_mov_b32 s8, s2
204 ; GFX11-NEXT: s_mov_b32 s9, s3
205 ; GFX11-NEXT: s_mov_b32 s4, s0
206 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
207 ; GFX11-NEXT: s_waitcnt vmcnt(0)
208 ; GFX11-NEXT: s_mov_b32 s5, s1
209 ; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0
210 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
211 ; GFX11-NEXT: s_endpgm
213 ptr addrspace(1) %a) {
215 %a.val = load volatile half, ptr addrspace(1) %a
216 %r.val = fmul half %a.val, 4.0
217 store half %r.val, ptr addrspace(1) %r
221 define amdgpu_kernel void @fmul_v2f16(
222 ; SI-LABEL: fmul_v2f16:
223 ; SI: ; %bb.0: ; %entry
224 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
225 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
226 ; SI-NEXT: s_mov_b32 s7, 0xf000
227 ; SI-NEXT: s_mov_b32 s6, -1
228 ; SI-NEXT: s_mov_b32 s10, s6
229 ; SI-NEXT: s_mov_b32 s11, s7
230 ; SI-NEXT: s_waitcnt lgkmcnt(0)
231 ; SI-NEXT: s_mov_b32 s12, s2
232 ; SI-NEXT: s_mov_b32 s13, s3
233 ; SI-NEXT: s_mov_b32 s14, s6
234 ; SI-NEXT: s_mov_b32 s15, s7
235 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
236 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
237 ; SI-NEXT: s_mov_b32 s4, s0
238 ; SI-NEXT: s_mov_b32 s5, s1
239 ; SI-NEXT: s_waitcnt vmcnt(1)
240 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
241 ; SI-NEXT: s_waitcnt vmcnt(0)
242 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
243 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
244 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
245 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
246 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
247 ; SI-NEXT: v_mul_f32_e32 v2, v3, v2
248 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
249 ; SI-NEXT: v_mul_f32_e32 v0, v1, v0
250 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
251 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
252 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
253 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
256 ; VI-LABEL: fmul_v2f16:
257 ; VI: ; %bb.0: ; %entry
258 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
259 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
260 ; VI-NEXT: s_mov_b32 s7, 0xf000
261 ; VI-NEXT: s_mov_b32 s6, -1
262 ; VI-NEXT: s_mov_b32 s10, s6
263 ; VI-NEXT: s_mov_b32 s11, s7
264 ; VI-NEXT: s_waitcnt lgkmcnt(0)
265 ; VI-NEXT: s_mov_b32 s12, s2
266 ; VI-NEXT: s_mov_b32 s13, s3
267 ; VI-NEXT: s_mov_b32 s14, s6
268 ; VI-NEXT: s_mov_b32 s15, s7
269 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
270 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
271 ; VI-NEXT: s_mov_b32 s4, s0
272 ; VI-NEXT: s_mov_b32 s5, s1
273 ; VI-NEXT: s_waitcnt vmcnt(0)
274 ; VI-NEXT: v_mul_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
275 ; VI-NEXT: v_mul_f16_e32 v0, v1, v0
276 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
277 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
280 ; GFX9-LABEL: fmul_v2f16:
281 ; GFX9: ; %bb.0: ; %entry
282 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
283 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
284 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
285 ; GFX9-NEXT: s_mov_b32 s6, -1
286 ; GFX9-NEXT: s_mov_b32 s14, s6
287 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
288 ; GFX9-NEXT: s_mov_b32 s12, s2
289 ; GFX9-NEXT: s_mov_b32 s13, s3
290 ; GFX9-NEXT: s_mov_b32 s15, s7
291 ; GFX9-NEXT: s_mov_b32 s10, s6
292 ; GFX9-NEXT: s_mov_b32 s11, s7
293 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0
294 ; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0
295 ; GFX9-NEXT: s_mov_b32 s4, s0
296 ; GFX9-NEXT: s_mov_b32 s5, s1
297 ; GFX9-NEXT: s_waitcnt vmcnt(0)
298 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
299 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
300 ; GFX9-NEXT: s_endpgm
302 ; GFX11-LABEL: fmul_v2f16:
303 ; GFX11: ; %bb.0: ; %entry
304 ; GFX11-NEXT: s_clause 0x1
305 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
306 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
307 ; GFX11-NEXT: s_mov_b32 s10, -1
308 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
309 ; GFX11-NEXT: s_mov_b32 s14, s10
310 ; GFX11-NEXT: s_mov_b32 s15, s11
311 ; GFX11-NEXT: s_mov_b32 s6, s10
312 ; GFX11-NEXT: s_mov_b32 s7, s11
313 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
314 ; GFX11-NEXT: s_mov_b32 s12, s2
315 ; GFX11-NEXT: s_mov_b32 s13, s3
316 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
317 ; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0
318 ; GFX11-NEXT: s_mov_b32 s8, s0
319 ; GFX11-NEXT: s_mov_b32 s9, s1
320 ; GFX11-NEXT: s_waitcnt vmcnt(0)
321 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1
322 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
323 ; GFX11-NEXT: s_endpgm
326 ptr addrspace(1) %b) {
328 %a.val = load <2 x half>, ptr addrspace(1) %a
329 %b.val = load <2 x half>, ptr addrspace(1) %b
330 %r.val = fmul <2 x half> %a.val, %b.val
331 store <2 x half> %r.val, ptr addrspace(1) %r
335 define amdgpu_kernel void @fmul_v2f16_imm_a(
336 ; SI-LABEL: fmul_v2f16_imm_a:
337 ; SI: ; %bb.0: ; %entry
338 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
339 ; SI-NEXT: s_mov_b32 s7, 0xf000
340 ; SI-NEXT: s_mov_b32 s6, -1
341 ; SI-NEXT: s_mov_b32 s10, s6
342 ; SI-NEXT: s_mov_b32 s11, s7
343 ; SI-NEXT: s_waitcnt lgkmcnt(0)
344 ; SI-NEXT: s_mov_b32 s8, s2
345 ; SI-NEXT: s_mov_b32 s9, s3
346 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
347 ; SI-NEXT: s_mov_b32 s4, s0
348 ; SI-NEXT: s_mov_b32 s5, s1
349 ; SI-NEXT: s_waitcnt vmcnt(0)
350 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
351 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
352 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
353 ; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1
354 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
355 ; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0
356 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
357 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
358 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
359 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
362 ; VI-LABEL: fmul_v2f16_imm_a:
363 ; VI: ; %bb.0: ; %entry
364 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
365 ; VI-NEXT: s_mov_b32 s7, 0xf000
366 ; VI-NEXT: s_mov_b32 s6, -1
367 ; VI-NEXT: s_mov_b32 s10, s6
368 ; VI-NEXT: s_mov_b32 s11, s7
369 ; VI-NEXT: s_waitcnt lgkmcnt(0)
370 ; VI-NEXT: s_mov_b32 s8, s2
371 ; VI-NEXT: s_mov_b32 s9, s3
372 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
373 ; VI-NEXT: v_mov_b32_e32 v1, 0x4400
374 ; VI-NEXT: s_mov_b32 s4, s0
375 ; VI-NEXT: s_mov_b32 s5, s1
376 ; VI-NEXT: s_waitcnt vmcnt(0)
377 ; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
378 ; VI-NEXT: v_mul_f16_e32 v0, 0x4200, v0
379 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
380 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
383 ; GFX9-LABEL: fmul_v2f16_imm_a:
384 ; GFX9: ; %bb.0: ; %entry
385 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
386 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
387 ; GFX9-NEXT: s_mov_b32 s6, -1
388 ; GFX9-NEXT: s_mov_b32 s10, s6
389 ; GFX9-NEXT: s_mov_b32 s11, s7
390 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
391 ; GFX9-NEXT: s_mov_b32 s8, s2
392 ; GFX9-NEXT: s_mov_b32 s9, s3
393 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
394 ; GFX9-NEXT: s_mov_b32 s4, s0
395 ; GFX9-NEXT: s_mov_b32 s0, 0x44004200
396 ; GFX9-NEXT: s_mov_b32 s5, s1
397 ; GFX9-NEXT: s_waitcnt vmcnt(0)
398 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0
399 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
400 ; GFX9-NEXT: s_endpgm
402 ; GFX11-LABEL: fmul_v2f16_imm_a:
403 ; GFX11: ; %bb.0: ; %entry
404 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
405 ; GFX11-NEXT: s_mov_b32 s6, -1
406 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
407 ; GFX11-NEXT: s_mov_b32 s10, s6
408 ; GFX11-NEXT: s_mov_b32 s11, s7
409 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
410 ; GFX11-NEXT: s_mov_b32 s8, s2
411 ; GFX11-NEXT: s_mov_b32 s9, s3
412 ; GFX11-NEXT: s_mov_b32 s4, s0
413 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
414 ; GFX11-NEXT: s_mov_b32 s5, s1
415 ; GFX11-NEXT: s_waitcnt vmcnt(0)
416 ; GFX11-NEXT: v_pk_mul_f16 v0, 0x44004200, v0
417 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
418 ; GFX11-NEXT: s_endpgm
420 ptr addrspace(1) %b) {
422 %b.val = load <2 x half>, ptr addrspace(1) %b
423 %r.val = fmul <2 x half> <half 3.0, half 4.0>, %b.val
424 store <2 x half> %r.val, ptr addrspace(1) %r
428 define amdgpu_kernel void @fmul_v2f16_imm_b(
429 ; SI-LABEL: fmul_v2f16_imm_b:
430 ; SI: ; %bb.0: ; %entry
431 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
432 ; SI-NEXT: s_mov_b32 s7, 0xf000
433 ; SI-NEXT: s_mov_b32 s6, -1
434 ; SI-NEXT: s_mov_b32 s10, s6
435 ; SI-NEXT: s_mov_b32 s11, s7
436 ; SI-NEXT: s_waitcnt lgkmcnt(0)
437 ; SI-NEXT: s_mov_b32 s8, s2
438 ; SI-NEXT: s_mov_b32 s9, s3
439 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
440 ; SI-NEXT: s_mov_b32 s4, s0
441 ; SI-NEXT: s_mov_b32 s5, s1
442 ; SI-NEXT: s_waitcnt vmcnt(0)
443 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
444 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
445 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
446 ; SI-NEXT: v_mul_f32_e32 v1, 0x40400000, v1
447 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
448 ; SI-NEXT: v_mul_f32_e32 v0, 4.0, v0
449 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
450 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
451 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
452 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
455 ; VI-LABEL: fmul_v2f16_imm_b:
456 ; VI: ; %bb.0: ; %entry
457 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
458 ; VI-NEXT: s_mov_b32 s7, 0xf000
459 ; VI-NEXT: s_mov_b32 s6, -1
460 ; VI-NEXT: s_mov_b32 s10, s6
461 ; VI-NEXT: s_mov_b32 s11, s7
462 ; VI-NEXT: s_waitcnt lgkmcnt(0)
463 ; VI-NEXT: s_mov_b32 s8, s2
464 ; VI-NEXT: s_mov_b32 s9, s3
465 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
466 ; VI-NEXT: v_mov_b32_e32 v1, 0x4200
467 ; VI-NEXT: s_mov_b32 s4, s0
468 ; VI-NEXT: s_mov_b32 s5, s1
469 ; VI-NEXT: s_waitcnt vmcnt(0)
470 ; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
471 ; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0
472 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
473 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
476 ; GFX9-LABEL: fmul_v2f16_imm_b:
477 ; GFX9: ; %bb.0: ; %entry
478 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
479 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
480 ; GFX9-NEXT: s_mov_b32 s6, -1
481 ; GFX9-NEXT: s_mov_b32 s10, s6
482 ; GFX9-NEXT: s_mov_b32 s11, s7
483 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
484 ; GFX9-NEXT: s_mov_b32 s8, s2
485 ; GFX9-NEXT: s_mov_b32 s9, s3
486 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
487 ; GFX9-NEXT: s_mov_b32 s4, s0
488 ; GFX9-NEXT: s_mov_b32 s0, 0x42004400
489 ; GFX9-NEXT: s_mov_b32 s5, s1
490 ; GFX9-NEXT: s_waitcnt vmcnt(0)
491 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0
492 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
493 ; GFX9-NEXT: s_endpgm
495 ; GFX11-LABEL: fmul_v2f16_imm_b:
496 ; GFX11: ; %bb.0: ; %entry
497 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
498 ; GFX11-NEXT: s_mov_b32 s6, -1
499 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
500 ; GFX11-NEXT: s_mov_b32 s10, s6
501 ; GFX11-NEXT: s_mov_b32 s11, s7
502 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
503 ; GFX11-NEXT: s_mov_b32 s8, s2
504 ; GFX11-NEXT: s_mov_b32 s9, s3
505 ; GFX11-NEXT: s_mov_b32 s4, s0
506 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
507 ; GFX11-NEXT: s_mov_b32 s5, s1
508 ; GFX11-NEXT: s_waitcnt vmcnt(0)
509 ; GFX11-NEXT: v_pk_mul_f16 v0, 0x42004400, v0
510 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
511 ; GFX11-NEXT: s_endpgm
513 ptr addrspace(1) %a) {
515 %a.val = load <2 x half>, ptr addrspace(1) %a
516 %r.val = fmul <2 x half> %a.val, <half 4.0, half 3.0>
517 store <2 x half> %r.val, ptr addrspace(1) %r
521 define amdgpu_kernel void @fmul_v4f16(
522 ; SI-LABEL: fmul_v4f16:
523 ; SI: ; %bb.0: ; %entry
524 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
525 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
526 ; SI-NEXT: s_mov_b32 s3, 0xf000
527 ; SI-NEXT: s_mov_b32 s2, -1
528 ; SI-NEXT: s_mov_b32 s6, s2
529 ; SI-NEXT: s_waitcnt lgkmcnt(0)
530 ; SI-NEXT: s_mov_b32 s12, s10
531 ; SI-NEXT: s_mov_b32 s7, s3
532 ; SI-NEXT: s_mov_b32 s13, s11
533 ; SI-NEXT: s_mov_b32 s14, s2
534 ; SI-NEXT: s_mov_b32 s15, s3
535 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
536 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
537 ; SI-NEXT: s_mov_b32 s0, s8
538 ; SI-NEXT: s_mov_b32 s1, s9
539 ; SI-NEXT: s_waitcnt vmcnt(1)
540 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
541 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
542 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v1
543 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
544 ; SI-NEXT: s_waitcnt vmcnt(0)
545 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v2
546 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
547 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v3
548 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
549 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
550 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
551 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
552 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
553 ; SI-NEXT: v_mul_f32_e32 v5, v7, v5
554 ; SI-NEXT: v_mul_f32_e32 v4, v6, v4
555 ; SI-NEXT: v_mul_f32_e32 v1, v3, v1
556 ; SI-NEXT: v_mul_f32_e32 v0, v2, v0
557 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
558 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
559 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v5
560 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v4
561 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
562 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
563 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
564 ; SI-NEXT: v_or_b32_e32 v0, v3, v0
565 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
568 ; VI-LABEL: fmul_v4f16:
569 ; VI: ; %bb.0: ; %entry
570 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
571 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
572 ; VI-NEXT: s_mov_b32 s7, 0xf000
573 ; VI-NEXT: s_mov_b32 s6, -1
574 ; VI-NEXT: s_mov_b32 s10, s6
575 ; VI-NEXT: s_mov_b32 s11, s7
576 ; VI-NEXT: s_waitcnt lgkmcnt(0)
577 ; VI-NEXT: s_mov_b32 s12, s2
578 ; VI-NEXT: s_mov_b32 s13, s3
579 ; VI-NEXT: s_mov_b32 s14, s6
580 ; VI-NEXT: s_mov_b32 s15, s7
581 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
582 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
583 ; VI-NEXT: s_mov_b32 s4, s0
584 ; VI-NEXT: s_mov_b32 s5, s1
585 ; VI-NEXT: s_waitcnt vmcnt(0)
586 ; VI-NEXT: v_mul_f16_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
587 ; VI-NEXT: v_mul_f16_e32 v1, v3, v1
588 ; VI-NEXT: v_mul_f16_sdwa v3, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
589 ; VI-NEXT: v_mul_f16_e32 v0, v2, v0
590 ; VI-NEXT: v_or_b32_e32 v1, v1, v4
591 ; VI-NEXT: v_or_b32_e32 v0, v0, v3
592 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
595 ; GFX9-LABEL: fmul_v4f16:
596 ; GFX9: ; %bb.0: ; %entry
597 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
598 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
599 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
600 ; GFX9-NEXT: s_mov_b32 s6, -1
601 ; GFX9-NEXT: s_mov_b32 s10, s6
602 ; GFX9-NEXT: s_mov_b32 s11, s7
603 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
604 ; GFX9-NEXT: s_mov_b32 s12, s2
605 ; GFX9-NEXT: s_mov_b32 s13, s3
606 ; GFX9-NEXT: s_mov_b32 s14, s6
607 ; GFX9-NEXT: s_mov_b32 s15, s7
608 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
609 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
610 ; GFX9-NEXT: s_mov_b32 s4, s0
611 ; GFX9-NEXT: s_mov_b32 s5, s1
612 ; GFX9-NEXT: s_waitcnt vmcnt(0)
613 ; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1
614 ; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0
615 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
616 ; GFX9-NEXT: s_endpgm
618 ; GFX11-LABEL: fmul_v4f16:
619 ; GFX11: ; %bb.0: ; %entry
620 ; GFX11-NEXT: s_clause 0x1
621 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
622 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
623 ; GFX11-NEXT: s_mov_b32 s10, -1
624 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
625 ; GFX11-NEXT: s_mov_b32 s6, s10
626 ; GFX11-NEXT: s_mov_b32 s7, s11
627 ; GFX11-NEXT: s_mov_b32 s14, s10
628 ; GFX11-NEXT: s_mov_b32 s15, s11
629 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
630 ; GFX11-NEXT: s_mov_b32 s12, s2
631 ; GFX11-NEXT: s_mov_b32 s13, s3
632 ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0
633 ; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0
634 ; GFX11-NEXT: s_mov_b32 s8, s0
635 ; GFX11-NEXT: s_mov_b32 s9, s1
636 ; GFX11-NEXT: s_waitcnt vmcnt(0)
637 ; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1
638 ; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0
639 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
640 ; GFX11-NEXT: s_endpgm
643 ptr addrspace(1) %b) {
645 %a.val = load <4 x half>, ptr addrspace(1) %a
646 %b.val = load <4 x half>, ptr addrspace(1) %b
647 %r.val = fmul <4 x half> %a.val, %b.val
648 store <4 x half> %r.val, ptr addrspace(1) %r
652 define amdgpu_kernel void @fmul_v4f16_imm_a(
653 ; SI-LABEL: fmul_v4f16_imm_a:
654 ; SI: ; %bb.0: ; %entry
655 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
656 ; SI-NEXT: s_mov_b32 s7, 0xf000
657 ; SI-NEXT: s_mov_b32 s6, -1
658 ; SI-NEXT: s_mov_b32 s10, s6
659 ; SI-NEXT: s_mov_b32 s11, s7
660 ; SI-NEXT: s_waitcnt lgkmcnt(0)
661 ; SI-NEXT: s_mov_b32 s8, s2
662 ; SI-NEXT: s_mov_b32 s9, s3
663 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
664 ; SI-NEXT: s_mov_b32 s4, s0
665 ; SI-NEXT: s_mov_b32 s5, s1
666 ; SI-NEXT: s_waitcnt vmcnt(0)
667 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
668 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
669 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
670 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
671 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
672 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
673 ; SI-NEXT: v_mul_f32_e32 v3, 0x40400000, v3
674 ; SI-NEXT: v_mul_f32_e32 v2, 0x41000000, v2
675 ; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1
676 ; SI-NEXT: v_add_f32_e32 v0, v0, v0
677 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
678 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
679 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
680 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
681 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
682 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
683 ; SI-NEXT: v_or_b32_e32 v1, v3, v1
684 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
685 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
688 ; VI-LABEL: fmul_v4f16_imm_a:
689 ; VI: ; %bb.0: ; %entry
690 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
691 ; VI-NEXT: s_mov_b32 s7, 0xf000
692 ; VI-NEXT: s_mov_b32 s6, -1
693 ; VI-NEXT: s_mov_b32 s10, s6
694 ; VI-NEXT: s_mov_b32 s11, s7
695 ; VI-NEXT: s_waitcnt lgkmcnt(0)
696 ; VI-NEXT: s_mov_b32 s8, s2
697 ; VI-NEXT: s_mov_b32 s9, s3
698 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
699 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400
700 ; VI-NEXT: s_mov_b32 s4, s0
701 ; VI-NEXT: s_mov_b32 s5, s1
702 ; VI-NEXT: s_waitcnt vmcnt(0)
703 ; VI-NEXT: v_mul_f16_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
704 ; VI-NEXT: v_mul_f16_e32 v1, 0x4200, v1
705 ; VI-NEXT: v_add_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
706 ; VI-NEXT: v_mul_f16_e32 v0, 0x4800, v0
707 ; VI-NEXT: v_or_b32_e32 v1, v1, v2
708 ; VI-NEXT: v_or_b32_e32 v0, v0, v3
709 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
712 ; GFX9-LABEL: fmul_v4f16_imm_a:
713 ; GFX9: ; %bb.0: ; %entry
714 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
715 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
716 ; GFX9-NEXT: s_mov_b32 s6, -1
717 ; GFX9-NEXT: s_mov_b32 s10, s6
718 ; GFX9-NEXT: s_mov_b32 s11, s7
719 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
720 ; GFX9-NEXT: s_mov_b32 s8, s2
721 ; GFX9-NEXT: s_mov_b32 s9, s3
722 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
723 ; GFX9-NEXT: s_mov_b32 s2, 0x44004200
724 ; GFX9-NEXT: s_mov_b32 s3, 0x40004800
725 ; GFX9-NEXT: s_mov_b32 s4, s0
726 ; GFX9-NEXT: s_mov_b32 s5, s1
727 ; GFX9-NEXT: s_waitcnt vmcnt(0)
728 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, s2
729 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, s3
730 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
731 ; GFX9-NEXT: s_endpgm
733 ; GFX11-LABEL: fmul_v4f16_imm_a:
734 ; GFX11: ; %bb.0: ; %entry
735 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
736 ; GFX11-NEXT: s_mov_b32 s6, -1
737 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
738 ; GFX11-NEXT: s_mov_b32 s10, s6
739 ; GFX11-NEXT: s_mov_b32 s11, s7
740 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
741 ; GFX11-NEXT: s_mov_b32 s8, s2
742 ; GFX11-NEXT: s_mov_b32 s9, s3
743 ; GFX11-NEXT: s_mov_b32 s4, s0
744 ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
745 ; GFX11-NEXT: s_mov_b32 s5, s1
746 ; GFX11-NEXT: s_waitcnt vmcnt(0)
747 ; GFX11-NEXT: v_pk_mul_f16 v1, 0x44004200, v1
748 ; GFX11-NEXT: v_pk_mul_f16 v0, 0x40004800, v0
749 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
750 ; GFX11-NEXT: s_endpgm
752 ptr addrspace(1) %b) {
754 %b.val = load <4 x half>, ptr addrspace(1) %b
755 %r.val = fmul <4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, %b.val
756 store <4 x half> %r.val, ptr addrspace(1) %r