1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
7 define amdgpu_kernel void @fmul_f16(
9 ; SI: ; %bb.0: ; %entry
10 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
11 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
12 ; SI-NEXT: s_mov_b32 s3, 0xf000
13 ; SI-NEXT: s_mov_b32 s2, -1
14 ; SI-NEXT: s_mov_b32 s14, s2
15 ; SI-NEXT: s_waitcnt lgkmcnt(0)
16 ; SI-NEXT: s_mov_b32 s12, s6
17 ; SI-NEXT: s_mov_b32 s13, s7
18 ; SI-NEXT: s_mov_b32 s15, s3
19 ; SI-NEXT: s_mov_b32 s10, s2
20 ; SI-NEXT: s_mov_b32 s11, s3
21 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
22 ; SI-NEXT: s_waitcnt vmcnt(0)
23 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
24 ; SI-NEXT: s_waitcnt vmcnt(0)
25 ; SI-NEXT: s_mov_b32 s0, s4
26 ; SI-NEXT: s_mov_b32 s1, s5
27 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
28 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
29 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
30 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
31 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
34 ; GFX89-LABEL: fmul_f16:
35 ; GFX89: ; %bb.0: ; %entry
36 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
37 ; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
38 ; GFX89-NEXT: s_mov_b32 s3, 0xf000
39 ; GFX89-NEXT: s_mov_b32 s2, -1
40 ; GFX89-NEXT: s_mov_b32 s14, s2
41 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
42 ; GFX89-NEXT: s_mov_b32 s12, s6
43 ; GFX89-NEXT: s_mov_b32 s13, s7
44 ; GFX89-NEXT: s_mov_b32 s15, s3
45 ; GFX89-NEXT: s_mov_b32 s10, s2
46 ; GFX89-NEXT: s_mov_b32 s11, s3
47 ; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
48 ; GFX89-NEXT: s_waitcnt vmcnt(0)
49 ; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
50 ; GFX89-NEXT: s_waitcnt vmcnt(0)
51 ; GFX89-NEXT: s_mov_b32 s0, s4
52 ; GFX89-NEXT: s_mov_b32 s1, s5
53 ; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
54 ; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
55 ; GFX89-NEXT: s_endpgm
57 ; GFX11-LABEL: fmul_f16:
58 ; GFX11: ; %bb.0: ; %entry
59 ; GFX11-NEXT: s_clause 0x1
60 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
61 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
62 ; GFX11-NEXT: s_mov_b32 s10, -1
63 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
64 ; GFX11-NEXT: s_mov_b32 s14, s10
65 ; GFX11-NEXT: s_mov_b32 s15, s11
66 ; GFX11-NEXT: s_mov_b32 s2, s10
67 ; GFX11-NEXT: s_mov_b32 s3, s11
68 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX11-NEXT: s_mov_b32 s12, s6
70 ; GFX11-NEXT: s_mov_b32 s13, s7
71 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
72 ; GFX11-NEXT: s_waitcnt vmcnt(0)
73 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
74 ; GFX11-NEXT: s_waitcnt vmcnt(0)
75 ; GFX11-NEXT: s_mov_b32 s8, s4
76 ; GFX11-NEXT: s_mov_b32 s9, s5
77 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
78 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
80 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
81 ; GFX11-NEXT: s_endpgm
84 ptr addrspace(1) %b) {
86 %a.val = load volatile half, ptr addrspace(1) %a
87 %b.val = load volatile half, ptr addrspace(1) %b
88 %r.val = fmul half %a.val, %b.val
89 store half %r.val, ptr addrspace(1) %r
93 define amdgpu_kernel void @fmul_f16_imm_a(
94 ; SI-LABEL: fmul_f16_imm_a:
95 ; SI: ; %bb.0: ; %entry
96 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
97 ; SI-NEXT: s_mov_b32 s7, 0xf000
98 ; SI-NEXT: s_mov_b32 s6, -1
99 ; SI-NEXT: s_mov_b32 s10, s6
100 ; SI-NEXT: s_mov_b32 s11, s7
101 ; SI-NEXT: s_waitcnt lgkmcnt(0)
102 ; SI-NEXT: s_mov_b32 s8, s2
103 ; SI-NEXT: s_mov_b32 s9, s3
104 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
105 ; SI-NEXT: s_waitcnt vmcnt(0)
106 ; SI-NEXT: s_mov_b32 s4, s0
107 ; SI-NEXT: s_mov_b32 s5, s1
108 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
109 ; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0
110 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
111 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
114 ; GFX89-LABEL: fmul_f16_imm_a:
115 ; GFX89: ; %bb.0: ; %entry
116 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
117 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
118 ; GFX89-NEXT: s_mov_b32 s6, -1
119 ; GFX89-NEXT: s_mov_b32 s10, s6
120 ; GFX89-NEXT: s_mov_b32 s11, s7
121 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
122 ; GFX89-NEXT: s_mov_b32 s8, s2
123 ; GFX89-NEXT: s_mov_b32 s9, s3
124 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
125 ; GFX89-NEXT: s_waitcnt vmcnt(0)
126 ; GFX89-NEXT: s_mov_b32 s4, s0
127 ; GFX89-NEXT: s_mov_b32 s5, s1
128 ; GFX89-NEXT: v_mul_f16_e32 v0, 0x4200, v0
129 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
130 ; GFX89-NEXT: s_endpgm
132 ; GFX11-LABEL: fmul_f16_imm_a:
133 ; GFX11: ; %bb.0: ; %entry
134 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
135 ; GFX11-NEXT: s_mov_b32 s6, -1
136 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
137 ; GFX11-NEXT: s_mov_b32 s10, s6
138 ; GFX11-NEXT: s_mov_b32 s11, s7
139 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
140 ; GFX11-NEXT: s_mov_b32 s8, s2
141 ; GFX11-NEXT: s_mov_b32 s9, s3
142 ; GFX11-NEXT: s_mov_b32 s4, s0
143 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
144 ; GFX11-NEXT: s_waitcnt vmcnt(0)
145 ; GFX11-NEXT: s_mov_b32 s5, s1
146 ; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0
147 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
148 ; GFX11-NEXT: s_nop 0
149 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
150 ; GFX11-NEXT: s_endpgm
152 ptr addrspace(1) %b) {
154 %b.val = load volatile half, ptr addrspace(1) %b
155 %r.val = fmul half 3.0, %b.val
156 store half %r.val, ptr addrspace(1) %r
160 define amdgpu_kernel void @fmul_f16_imm_b(
161 ; SI-LABEL: fmul_f16_imm_b:
162 ; SI: ; %bb.0: ; %entry
163 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
164 ; SI-NEXT: s_mov_b32 s7, 0xf000
165 ; SI-NEXT: s_mov_b32 s6, -1
166 ; SI-NEXT: s_mov_b32 s10, s6
167 ; SI-NEXT: s_mov_b32 s11, s7
168 ; SI-NEXT: s_waitcnt lgkmcnt(0)
169 ; SI-NEXT: s_mov_b32 s8, s2
170 ; SI-NEXT: s_mov_b32 s9, s3
171 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
172 ; SI-NEXT: s_waitcnt vmcnt(0)
173 ; SI-NEXT: s_mov_b32 s4, s0
174 ; SI-NEXT: s_mov_b32 s5, s1
175 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
176 ; SI-NEXT: v_mul_f32_e32 v0, 4.0, v0
177 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
178 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
181 ; GFX89-LABEL: fmul_f16_imm_b:
182 ; GFX89: ; %bb.0: ; %entry
183 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
184 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
185 ; GFX89-NEXT: s_mov_b32 s6, -1
186 ; GFX89-NEXT: s_mov_b32 s10, s6
187 ; GFX89-NEXT: s_mov_b32 s11, s7
188 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
189 ; GFX89-NEXT: s_mov_b32 s8, s2
190 ; GFX89-NEXT: s_mov_b32 s9, s3
191 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
192 ; GFX89-NEXT: s_waitcnt vmcnt(0)
193 ; GFX89-NEXT: s_mov_b32 s4, s0
194 ; GFX89-NEXT: s_mov_b32 s5, s1
195 ; GFX89-NEXT: v_mul_f16_e32 v0, 4.0, v0
196 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
197 ; GFX89-NEXT: s_endpgm
199 ; GFX11-LABEL: fmul_f16_imm_b:
200 ; GFX11: ; %bb.0: ; %entry
201 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
202 ; GFX11-NEXT: s_mov_b32 s6, -1
203 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
204 ; GFX11-NEXT: s_mov_b32 s10, s6
205 ; GFX11-NEXT: s_mov_b32 s11, s7
206 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
207 ; GFX11-NEXT: s_mov_b32 s8, s2
208 ; GFX11-NEXT: s_mov_b32 s9, s3
209 ; GFX11-NEXT: s_mov_b32 s4, s0
210 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
211 ; GFX11-NEXT: s_waitcnt vmcnt(0)
212 ; GFX11-NEXT: s_mov_b32 s5, s1
213 ; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0
214 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
215 ; GFX11-NEXT: s_nop 0
216 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
217 ; GFX11-NEXT: s_endpgm
219 ptr addrspace(1) %a) {
221 %a.val = load volatile half, ptr addrspace(1) %a
222 %r.val = fmul half %a.val, 4.0
223 store half %r.val, ptr addrspace(1) %r
227 define amdgpu_kernel void @fmul_v2f16(
228 ; SI-LABEL: fmul_v2f16:
229 ; SI: ; %bb.0: ; %entry
230 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
231 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
232 ; SI-NEXT: s_mov_b32 s3, 0xf000
233 ; SI-NEXT: s_mov_b32 s2, -1
234 ; SI-NEXT: s_mov_b32 s10, s2
235 ; SI-NEXT: s_mov_b32 s11, s3
236 ; SI-NEXT: s_waitcnt lgkmcnt(0)
237 ; SI-NEXT: s_mov_b32 s12, s6
238 ; SI-NEXT: s_mov_b32 s13, s7
239 ; SI-NEXT: s_mov_b32 s14, s2
240 ; SI-NEXT: s_mov_b32 s15, s3
241 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
242 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
243 ; SI-NEXT: s_mov_b32 s0, s4
244 ; SI-NEXT: s_mov_b32 s1, s5
245 ; SI-NEXT: s_waitcnt vmcnt(1)
246 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
247 ; SI-NEXT: s_waitcnt vmcnt(0)
248 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
249 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
250 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
251 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
252 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
253 ; SI-NEXT: v_mul_f32_e32 v2, v3, v2
254 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
255 ; SI-NEXT: v_mul_f32_e32 v0, v1, v0
256 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
257 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
258 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
259 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
262 ; VI-LABEL: fmul_v2f16:
263 ; VI: ; %bb.0: ; %entry
264 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
265 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
266 ; VI-NEXT: s_mov_b32 s3, 0xf000
267 ; VI-NEXT: s_mov_b32 s2, -1
268 ; VI-NEXT: s_mov_b32 s10, s2
269 ; VI-NEXT: s_mov_b32 s11, s3
270 ; VI-NEXT: s_waitcnt lgkmcnt(0)
271 ; VI-NEXT: s_mov_b32 s12, s6
272 ; VI-NEXT: s_mov_b32 s13, s7
273 ; VI-NEXT: s_mov_b32 s14, s2
274 ; VI-NEXT: s_mov_b32 s15, s3
275 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
276 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
277 ; VI-NEXT: s_mov_b32 s0, s4
278 ; VI-NEXT: s_mov_b32 s1, s5
279 ; VI-NEXT: s_waitcnt vmcnt(0)
280 ; VI-NEXT: v_mul_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
281 ; VI-NEXT: v_mul_f16_e32 v0, v1, v0
282 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
283 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
286 ; GFX9-LABEL: fmul_v2f16:
287 ; GFX9: ; %bb.0: ; %entry
288 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
289 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
290 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
291 ; GFX9-NEXT: s_mov_b32 s2, -1
292 ; GFX9-NEXT: s_mov_b32 s14, s2
293 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
294 ; GFX9-NEXT: s_mov_b32 s12, s6
295 ; GFX9-NEXT: s_mov_b32 s13, s7
296 ; GFX9-NEXT: s_mov_b32 s15, s3
297 ; GFX9-NEXT: s_mov_b32 s10, s2
298 ; GFX9-NEXT: s_mov_b32 s11, s3
299 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0
300 ; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0
301 ; GFX9-NEXT: s_mov_b32 s0, s4
302 ; GFX9-NEXT: s_mov_b32 s1, s5
303 ; GFX9-NEXT: s_waitcnt vmcnt(0)
304 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
305 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
306 ; GFX9-NEXT: s_endpgm
308 ; GFX11-LABEL: fmul_v2f16:
309 ; GFX11: ; %bb.0: ; %entry
310 ; GFX11-NEXT: s_clause 0x1
311 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
312 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
313 ; GFX11-NEXT: s_mov_b32 s10, -1
314 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
315 ; GFX11-NEXT: s_mov_b32 s14, s10
316 ; GFX11-NEXT: s_mov_b32 s15, s11
317 ; GFX11-NEXT: s_mov_b32 s2, s10
318 ; GFX11-NEXT: s_mov_b32 s3, s11
319 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
320 ; GFX11-NEXT: s_mov_b32 s12, s6
321 ; GFX11-NEXT: s_mov_b32 s13, s7
322 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
323 ; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
324 ; GFX11-NEXT: s_mov_b32 s8, s4
325 ; GFX11-NEXT: s_mov_b32 s9, s5
326 ; GFX11-NEXT: s_waitcnt vmcnt(0)
327 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1
328 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
329 ; GFX11-NEXT: s_nop 0
330 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
331 ; GFX11-NEXT: s_endpgm
334 ptr addrspace(1) %b) {
336 %a.val = load <2 x half>, ptr addrspace(1) %a
337 %b.val = load <2 x half>, ptr addrspace(1) %b
338 %r.val = fmul <2 x half> %a.val, %b.val
339 store <2 x half> %r.val, ptr addrspace(1) %r
343 define amdgpu_kernel void @fmul_v2f16_imm_a(
344 ; SI-LABEL: fmul_v2f16_imm_a:
345 ; SI: ; %bb.0: ; %entry
346 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
347 ; SI-NEXT: s_mov_b32 s7, 0xf000
348 ; SI-NEXT: s_mov_b32 s6, -1
349 ; SI-NEXT: s_mov_b32 s10, s6
350 ; SI-NEXT: s_mov_b32 s11, s7
351 ; SI-NEXT: s_waitcnt lgkmcnt(0)
352 ; SI-NEXT: s_mov_b32 s8, s2
353 ; SI-NEXT: s_mov_b32 s9, s3
354 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
355 ; SI-NEXT: s_mov_b32 s4, s0
356 ; SI-NEXT: s_mov_b32 s5, s1
357 ; SI-NEXT: s_waitcnt vmcnt(0)
358 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
359 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
360 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
361 ; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1
362 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
363 ; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v0
364 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
365 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
366 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
367 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
370 ; VI-LABEL: fmul_v2f16_imm_a:
371 ; VI: ; %bb.0: ; %entry
372 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
373 ; VI-NEXT: s_mov_b32 s7, 0xf000
374 ; VI-NEXT: s_mov_b32 s6, -1
375 ; VI-NEXT: s_mov_b32 s10, s6
376 ; VI-NEXT: s_mov_b32 s11, s7
377 ; VI-NEXT: s_waitcnt lgkmcnt(0)
378 ; VI-NEXT: s_mov_b32 s8, s2
379 ; VI-NEXT: s_mov_b32 s9, s3
380 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
381 ; VI-NEXT: v_mov_b32_e32 v1, 0x4400
382 ; VI-NEXT: s_mov_b32 s4, s0
383 ; VI-NEXT: s_mov_b32 s5, s1
384 ; VI-NEXT: s_waitcnt vmcnt(0)
385 ; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
386 ; VI-NEXT: v_mul_f16_e32 v0, 0x4200, v0
387 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
388 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
391 ; GFX9-LABEL: fmul_v2f16_imm_a:
392 ; GFX9: ; %bb.0: ; %entry
393 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
394 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
395 ; GFX9-NEXT: s_mov_b32 s6, -1
396 ; GFX9-NEXT: s_mov_b32 s10, s6
397 ; GFX9-NEXT: s_mov_b32 s11, s7
398 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
399 ; GFX9-NEXT: s_mov_b32 s8, s2
400 ; GFX9-NEXT: s_mov_b32 s9, s3
401 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
402 ; GFX9-NEXT: s_mov_b32 s4, s0
403 ; GFX9-NEXT: s_mov_b32 s0, 0x44004200
404 ; GFX9-NEXT: s_mov_b32 s5, s1
405 ; GFX9-NEXT: s_waitcnt vmcnt(0)
406 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0
407 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
408 ; GFX9-NEXT: s_endpgm
410 ; GFX11-LABEL: fmul_v2f16_imm_a:
411 ; GFX11: ; %bb.0: ; %entry
412 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
413 ; GFX11-NEXT: s_mov_b32 s6, -1
414 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
415 ; GFX11-NEXT: s_mov_b32 s10, s6
416 ; GFX11-NEXT: s_mov_b32 s11, s7
417 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
418 ; GFX11-NEXT: s_mov_b32 s8, s2
419 ; GFX11-NEXT: s_mov_b32 s9, s3
420 ; GFX11-NEXT: s_mov_b32 s4, s0
421 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
422 ; GFX11-NEXT: s_mov_b32 s5, s1
423 ; GFX11-NEXT: s_waitcnt vmcnt(0)
424 ; GFX11-NEXT: v_pk_mul_f16 v0, 0x44004200, v0
425 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
426 ; GFX11-NEXT: s_nop 0
427 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
428 ; GFX11-NEXT: s_endpgm
430 ptr addrspace(1) %b) {
432 %b.val = load <2 x half>, ptr addrspace(1) %b
433 %r.val = fmul <2 x half> <half 3.0, half 4.0>, %b.val
434 store <2 x half> %r.val, ptr addrspace(1) %r
438 define amdgpu_kernel void @fmul_v2f16_imm_b(
439 ; SI-LABEL: fmul_v2f16_imm_b:
440 ; SI: ; %bb.0: ; %entry
441 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
442 ; SI-NEXT: s_mov_b32 s7, 0xf000
443 ; SI-NEXT: s_mov_b32 s6, -1
444 ; SI-NEXT: s_mov_b32 s10, s6
445 ; SI-NEXT: s_mov_b32 s11, s7
446 ; SI-NEXT: s_waitcnt lgkmcnt(0)
447 ; SI-NEXT: s_mov_b32 s8, s2
448 ; SI-NEXT: s_mov_b32 s9, s3
449 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
450 ; SI-NEXT: s_mov_b32 s4, s0
451 ; SI-NEXT: s_mov_b32 s5, s1
452 ; SI-NEXT: s_waitcnt vmcnt(0)
453 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
454 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
455 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
456 ; SI-NEXT: v_mul_f32_e32 v1, 0x40400000, v1
457 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
458 ; SI-NEXT: v_mul_f32_e32 v0, 4.0, v0
459 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
460 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
461 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
462 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
465 ; VI-LABEL: fmul_v2f16_imm_b:
466 ; VI: ; %bb.0: ; %entry
467 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
468 ; VI-NEXT: s_mov_b32 s7, 0xf000
469 ; VI-NEXT: s_mov_b32 s6, -1
470 ; VI-NEXT: s_mov_b32 s10, s6
471 ; VI-NEXT: s_mov_b32 s11, s7
472 ; VI-NEXT: s_waitcnt lgkmcnt(0)
473 ; VI-NEXT: s_mov_b32 s8, s2
474 ; VI-NEXT: s_mov_b32 s9, s3
475 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
476 ; VI-NEXT: v_mov_b32_e32 v1, 0x4200
477 ; VI-NEXT: s_mov_b32 s4, s0
478 ; VI-NEXT: s_mov_b32 s5, s1
479 ; VI-NEXT: s_waitcnt vmcnt(0)
480 ; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
481 ; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0
482 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
483 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
486 ; GFX9-LABEL: fmul_v2f16_imm_b:
487 ; GFX9: ; %bb.0: ; %entry
488 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
489 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
490 ; GFX9-NEXT: s_mov_b32 s6, -1
491 ; GFX9-NEXT: s_mov_b32 s10, s6
492 ; GFX9-NEXT: s_mov_b32 s11, s7
493 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
494 ; GFX9-NEXT: s_mov_b32 s8, s2
495 ; GFX9-NEXT: s_mov_b32 s9, s3
496 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
497 ; GFX9-NEXT: s_mov_b32 s4, s0
498 ; GFX9-NEXT: s_mov_b32 s0, 0x42004400
499 ; GFX9-NEXT: s_mov_b32 s5, s1
500 ; GFX9-NEXT: s_waitcnt vmcnt(0)
501 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0
502 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
503 ; GFX9-NEXT: s_endpgm
505 ; GFX11-LABEL: fmul_v2f16_imm_b:
506 ; GFX11: ; %bb.0: ; %entry
507 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
508 ; GFX11-NEXT: s_mov_b32 s6, -1
509 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
510 ; GFX11-NEXT: s_mov_b32 s10, s6
511 ; GFX11-NEXT: s_mov_b32 s11, s7
512 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
513 ; GFX11-NEXT: s_mov_b32 s8, s2
514 ; GFX11-NEXT: s_mov_b32 s9, s3
515 ; GFX11-NEXT: s_mov_b32 s4, s0
516 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
517 ; GFX11-NEXT: s_mov_b32 s5, s1
518 ; GFX11-NEXT: s_waitcnt vmcnt(0)
519 ; GFX11-NEXT: v_pk_mul_f16 v0, 0x42004400, v0
520 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
521 ; GFX11-NEXT: s_nop 0
522 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
523 ; GFX11-NEXT: s_endpgm
525 ptr addrspace(1) %a) {
527 %a.val = load <2 x half>, ptr addrspace(1) %a
528 %r.val = fmul <2 x half> %a.val, <half 4.0, half 3.0>
529 store <2 x half> %r.val, ptr addrspace(1) %r
533 define amdgpu_kernel void @fmul_v4f16(
534 ; SI-LABEL: fmul_v4f16:
535 ; SI: ; %bb.0: ; %entry
536 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
537 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
538 ; SI-NEXT: s_mov_b32 s3, 0xf000
539 ; SI-NEXT: s_mov_b32 s2, -1
540 ; SI-NEXT: s_mov_b32 s10, s2
541 ; SI-NEXT: s_waitcnt lgkmcnt(0)
542 ; SI-NEXT: s_mov_b32 s12, s6
543 ; SI-NEXT: s_mov_b32 s11, s3
544 ; SI-NEXT: s_mov_b32 s13, s7
545 ; SI-NEXT: s_mov_b32 s14, s2
546 ; SI-NEXT: s_mov_b32 s15, s3
547 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
548 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
549 ; SI-NEXT: s_mov_b32 s0, s4
550 ; SI-NEXT: s_mov_b32 s1, s5
551 ; SI-NEXT: s_waitcnt vmcnt(1)
552 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
553 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
554 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v1
555 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
556 ; SI-NEXT: s_waitcnt vmcnt(0)
557 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v2
558 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
559 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v3
560 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
561 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
562 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
563 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
564 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
565 ; SI-NEXT: v_mul_f32_e32 v5, v7, v5
566 ; SI-NEXT: v_mul_f32_e32 v4, v6, v4
567 ; SI-NEXT: v_mul_f32_e32 v1, v3, v1
568 ; SI-NEXT: v_mul_f32_e32 v0, v2, v0
569 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
570 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
571 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v5
572 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v4
573 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
574 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
575 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
576 ; SI-NEXT: v_or_b32_e32 v0, v3, v0
577 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
580 ; VI-LABEL: fmul_v4f16:
581 ; VI: ; %bb.0: ; %entry
582 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
583 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
584 ; VI-NEXT: s_mov_b32 s3, 0xf000
585 ; VI-NEXT: s_mov_b32 s2, -1
586 ; VI-NEXT: s_mov_b32 s10, s2
587 ; VI-NEXT: s_mov_b32 s11, s3
588 ; VI-NEXT: s_waitcnt lgkmcnt(0)
589 ; VI-NEXT: s_mov_b32 s12, s6
590 ; VI-NEXT: s_mov_b32 s13, s7
591 ; VI-NEXT: s_mov_b32 s14, s2
592 ; VI-NEXT: s_mov_b32 s15, s3
593 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
594 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
595 ; VI-NEXT: s_mov_b32 s0, s4
596 ; VI-NEXT: s_mov_b32 s1, s5
597 ; VI-NEXT: s_waitcnt vmcnt(0)
598 ; VI-NEXT: v_mul_f16_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
599 ; VI-NEXT: v_mul_f16_e32 v1, v3, v1
600 ; VI-NEXT: v_mul_f16_sdwa v3, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
601 ; VI-NEXT: v_mul_f16_e32 v0, v2, v0
602 ; VI-NEXT: v_or_b32_e32 v1, v1, v4
603 ; VI-NEXT: v_or_b32_e32 v0, v0, v3
604 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
607 ; GFX9-LABEL: fmul_v4f16:
608 ; GFX9: ; %bb.0: ; %entry
609 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
610 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
611 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
612 ; GFX9-NEXT: s_mov_b32 s2, -1
613 ; GFX9-NEXT: s_mov_b32 s10, s2
614 ; GFX9-NEXT: s_mov_b32 s11, s3
615 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
616 ; GFX9-NEXT: s_mov_b32 s12, s6
617 ; GFX9-NEXT: s_mov_b32 s13, s7
618 ; GFX9-NEXT: s_mov_b32 s14, s2
619 ; GFX9-NEXT: s_mov_b32 s15, s3
620 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
621 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
622 ; GFX9-NEXT: s_mov_b32 s0, s4
623 ; GFX9-NEXT: s_mov_b32 s1, s5
624 ; GFX9-NEXT: s_waitcnt vmcnt(0)
625 ; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1
626 ; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0
627 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
628 ; GFX9-NEXT: s_endpgm
630 ; GFX11-LABEL: fmul_v4f16:
631 ; GFX11: ; %bb.0: ; %entry
632 ; GFX11-NEXT: s_clause 0x1
633 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
634 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
635 ; GFX11-NEXT: s_mov_b32 s10, -1
636 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
637 ; GFX11-NEXT: s_mov_b32 s2, s10
638 ; GFX11-NEXT: s_mov_b32 s3, s11
639 ; GFX11-NEXT: s_mov_b32 s14, s10
640 ; GFX11-NEXT: s_mov_b32 s15, s11
641 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
642 ; GFX11-NEXT: s_mov_b32 s12, s6
643 ; GFX11-NEXT: s_mov_b32 s13, s7
644 ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
645 ; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0
646 ; GFX11-NEXT: s_mov_b32 s8, s4
647 ; GFX11-NEXT: s_mov_b32 s9, s5
648 ; GFX11-NEXT: s_waitcnt vmcnt(0)
649 ; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1
650 ; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0
651 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
652 ; GFX11-NEXT: s_nop 0
653 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
654 ; GFX11-NEXT: s_endpgm
657 ptr addrspace(1) %b) {
659 %a.val = load <4 x half>, ptr addrspace(1) %a
660 %b.val = load <4 x half>, ptr addrspace(1) %b
661 %r.val = fmul <4 x half> %a.val, %b.val
662 store <4 x half> %r.val, ptr addrspace(1) %r
666 define amdgpu_kernel void @fmul_v4f16_imm_a(
667 ; SI-LABEL: fmul_v4f16_imm_a:
668 ; SI: ; %bb.0: ; %entry
669 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
670 ; SI-NEXT: s_mov_b32 s7, 0xf000
671 ; SI-NEXT: s_mov_b32 s6, -1
672 ; SI-NEXT: s_mov_b32 s10, s6
673 ; SI-NEXT: s_mov_b32 s11, s7
674 ; SI-NEXT: s_waitcnt lgkmcnt(0)
675 ; SI-NEXT: s_mov_b32 s8, s2
676 ; SI-NEXT: s_mov_b32 s9, s3
677 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
678 ; SI-NEXT: s_mov_b32 s4, s0
679 ; SI-NEXT: s_mov_b32 s5, s1
680 ; SI-NEXT: s_waitcnt vmcnt(0)
681 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
682 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
683 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v1
684 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
685 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
686 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
687 ; SI-NEXT: v_mul_f32_e32 v3, 0x40400000, v3
688 ; SI-NEXT: v_mul_f32_e32 v2, 0x41000000, v2
689 ; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1
690 ; SI-NEXT: v_add_f32_e32 v0, v0, v0
691 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
692 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
693 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
694 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
695 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
696 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
697 ; SI-NEXT: v_or_b32_e32 v1, v3, v1
698 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
699 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
702 ; VI-LABEL: fmul_v4f16_imm_a:
703 ; VI: ; %bb.0: ; %entry
704 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
705 ; VI-NEXT: s_mov_b32 s7, 0xf000
706 ; VI-NEXT: s_mov_b32 s6, -1
707 ; VI-NEXT: s_mov_b32 s10, s6
708 ; VI-NEXT: s_mov_b32 s11, s7
709 ; VI-NEXT: s_waitcnt lgkmcnt(0)
710 ; VI-NEXT: s_mov_b32 s8, s2
711 ; VI-NEXT: s_mov_b32 s9, s3
712 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
713 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400
714 ; VI-NEXT: s_mov_b32 s4, s0
715 ; VI-NEXT: s_mov_b32 s5, s1
716 ; VI-NEXT: s_waitcnt vmcnt(0)
717 ; VI-NEXT: v_mul_f16_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
718 ; VI-NEXT: v_mul_f16_e32 v1, 0x4200, v1
719 ; VI-NEXT: v_add_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
720 ; VI-NEXT: v_mul_f16_e32 v0, 0x4800, v0
721 ; VI-NEXT: v_or_b32_e32 v1, v1, v2
722 ; VI-NEXT: v_or_b32_e32 v0, v0, v3
723 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
726 ; GFX9-LABEL: fmul_v4f16_imm_a:
727 ; GFX9: ; %bb.0: ; %entry
728 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
729 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
730 ; GFX9-NEXT: s_mov_b32 s6, -1
731 ; GFX9-NEXT: s_mov_b32 s10, s6
732 ; GFX9-NEXT: s_mov_b32 s11, s7
733 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
734 ; GFX9-NEXT: s_mov_b32 s8, s2
735 ; GFX9-NEXT: s_mov_b32 s9, s3
736 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
737 ; GFX9-NEXT: s_mov_b32 s2, 0x44004200
738 ; GFX9-NEXT: s_mov_b32 s3, 0x40004800
739 ; GFX9-NEXT: s_mov_b32 s4, s0
740 ; GFX9-NEXT: s_mov_b32 s5, s1
741 ; GFX9-NEXT: s_waitcnt vmcnt(0)
742 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, s2
743 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, s3
744 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
745 ; GFX9-NEXT: s_endpgm
747 ; GFX11-LABEL: fmul_v4f16_imm_a:
748 ; GFX11: ; %bb.0: ; %entry
749 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
750 ; GFX11-NEXT: s_mov_b32 s6, -1
751 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
752 ; GFX11-NEXT: s_mov_b32 s10, s6
753 ; GFX11-NEXT: s_mov_b32 s11, s7
754 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
755 ; GFX11-NEXT: s_mov_b32 s8, s2
756 ; GFX11-NEXT: s_mov_b32 s9, s3
757 ; GFX11-NEXT: s_mov_b32 s4, s0
758 ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
759 ; GFX11-NEXT: s_mov_b32 s5, s1
760 ; GFX11-NEXT: s_waitcnt vmcnt(0)
761 ; GFX11-NEXT: v_pk_mul_f16 v1, 0x44004200, v1
762 ; GFX11-NEXT: v_pk_mul_f16 v0, 0x40004800, v0
763 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
764 ; GFX11-NEXT: s_nop 0
765 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
766 ; GFX11-NEXT: s_endpgm
768 ptr addrspace(1) %b) {
770 %b.val = load <4 x half>, ptr addrspace(1) %b
771 %r.val = fmul <4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, %b.val
772 store <4 x half> %r.val, ptr addrspace(1) %r