1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
7 define amdgpu_kernel void @fsub_f16(
9 ; SI: ; %bb.0: ; %entry
10 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
11 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
12 ; SI-NEXT: s_mov_b32 s3, 0xf000
13 ; SI-NEXT: s_mov_b32 s2, -1
14 ; SI-NEXT: s_mov_b32 s14, s2
15 ; SI-NEXT: s_waitcnt lgkmcnt(0)
16 ; SI-NEXT: s_mov_b32 s12, s6
17 ; SI-NEXT: s_mov_b32 s13, s7
18 ; SI-NEXT: s_mov_b32 s15, s3
19 ; SI-NEXT: s_mov_b32 s10, s2
20 ; SI-NEXT: s_mov_b32 s11, s3
21 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
22 ; SI-NEXT: s_waitcnt vmcnt(0)
23 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
24 ; SI-NEXT: s_waitcnt vmcnt(0)
25 ; SI-NEXT: s_mov_b32 s0, s4
26 ; SI-NEXT: s_mov_b32 s1, s5
27 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
28 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
29 ; SI-NEXT: v_sub_f32_e32 v0, v0, v1
30 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
31 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
34 ; GFX89-LABEL: fsub_f16:
35 ; GFX89: ; %bb.0: ; %entry
36 ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
37 ; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
38 ; GFX89-NEXT: s_mov_b32 s3, 0xf000
39 ; GFX89-NEXT: s_mov_b32 s2, -1
40 ; GFX89-NEXT: s_mov_b32 s14, s2
41 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
42 ; GFX89-NEXT: s_mov_b32 s12, s6
43 ; GFX89-NEXT: s_mov_b32 s13, s7
44 ; GFX89-NEXT: s_mov_b32 s15, s3
45 ; GFX89-NEXT: s_mov_b32 s10, s2
46 ; GFX89-NEXT: s_mov_b32 s11, s3
47 ; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
48 ; GFX89-NEXT: s_waitcnt vmcnt(0)
49 ; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
50 ; GFX89-NEXT: s_waitcnt vmcnt(0)
51 ; GFX89-NEXT: s_mov_b32 s0, s4
52 ; GFX89-NEXT: s_mov_b32 s1, s5
53 ; GFX89-NEXT: v_sub_f16_e32 v0, v0, v1
54 ; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
55 ; GFX89-NEXT: s_endpgm
57 ; GFX11-LABEL: fsub_f16:
58 ; GFX11: ; %bb.0: ; %entry
59 ; GFX11-NEXT: s_clause 0x1
60 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
61 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
62 ; GFX11-NEXT: s_mov_b32 s10, -1
63 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
64 ; GFX11-NEXT: s_mov_b32 s14, s10
65 ; GFX11-NEXT: s_mov_b32 s15, s11
66 ; GFX11-NEXT: s_mov_b32 s2, s10
67 ; GFX11-NEXT: s_mov_b32 s3, s11
68 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX11-NEXT: s_mov_b32 s12, s6
70 ; GFX11-NEXT: s_mov_b32 s13, s7
71 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
72 ; GFX11-NEXT: s_waitcnt vmcnt(0)
73 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
74 ; GFX11-NEXT: s_waitcnt vmcnt(0)
75 ; GFX11-NEXT: s_mov_b32 s8, s4
76 ; GFX11-NEXT: s_mov_b32 s9, s5
77 ; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1
78 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
80 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
81 ; GFX11-NEXT: s_endpgm
84 ptr addrspace(1) %b) {
86 %a.val = load volatile half, ptr addrspace(1) %a
87 %b.val = load volatile half, ptr addrspace(1) %b
88 %r.val = fsub half %a.val, %b.val
89 store half %r.val, ptr addrspace(1) %r
93 define amdgpu_kernel void @fsub_f16_imm_a(
94 ; SI-LABEL: fsub_f16_imm_a:
95 ; SI: ; %bb.0: ; %entry
96 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
97 ; SI-NEXT: s_mov_b32 s7, 0xf000
98 ; SI-NEXT: s_mov_b32 s6, -1
99 ; SI-NEXT: s_mov_b32 s10, s6
100 ; SI-NEXT: s_mov_b32 s11, s7
101 ; SI-NEXT: s_waitcnt lgkmcnt(0)
102 ; SI-NEXT: s_mov_b32 s8, s2
103 ; SI-NEXT: s_mov_b32 s9, s3
104 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
105 ; SI-NEXT: s_waitcnt vmcnt(0)
106 ; SI-NEXT: s_mov_b32 s4, s0
107 ; SI-NEXT: s_mov_b32 s5, s1
108 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
109 ; SI-NEXT: v_sub_f32_e32 v0, 1.0, v0
110 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
111 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
114 ; GFX89-LABEL: fsub_f16_imm_a:
115 ; GFX89: ; %bb.0: ; %entry
116 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
117 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
118 ; GFX89-NEXT: s_mov_b32 s6, -1
119 ; GFX89-NEXT: s_mov_b32 s10, s6
120 ; GFX89-NEXT: s_mov_b32 s11, s7
121 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
122 ; GFX89-NEXT: s_mov_b32 s8, s2
123 ; GFX89-NEXT: s_mov_b32 s9, s3
124 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
125 ; GFX89-NEXT: s_waitcnt vmcnt(0)
126 ; GFX89-NEXT: s_mov_b32 s4, s0
127 ; GFX89-NEXT: s_mov_b32 s5, s1
128 ; GFX89-NEXT: v_sub_f16_e32 v0, 1.0, v0
129 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
130 ; GFX89-NEXT: s_endpgm
132 ; GFX11-LABEL: fsub_f16_imm_a:
133 ; GFX11: ; %bb.0: ; %entry
134 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
135 ; GFX11-NEXT: s_mov_b32 s6, -1
136 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
137 ; GFX11-NEXT: s_mov_b32 s10, s6
138 ; GFX11-NEXT: s_mov_b32 s11, s7
139 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
140 ; GFX11-NEXT: s_mov_b32 s8, s2
141 ; GFX11-NEXT: s_mov_b32 s9, s3
142 ; GFX11-NEXT: s_mov_b32 s4, s0
143 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
144 ; GFX11-NEXT: s_waitcnt vmcnt(0)
145 ; GFX11-NEXT: s_mov_b32 s5, s1
146 ; GFX11-NEXT: v_sub_f16_e32 v0, 1.0, v0
147 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
148 ; GFX11-NEXT: s_nop 0
149 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
150 ; GFX11-NEXT: s_endpgm
152 ptr addrspace(1) %b) {
154 %b.val = load volatile half, ptr addrspace(1) %b
155 %r.val = fsub half 1.0, %b.val
156 store half %r.val, ptr addrspace(1) %r
160 define amdgpu_kernel void @fsub_f16_imm_b(
161 ; SI-LABEL: fsub_f16_imm_b:
162 ; SI: ; %bb.0: ; %entry
163 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
164 ; SI-NEXT: s_mov_b32 s7, 0xf000
165 ; SI-NEXT: s_mov_b32 s6, -1
166 ; SI-NEXT: s_mov_b32 s10, s6
167 ; SI-NEXT: s_mov_b32 s11, s7
168 ; SI-NEXT: s_waitcnt lgkmcnt(0)
169 ; SI-NEXT: s_mov_b32 s8, s2
170 ; SI-NEXT: s_mov_b32 s9, s3
171 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
172 ; SI-NEXT: s_waitcnt vmcnt(0)
173 ; SI-NEXT: s_mov_b32 s4, s0
174 ; SI-NEXT: s_mov_b32 s5, s1
175 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
176 ; SI-NEXT: v_add_f32_e32 v0, -2.0, v0
177 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
178 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
181 ; GFX89-LABEL: fsub_f16_imm_b:
182 ; GFX89: ; %bb.0: ; %entry
183 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
184 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
185 ; GFX89-NEXT: s_mov_b32 s6, -1
186 ; GFX89-NEXT: s_mov_b32 s10, s6
187 ; GFX89-NEXT: s_mov_b32 s11, s7
188 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
189 ; GFX89-NEXT: s_mov_b32 s8, s2
190 ; GFX89-NEXT: s_mov_b32 s9, s3
191 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
192 ; GFX89-NEXT: s_waitcnt vmcnt(0)
193 ; GFX89-NEXT: s_mov_b32 s4, s0
194 ; GFX89-NEXT: s_mov_b32 s5, s1
195 ; GFX89-NEXT: v_add_f16_e32 v0, -2.0, v0
196 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
197 ; GFX89-NEXT: s_endpgm
199 ; GFX11-LABEL: fsub_f16_imm_b:
200 ; GFX11: ; %bb.0: ; %entry
201 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
202 ; GFX11-NEXT: s_mov_b32 s6, -1
203 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
204 ; GFX11-NEXT: s_mov_b32 s10, s6
205 ; GFX11-NEXT: s_mov_b32 s11, s7
206 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
207 ; GFX11-NEXT: s_mov_b32 s8, s2
208 ; GFX11-NEXT: s_mov_b32 s9, s3
209 ; GFX11-NEXT: s_mov_b32 s4, s0
210 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
211 ; GFX11-NEXT: s_waitcnt vmcnt(0)
212 ; GFX11-NEXT: s_mov_b32 s5, s1
213 ; GFX11-NEXT: v_add_f16_e32 v0, -2.0, v0
214 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
215 ; GFX11-NEXT: s_nop 0
216 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
217 ; GFX11-NEXT: s_endpgm
219 ptr addrspace(1) %a) {
221 %a.val = load volatile half, ptr addrspace(1) %a
222 %r.val = fsub half %a.val, 2.0
223 store half %r.val, ptr addrspace(1) %r
227 define amdgpu_kernel void @fsub_v2f16(
228 ; SI-LABEL: fsub_v2f16:
229 ; SI: ; %bb.0: ; %entry
230 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
231 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
232 ; SI-NEXT: s_mov_b32 s3, 0xf000
233 ; SI-NEXT: s_mov_b32 s2, -1
234 ; SI-NEXT: s_mov_b32 s10, s2
235 ; SI-NEXT: s_mov_b32 s11, s3
236 ; SI-NEXT: s_waitcnt lgkmcnt(0)
237 ; SI-NEXT: s_mov_b32 s12, s6
238 ; SI-NEXT: s_mov_b32 s13, s7
239 ; SI-NEXT: s_mov_b32 s14, s2
240 ; SI-NEXT: s_mov_b32 s15, s3
241 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
242 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
243 ; SI-NEXT: s_mov_b32 s0, s4
244 ; SI-NEXT: s_mov_b32 s1, s5
245 ; SI-NEXT: s_waitcnt vmcnt(1)
246 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
247 ; SI-NEXT: s_waitcnt vmcnt(0)
248 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
249 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
250 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
251 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
252 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
253 ; SI-NEXT: v_sub_f32_e32 v2, v3, v2
254 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
255 ; SI-NEXT: v_sub_f32_e32 v0, v1, v0
256 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
257 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
258 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
259 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
262 ; VI-LABEL: fsub_v2f16:
263 ; VI: ; %bb.0: ; %entry
264 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
265 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
266 ; VI-NEXT: s_mov_b32 s3, 0xf000
267 ; VI-NEXT: s_mov_b32 s2, -1
268 ; VI-NEXT: s_mov_b32 s10, s2
269 ; VI-NEXT: s_mov_b32 s11, s3
270 ; VI-NEXT: s_waitcnt lgkmcnt(0)
271 ; VI-NEXT: s_mov_b32 s12, s6
272 ; VI-NEXT: s_mov_b32 s13, s7
273 ; VI-NEXT: s_mov_b32 s14, s2
274 ; VI-NEXT: s_mov_b32 s15, s3
275 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
276 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
277 ; VI-NEXT: s_mov_b32 s0, s4
278 ; VI-NEXT: s_mov_b32 s1, s5
279 ; VI-NEXT: s_waitcnt vmcnt(0)
280 ; VI-NEXT: v_sub_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
281 ; VI-NEXT: v_sub_f16_e32 v0, v1, v0
282 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
283 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
286 ; GFX9-LABEL: fsub_v2f16:
287 ; GFX9: ; %bb.0: ; %entry
288 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
289 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
290 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
291 ; GFX9-NEXT: s_mov_b32 s2, -1
292 ; GFX9-NEXT: s_mov_b32 s14, s2
293 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
294 ; GFX9-NEXT: s_mov_b32 s12, s6
295 ; GFX9-NEXT: s_mov_b32 s13, s7
296 ; GFX9-NEXT: s_mov_b32 s15, s3
297 ; GFX9-NEXT: s_mov_b32 s10, s2
298 ; GFX9-NEXT: s_mov_b32 s11, s3
299 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0
300 ; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0
301 ; GFX9-NEXT: s_mov_b32 s0, s4
302 ; GFX9-NEXT: s_mov_b32 s1, s5
303 ; GFX9-NEXT: s_waitcnt vmcnt(0)
304 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
305 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
306 ; GFX9-NEXT: s_endpgm
308 ; GFX11-LABEL: fsub_v2f16:
309 ; GFX11: ; %bb.0: ; %entry
310 ; GFX11-NEXT: s_clause 0x1
311 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
312 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
313 ; GFX11-NEXT: s_mov_b32 s10, -1
314 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
315 ; GFX11-NEXT: s_mov_b32 s14, s10
316 ; GFX11-NEXT: s_mov_b32 s15, s11
317 ; GFX11-NEXT: s_mov_b32 s2, s10
318 ; GFX11-NEXT: s_mov_b32 s3, s11
319 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
320 ; GFX11-NEXT: s_mov_b32 s12, s6
321 ; GFX11-NEXT: s_mov_b32 s13, s7
322 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
323 ; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
324 ; GFX11-NEXT: s_mov_b32 s8, s4
325 ; GFX11-NEXT: s_mov_b32 s9, s5
326 ; GFX11-NEXT: s_waitcnt vmcnt(0)
327 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
328 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
329 ; GFX11-NEXT: s_nop 0
330 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
331 ; GFX11-NEXT: s_endpgm
334 ptr addrspace(1) %b) {
336 %a.val = load <2 x half>, ptr addrspace(1) %a
337 %b.val = load <2 x half>, ptr addrspace(1) %b
338 %r.val = fsub <2 x half> %a.val, %b.val
339 store <2 x half> %r.val, ptr addrspace(1) %r
343 define amdgpu_kernel void @fsub_v2f16_imm_a(
344 ; SI-LABEL: fsub_v2f16_imm_a:
345 ; SI: ; %bb.0: ; %entry
346 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
347 ; SI-NEXT: s_mov_b32 s7, 0xf000
348 ; SI-NEXT: s_mov_b32 s6, -1
349 ; SI-NEXT: s_mov_b32 s10, s6
350 ; SI-NEXT: s_mov_b32 s11, s7
351 ; SI-NEXT: s_waitcnt lgkmcnt(0)
352 ; SI-NEXT: s_mov_b32 s8, s2
353 ; SI-NEXT: s_mov_b32 s9, s3
354 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
355 ; SI-NEXT: s_mov_b32 s4, s0
356 ; SI-NEXT: s_mov_b32 s5, s1
357 ; SI-NEXT: s_waitcnt vmcnt(0)
358 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
359 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
360 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
361 ; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1
362 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
363 ; SI-NEXT: v_sub_f32_e32 v0, 1.0, v0
364 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
365 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
366 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
367 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
370 ; VI-LABEL: fsub_v2f16_imm_a:
371 ; VI: ; %bb.0: ; %entry
372 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
373 ; VI-NEXT: s_mov_b32 s7, 0xf000
374 ; VI-NEXT: s_mov_b32 s6, -1
375 ; VI-NEXT: s_mov_b32 s10, s6
376 ; VI-NEXT: s_mov_b32 s11, s7
377 ; VI-NEXT: s_waitcnt lgkmcnt(0)
378 ; VI-NEXT: s_mov_b32 s8, s2
379 ; VI-NEXT: s_mov_b32 s9, s3
380 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
381 ; VI-NEXT: v_mov_b32_e32 v1, 0x4000
382 ; VI-NEXT: s_mov_b32 s4, s0
383 ; VI-NEXT: s_mov_b32 s5, s1
384 ; VI-NEXT: s_waitcnt vmcnt(0)
385 ; VI-NEXT: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
386 ; VI-NEXT: v_sub_f16_e32 v0, 1.0, v0
387 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
388 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
391 ; GFX9-LABEL: fsub_v2f16_imm_a:
392 ; GFX9: ; %bb.0: ; %entry
393 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
394 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
395 ; GFX9-NEXT: s_mov_b32 s6, -1
396 ; GFX9-NEXT: s_mov_b32 s10, s6
397 ; GFX9-NEXT: s_mov_b32 s11, s7
398 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
399 ; GFX9-NEXT: s_mov_b32 s8, s2
400 ; GFX9-NEXT: s_mov_b32 s9, s3
401 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
402 ; GFX9-NEXT: s_mov_b32 s4, s0
403 ; GFX9-NEXT: s_mov_b32 s0, 0x40003c00
404 ; GFX9-NEXT: s_mov_b32 s5, s1
405 ; GFX9-NEXT: s_waitcnt vmcnt(0)
406 ; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 neg_lo:[1,0] neg_hi:[1,0]
407 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
408 ; GFX9-NEXT: s_endpgm
410 ; GFX11-LABEL: fsub_v2f16_imm_a:
411 ; GFX11: ; %bb.0: ; %entry
412 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
413 ; GFX11-NEXT: s_mov_b32 s6, -1
414 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
415 ; GFX11-NEXT: s_mov_b32 s10, s6
416 ; GFX11-NEXT: s_mov_b32 s11, s7
417 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
418 ; GFX11-NEXT: s_mov_b32 s8, s2
419 ; GFX11-NEXT: s_mov_b32 s9, s3
420 ; GFX11-NEXT: s_mov_b32 s4, s0
421 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
422 ; GFX11-NEXT: s_mov_b32 s5, s1
423 ; GFX11-NEXT: s_waitcnt vmcnt(0)
424 ; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 neg_lo:[0,1] neg_hi:[0,1]
425 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
426 ; GFX11-NEXT: s_nop 0
427 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
428 ; GFX11-NEXT: s_endpgm
430 ptr addrspace(1) %b) {
432 %b.val = load <2 x half>, ptr addrspace(1) %b
433 %r.val = fsub <2 x half> <half 1.0, half 2.0>, %b.val
434 store <2 x half> %r.val, ptr addrspace(1) %r
438 define amdgpu_kernel void @fsub_v2f16_imm_b(
439 ; SI-LABEL: fsub_v2f16_imm_b:
440 ; SI: ; %bb.0: ; %entry
441 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
442 ; SI-NEXT: s_mov_b32 s7, 0xf000
443 ; SI-NEXT: s_mov_b32 s6, -1
444 ; SI-NEXT: s_mov_b32 s10, s6
445 ; SI-NEXT: s_mov_b32 s11, s7
446 ; SI-NEXT: s_waitcnt lgkmcnt(0)
447 ; SI-NEXT: s_mov_b32 s8, s2
448 ; SI-NEXT: s_mov_b32 s9, s3
449 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
450 ; SI-NEXT: s_mov_b32 s4, s0
451 ; SI-NEXT: s_mov_b32 s5, s1
452 ; SI-NEXT: s_waitcnt vmcnt(0)
453 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
454 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
455 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
456 ; SI-NEXT: v_add_f32_e32 v1, -1.0, v1
457 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
458 ; SI-NEXT: v_add_f32_e32 v0, -2.0, v0
459 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
460 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
461 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
462 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
465 ; VI-LABEL: fsub_v2f16_imm_b:
466 ; VI: ; %bb.0: ; %entry
467 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
468 ; VI-NEXT: s_mov_b32 s7, 0xf000
469 ; VI-NEXT: s_mov_b32 s6, -1
470 ; VI-NEXT: s_mov_b32 s10, s6
471 ; VI-NEXT: s_mov_b32 s11, s7
472 ; VI-NEXT: s_waitcnt lgkmcnt(0)
473 ; VI-NEXT: s_mov_b32 s8, s2
474 ; VI-NEXT: s_mov_b32 s9, s3
475 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
476 ; VI-NEXT: v_mov_b32_e32 v1, 0xbc00
477 ; VI-NEXT: s_mov_b32 s4, s0
478 ; VI-NEXT: s_mov_b32 s5, s1
479 ; VI-NEXT: s_waitcnt vmcnt(0)
480 ; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
481 ; VI-NEXT: v_add_f16_e32 v0, -2.0, v0
482 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
483 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
486 ; GFX9-LABEL: fsub_v2f16_imm_b:
487 ; GFX9: ; %bb.0: ; %entry
488 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
489 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
490 ; GFX9-NEXT: s_mov_b32 s6, -1
491 ; GFX9-NEXT: s_mov_b32 s10, s6
492 ; GFX9-NEXT: s_mov_b32 s11, s7
493 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
494 ; GFX9-NEXT: s_mov_b32 s8, s2
495 ; GFX9-NEXT: s_mov_b32 s9, s3
496 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
497 ; GFX9-NEXT: s_mov_b32 s4, s0
498 ; GFX9-NEXT: s_mov_b32 s0, 0xbc00c000
499 ; GFX9-NEXT: s_mov_b32 s5, s1
500 ; GFX9-NEXT: s_waitcnt vmcnt(0)
501 ; GFX9-NEXT: v_pk_add_f16 v0, v0, s0
502 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
503 ; GFX9-NEXT: s_endpgm
505 ; GFX11-LABEL: fsub_v2f16_imm_b:
506 ; GFX11: ; %bb.0: ; %entry
507 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
508 ; GFX11-NEXT: s_mov_b32 s6, -1
509 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
510 ; GFX11-NEXT: s_mov_b32 s10, s6
511 ; GFX11-NEXT: s_mov_b32 s11, s7
512 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
513 ; GFX11-NEXT: s_mov_b32 s8, s2
514 ; GFX11-NEXT: s_mov_b32 s9, s3
515 ; GFX11-NEXT: s_mov_b32 s4, s0
516 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
517 ; GFX11-NEXT: s_mov_b32 s5, s1
518 ; GFX11-NEXT: s_waitcnt vmcnt(0)
519 ; GFX11-NEXT: v_pk_add_f16 v0, 0xbc00c000, v0
520 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
521 ; GFX11-NEXT: s_nop 0
522 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
523 ; GFX11-NEXT: s_endpgm
525 ptr addrspace(1) %a) {
527 %a.val = load <2 x half>, ptr addrspace(1) %a
528 %r.val = fsub <2 x half> %a.val, <half 2.0, half 1.0>
529 store <2 x half> %r.val, ptr addrspace(1) %r