1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11 %s
6 define amdgpu_kernel void @fadd_f16(
8 ; SI: ; %bb.0: ; %entry
9 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
10 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
11 ; SI-NEXT: s_mov_b32 s11, 0xf000
12 ; SI-NEXT: s_mov_b32 s10, -1
13 ; SI-NEXT: s_mov_b32 s2, s10
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: s_mov_b32 s8, s4
16 ; SI-NEXT: s_mov_b32 s9, s5
17 ; SI-NEXT: s_mov_b32 s4, s6
18 ; SI-NEXT: s_mov_b32 s5, s7
19 ; SI-NEXT: s_mov_b32 s6, s10
20 ; SI-NEXT: s_mov_b32 s7, s11
21 ; SI-NEXT: s_mov_b32 s3, s11
22 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
23 ; SI-NEXT: s_waitcnt vmcnt(0)
24 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
25 ; SI-NEXT: s_waitcnt vmcnt(0)
26 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
27 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
28 ; SI-NEXT: v_add_f32_e32 v0, v0, v1
29 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
30 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
34 ; VI: ; %bb.0: ; %entry
35 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
36 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
37 ; VI-NEXT: s_mov_b32 s11, 0xf000
38 ; VI-NEXT: s_mov_b32 s10, -1
39 ; VI-NEXT: s_mov_b32 s2, s10
40 ; VI-NEXT: s_waitcnt lgkmcnt(0)
41 ; VI-NEXT: s_mov_b32 s8, s4
42 ; VI-NEXT: s_mov_b32 s9, s5
43 ; VI-NEXT: s_mov_b32 s4, s6
44 ; VI-NEXT: s_mov_b32 s5, s7
45 ; VI-NEXT: s_mov_b32 s6, s10
46 ; VI-NEXT: s_mov_b32 s7, s11
47 ; VI-NEXT: s_mov_b32 s3, s11
48 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
49 ; VI-NEXT: s_waitcnt vmcnt(0)
50 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
51 ; VI-NEXT: s_waitcnt vmcnt(0)
52 ; VI-NEXT: v_add_f16_e32 v0, v0, v1
53 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
56 ; GFX11-LABEL: fadd_f16:
57 ; GFX11: ; %bb.0: ; %entry
58 ; GFX11-NEXT: s_clause 0x1
59 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
60 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
61 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
62 ; GFX11-NEXT: s_mov_b32 s10, -1
63 ; GFX11-NEXT: s_mov_b32 s3, s11
64 ; GFX11-NEXT: s_mov_b32 s2, s10
65 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
66 ; GFX11-NEXT: s_mov_b32 s8, s4
67 ; GFX11-NEXT: s_mov_b32 s9, s5
68 ; GFX11-NEXT: s_mov_b32 s4, s6
69 ; GFX11-NEXT: s_mov_b32 s5, s7
70 ; GFX11-NEXT: s_mov_b32 s6, s10
71 ; GFX11-NEXT: s_mov_b32 s7, s11
72 ; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
73 ; GFX11-NEXT: s_waitcnt vmcnt(0)
74 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
75 ; GFX11-NEXT: s_waitcnt vmcnt(0)
76 ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1
77 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
79 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
80 ; GFX11-NEXT: s_endpgm
83 ptr addrspace(1) %b) {
85 %a.val = load volatile half, ptr addrspace(1) %a
86 %b.val = load volatile half, ptr addrspace(1) %b
87 %r.val = fadd half %a.val, %b.val
88 store half %r.val, ptr addrspace(1) %r
92 define amdgpu_kernel void @fadd_f16_imm_a(
93 ; SI-LABEL: fadd_f16_imm_a:
94 ; SI: ; %bb.0: ; %entry
95 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
96 ; SI-NEXT: s_mov_b32 s7, 0xf000
97 ; SI-NEXT: s_mov_b32 s6, -1
98 ; SI-NEXT: s_waitcnt lgkmcnt(0)
99 ; SI-NEXT: s_mov_b32 s4, s0
100 ; SI-NEXT: s_mov_b32 s5, s1
101 ; SI-NEXT: s_mov_b32 s0, s2
102 ; SI-NEXT: s_mov_b32 s1, s3
103 ; SI-NEXT: s_mov_b32 s2, s6
104 ; SI-NEXT: s_mov_b32 s3, s7
105 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
106 ; SI-NEXT: s_waitcnt vmcnt(0)
107 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
108 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
109 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
110 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
113 ; VI-LABEL: fadd_f16_imm_a:
114 ; VI: ; %bb.0: ; %entry
115 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
116 ; VI-NEXT: s_mov_b32 s7, 0xf000
117 ; VI-NEXT: s_mov_b32 s6, -1
118 ; VI-NEXT: s_waitcnt lgkmcnt(0)
119 ; VI-NEXT: s_mov_b32 s4, s0
120 ; VI-NEXT: s_mov_b32 s5, s1
121 ; VI-NEXT: s_mov_b32 s0, s2
122 ; VI-NEXT: s_mov_b32 s1, s3
123 ; VI-NEXT: s_mov_b32 s2, s6
124 ; VI-NEXT: s_mov_b32 s3, s7
125 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
126 ; VI-NEXT: s_waitcnt vmcnt(0)
127 ; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
128 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
131 ; GFX11-LABEL: fadd_f16_imm_a:
132 ; GFX11: ; %bb.0: ; %entry
133 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
134 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
135 ; GFX11-NEXT: s_mov_b32 s6, -1
136 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
137 ; GFX11-NEXT: s_mov_b32 s4, s0
138 ; GFX11-NEXT: s_mov_b32 s5, s1
139 ; GFX11-NEXT: s_mov_b32 s0, s2
140 ; GFX11-NEXT: s_mov_b32 s1, s3
141 ; GFX11-NEXT: s_mov_b32 s2, s6
142 ; GFX11-NEXT: s_mov_b32 s3, s7
143 ; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
144 ; GFX11-NEXT: s_waitcnt vmcnt(0)
145 ; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0
146 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
147 ; GFX11-NEXT: s_nop 0
148 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
149 ; GFX11-NEXT: s_endpgm
151 ptr addrspace(1) %b) {
153 %b.val = load half, ptr addrspace(1) %b
154 %r.val = fadd half 1.0, %b.val
155 store half %r.val, ptr addrspace(1) %r
159 define amdgpu_kernel void @fadd_f16_imm_b(
160 ; SI-LABEL: fadd_f16_imm_b:
161 ; SI: ; %bb.0: ; %entry
162 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
163 ; SI-NEXT: s_mov_b32 s7, 0xf000
164 ; SI-NEXT: s_mov_b32 s6, -1
165 ; SI-NEXT: s_waitcnt lgkmcnt(0)
166 ; SI-NEXT: s_mov_b32 s4, s0
167 ; SI-NEXT: s_mov_b32 s5, s1
168 ; SI-NEXT: s_mov_b32 s0, s2
169 ; SI-NEXT: s_mov_b32 s1, s3
170 ; SI-NEXT: s_mov_b32 s2, s6
171 ; SI-NEXT: s_mov_b32 s3, s7
172 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
173 ; SI-NEXT: s_waitcnt vmcnt(0)
174 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
175 ; SI-NEXT: v_add_f32_e32 v0, 2.0, v0
176 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
177 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
180 ; VI-LABEL: fadd_f16_imm_b:
181 ; VI: ; %bb.0: ; %entry
182 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
183 ; VI-NEXT: s_mov_b32 s7, 0xf000
184 ; VI-NEXT: s_mov_b32 s6, -1
185 ; VI-NEXT: s_waitcnt lgkmcnt(0)
186 ; VI-NEXT: s_mov_b32 s4, s0
187 ; VI-NEXT: s_mov_b32 s5, s1
188 ; VI-NEXT: s_mov_b32 s0, s2
189 ; VI-NEXT: s_mov_b32 s1, s3
190 ; VI-NEXT: s_mov_b32 s2, s6
191 ; VI-NEXT: s_mov_b32 s3, s7
192 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
193 ; VI-NEXT: s_waitcnt vmcnt(0)
194 ; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
195 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
198 ; GFX11-LABEL: fadd_f16_imm_b:
199 ; GFX11: ; %bb.0: ; %entry
200 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
201 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
202 ; GFX11-NEXT: s_mov_b32 s6, -1
203 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
204 ; GFX11-NEXT: s_mov_b32 s4, s0
205 ; GFX11-NEXT: s_mov_b32 s5, s1
206 ; GFX11-NEXT: s_mov_b32 s0, s2
207 ; GFX11-NEXT: s_mov_b32 s1, s3
208 ; GFX11-NEXT: s_mov_b32 s2, s6
209 ; GFX11-NEXT: s_mov_b32 s3, s7
210 ; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
211 ; GFX11-NEXT: s_waitcnt vmcnt(0)
212 ; GFX11-NEXT: v_add_f16_e32 v0, 2.0, v0
213 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
214 ; GFX11-NEXT: s_nop 0
215 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
216 ; GFX11-NEXT: s_endpgm
218 ptr addrspace(1) %a) {
220 %a.val = load half, ptr addrspace(1) %a
221 %r.val = fadd half %a.val, 2.0
222 store half %r.val, ptr addrspace(1) %r
226 define amdgpu_kernel void @fadd_v2f16(
227 ; SI-LABEL: fadd_v2f16:
228 ; SI: ; %bb.0: ; %entry
229 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
230 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
231 ; SI-NEXT: s_mov_b32 s11, 0xf000
232 ; SI-NEXT: s_mov_b32 s14, 0
233 ; SI-NEXT: s_mov_b32 s15, s11
234 ; SI-NEXT: s_waitcnt lgkmcnt(0)
235 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
236 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
237 ; SI-NEXT: v_mov_b32_e32 v1, 0
238 ; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
239 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
240 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
241 ; SI-NEXT: s_mov_b32 s10, -1
242 ; SI-NEXT: s_mov_b32 s8, s4
243 ; SI-NEXT: s_mov_b32 s9, s5
244 ; SI-NEXT: s_waitcnt vmcnt(1)
245 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2
246 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
247 ; SI-NEXT: s_waitcnt vmcnt(0)
248 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
249 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
250 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
251 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
252 ; SI-NEXT: v_add_f32_e32 v1, v3, v1
253 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
254 ; SI-NEXT: v_add_f32_e32 v0, v2, v0
255 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
256 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
257 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
258 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
261 ; VI-LABEL: fadd_v2f16:
262 ; VI: ; %bb.0: ; %entry
263 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
264 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
265 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
266 ; VI-NEXT: s_mov_b32 s3, 0xf000
267 ; VI-NEXT: s_mov_b32 s2, -1
268 ; VI-NEXT: s_waitcnt lgkmcnt(0)
269 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
270 ; VI-NEXT: v_mov_b32_e32 v1, s7
271 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
272 ; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
273 ; VI-NEXT: v_mov_b32_e32 v3, s9
274 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
275 ; VI-NEXT: flat_load_dword v0, v[0:1]
276 ; VI-NEXT: flat_load_dword v1, v[2:3]
277 ; VI-NEXT: s_mov_b32 s0, s4
278 ; VI-NEXT: s_mov_b32 s1, s5
279 ; VI-NEXT: s_waitcnt vmcnt(0)
280 ; VI-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
281 ; VI-NEXT: v_add_f16_e32 v0, v0, v1
282 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
283 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
286 ; GFX11-LABEL: fadd_v2f16:
287 ; GFX11: ; %bb.0: ; %entry
288 ; GFX11-NEXT: s_clause 0x1
289 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
290 ; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
291 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
292 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
293 ; GFX11-NEXT: s_mov_b32 s2, -1
294 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
295 ; GFX11-NEXT: s_clause 0x1
296 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
297 ; GFX11-NEXT: global_load_b32 v0, v0, s[8:9]
298 ; GFX11-NEXT: s_mov_b32 s0, s4
299 ; GFX11-NEXT: s_mov_b32 s1, s5
300 ; GFX11-NEXT: s_waitcnt vmcnt(0)
301 ; GFX11-NEXT: v_pk_add_f16 v0, v1, v0
302 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
303 ; GFX11-NEXT: s_nop 0
304 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
305 ; GFX11-NEXT: s_endpgm
308 ptr addrspace(1) %b) {
310 %tid = call i32 @llvm.amdgcn.workitem.id.x()
311 %gep.a = getelementptr inbounds <2 x half>, ptr addrspace(1) %a, i32 %tid
312 %gep.b = getelementptr inbounds <2 x half>, ptr addrspace(1) %b, i32 %tid
313 %a.val = load <2 x half>, ptr addrspace(1) %gep.a
314 %b.val = load <2 x half>, ptr addrspace(1) %gep.b
315 %r.val = fadd <2 x half> %a.val, %b.val
316 store <2 x half> %r.val, ptr addrspace(1) %r
320 define amdgpu_kernel void @fadd_v2f16_imm_a(
321 ; SI-LABEL: fadd_v2f16_imm_a:
322 ; SI: ; %bb.0: ; %entry
323 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
324 ; SI-NEXT: s_mov_b32 s7, 0xf000
325 ; SI-NEXT: s_mov_b32 s10, 0
326 ; SI-NEXT: s_mov_b32 s11, s7
327 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
328 ; SI-NEXT: s_waitcnt lgkmcnt(0)
329 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
330 ; SI-NEXT: v_mov_b32_e32 v1, 0
331 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
332 ; SI-NEXT: s_mov_b32 s6, -1
333 ; SI-NEXT: s_mov_b32 s4, s0
334 ; SI-NEXT: s_mov_b32 s5, s1
335 ; SI-NEXT: s_waitcnt vmcnt(0)
336 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
337 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
338 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
339 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1
340 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
341 ; SI-NEXT: v_add_f32_e32 v0, 2.0, v0
342 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
343 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
344 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
345 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
348 ; VI-LABEL: fadd_v2f16_imm_a:
349 ; VI: ; %bb.0: ; %entry
350 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
351 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
352 ; VI-NEXT: s_mov_b32 s7, 0xf000
353 ; VI-NEXT: s_mov_b32 s6, -1
354 ; VI-NEXT: s_waitcnt lgkmcnt(0)
355 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
356 ; VI-NEXT: v_mov_b32_e32 v1, s3
357 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
358 ; VI-NEXT: flat_load_dword v0, v[0:1]
359 ; VI-NEXT: v_mov_b32_e32 v1, 0x4000
360 ; VI-NEXT: s_mov_b32 s4, s0
361 ; VI-NEXT: s_mov_b32 s5, s1
362 ; VI-NEXT: s_waitcnt vmcnt(0)
363 ; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
364 ; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
365 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
366 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
369 ; GFX11-LABEL: fadd_v2f16_imm_a:
370 ; GFX11: ; %bb.0: ; %entry
371 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
372 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
373 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
374 ; GFX11-NEXT: s_mov_b32 s6, -1
375 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
376 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
377 ; GFX11-NEXT: s_mov_b32 s4, s0
378 ; GFX11-NEXT: s_mov_b32 s5, s1
379 ; GFX11-NEXT: s_waitcnt vmcnt(0)
380 ; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
381 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
382 ; GFX11-NEXT: s_nop 0
383 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
384 ; GFX11-NEXT: s_endpgm
386 ptr addrspace(1) %b) {
388 %tid = call i32 @llvm.amdgcn.workitem.id.x()
389 %gep.b = getelementptr inbounds <2 x half>, ptr addrspace(1) %b, i32 %tid
390 %b.val = load <2 x half>, ptr addrspace(1) %gep.b
391 %r.val = fadd <2 x half> <half 1.0, half 2.0>, %b.val
392 store <2 x half> %r.val, ptr addrspace(1) %r
396 define amdgpu_kernel void @fadd_v2f16_imm_b(
397 ; SI-LABEL: fadd_v2f16_imm_b:
398 ; SI: ; %bb.0: ; %entry
399 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
400 ; SI-NEXT: s_mov_b32 s7, 0xf000
401 ; SI-NEXT: s_mov_b32 s10, 0
402 ; SI-NEXT: s_mov_b32 s11, s7
403 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
404 ; SI-NEXT: s_waitcnt lgkmcnt(0)
405 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
406 ; SI-NEXT: v_mov_b32_e32 v1, 0
407 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
408 ; SI-NEXT: s_mov_b32 s6, -1
409 ; SI-NEXT: s_mov_b32 s4, s0
410 ; SI-NEXT: s_mov_b32 s5, s1
411 ; SI-NEXT: s_waitcnt vmcnt(0)
412 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
413 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
414 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
415 ; SI-NEXT: v_add_f32_e32 v1, 2.0, v1
416 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
417 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
418 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
419 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
420 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
421 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
424 ; VI-LABEL: fadd_v2f16_imm_b:
425 ; VI: ; %bb.0: ; %entry
426 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
427 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
428 ; VI-NEXT: s_mov_b32 s7, 0xf000
429 ; VI-NEXT: s_mov_b32 s6, -1
430 ; VI-NEXT: s_waitcnt lgkmcnt(0)
431 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
432 ; VI-NEXT: v_mov_b32_e32 v1, s3
433 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
434 ; VI-NEXT: flat_load_dword v0, v[0:1]
435 ; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
436 ; VI-NEXT: s_mov_b32 s4, s0
437 ; VI-NEXT: s_mov_b32 s5, s1
438 ; VI-NEXT: s_waitcnt vmcnt(0)
439 ; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
440 ; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
441 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
442 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
445 ; GFX11-LABEL: fadd_v2f16_imm_b:
446 ; GFX11: ; %bb.0: ; %entry
447 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
448 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
449 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
450 ; GFX11-NEXT: s_mov_b32 s6, -1
451 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
452 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
453 ; GFX11-NEXT: s_mov_b32 s4, s0
454 ; GFX11-NEXT: s_mov_b32 s5, s1
455 ; GFX11-NEXT: s_waitcnt vmcnt(0)
456 ; GFX11-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
457 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
458 ; GFX11-NEXT: s_nop 0
459 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
460 ; GFX11-NEXT: s_endpgm
462 ptr addrspace(1) %a) {
464 %tid = call i32 @llvm.amdgcn.workitem.id.x()
465 %gep.a = getelementptr inbounds <2 x half>, ptr addrspace(1) %a, i32 %tid
466 %a.val = load <2 x half>, ptr addrspace(1) %gep.a
467 %r.val = fadd <2 x half> %a.val, <half 2.0, half 1.0>
468 store <2 x half> %r.val, ptr addrspace(1) %r
472 declare i32 @llvm.amdgcn.workitem.id.x() #1
474 attributes #0 = { nounwind }
475 attributes #1 = { nounwind readnone }