1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11 %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
7 define amdgpu_kernel void @fadd_f16(
9 ; SI: ; %bb.0: ; %entry
10 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
11 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
12 ; SI-NEXT: s_mov_b32 s11, 0xf000
13 ; SI-NEXT: s_mov_b32 s10, -1
14 ; SI-NEXT: s_mov_b32 s2, s10
15 ; SI-NEXT: s_waitcnt lgkmcnt(0)
16 ; SI-NEXT: s_mov_b32 s8, s4
17 ; SI-NEXT: s_mov_b32 s9, s5
18 ; SI-NEXT: s_mov_b32 s4, s6
19 ; SI-NEXT: s_mov_b32 s5, s7
20 ; SI-NEXT: s_mov_b32 s6, s10
21 ; SI-NEXT: s_mov_b32 s7, s11
22 ; SI-NEXT: s_mov_b32 s3, s11
23 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
24 ; SI-NEXT: s_waitcnt vmcnt(0)
25 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
26 ; SI-NEXT: s_waitcnt vmcnt(0)
27 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
28 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
29 ; SI-NEXT: v_add_f32_e32 v0, v0, v1
30 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
31 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
35 ; VI: ; %bb.0: ; %entry
36 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
37 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
38 ; VI-NEXT: s_mov_b32 s11, 0xf000
39 ; VI-NEXT: s_mov_b32 s10, -1
40 ; VI-NEXT: s_mov_b32 s2, s10
41 ; VI-NEXT: s_waitcnt lgkmcnt(0)
42 ; VI-NEXT: s_mov_b32 s8, s4
43 ; VI-NEXT: s_mov_b32 s9, s5
44 ; VI-NEXT: s_mov_b32 s4, s6
45 ; VI-NEXT: s_mov_b32 s5, s7
46 ; VI-NEXT: s_mov_b32 s6, s10
47 ; VI-NEXT: s_mov_b32 s7, s11
48 ; VI-NEXT: s_mov_b32 s3, s11
49 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
50 ; VI-NEXT: s_waitcnt vmcnt(0)
51 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
52 ; VI-NEXT: s_waitcnt vmcnt(0)
53 ; VI-NEXT: v_add_f16_e32 v0, v0, v1
54 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
57 ; GFX11-LABEL: fadd_f16:
58 ; GFX11: ; %bb.0: ; %entry
59 ; GFX11-NEXT: s_clause 0x1
60 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
61 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
62 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
63 ; GFX11-NEXT: s_mov_b32 s10, -1
64 ; GFX11-NEXT: s_mov_b32 s3, s11
65 ; GFX11-NEXT: s_mov_b32 s2, s10
66 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
67 ; GFX11-NEXT: s_mov_b32 s8, s4
68 ; GFX11-NEXT: s_mov_b32 s9, s5
69 ; GFX11-NEXT: s_mov_b32 s4, s6
70 ; GFX11-NEXT: s_mov_b32 s5, s7
71 ; GFX11-NEXT: s_mov_b32 s6, s10
72 ; GFX11-NEXT: s_mov_b32 s7, s11
73 ; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
74 ; GFX11-NEXT: s_waitcnt vmcnt(0)
75 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
76 ; GFX11-NEXT: s_waitcnt vmcnt(0)
77 ; GFX11-NEXT: v_mov_b16_e32 v0.h, v1.l
78 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
79 ; GFX11-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
80 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
82 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
83 ; GFX11-NEXT: s_endpgm
85 ; GFX11-FAKE16-LABEL: fadd_f16:
86 ; GFX11-FAKE16: ; %bb.0: ; %entry
87 ; GFX11-FAKE16-NEXT: s_clause 0x1
88 ; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
89 ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
90 ; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
91 ; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
92 ; GFX11-FAKE16-NEXT: s_mov_b32 s3, s11
93 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, s10
94 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
95 ; GFX11-FAKE16-NEXT: s_mov_b32 s8, s4
96 ; GFX11-FAKE16-NEXT: s_mov_b32 s9, s5
97 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s6
98 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s7
99 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
100 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
101 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
102 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
103 ; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
104 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
105 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
106 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
107 ; GFX11-FAKE16-NEXT: s_nop 0
108 ; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
109 ; GFX11-FAKE16-NEXT: s_endpgm
112 ptr addrspace(1) %b) {
114 %a.val = load volatile half, ptr addrspace(1) %a
115 %b.val = load volatile half, ptr addrspace(1) %b
116 %r.val = fadd half %a.val, %b.val
117 store half %r.val, ptr addrspace(1) %r
121 define amdgpu_kernel void @fadd_f16_imm_a(
122 ; SI-LABEL: fadd_f16_imm_a:
123 ; SI: ; %bb.0: ; %entry
124 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
125 ; SI-NEXT: s_mov_b32 s7, 0xf000
126 ; SI-NEXT: s_mov_b32 s6, -1
127 ; SI-NEXT: s_waitcnt lgkmcnt(0)
128 ; SI-NEXT: s_mov_b32 s4, s0
129 ; SI-NEXT: s_mov_b32 s5, s1
130 ; SI-NEXT: s_mov_b32 s0, s2
131 ; SI-NEXT: s_mov_b32 s1, s3
132 ; SI-NEXT: s_mov_b32 s2, s6
133 ; SI-NEXT: s_mov_b32 s3, s7
134 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
135 ; SI-NEXT: s_waitcnt vmcnt(0)
136 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
137 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
138 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
139 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
142 ; VI-LABEL: fadd_f16_imm_a:
143 ; VI: ; %bb.0: ; %entry
144 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
145 ; VI-NEXT: s_mov_b32 s7, 0xf000
146 ; VI-NEXT: s_mov_b32 s6, -1
147 ; VI-NEXT: s_waitcnt lgkmcnt(0)
148 ; VI-NEXT: s_mov_b32 s4, s0
149 ; VI-NEXT: s_mov_b32 s5, s1
150 ; VI-NEXT: s_mov_b32 s0, s2
151 ; VI-NEXT: s_mov_b32 s1, s3
152 ; VI-NEXT: s_mov_b32 s2, s6
153 ; VI-NEXT: s_mov_b32 s3, s7
154 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
155 ; VI-NEXT: s_waitcnt vmcnt(0)
156 ; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
157 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
160 ; GFX11-LABEL: fadd_f16_imm_a:
161 ; GFX11: ; %bb.0: ; %entry
162 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
163 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
164 ; GFX11-NEXT: s_mov_b32 s6, -1
165 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
166 ; GFX11-NEXT: s_mov_b32 s4, s0
167 ; GFX11-NEXT: s_mov_b32 s5, s1
168 ; GFX11-NEXT: s_mov_b32 s0, s2
169 ; GFX11-NEXT: s_mov_b32 s1, s3
170 ; GFX11-NEXT: s_mov_b32 s2, s6
171 ; GFX11-NEXT: s_mov_b32 s3, s7
172 ; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
173 ; GFX11-NEXT: s_waitcnt vmcnt(0)
174 ; GFX11-NEXT: v_mov_b16_e32 v0.h, 0x3c00
175 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
176 ; GFX11-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
177 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
178 ; GFX11-NEXT: s_nop 0
179 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
180 ; GFX11-NEXT: s_endpgm
182 ; GFX11-FAKE16-LABEL: fadd_f16_imm_a:
183 ; GFX11-FAKE16: ; %bb.0: ; %entry
184 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
185 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
186 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
187 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
188 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
189 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
190 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, s2
191 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, s3
192 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
193 ; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
194 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[0:3], 0
195 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
196 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
197 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
198 ; GFX11-FAKE16-NEXT: s_nop 0
199 ; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
200 ; GFX11-FAKE16-NEXT: s_endpgm
202 ptr addrspace(1) %b) {
204 %b.val = load half, ptr addrspace(1) %b
205 %r.val = fadd half 1.0, %b.val
206 store half %r.val, ptr addrspace(1) %r
210 define amdgpu_kernel void @fadd_f16_imm_b(
211 ; SI-LABEL: fadd_f16_imm_b:
212 ; SI: ; %bb.0: ; %entry
213 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
214 ; SI-NEXT: s_mov_b32 s7, 0xf000
215 ; SI-NEXT: s_mov_b32 s6, -1
216 ; SI-NEXT: s_waitcnt lgkmcnt(0)
217 ; SI-NEXT: s_mov_b32 s4, s0
218 ; SI-NEXT: s_mov_b32 s5, s1
219 ; SI-NEXT: s_mov_b32 s0, s2
220 ; SI-NEXT: s_mov_b32 s1, s3
221 ; SI-NEXT: s_mov_b32 s2, s6
222 ; SI-NEXT: s_mov_b32 s3, s7
223 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
224 ; SI-NEXT: s_waitcnt vmcnt(0)
225 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
226 ; SI-NEXT: v_add_f32_e32 v0, 2.0, v0
227 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
228 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
231 ; VI-LABEL: fadd_f16_imm_b:
232 ; VI: ; %bb.0: ; %entry
233 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
234 ; VI-NEXT: s_mov_b32 s7, 0xf000
235 ; VI-NEXT: s_mov_b32 s6, -1
236 ; VI-NEXT: s_waitcnt lgkmcnt(0)
237 ; VI-NEXT: s_mov_b32 s4, s0
238 ; VI-NEXT: s_mov_b32 s5, s1
239 ; VI-NEXT: s_mov_b32 s0, s2
240 ; VI-NEXT: s_mov_b32 s1, s3
241 ; VI-NEXT: s_mov_b32 s2, s6
242 ; VI-NEXT: s_mov_b32 s3, s7
243 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
244 ; VI-NEXT: s_waitcnt vmcnt(0)
245 ; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
246 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
249 ; GFX11-LABEL: fadd_f16_imm_b:
250 ; GFX11: ; %bb.0: ; %entry
251 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
252 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
253 ; GFX11-NEXT: s_mov_b32 s6, -1
254 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
255 ; GFX11-NEXT: s_mov_b32 s4, s0
256 ; GFX11-NEXT: s_mov_b32 s5, s1
257 ; GFX11-NEXT: s_mov_b32 s0, s2
258 ; GFX11-NEXT: s_mov_b32 s1, s3
259 ; GFX11-NEXT: s_mov_b32 s2, s6
260 ; GFX11-NEXT: s_mov_b32 s3, s7
261 ; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
262 ; GFX11-NEXT: s_waitcnt vmcnt(0)
263 ; GFX11-NEXT: v_mov_b16_e32 v0.h, 0x4000
264 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
265 ; GFX11-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
266 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
267 ; GFX11-NEXT: s_nop 0
268 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
269 ; GFX11-NEXT: s_endpgm
271 ; GFX11-FAKE16-LABEL: fadd_f16_imm_b:
272 ; GFX11-FAKE16: ; %bb.0: ; %entry
273 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
274 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
275 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
276 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
277 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
278 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
279 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, s2
280 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, s3
281 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
282 ; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
283 ; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[0:3], 0
284 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
285 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0
286 ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
287 ; GFX11-FAKE16-NEXT: s_nop 0
288 ; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
289 ; GFX11-FAKE16-NEXT: s_endpgm
291 ptr addrspace(1) %a) {
293 %a.val = load half, ptr addrspace(1) %a
294 %r.val = fadd half %a.val, 2.0
295 store half %r.val, ptr addrspace(1) %r
299 define amdgpu_kernel void @fadd_v2f16(
300 ; SI-LABEL: fadd_v2f16:
301 ; SI: ; %bb.0: ; %entry
302 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
303 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
304 ; SI-NEXT: s_mov_b32 s11, 0xf000
305 ; SI-NEXT: s_mov_b32 s14, 0
306 ; SI-NEXT: s_mov_b32 s15, s11
307 ; SI-NEXT: s_waitcnt lgkmcnt(0)
308 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
309 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
310 ; SI-NEXT: v_mov_b32_e32 v1, 0
311 ; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
312 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
313 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
314 ; SI-NEXT: s_mov_b32 s10, -1
315 ; SI-NEXT: s_mov_b32 s8, s4
316 ; SI-NEXT: s_mov_b32 s9, s5
317 ; SI-NEXT: s_waitcnt vmcnt(1)
318 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2
319 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
320 ; SI-NEXT: s_waitcnt vmcnt(0)
321 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
322 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
323 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
324 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
325 ; SI-NEXT: v_add_f32_e32 v1, v3, v1
326 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
327 ; SI-NEXT: v_add_f32_e32 v0, v2, v0
328 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
329 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
330 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
331 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
334 ; VI-LABEL: fadd_v2f16:
335 ; VI: ; %bb.0: ; %entry
336 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
337 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
338 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
339 ; VI-NEXT: s_mov_b32 s3, 0xf000
340 ; VI-NEXT: s_mov_b32 s2, -1
341 ; VI-NEXT: s_waitcnt lgkmcnt(0)
342 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
343 ; VI-NEXT: v_mov_b32_e32 v1, s7
344 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
345 ; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
346 ; VI-NEXT: v_mov_b32_e32 v3, s9
347 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
348 ; VI-NEXT: flat_load_dword v0, v[0:1]
349 ; VI-NEXT: flat_load_dword v1, v[2:3]
350 ; VI-NEXT: s_mov_b32 s0, s4
351 ; VI-NEXT: s_mov_b32 s1, s5
352 ; VI-NEXT: s_waitcnt vmcnt(0)
353 ; VI-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
354 ; VI-NEXT: v_add_f16_e32 v0, v0, v1
355 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
356 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
359 ; GFX11-LABEL: fadd_v2f16:
360 ; GFX11: ; %bb.0: ; %entry
361 ; GFX11-NEXT: s_clause 0x1
362 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
363 ; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
364 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
365 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
366 ; GFX11-NEXT: s_mov_b32 s2, -1
367 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
368 ; GFX11-NEXT: s_clause 0x1
369 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
370 ; GFX11-NEXT: global_load_b32 v0, v0, s[8:9]
371 ; GFX11-NEXT: s_mov_b32 s0, s4
372 ; GFX11-NEXT: s_mov_b32 s1, s5
373 ; GFX11-NEXT: s_waitcnt vmcnt(0)
374 ; GFX11-NEXT: v_pk_add_f16 v0, v1, v0
375 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
376 ; GFX11-NEXT: s_nop 0
377 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
378 ; GFX11-NEXT: s_endpgm
380 ; GFX11-FAKE16-LABEL: fadd_v2f16:
381 ; GFX11-FAKE16: ; %bb.0: ; %entry
382 ; GFX11-FAKE16-NEXT: s_clause 0x1
383 ; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
384 ; GFX11-FAKE16-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
385 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
386 ; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
387 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
388 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
389 ; GFX11-FAKE16-NEXT: s_clause 0x1
390 ; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[6:7]
391 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[8:9]
392 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
393 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
394 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
395 ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, v1, v0
396 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
397 ; GFX11-FAKE16-NEXT: s_nop 0
398 ; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
399 ; GFX11-FAKE16-NEXT: s_endpgm
402 ptr addrspace(1) %b) {
404 %tid = call i32 @llvm.amdgcn.workitem.id.x()
405 %gep.a = getelementptr inbounds <2 x half>, ptr addrspace(1) %a, i32 %tid
406 %gep.b = getelementptr inbounds <2 x half>, ptr addrspace(1) %b, i32 %tid
407 %a.val = load <2 x half>, ptr addrspace(1) %gep.a
408 %b.val = load <2 x half>, ptr addrspace(1) %gep.b
409 %r.val = fadd <2 x half> %a.val, %b.val
410 store <2 x half> %r.val, ptr addrspace(1) %r
414 define amdgpu_kernel void @fadd_v2f16_imm_a(
415 ; SI-LABEL: fadd_v2f16_imm_a:
416 ; SI: ; %bb.0: ; %entry
417 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
418 ; SI-NEXT: s_mov_b32 s7, 0xf000
419 ; SI-NEXT: s_mov_b32 s10, 0
420 ; SI-NEXT: s_mov_b32 s11, s7
421 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
422 ; SI-NEXT: s_waitcnt lgkmcnt(0)
423 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
424 ; SI-NEXT: v_mov_b32_e32 v1, 0
425 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
426 ; SI-NEXT: s_mov_b32 s6, -1
427 ; SI-NEXT: s_mov_b32 s4, s0
428 ; SI-NEXT: s_mov_b32 s5, s1
429 ; SI-NEXT: s_waitcnt vmcnt(0)
430 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
431 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
432 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
433 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1
434 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
435 ; SI-NEXT: v_add_f32_e32 v0, 2.0, v0
436 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
437 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
438 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
439 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
442 ; VI-LABEL: fadd_v2f16_imm_a:
443 ; VI: ; %bb.0: ; %entry
444 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
445 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
446 ; VI-NEXT: s_mov_b32 s7, 0xf000
447 ; VI-NEXT: s_mov_b32 s6, -1
448 ; VI-NEXT: s_waitcnt lgkmcnt(0)
449 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
450 ; VI-NEXT: v_mov_b32_e32 v1, s3
451 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
452 ; VI-NEXT: flat_load_dword v0, v[0:1]
453 ; VI-NEXT: v_mov_b32_e32 v1, 0x4000
454 ; VI-NEXT: s_mov_b32 s4, s0
455 ; VI-NEXT: s_mov_b32 s5, s1
456 ; VI-NEXT: s_waitcnt vmcnt(0)
457 ; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
458 ; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
459 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
460 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
463 ; GFX11-LABEL: fadd_v2f16_imm_a:
464 ; GFX11: ; %bb.0: ; %entry
465 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
466 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
467 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
468 ; GFX11-NEXT: s_mov_b32 s6, -1
469 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
470 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
471 ; GFX11-NEXT: s_mov_b32 s4, s0
472 ; GFX11-NEXT: s_mov_b32 s5, s1
473 ; GFX11-NEXT: s_waitcnt vmcnt(0)
474 ; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
475 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
476 ; GFX11-NEXT: s_nop 0
477 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
478 ; GFX11-NEXT: s_endpgm
480 ; GFX11-FAKE16-LABEL: fadd_v2f16_imm_a:
481 ; GFX11-FAKE16: ; %bb.0: ; %entry
482 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
483 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
484 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
485 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
486 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
487 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3]
488 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
489 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
490 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
491 ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
492 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
493 ; GFX11-FAKE16-NEXT: s_nop 0
494 ; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
495 ; GFX11-FAKE16-NEXT: s_endpgm
497 ptr addrspace(1) %b) {
499 %tid = call i32 @llvm.amdgcn.workitem.id.x()
500 %gep.b = getelementptr inbounds <2 x half>, ptr addrspace(1) %b, i32 %tid
501 %b.val = load <2 x half>, ptr addrspace(1) %gep.b
502 %r.val = fadd <2 x half> <half 1.0, half 2.0>, %b.val
503 store <2 x half> %r.val, ptr addrspace(1) %r
507 define amdgpu_kernel void @fadd_v2f16_imm_b(
508 ; SI-LABEL: fadd_v2f16_imm_b:
509 ; SI: ; %bb.0: ; %entry
510 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
511 ; SI-NEXT: s_mov_b32 s7, 0xf000
512 ; SI-NEXT: s_mov_b32 s10, 0
513 ; SI-NEXT: s_mov_b32 s11, s7
514 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
515 ; SI-NEXT: s_waitcnt lgkmcnt(0)
516 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
517 ; SI-NEXT: v_mov_b32_e32 v1, 0
518 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
519 ; SI-NEXT: s_mov_b32 s6, -1
520 ; SI-NEXT: s_mov_b32 s4, s0
521 ; SI-NEXT: s_mov_b32 s5, s1
522 ; SI-NEXT: s_waitcnt vmcnt(0)
523 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
524 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
525 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
526 ; SI-NEXT: v_add_f32_e32 v1, 2.0, v1
527 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
528 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
529 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
530 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
531 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
532 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
535 ; VI-LABEL: fadd_v2f16_imm_b:
536 ; VI: ; %bb.0: ; %entry
537 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
538 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
539 ; VI-NEXT: s_mov_b32 s7, 0xf000
540 ; VI-NEXT: s_mov_b32 s6, -1
541 ; VI-NEXT: s_waitcnt lgkmcnt(0)
542 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
543 ; VI-NEXT: v_mov_b32_e32 v1, s3
544 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
545 ; VI-NEXT: flat_load_dword v0, v[0:1]
546 ; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
547 ; VI-NEXT: s_mov_b32 s4, s0
548 ; VI-NEXT: s_mov_b32 s5, s1
549 ; VI-NEXT: s_waitcnt vmcnt(0)
550 ; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
551 ; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
552 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
553 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
556 ; GFX11-LABEL: fadd_v2f16_imm_b:
557 ; GFX11: ; %bb.0: ; %entry
558 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
559 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
560 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
561 ; GFX11-NEXT: s_mov_b32 s6, -1
562 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
563 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
564 ; GFX11-NEXT: s_mov_b32 s4, s0
565 ; GFX11-NEXT: s_mov_b32 s5, s1
566 ; GFX11-NEXT: s_waitcnt vmcnt(0)
567 ; GFX11-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
568 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
569 ; GFX11-NEXT: s_nop 0
570 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
571 ; GFX11-NEXT: s_endpgm
573 ; GFX11-FAKE16-LABEL: fadd_v2f16_imm_b:
574 ; GFX11-FAKE16: ; %bb.0: ; %entry
575 ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
576 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
577 ; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
578 ; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
579 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
580 ; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3]
581 ; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
582 ; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
583 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
584 ; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
585 ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
586 ; GFX11-FAKE16-NEXT: s_nop 0
587 ; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
588 ; GFX11-FAKE16-NEXT: s_endpgm
590 ptr addrspace(1) %a) {
592 %tid = call i32 @llvm.amdgcn.workitem.id.x()
593 %gep.a = getelementptr inbounds <2 x half>, ptr addrspace(1) %a, i32 %tid
594 %a.val = load <2 x half>, ptr addrspace(1) %gep.a
595 %r.val = fadd <2 x half> %a.val, <half 2.0, half 1.0>
596 store <2 x half> %r.val, ptr addrspace(1) %r
600 declare i32 @llvm.amdgcn.workitem.id.x() #1
602 attributes #0 = { nounwind }
603 attributes #1 = { nounwind readnone }