1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-SDAG %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-GISEL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-FAKE16-SDAG %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-FAKE16-GISEL %s
9 define amdgpu_kernel void @fadd_f16(
11 ; SI: ; %bb.0: ; %entry
12 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
13 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
14 ; SI-NEXT: s_mov_b32 s11, 0xf000
15 ; SI-NEXT: s_mov_b32 s10, -1
16 ; SI-NEXT: s_mov_b32 s6, s10
17 ; SI-NEXT: s_waitcnt lgkmcnt(0)
18 ; SI-NEXT: s_mov_b32 s8, s0
19 ; SI-NEXT: s_mov_b32 s9, s1
20 ; SI-NEXT: s_mov_b32 s0, s2
21 ; SI-NEXT: s_mov_b32 s1, s3
22 ; SI-NEXT: s_mov_b32 s2, s10
23 ; SI-NEXT: s_mov_b32 s3, s11
24 ; SI-NEXT: s_mov_b32 s7, s11
25 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc
26 ; SI-NEXT: s_waitcnt vmcnt(0)
27 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
28 ; SI-NEXT: s_waitcnt vmcnt(0)
29 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
30 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
31 ; SI-NEXT: v_add_f32_e32 v0, v0, v1
32 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
33 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
37 ; VI: ; %bb.0: ; %entry
38 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
39 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
40 ; VI-NEXT: s_mov_b32 s11, 0xf000
41 ; VI-NEXT: s_mov_b32 s10, -1
42 ; VI-NEXT: s_mov_b32 s6, s10
43 ; VI-NEXT: s_waitcnt lgkmcnt(0)
44 ; VI-NEXT: s_mov_b32 s8, s0
45 ; VI-NEXT: s_mov_b32 s9, s1
46 ; VI-NEXT: s_mov_b32 s0, s2
47 ; VI-NEXT: s_mov_b32 s1, s3
48 ; VI-NEXT: s_mov_b32 s2, s10
49 ; VI-NEXT: s_mov_b32 s3, s11
50 ; VI-NEXT: s_mov_b32 s7, s11
51 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc
52 ; VI-NEXT: s_waitcnt vmcnt(0)
53 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc
54 ; VI-NEXT: s_waitcnt vmcnt(0)
55 ; VI-NEXT: v_add_f16_e32 v0, v0, v1
56 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
59 ; GFX11-SDAG-LABEL: fadd_f16:
60 ; GFX11-SDAG: ; %bb.0: ; %entry
61 ; GFX11-SDAG-NEXT: s_clause 0x1
62 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
63 ; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
64 ; GFX11-SDAG-NEXT: s_mov_b32 s11, 0x31016000
65 ; GFX11-SDAG-NEXT: s_mov_b32 s10, -1
66 ; GFX11-SDAG-NEXT: s_mov_b32 s7, s11
67 ; GFX11-SDAG-NEXT: s_mov_b32 s6, s10
68 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX11-SDAG-NEXT: s_mov_b32 s8, s0
70 ; GFX11-SDAG-NEXT: s_mov_b32 s9, s1
71 ; GFX11-SDAG-NEXT: s_mov_b32 s0, s2
72 ; GFX11-SDAG-NEXT: s_mov_b32 s1, s3
73 ; GFX11-SDAG-NEXT: s_mov_b32 s2, s10
74 ; GFX11-SDAG-NEXT: s_mov_b32 s3, s11
75 ; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc
76 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
77 ; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
78 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
79 ; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
80 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
81 ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
82 ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
83 ; GFX11-SDAG-NEXT: s_endpgm
85 ; GFX11-GISEL-LABEL: fadd_f16:
86 ; GFX11-GISEL: ; %bb.0: ; %entry
87 ; GFX11-GISEL-NEXT: s_clause 0x1
88 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
89 ; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
90 ; GFX11-GISEL-NEXT: s_mov_b32 s10, -1
91 ; GFX11-GISEL-NEXT: s_mov_b32 s11, 0x31016000
92 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
93 ; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[10:11]
94 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
95 ; GFX11-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3]
96 ; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
97 ; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
98 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
99 ; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
100 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
101 ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
102 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
103 ; GFX11-GISEL-NEXT: s_endpgm
105 ; GFX11-FAKE16-SDAG-LABEL: fadd_f16:
106 ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
107 ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1
108 ; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
109 ; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
110 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s11, 0x31016000
111 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s10, -1
112 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s11
113 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s10
114 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
115 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s8, s0
116 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s9, s1
117 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s2
118 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s3
119 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s10
120 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s11
121 ; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc
122 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
123 ; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
124 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
125 ; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
126 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
127 ; GFX11-FAKE16-SDAG-NEXT: s_endpgm
129 ; GFX11-FAKE16-GISEL-LABEL: fadd_f16:
130 ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
131 ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1
132 ; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
133 ; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
134 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s10, -1
135 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s11, 0x31016000
136 ; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
137 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[10:11]
138 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
139 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3]
140 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
141 ; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
142 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
143 ; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
144 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
145 ; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
146 ; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
147 ; GFX11-FAKE16-GISEL-NEXT: s_endpgm
148 ; GFX11-LABEL: fadd_f16:
149 ; GFX11: ; %bb.0: ; %entry
150 ; GFX11-NEXT: s_clause 0x1
151 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
152 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
153 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
154 ; GFX11-NEXT: s_mov_b32 s10, -1
155 ; GFX11-NEXT: s_mov_b32 s3, s11
156 ; GFX11-NEXT: s_mov_b32 s2, s10
157 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
158 ; GFX11-NEXT: s_mov_b32 s8, s4
159 ; GFX11-NEXT: s_mov_b32 s9, s5
160 ; GFX11-NEXT: s_mov_b32 s4, s6
161 ; GFX11-NEXT: s_mov_b32 s5, s7
162 ; GFX11-NEXT: s_mov_b32 s6, s10
163 ; GFX11-NEXT: s_mov_b32 s7, s11
164 ; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
165 ; GFX11-NEXT: s_waitcnt vmcnt(0)
166 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
167 ; GFX11-NEXT: s_waitcnt vmcnt(0)
168 ; GFX11-NEXT: v_mov_b16_e32 v0.h, v1.l
169 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
170 ; GFX11-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
171 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
172 ; GFX11-NEXT: s_nop 0
173 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
174 ; GFX11-NEXT: s_endpgm
177 ptr addrspace(1) %b) {
179 %a.val = load volatile half, ptr addrspace(1) %a
180 %b.val = load volatile half, ptr addrspace(1) %b
181 %r.val = fadd half %a.val, %b.val
182 store half %r.val, ptr addrspace(1) %r
186 define amdgpu_kernel void @fadd_f16_imm_a(
187 ; SI-LABEL: fadd_f16_imm_a:
188 ; SI: ; %bb.0: ; %entry
189 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
190 ; SI-NEXT: s_mov_b32 s7, 0xf000
191 ; SI-NEXT: s_mov_b32 s6, -1
192 ; SI-NEXT: s_waitcnt lgkmcnt(0)
193 ; SI-NEXT: s_mov_b32 s4, s0
194 ; SI-NEXT: s_mov_b32 s5, s1
195 ; SI-NEXT: s_mov_b32 s0, s2
196 ; SI-NEXT: s_mov_b32 s1, s3
197 ; SI-NEXT: s_mov_b32 s2, s6
198 ; SI-NEXT: s_mov_b32 s3, s7
199 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
200 ; SI-NEXT: s_waitcnt vmcnt(0)
201 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
202 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
203 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
204 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
207 ; VI-LABEL: fadd_f16_imm_a:
208 ; VI: ; %bb.0: ; %entry
209 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
210 ; VI-NEXT: s_mov_b32 s7, 0xf000
211 ; VI-NEXT: s_mov_b32 s6, -1
212 ; VI-NEXT: s_waitcnt lgkmcnt(0)
213 ; VI-NEXT: s_mov_b32 s4, s0
214 ; VI-NEXT: s_mov_b32 s5, s1
215 ; VI-NEXT: s_mov_b32 s0, s2
216 ; VI-NEXT: s_mov_b32 s1, s3
217 ; VI-NEXT: s_mov_b32 s2, s6
218 ; VI-NEXT: s_mov_b32 s3, s7
219 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
220 ; VI-NEXT: s_waitcnt vmcnt(0)
221 ; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
222 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
225 ; GFX11-SDAG-LABEL: fadd_f16_imm_a:
226 ; GFX11-SDAG: ; %bb.0: ; %entry
227 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
228 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
229 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
230 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
231 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
232 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
233 ; GFX11-SDAG-NEXT: s_mov_b32 s0, s2
234 ; GFX11-SDAG-NEXT: s_mov_b32 s1, s3
235 ; GFX11-SDAG-NEXT: s_mov_b32 s2, s6
236 ; GFX11-SDAG-NEXT: s_mov_b32 s3, s7
237 ; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
238 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
239 ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
240 ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
241 ; GFX11-SDAG-NEXT: s_endpgm
243 ; GFX11-GISEL-LABEL: fadd_f16_imm_a:
244 ; GFX11-GISEL: ; %bb.0: ; %entry
245 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
246 ; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
247 ; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
248 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
249 ; GFX11-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
250 ; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
251 ; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
252 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
253 ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
254 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
255 ; GFX11-GISEL-NEXT: s_endpgm
257 ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_a:
258 ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
259 ; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
260 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
261 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
262 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
263 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
264 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
265 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s2
266 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s3
267 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s6
268 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s7
269 ; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
270 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
271 ; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, 1.0, v0
272 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
273 ; GFX11-FAKE16-SDAG-NEXT: s_endpgm
275 ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_a:
276 ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
277 ; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
278 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
279 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
280 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
281 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
282 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
283 ; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
284 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
285 ; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, 1.0, v0
286 ; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
287 ; GFX11-FAKE16-GISEL-NEXT: s_endpgm
288 ; GFX11-LABEL: fadd_f16_imm_a:
289 ; GFX11: ; %bb.0: ; %entry
290 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
291 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
292 ; GFX11-NEXT: s_mov_b32 s6, -1
293 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
294 ; GFX11-NEXT: s_mov_b32 s4, s0
295 ; GFX11-NEXT: s_mov_b32 s5, s1
296 ; GFX11-NEXT: s_mov_b32 s0, s2
297 ; GFX11-NEXT: s_mov_b32 s1, s3
298 ; GFX11-NEXT: s_mov_b32 s2, s6
299 ; GFX11-NEXT: s_mov_b32 s3, s7
300 ; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
301 ; GFX11-NEXT: s_waitcnt vmcnt(0)
302 ; GFX11-NEXT: v_mov_b16_e32 v0.h, 0x3c00
303 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
304 ; GFX11-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
305 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
306 ; GFX11-NEXT: s_nop 0
307 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
308 ; GFX11-NEXT: s_endpgm
310 ptr addrspace(1) %b) {
312 %b.val = load half, ptr addrspace(1) %b
313 %r.val = fadd half 1.0, %b.val
314 store half %r.val, ptr addrspace(1) %r
318 define amdgpu_kernel void @fadd_f16_imm_b(
319 ; SI-LABEL: fadd_f16_imm_b:
320 ; SI: ; %bb.0: ; %entry
321 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
322 ; SI-NEXT: s_mov_b32 s7, 0xf000
323 ; SI-NEXT: s_mov_b32 s6, -1
324 ; SI-NEXT: s_waitcnt lgkmcnt(0)
325 ; SI-NEXT: s_mov_b32 s4, s0
326 ; SI-NEXT: s_mov_b32 s5, s1
327 ; SI-NEXT: s_mov_b32 s0, s2
328 ; SI-NEXT: s_mov_b32 s1, s3
329 ; SI-NEXT: s_mov_b32 s2, s6
330 ; SI-NEXT: s_mov_b32 s3, s7
331 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
332 ; SI-NEXT: s_waitcnt vmcnt(0)
333 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
334 ; SI-NEXT: v_add_f32_e32 v0, 2.0, v0
335 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
336 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
339 ; VI-LABEL: fadd_f16_imm_b:
340 ; VI: ; %bb.0: ; %entry
341 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
342 ; VI-NEXT: s_mov_b32 s7, 0xf000
343 ; VI-NEXT: s_mov_b32 s6, -1
344 ; VI-NEXT: s_waitcnt lgkmcnt(0)
345 ; VI-NEXT: s_mov_b32 s4, s0
346 ; VI-NEXT: s_mov_b32 s5, s1
347 ; VI-NEXT: s_mov_b32 s0, s2
348 ; VI-NEXT: s_mov_b32 s1, s3
349 ; VI-NEXT: s_mov_b32 s2, s6
350 ; VI-NEXT: s_mov_b32 s3, s7
351 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
352 ; VI-NEXT: s_waitcnt vmcnt(0)
353 ; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
354 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
357 ; GFX11-SDAG-LABEL: fadd_f16_imm_b:
358 ; GFX11-SDAG: ; %bb.0: ; %entry
359 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
360 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
361 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
362 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
363 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
364 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
365 ; GFX11-SDAG-NEXT: s_mov_b32 s0, s2
366 ; GFX11-SDAG-NEXT: s_mov_b32 s1, s3
367 ; GFX11-SDAG-NEXT: s_mov_b32 s2, s6
368 ; GFX11-SDAG-NEXT: s_mov_b32 s3, s7
369 ; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
370 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
371 ; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l
372 ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
373 ; GFX11-SDAG-NEXT: s_endpgm
375 ; GFX11-GISEL-LABEL: fadd_f16_imm_b:
376 ; GFX11-GISEL: ; %bb.0: ; %entry
377 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
378 ; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
379 ; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
380 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
381 ; GFX11-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
382 ; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
383 ; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
384 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
385 ; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l
386 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
387 ; GFX11-GISEL-NEXT: s_endpgm
389 ; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_b:
390 ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
391 ; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
392 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
393 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
394 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
395 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
396 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
397 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s2
398 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s3
399 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s6
400 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s7
401 ; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
402 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
403 ; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, 2.0, v0
404 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
405 ; GFX11-FAKE16-SDAG-NEXT: s_endpgm
407 ; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_b:
408 ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
409 ; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
410 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
411 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
412 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
413 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
414 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
415 ; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
416 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
417 ; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, 2.0, v0
418 ; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
419 ; GFX11-FAKE16-GISEL-NEXT: s_endpgm
420 ; GFX11-LABEL: fadd_f16_imm_b:
421 ; GFX11: ; %bb.0: ; %entry
422 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
423 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
424 ; GFX11-NEXT: s_mov_b32 s6, -1
425 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
426 ; GFX11-NEXT: s_mov_b32 s4, s0
427 ; GFX11-NEXT: s_mov_b32 s5, s1
428 ; GFX11-NEXT: s_mov_b32 s0, s2
429 ; GFX11-NEXT: s_mov_b32 s1, s3
430 ; GFX11-NEXT: s_mov_b32 s2, s6
431 ; GFX11-NEXT: s_mov_b32 s3, s7
432 ; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
433 ; GFX11-NEXT: s_waitcnt vmcnt(0)
434 ; GFX11-NEXT: v_mov_b16_e32 v0.h, 0x4000
435 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
436 ; GFX11-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
437 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
438 ; GFX11-NEXT: s_nop 0
439 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
440 ; GFX11-NEXT: s_endpgm
442 ptr addrspace(1) %a) {
444 %a.val = load half, ptr addrspace(1) %a
445 %r.val = fadd half %a.val, 2.0
446 store half %r.val, ptr addrspace(1) %r
450 define amdgpu_kernel void @fadd_v2f16(
451 ; SI-LABEL: fadd_v2f16:
452 ; SI: ; %bb.0: ; %entry
453 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
454 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
455 ; SI-NEXT: s_mov_b32 s11, 0xf000
456 ; SI-NEXT: s_mov_b32 s14, 0
457 ; SI-NEXT: s_mov_b32 s15, s11
458 ; SI-NEXT: s_waitcnt lgkmcnt(0)
459 ; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
460 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
461 ; SI-NEXT: v_mov_b32_e32 v1, 0
462 ; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
463 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
464 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
465 ; SI-NEXT: s_mov_b32 s10, -1
466 ; SI-NEXT: s_mov_b32 s8, s0
467 ; SI-NEXT: s_mov_b32 s9, s1
468 ; SI-NEXT: s_waitcnt vmcnt(1)
469 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2
470 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
471 ; SI-NEXT: s_waitcnt vmcnt(0)
472 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
473 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
474 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
475 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
476 ; SI-NEXT: v_add_f32_e32 v1, v3, v1
477 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
478 ; SI-NEXT: v_add_f32_e32 v0, v2, v0
479 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
480 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
481 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
482 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
485 ; VI-LABEL: fadd_v2f16:
486 ; VI: ; %bb.0: ; %entry
487 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
488 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
489 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
490 ; VI-NEXT: s_mov_b32 s7, 0xf000
491 ; VI-NEXT: s_mov_b32 s6, -1
492 ; VI-NEXT: s_waitcnt lgkmcnt(0)
493 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
494 ; VI-NEXT: v_mov_b32_e32 v1, s3
495 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
496 ; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
497 ; VI-NEXT: v_mov_b32_e32 v3, s9
498 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
499 ; VI-NEXT: flat_load_dword v0, v[0:1]
500 ; VI-NEXT: flat_load_dword v1, v[2:3]
501 ; VI-NEXT: s_mov_b32 s4, s0
502 ; VI-NEXT: s_mov_b32 s5, s1
503 ; VI-NEXT: s_waitcnt vmcnt(0)
504 ; VI-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
505 ; VI-NEXT: v_add_f16_e32 v0, v0, v1
506 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
507 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
510 ; GFX11-SDAG-LABEL: fadd_v2f16:
511 ; GFX11-SDAG: ; %bb.0: ; %entry
512 ; GFX11-SDAG-NEXT: s_clause 0x1
513 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
514 ; GFX11-SDAG-NEXT: s_load_b64 s[8:9], s[4:5], 0x34
515 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
516 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
517 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
518 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
519 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
520 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
521 ; GFX11-SDAG-NEXT: s_clause 0x1
522 ; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
523 ; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[8:9]
524 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
525 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
526 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
527 ; GFX11-SDAG-NEXT: v_pk_add_f16 v0, v1, v0
528 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
529 ; GFX11-SDAG-NEXT: s_endpgm
531 ; GFX11-GISEL-LABEL: fadd_v2f16:
532 ; GFX11-GISEL: ; %bb.0: ; %entry
533 ; GFX11-GISEL-NEXT: s_clause 0x1
534 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
535 ; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
536 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
537 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
538 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
539 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
540 ; GFX11-GISEL-NEXT: s_clause 0x1
541 ; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
542 ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[4:5]
543 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
544 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
545 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
546 ; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v1, v0
547 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
548 ; GFX11-GISEL-NEXT: s_endpgm
550 ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16:
551 ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
552 ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1
553 ; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
554 ; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[8:9], s[4:5], 0x34
555 ; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
556 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
557 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
558 ; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
559 ; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
560 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
561 ; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1
562 ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
563 ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[8:9]
564 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
565 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
566 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
567 ; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, v1, v0
568 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
569 ; GFX11-FAKE16-SDAG-NEXT: s_endpgm
571 ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16:
572 ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
573 ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1
574 ; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
575 ; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
576 ; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
577 ; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
578 ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
579 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
580 ; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1
581 ; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
582 ; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[4:5]
583 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
584 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
585 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
586 ; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, v1, v0
587 ; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
588 ; GFX11-FAKE16-GISEL-NEXT: s_endpgm
589 ; GFX11-LABEL: fadd_v2f16:
590 ; GFX11: ; %bb.0: ; %entry
591 ; GFX11-NEXT: s_clause 0x1
592 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
593 ; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
594 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
595 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
596 ; GFX11-NEXT: s_mov_b32 s2, -1
597 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
598 ; GFX11-NEXT: s_clause 0x1
599 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
600 ; GFX11-NEXT: global_load_b32 v0, v0, s[8:9]
601 ; GFX11-NEXT: s_mov_b32 s0, s4
602 ; GFX11-NEXT: s_mov_b32 s1, s5
603 ; GFX11-NEXT: s_waitcnt vmcnt(0)
604 ; GFX11-NEXT: v_pk_add_f16 v0, v1, v0
605 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
606 ; GFX11-NEXT: s_nop 0
607 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
608 ; GFX11-NEXT: s_endpgm
611 ptr addrspace(1) %b) {
613 %tid = call i32 @llvm.amdgcn.workitem.id.x()
614 %gep.a = getelementptr inbounds <2 x half>, ptr addrspace(1) %a, i32 %tid
615 %gep.b = getelementptr inbounds <2 x half>, ptr addrspace(1) %b, i32 %tid
616 %a.val = load <2 x half>, ptr addrspace(1) %gep.a
617 %b.val = load <2 x half>, ptr addrspace(1) %gep.b
618 %r.val = fadd <2 x half> %a.val, %b.val
619 store <2 x half> %r.val, ptr addrspace(1) %r
623 define amdgpu_kernel void @fadd_v2f16_imm_a(
624 ; SI-LABEL: fadd_v2f16_imm_a:
625 ; SI: ; %bb.0: ; %entry
626 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
627 ; SI-NEXT: s_mov_b32 s7, 0xf000
628 ; SI-NEXT: s_mov_b32 s10, 0
629 ; SI-NEXT: s_mov_b32 s11, s7
630 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
631 ; SI-NEXT: s_waitcnt lgkmcnt(0)
632 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
633 ; SI-NEXT: v_mov_b32_e32 v1, 0
634 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
635 ; SI-NEXT: s_mov_b32 s6, -1
636 ; SI-NEXT: s_mov_b32 s4, s0
637 ; SI-NEXT: s_mov_b32 s5, s1
638 ; SI-NEXT: s_waitcnt vmcnt(0)
639 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
640 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
641 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
642 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1
643 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
644 ; SI-NEXT: v_add_f32_e32 v0, 2.0, v0
645 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
646 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
647 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
648 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
651 ; VI-LABEL: fadd_v2f16_imm_a:
652 ; VI: ; %bb.0: ; %entry
653 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
654 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
655 ; VI-NEXT: s_mov_b32 s7, 0xf000
656 ; VI-NEXT: s_mov_b32 s6, -1
657 ; VI-NEXT: s_waitcnt lgkmcnt(0)
658 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
659 ; VI-NEXT: v_mov_b32_e32 v1, s3
660 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
661 ; VI-NEXT: flat_load_dword v0, v[0:1]
662 ; VI-NEXT: v_mov_b32_e32 v1, 0x4000
663 ; VI-NEXT: s_mov_b32 s4, s0
664 ; VI-NEXT: s_mov_b32 s5, s1
665 ; VI-NEXT: s_waitcnt vmcnt(0)
666 ; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
667 ; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
668 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
669 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
672 ; GFX11-SDAG-LABEL: fadd_v2f16_imm_a:
673 ; GFX11-SDAG: ; %bb.0: ; %entry
674 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
675 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
676 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
677 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
678 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
679 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
680 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
681 ; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
682 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
683 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
684 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
685 ; GFX11-SDAG-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
686 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
687 ; GFX11-SDAG-NEXT: s_endpgm
689 ; GFX11-GISEL-LABEL: fadd_v2f16_imm_a:
690 ; GFX11-GISEL: ; %bb.0: ; %entry
691 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
692 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
693 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
694 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
695 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
696 ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
697 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
698 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
699 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
700 ; GFX11-GISEL-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
701 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
702 ; GFX11-GISEL-NEXT: s_endpgm
704 ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_a:
705 ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
706 ; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
707 ; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
708 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
709 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
710 ; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
711 ; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
712 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
713 ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
714 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
715 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
716 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
717 ; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
718 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
719 ; GFX11-FAKE16-SDAG-NEXT: s_endpgm
721 ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_a:
722 ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
723 ; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
724 ; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
725 ; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
726 ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
727 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
728 ; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
729 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
730 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
731 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
732 ; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
733 ; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
734 ; GFX11-FAKE16-GISEL-NEXT: s_endpgm
735 ; GFX11-LABEL: fadd_v2f16_imm_a:
736 ; GFX11: ; %bb.0: ; %entry
737 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
738 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
739 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
740 ; GFX11-NEXT: s_mov_b32 s6, -1
741 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
742 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
743 ; GFX11-NEXT: s_mov_b32 s4, s0
744 ; GFX11-NEXT: s_mov_b32 s5, s1
745 ; GFX11-NEXT: s_waitcnt vmcnt(0)
746 ; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
747 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
748 ; GFX11-NEXT: s_nop 0
749 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
750 ; GFX11-NEXT: s_endpgm
752 ptr addrspace(1) %b) {
754 %tid = call i32 @llvm.amdgcn.workitem.id.x()
755 %gep.b = getelementptr inbounds <2 x half>, ptr addrspace(1) %b, i32 %tid
756 %b.val = load <2 x half>, ptr addrspace(1) %gep.b
757 %r.val = fadd <2 x half> <half 1.0, half 2.0>, %b.val
758 store <2 x half> %r.val, ptr addrspace(1) %r
762 define amdgpu_kernel void @fadd_v2f16_imm_b(
763 ; SI-LABEL: fadd_v2f16_imm_b:
764 ; SI: ; %bb.0: ; %entry
765 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
766 ; SI-NEXT: s_mov_b32 s7, 0xf000
767 ; SI-NEXT: s_mov_b32 s10, 0
768 ; SI-NEXT: s_mov_b32 s11, s7
769 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
770 ; SI-NEXT: s_waitcnt lgkmcnt(0)
771 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
772 ; SI-NEXT: v_mov_b32_e32 v1, 0
773 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
774 ; SI-NEXT: s_mov_b32 s6, -1
775 ; SI-NEXT: s_mov_b32 s4, s0
776 ; SI-NEXT: s_mov_b32 s5, s1
777 ; SI-NEXT: s_waitcnt vmcnt(0)
778 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
779 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
780 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
781 ; SI-NEXT: v_add_f32_e32 v1, 2.0, v1
782 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
783 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
784 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
785 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
786 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
787 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
790 ; VI-LABEL: fadd_v2f16_imm_b:
791 ; VI: ; %bb.0: ; %entry
792 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
793 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
794 ; VI-NEXT: s_mov_b32 s7, 0xf000
795 ; VI-NEXT: s_mov_b32 s6, -1
796 ; VI-NEXT: s_waitcnt lgkmcnt(0)
797 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
798 ; VI-NEXT: v_mov_b32_e32 v1, s3
799 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
800 ; VI-NEXT: flat_load_dword v0, v[0:1]
801 ; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
802 ; VI-NEXT: s_mov_b32 s4, s0
803 ; VI-NEXT: s_mov_b32 s5, s1
804 ; VI-NEXT: s_waitcnt vmcnt(0)
805 ; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
806 ; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
807 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
808 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
811 ; GFX11-SDAG-LABEL: fadd_v2f16_imm_b:
812 ; GFX11-SDAG: ; %bb.0: ; %entry
813 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
814 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
815 ; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
816 ; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
817 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
818 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
819 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
820 ; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
821 ; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
822 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
823 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
824 ; GFX11-SDAG-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
825 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
826 ; GFX11-SDAG-NEXT: s_endpgm
828 ; GFX11-GISEL-LABEL: fadd_v2f16_imm_b:
829 ; GFX11-GISEL: ; %bb.0: ; %entry
830 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
831 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
832 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
833 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
834 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
835 ; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
836 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
837 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
838 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
839 ; GFX11-GISEL-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
840 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
841 ; GFX11-GISEL-NEXT: s_endpgm
843 ; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_b:
844 ; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
845 ; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
846 ; GFX11-FAKE16-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
847 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
848 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
849 ; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
850 ; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
851 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
852 ; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
853 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
854 ; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
855 ; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
856 ; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
857 ; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
858 ; GFX11-FAKE16-SDAG-NEXT: s_endpgm
860 ; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_b:
861 ; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
862 ; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
863 ; GFX11-FAKE16-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
864 ; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
865 ; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
866 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
867 ; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
868 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
869 ; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
870 ; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
871 ; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
872 ; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
873 ; GFX11-FAKE16-GISEL-NEXT: s_endpgm
874 ; GFX11-LABEL: fadd_v2f16_imm_b:
875 ; GFX11: ; %bb.0: ; %entry
876 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
877 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
878 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
879 ; GFX11-NEXT: s_mov_b32 s6, -1
880 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
881 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
882 ; GFX11-NEXT: s_mov_b32 s4, s0
883 ; GFX11-NEXT: s_mov_b32 s5, s1
884 ; GFX11-NEXT: s_waitcnt vmcnt(0)
885 ; GFX11-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
886 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
887 ; GFX11-NEXT: s_nop 0
888 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
889 ; GFX11-NEXT: s_endpgm
891 ptr addrspace(1) %a) {
893 %tid = call i32 @llvm.amdgcn.workitem.id.x()
894 %gep.a = getelementptr inbounds <2 x half>, ptr addrspace(1) %a, i32 %tid
895 %a.val = load <2 x half>, ptr addrspace(1) %gep.a
896 %r.val = fadd <2 x half> %a.val, <half 2.0, half 1.0>
897 store <2 x half> %r.val, ptr addrspace(1) %r
901 declare i32 @llvm.amdgcn.workitem.id.x() #1
903 attributes #0 = { nounwind }
904 attributes #1 = { nounwind readnone }