1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
4 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
6 define amdgpu_kernel void @select_f16(
7 ; SI-LABEL: select_f16:
8 ; SI: ; %bb.0: ; %entry
9 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
10 ; SI-NEXT: s_mov_b32 s3, 0xf000
11 ; SI-NEXT: s_mov_b32 s2, -1
12 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
13 ; SI-NEXT: s_mov_b32 s18, s2
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: s_mov_b32 s16, s6
16 ; SI-NEXT: s_mov_b32 s17, s7
17 ; SI-NEXT: s_mov_b32 s19, s3
18 ; SI-NEXT: s_mov_b32 s20, s8
19 ; SI-NEXT: s_mov_b32 s21, s9
20 ; SI-NEXT: s_mov_b32 s22, s2
21 ; SI-NEXT: s_mov_b32 s23, s3
22 ; SI-NEXT: s_mov_b32 s8, s10
23 ; SI-NEXT: s_mov_b32 s9, s11
24 ; SI-NEXT: s_mov_b32 s10, s2
25 ; SI-NEXT: s_mov_b32 s11, s3
26 ; SI-NEXT: s_mov_b32 s14, s2
27 ; SI-NEXT: s_mov_b32 s15, s3
28 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
29 ; SI-NEXT: s_waitcnt vmcnt(0)
30 ; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc
31 ; SI-NEXT: s_waitcnt vmcnt(0)
32 ; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
33 ; SI-NEXT: s_waitcnt vmcnt(0)
34 ; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc
35 ; SI-NEXT: s_waitcnt vmcnt(0)
36 ; SI-NEXT: s_mov_b32 s0, s4
37 ; SI-NEXT: s_mov_b32 s1, s5
38 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
39 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
40 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
41 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
42 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
43 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
44 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
45 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
48 ; VI-LABEL: select_f16:
49 ; VI: ; %bb.0: ; %entry
50 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
51 ; VI-NEXT: s_mov_b32 s3, 0xf000
52 ; VI-NEXT: s_mov_b32 s2, -1
53 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
54 ; VI-NEXT: s_mov_b32 s18, s2
55 ; VI-NEXT: s_waitcnt lgkmcnt(0)
56 ; VI-NEXT: s_mov_b32 s16, s6
57 ; VI-NEXT: s_mov_b32 s17, s7
58 ; VI-NEXT: s_mov_b32 s19, s3
59 ; VI-NEXT: s_mov_b32 s20, s8
60 ; VI-NEXT: s_mov_b32 s21, s9
61 ; VI-NEXT: s_mov_b32 s22, s2
62 ; VI-NEXT: s_mov_b32 s23, s3
63 ; VI-NEXT: s_mov_b32 s8, s10
64 ; VI-NEXT: s_mov_b32 s9, s11
65 ; VI-NEXT: s_mov_b32 s10, s2
66 ; VI-NEXT: s_mov_b32 s11, s3
67 ; VI-NEXT: s_mov_b32 s14, s2
68 ; VI-NEXT: s_mov_b32 s15, s3
69 ; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
70 ; VI-NEXT: s_waitcnt vmcnt(0)
71 ; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc
72 ; VI-NEXT: s_waitcnt vmcnt(0)
73 ; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
74 ; VI-NEXT: s_waitcnt vmcnt(0)
75 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc
76 ; VI-NEXT: s_waitcnt vmcnt(0)
77 ; VI-NEXT: s_mov_b32 s0, s4
78 ; VI-NEXT: s_mov_b32 s1, s5
79 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
80 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
81 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
84 ; GFX11-LABEL: select_f16:
85 ; GFX11: ; %bb.0: ; %entry
86 ; GFX11-NEXT: s_clause 0x1
87 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
88 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
89 ; GFX11-NEXT: s_mov_b32 s14, -1
90 ; GFX11-NEXT: s_mov_b32 s15, 0x31016000
91 ; GFX11-NEXT: s_mov_b32 s18, s14
92 ; GFX11-NEXT: s_mov_b32 s19, s15
93 ; GFX11-NEXT: s_mov_b32 s22, s14
94 ; GFX11-NEXT: s_mov_b32 s23, s15
95 ; GFX11-NEXT: s_mov_b32 s26, s14
96 ; GFX11-NEXT: s_mov_b32 s27, s15
97 ; GFX11-NEXT: s_mov_b32 s2, s14
98 ; GFX11-NEXT: s_mov_b32 s3, s15
99 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
100 ; GFX11-NEXT: s_mov_b32 s16, s6
101 ; GFX11-NEXT: s_mov_b32 s17, s7
102 ; GFX11-NEXT: s_mov_b32 s20, s8
103 ; GFX11-NEXT: s_mov_b32 s21, s9
104 ; GFX11-NEXT: s_mov_b32 s24, s10
105 ; GFX11-NEXT: s_mov_b32 s25, s11
106 ; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc
107 ; GFX11-NEXT: s_waitcnt vmcnt(0)
108 ; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc
109 ; GFX11-NEXT: s_waitcnt vmcnt(0)
110 ; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc
111 ; GFX11-NEXT: s_waitcnt vmcnt(0)
112 ; GFX11-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc
113 ; GFX11-NEXT: s_waitcnt vmcnt(0)
114 ; GFX11-NEXT: s_mov_b32 s12, s4
115 ; GFX11-NEXT: s_mov_b32 s13, s5
116 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
117 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
118 ; GFX11-NEXT: buffer_store_b16 v0, off, s[12:15], 0
119 ; GFX11-NEXT: s_nop 0
120 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
121 ; GFX11-NEXT: s_endpgm
126 ptr addrspace(1) %d) {
128 %a.val = load volatile half, ptr addrspace(1) %a
129 %b.val = load volatile half, ptr addrspace(1) %b
130 %c.val = load volatile half, ptr addrspace(1) %c
131 %d.val = load volatile half, ptr addrspace(1) %d
132 %fcmp = fcmp olt half %a.val, %b.val
133 %r.val = select i1 %fcmp, half %c.val, half %d.val
134 store half %r.val, ptr addrspace(1) %r
138 define amdgpu_kernel void @select_f16_imm_a(
139 ; SI-LABEL: select_f16_imm_a:
140 ; SI: ; %bb.0: ; %entry
141 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
142 ; SI-NEXT: s_mov_b32 s11, 0xf000
143 ; SI-NEXT: s_mov_b32 s10, -1
144 ; SI-NEXT: s_mov_b32 s14, s10
145 ; SI-NEXT: s_mov_b32 s15, s11
146 ; SI-NEXT: s_waitcnt lgkmcnt(0)
147 ; SI-NEXT: s_mov_b32 s12, s2
148 ; SI-NEXT: s_mov_b32 s13, s3
149 ; SI-NEXT: s_mov_b32 s16, s4
150 ; SI-NEXT: s_mov_b32 s17, s5
151 ; SI-NEXT: s_mov_b32 s18, s10
152 ; SI-NEXT: s_mov_b32 s19, s11
153 ; SI-NEXT: s_mov_b32 s4, s6
154 ; SI-NEXT: s_mov_b32 s5, s7
155 ; SI-NEXT: s_mov_b32 s6, s10
156 ; SI-NEXT: s_mov_b32 s7, s11
157 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
158 ; SI-NEXT: s_waitcnt vmcnt(0)
159 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
160 ; SI-NEXT: s_waitcnt vmcnt(0)
161 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
162 ; SI-NEXT: s_waitcnt vmcnt(0)
163 ; SI-NEXT: s_mov_b32 s8, s0
164 ; SI-NEXT: s_mov_b32 s9, s1
165 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
166 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
167 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
168 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0
169 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
170 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
171 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
174 ; VI-LABEL: select_f16_imm_a:
175 ; VI: ; %bb.0: ; %entry
176 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
177 ; VI-NEXT: s_mov_b32 s11, 0xf000
178 ; VI-NEXT: s_mov_b32 s10, -1
179 ; VI-NEXT: s_mov_b32 s14, s10
180 ; VI-NEXT: s_mov_b32 s15, s11
181 ; VI-NEXT: s_waitcnt lgkmcnt(0)
182 ; VI-NEXT: s_mov_b32 s12, s2
183 ; VI-NEXT: s_mov_b32 s13, s3
184 ; VI-NEXT: s_mov_b32 s16, s4
185 ; VI-NEXT: s_mov_b32 s17, s5
186 ; VI-NEXT: s_mov_b32 s18, s10
187 ; VI-NEXT: s_mov_b32 s19, s11
188 ; VI-NEXT: s_mov_b32 s4, s6
189 ; VI-NEXT: s_mov_b32 s5, s7
190 ; VI-NEXT: s_mov_b32 s6, s10
191 ; VI-NEXT: s_mov_b32 s7, s11
192 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
193 ; VI-NEXT: s_waitcnt vmcnt(0)
194 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
195 ; VI-NEXT: s_waitcnt vmcnt(0)
196 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
197 ; VI-NEXT: s_waitcnt vmcnt(0)
198 ; VI-NEXT: s_mov_b32 s8, s0
199 ; VI-NEXT: s_mov_b32 s9, s1
200 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
201 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
202 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
205 ; GFX11-LABEL: select_f16_imm_a:
206 ; GFX11: ; %bb.0: ; %entry
207 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
208 ; GFX11-NEXT: s_mov_b32 s10, -1
209 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
210 ; GFX11-NEXT: s_mov_b32 s14, s10
211 ; GFX11-NEXT: s_mov_b32 s15, s11
212 ; GFX11-NEXT: s_mov_b32 s18, s10
213 ; GFX11-NEXT: s_mov_b32 s19, s11
214 ; GFX11-NEXT: s_mov_b32 s22, s10
215 ; GFX11-NEXT: s_mov_b32 s23, s11
216 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
217 ; GFX11-NEXT: s_mov_b32 s12, s2
218 ; GFX11-NEXT: s_mov_b32 s13, s3
219 ; GFX11-NEXT: s_mov_b32 s16, s4
220 ; GFX11-NEXT: s_mov_b32 s17, s5
221 ; GFX11-NEXT: s_mov_b32 s20, s6
222 ; GFX11-NEXT: s_mov_b32 s21, s7
223 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
224 ; GFX11-NEXT: s_waitcnt vmcnt(0)
225 ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
226 ; GFX11-NEXT: s_waitcnt vmcnt(0)
227 ; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
228 ; GFX11-NEXT: s_waitcnt vmcnt(0)
229 ; GFX11-NEXT: s_mov_b32 s8, s0
230 ; GFX11-NEXT: s_mov_b32 s9, s1
231 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
232 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
233 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
234 ; GFX11-NEXT: s_nop 0
235 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
236 ; GFX11-NEXT: s_endpgm
240 ptr addrspace(1) %d) {
242 %b.val = load volatile half, ptr addrspace(1) %b
243 %c.val = load volatile half, ptr addrspace(1) %c
244 %d.val = load volatile half, ptr addrspace(1) %d
245 %fcmp = fcmp olt half 0xH3800, %b.val
246 %r.val = select i1 %fcmp, half %c.val, half %d.val
247 store half %r.val, ptr addrspace(1) %r
251 define amdgpu_kernel void @select_f16_imm_b(
252 ; SI-LABEL: select_f16_imm_b:
253 ; SI: ; %bb.0: ; %entry
254 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
255 ; SI-NEXT: s_mov_b32 s11, 0xf000
256 ; SI-NEXT: s_mov_b32 s10, -1
257 ; SI-NEXT: s_mov_b32 s14, s10
258 ; SI-NEXT: s_mov_b32 s15, s11
259 ; SI-NEXT: s_waitcnt lgkmcnt(0)
260 ; SI-NEXT: s_mov_b32 s12, s2
261 ; SI-NEXT: s_mov_b32 s13, s3
262 ; SI-NEXT: s_mov_b32 s16, s4
263 ; SI-NEXT: s_mov_b32 s17, s5
264 ; SI-NEXT: s_mov_b32 s18, s10
265 ; SI-NEXT: s_mov_b32 s19, s11
266 ; SI-NEXT: s_mov_b32 s4, s6
267 ; SI-NEXT: s_mov_b32 s5, s7
268 ; SI-NEXT: s_mov_b32 s6, s10
269 ; SI-NEXT: s_mov_b32 s7, s11
270 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
271 ; SI-NEXT: s_waitcnt vmcnt(0)
272 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
273 ; SI-NEXT: s_waitcnt vmcnt(0)
274 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
275 ; SI-NEXT: s_waitcnt vmcnt(0)
276 ; SI-NEXT: s_mov_b32 s8, s0
277 ; SI-NEXT: s_mov_b32 s9, s1
278 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
279 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
280 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
281 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0
282 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
283 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
284 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
287 ; VI-LABEL: select_f16_imm_b:
288 ; VI: ; %bb.0: ; %entry
289 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
290 ; VI-NEXT: s_mov_b32 s11, 0xf000
291 ; VI-NEXT: s_mov_b32 s10, -1
292 ; VI-NEXT: s_mov_b32 s14, s10
293 ; VI-NEXT: s_mov_b32 s15, s11
294 ; VI-NEXT: s_waitcnt lgkmcnt(0)
295 ; VI-NEXT: s_mov_b32 s12, s2
296 ; VI-NEXT: s_mov_b32 s13, s3
297 ; VI-NEXT: s_mov_b32 s16, s4
298 ; VI-NEXT: s_mov_b32 s17, s5
299 ; VI-NEXT: s_mov_b32 s18, s10
300 ; VI-NEXT: s_mov_b32 s19, s11
301 ; VI-NEXT: s_mov_b32 s4, s6
302 ; VI-NEXT: s_mov_b32 s5, s7
303 ; VI-NEXT: s_mov_b32 s6, s10
304 ; VI-NEXT: s_mov_b32 s7, s11
305 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
306 ; VI-NEXT: s_waitcnt vmcnt(0)
307 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
308 ; VI-NEXT: s_waitcnt vmcnt(0)
309 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
310 ; VI-NEXT: s_waitcnt vmcnt(0)
311 ; VI-NEXT: s_mov_b32 s8, s0
312 ; VI-NEXT: s_mov_b32 s9, s1
313 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
314 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
315 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
318 ; GFX11-LABEL: select_f16_imm_b:
319 ; GFX11: ; %bb.0: ; %entry
320 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
321 ; GFX11-NEXT: s_mov_b32 s10, -1
322 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
323 ; GFX11-NEXT: s_mov_b32 s14, s10
324 ; GFX11-NEXT: s_mov_b32 s15, s11
325 ; GFX11-NEXT: s_mov_b32 s18, s10
326 ; GFX11-NEXT: s_mov_b32 s19, s11
327 ; GFX11-NEXT: s_mov_b32 s22, s10
328 ; GFX11-NEXT: s_mov_b32 s23, s11
329 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
330 ; GFX11-NEXT: s_mov_b32 s12, s2
331 ; GFX11-NEXT: s_mov_b32 s13, s3
332 ; GFX11-NEXT: s_mov_b32 s16, s4
333 ; GFX11-NEXT: s_mov_b32 s17, s5
334 ; GFX11-NEXT: s_mov_b32 s20, s6
335 ; GFX11-NEXT: s_mov_b32 s21, s7
336 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
337 ; GFX11-NEXT: s_waitcnt vmcnt(0)
338 ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
339 ; GFX11-NEXT: s_waitcnt vmcnt(0)
340 ; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
341 ; GFX11-NEXT: s_waitcnt vmcnt(0)
342 ; GFX11-NEXT: s_mov_b32 s8, s0
343 ; GFX11-NEXT: s_mov_b32 s9, s1
344 ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
345 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
346 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
347 ; GFX11-NEXT: s_nop 0
348 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
349 ; GFX11-NEXT: s_endpgm
353 ptr addrspace(1) %d) {
355 %a.val = load volatile half, ptr addrspace(1) %a
356 %c.val = load volatile half, ptr addrspace(1) %c
357 %d.val = load volatile half, ptr addrspace(1) %d
358 %fcmp = fcmp olt half %a.val, 0xH3800
359 %r.val = select i1 %fcmp, half %c.val, half %d.val
360 store half %r.val, ptr addrspace(1) %r
364 define amdgpu_kernel void @select_f16_imm_c(
365 ; SI-LABEL: select_f16_imm_c:
366 ; SI: ; %bb.0: ; %entry
367 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
368 ; SI-NEXT: s_mov_b32 s11, 0xf000
369 ; SI-NEXT: s_mov_b32 s10, -1
370 ; SI-NEXT: s_mov_b32 s14, s10
371 ; SI-NEXT: s_mov_b32 s15, s11
372 ; SI-NEXT: s_waitcnt lgkmcnt(0)
373 ; SI-NEXT: s_mov_b32 s12, s2
374 ; SI-NEXT: s_mov_b32 s13, s3
375 ; SI-NEXT: s_mov_b32 s16, s4
376 ; SI-NEXT: s_mov_b32 s17, s5
377 ; SI-NEXT: s_mov_b32 s18, s10
378 ; SI-NEXT: s_mov_b32 s19, s11
379 ; SI-NEXT: s_mov_b32 s4, s6
380 ; SI-NEXT: s_mov_b32 s5, s7
381 ; SI-NEXT: s_mov_b32 s6, s10
382 ; SI-NEXT: s_mov_b32 s7, s11
383 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
384 ; SI-NEXT: s_waitcnt vmcnt(0)
385 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
386 ; SI-NEXT: s_waitcnt vmcnt(0)
387 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
388 ; SI-NEXT: s_waitcnt vmcnt(0)
389 ; SI-NEXT: s_mov_b32 s8, s0
390 ; SI-NEXT: s_mov_b32 s9, s1
391 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
392 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
393 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
394 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
395 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
396 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
397 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
400 ; VI-LABEL: select_f16_imm_c:
401 ; VI: ; %bb.0: ; %entry
402 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
403 ; VI-NEXT: s_mov_b32 s11, 0xf000
404 ; VI-NEXT: s_mov_b32 s10, -1
405 ; VI-NEXT: s_mov_b32 s14, s10
406 ; VI-NEXT: s_mov_b32 s15, s11
407 ; VI-NEXT: s_waitcnt lgkmcnt(0)
408 ; VI-NEXT: s_mov_b32 s12, s2
409 ; VI-NEXT: s_mov_b32 s13, s3
410 ; VI-NEXT: s_mov_b32 s16, s4
411 ; VI-NEXT: s_mov_b32 s17, s5
412 ; VI-NEXT: s_mov_b32 s18, s10
413 ; VI-NEXT: s_mov_b32 s19, s11
414 ; VI-NEXT: s_mov_b32 s4, s6
415 ; VI-NEXT: s_mov_b32 s5, s7
416 ; VI-NEXT: s_mov_b32 s6, s10
417 ; VI-NEXT: s_mov_b32 s7, s11
418 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
419 ; VI-NEXT: s_waitcnt vmcnt(0)
420 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
421 ; VI-NEXT: s_waitcnt vmcnt(0)
422 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
423 ; VI-NEXT: s_waitcnt vmcnt(0)
424 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800
425 ; VI-NEXT: s_mov_b32 s8, s0
426 ; VI-NEXT: s_mov_b32 s9, s1
427 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
428 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
429 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
432 ; GFX11-LABEL: select_f16_imm_c:
433 ; GFX11: ; %bb.0: ; %entry
434 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
435 ; GFX11-NEXT: s_mov_b32 s10, -1
436 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
437 ; GFX11-NEXT: s_mov_b32 s14, s10
438 ; GFX11-NEXT: s_mov_b32 s15, s11
439 ; GFX11-NEXT: s_mov_b32 s18, s10
440 ; GFX11-NEXT: s_mov_b32 s19, s11
441 ; GFX11-NEXT: s_mov_b32 s22, s10
442 ; GFX11-NEXT: s_mov_b32 s23, s11
443 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
444 ; GFX11-NEXT: s_mov_b32 s12, s2
445 ; GFX11-NEXT: s_mov_b32 s13, s3
446 ; GFX11-NEXT: s_mov_b32 s16, s4
447 ; GFX11-NEXT: s_mov_b32 s17, s5
448 ; GFX11-NEXT: s_mov_b32 s20, s6
449 ; GFX11-NEXT: s_mov_b32 s21, s7
450 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
451 ; GFX11-NEXT: s_waitcnt vmcnt(0)
452 ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
453 ; GFX11-NEXT: s_waitcnt vmcnt(0)
454 ; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
455 ; GFX11-NEXT: s_waitcnt vmcnt(0)
456 ; GFX11-NEXT: s_mov_b32 s8, s0
457 ; GFX11-NEXT: s_mov_b32 s9, s1
458 ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
459 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
460 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
461 ; GFX11-NEXT: s_nop 0
462 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
463 ; GFX11-NEXT: s_endpgm
467 ptr addrspace(1) %d) {
469 %a.val = load volatile half, ptr addrspace(1) %a
470 %b.val = load volatile half, ptr addrspace(1) %b
471 %d.val = load volatile half, ptr addrspace(1) %d
472 %fcmp = fcmp olt half %a.val, %b.val
473 %r.val = select i1 %fcmp, half 0xH3800, half %d.val
474 store half %r.val, ptr addrspace(1) %r
478 define amdgpu_kernel void @select_f16_imm_d(
479 ; SI-LABEL: select_f16_imm_d:
480 ; SI: ; %bb.0: ; %entry
481 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
482 ; SI-NEXT: s_mov_b32 s11, 0xf000
483 ; SI-NEXT: s_mov_b32 s10, -1
484 ; SI-NEXT: s_mov_b32 s14, s10
485 ; SI-NEXT: s_mov_b32 s15, s11
486 ; SI-NEXT: s_waitcnt lgkmcnt(0)
487 ; SI-NEXT: s_mov_b32 s12, s2
488 ; SI-NEXT: s_mov_b32 s13, s3
489 ; SI-NEXT: s_mov_b32 s16, s4
490 ; SI-NEXT: s_mov_b32 s17, s5
491 ; SI-NEXT: s_mov_b32 s18, s10
492 ; SI-NEXT: s_mov_b32 s19, s11
493 ; SI-NEXT: s_mov_b32 s4, s6
494 ; SI-NEXT: s_mov_b32 s5, s7
495 ; SI-NEXT: s_mov_b32 s6, s10
496 ; SI-NEXT: s_mov_b32 s7, s11
497 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
498 ; SI-NEXT: s_waitcnt vmcnt(0)
499 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
500 ; SI-NEXT: s_waitcnt vmcnt(0)
501 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
502 ; SI-NEXT: s_waitcnt vmcnt(0)
503 ; SI-NEXT: s_mov_b32 s8, s0
504 ; SI-NEXT: s_mov_b32 s9, s1
505 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
506 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
507 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
508 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
509 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
510 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
511 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
514 ; VI-LABEL: select_f16_imm_d:
515 ; VI: ; %bb.0: ; %entry
516 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
517 ; VI-NEXT: s_mov_b32 s11, 0xf000
518 ; VI-NEXT: s_mov_b32 s10, -1
519 ; VI-NEXT: s_mov_b32 s14, s10
520 ; VI-NEXT: s_mov_b32 s15, s11
521 ; VI-NEXT: s_waitcnt lgkmcnt(0)
522 ; VI-NEXT: s_mov_b32 s12, s2
523 ; VI-NEXT: s_mov_b32 s13, s3
524 ; VI-NEXT: s_mov_b32 s16, s4
525 ; VI-NEXT: s_mov_b32 s17, s5
526 ; VI-NEXT: s_mov_b32 s18, s10
527 ; VI-NEXT: s_mov_b32 s19, s11
528 ; VI-NEXT: s_mov_b32 s4, s6
529 ; VI-NEXT: s_mov_b32 s5, s7
530 ; VI-NEXT: s_mov_b32 s6, s10
531 ; VI-NEXT: s_mov_b32 s7, s11
532 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
533 ; VI-NEXT: s_waitcnt vmcnt(0)
534 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
535 ; VI-NEXT: s_waitcnt vmcnt(0)
536 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
537 ; VI-NEXT: s_waitcnt vmcnt(0)
538 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800
539 ; VI-NEXT: s_mov_b32 s8, s0
540 ; VI-NEXT: s_mov_b32 s9, s1
541 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
542 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
543 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
546 ; GFX11-LABEL: select_f16_imm_d:
547 ; GFX11: ; %bb.0: ; %entry
548 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
549 ; GFX11-NEXT: s_mov_b32 s10, -1
550 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
551 ; GFX11-NEXT: s_mov_b32 s14, s10
552 ; GFX11-NEXT: s_mov_b32 s15, s11
553 ; GFX11-NEXT: s_mov_b32 s18, s10
554 ; GFX11-NEXT: s_mov_b32 s19, s11
555 ; GFX11-NEXT: s_mov_b32 s22, s10
556 ; GFX11-NEXT: s_mov_b32 s23, s11
557 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
558 ; GFX11-NEXT: s_mov_b32 s12, s2
559 ; GFX11-NEXT: s_mov_b32 s13, s3
560 ; GFX11-NEXT: s_mov_b32 s16, s4
561 ; GFX11-NEXT: s_mov_b32 s17, s5
562 ; GFX11-NEXT: s_mov_b32 s20, s6
563 ; GFX11-NEXT: s_mov_b32 s21, s7
564 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
565 ; GFX11-NEXT: s_waitcnt vmcnt(0)
566 ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
567 ; GFX11-NEXT: s_waitcnt vmcnt(0)
568 ; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
569 ; GFX11-NEXT: s_waitcnt vmcnt(0)
570 ; GFX11-NEXT: s_mov_b32 s8, s0
571 ; GFX11-NEXT: s_mov_b32 s9, s1
572 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
573 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
574 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
575 ; GFX11-NEXT: s_nop 0
576 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
577 ; GFX11-NEXT: s_endpgm
581 ptr addrspace(1) %c) {
583 %a.val = load volatile half, ptr addrspace(1) %a
584 %b.val = load volatile half, ptr addrspace(1) %b
585 %c.val = load volatile half, ptr addrspace(1) %c
586 %fcmp = fcmp olt half %a.val, %b.val
587 %r.val = select i1 %fcmp, half %c.val, half 0xH3800
588 store half %r.val, ptr addrspace(1) %r
592 define amdgpu_kernel void @select_v2f16(
593 ; SI-LABEL: select_v2f16:
594 ; SI: ; %bb.0: ; %entry
595 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
596 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
597 ; SI-NEXT: s_mov_b32 s3, 0xf000
598 ; SI-NEXT: s_mov_b32 s2, -1
599 ; SI-NEXT: s_mov_b32 s18, s2
600 ; SI-NEXT: s_waitcnt lgkmcnt(0)
601 ; SI-NEXT: s_mov_b32 s16, s6
602 ; SI-NEXT: s_mov_b32 s17, s7
603 ; SI-NEXT: s_mov_b32 s19, s3
604 ; SI-NEXT: s_mov_b32 s20, s8
605 ; SI-NEXT: s_mov_b32 s21, s9
606 ; SI-NEXT: s_mov_b32 s22, s2
607 ; SI-NEXT: s_mov_b32 s23, s3
608 ; SI-NEXT: s_mov_b32 s14, s2
609 ; SI-NEXT: s_mov_b32 s15, s3
610 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
611 ; SI-NEXT: s_mov_b32 s8, s10
612 ; SI-NEXT: s_mov_b32 s9, s11
613 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
614 ; SI-NEXT: s_mov_b32 s10, s2
615 ; SI-NEXT: s_mov_b32 s11, s3
616 ; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0
617 ; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0
618 ; SI-NEXT: s_mov_b32 s0, s4
619 ; SI-NEXT: s_mov_b32 s1, s5
620 ; SI-NEXT: s_waitcnt vmcnt(3)
621 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
622 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
623 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
624 ; SI-NEXT: s_waitcnt vmcnt(2)
625 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
626 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
627 ; SI-NEXT: s_waitcnt vmcnt(1)
628 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
629 ; SI-NEXT: s_waitcnt vmcnt(0)
630 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
631 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
632 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
633 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
634 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
635 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
636 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6
637 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
638 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2
639 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
640 ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
641 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
642 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
643 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
644 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
647 ; VI-LABEL: select_v2f16:
648 ; VI: ; %bb.0: ; %entry
649 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
650 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
651 ; VI-NEXT: s_mov_b32 s3, 0xf000
652 ; VI-NEXT: s_mov_b32 s2, -1
653 ; VI-NEXT: s_mov_b32 s14, s2
654 ; VI-NEXT: s_mov_b32 s15, s3
655 ; VI-NEXT: s_waitcnt lgkmcnt(0)
656 ; VI-NEXT: s_mov_b32 s16, s6
657 ; VI-NEXT: s_mov_b32 s17, s7
658 ; VI-NEXT: s_mov_b32 s18, s2
659 ; VI-NEXT: s_mov_b32 s19, s3
660 ; VI-NEXT: s_mov_b32 s20, s8
661 ; VI-NEXT: s_mov_b32 s21, s9
662 ; VI-NEXT: s_mov_b32 s22, s2
663 ; VI-NEXT: s_mov_b32 s23, s3
664 ; VI-NEXT: s_mov_b32 s8, s10
665 ; VI-NEXT: s_mov_b32 s9, s11
666 ; VI-NEXT: s_mov_b32 s10, s2
667 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
668 ; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0
669 ; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0
670 ; VI-NEXT: s_mov_b32 s11, s3
671 ; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0
672 ; VI-NEXT: s_mov_b32 s0, s4
673 ; VI-NEXT: s_mov_b32 s1, s5
674 ; VI-NEXT: s_waitcnt vmcnt(3)
675 ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
676 ; VI-NEXT: s_waitcnt vmcnt(2)
677 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
678 ; VI-NEXT: s_waitcnt vmcnt(1)
679 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
680 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v2, v1
681 ; VI-NEXT: s_waitcnt vmcnt(0)
682 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
683 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
684 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5
685 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
686 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
687 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
688 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
691 ; GFX11-LABEL: select_v2f16:
692 ; GFX11: ; %bb.0: ; %entry
693 ; GFX11-NEXT: s_clause 0x1
694 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
695 ; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44
696 ; GFX11-NEXT: s_mov_b32 s2, -1
697 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
698 ; GFX11-NEXT: s_mov_b32 s14, s2
699 ; GFX11-NEXT: s_mov_b32 s15, s3
700 ; GFX11-NEXT: s_mov_b32 s22, s2
701 ; GFX11-NEXT: s_mov_b32 s23, s3
702 ; GFX11-NEXT: s_mov_b32 s18, s2
703 ; GFX11-NEXT: s_mov_b32 s19, s3
704 ; GFX11-NEXT: s_mov_b32 s26, s2
705 ; GFX11-NEXT: s_mov_b32 s27, s3
706 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
707 ; GFX11-NEXT: s_mov_b32 s20, s8
708 ; GFX11-NEXT: s_mov_b32 s21, s9
709 ; GFX11-NEXT: s_mov_b32 s16, s6
710 ; GFX11-NEXT: s_mov_b32 s17, s7
711 ; GFX11-NEXT: s_mov_b32 s24, s10
712 ; GFX11-NEXT: s_mov_b32 s25, s11
713 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
714 ; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0
715 ; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0
716 ; GFX11-NEXT: buffer_load_b32 v3, off, s[24:27], 0
717 ; GFX11-NEXT: s_mov_b32 s0, s4
718 ; GFX11-NEXT: s_mov_b32 s1, s5
719 ; GFX11-NEXT: s_waitcnt vmcnt(3)
720 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
721 ; GFX11-NEXT: s_waitcnt vmcnt(2)
722 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1
723 ; GFX11-NEXT: s_waitcnt vmcnt(1)
724 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
725 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v2, v1
726 ; GFX11-NEXT: s_waitcnt vmcnt(0)
727 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3
728 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
729 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v6, v5
730 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
731 ; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0
732 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
733 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
734 ; GFX11-NEXT: s_nop 0
735 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
736 ; GFX11-NEXT: s_endpgm
741 ptr addrspace(1) %d) {
743 %a.val = load <2 x half>, ptr addrspace(1) %a
744 %b.val = load <2 x half>, ptr addrspace(1) %b
745 %c.val = load <2 x half>, ptr addrspace(1) %c
746 %d.val = load <2 x half>, ptr addrspace(1) %d
747 %fcmp = fcmp olt <2 x half> %a.val, %b.val
748 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
749 store <2 x half> %r.val, ptr addrspace(1) %r
753 define amdgpu_kernel void @select_v2f16_imm_a(
754 ; SI-LABEL: select_v2f16_imm_a:
755 ; SI: ; %bb.0: ; %entry
756 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
757 ; SI-NEXT: s_mov_b32 s11, 0xf000
758 ; SI-NEXT: s_mov_b32 s10, -1
759 ; SI-NEXT: s_mov_b32 s14, s10
760 ; SI-NEXT: s_mov_b32 s15, s11
761 ; SI-NEXT: s_waitcnt lgkmcnt(0)
762 ; SI-NEXT: s_mov_b32 s12, s2
763 ; SI-NEXT: s_mov_b32 s13, s3
764 ; SI-NEXT: s_mov_b32 s16, s4
765 ; SI-NEXT: s_mov_b32 s17, s5
766 ; SI-NEXT: s_mov_b32 s18, s10
767 ; SI-NEXT: s_mov_b32 s19, s11
768 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
769 ; SI-NEXT: s_mov_b32 s4, s6
770 ; SI-NEXT: s_mov_b32 s5, s7
771 ; SI-NEXT: s_mov_b32 s6, s10
772 ; SI-NEXT: s_mov_b32 s7, s11
773 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
774 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
775 ; SI-NEXT: s_mov_b32 s2, 0x3f200000
776 ; SI-NEXT: s_mov_b32 s8, s0
777 ; SI-NEXT: s_mov_b32 s9, s1
778 ; SI-NEXT: s_waitcnt vmcnt(2)
779 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
780 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
781 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
782 ; SI-NEXT: s_waitcnt vmcnt(1)
783 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
784 ; SI-NEXT: s_waitcnt vmcnt(0)
785 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
786 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
787 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
788 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
789 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
790 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3
791 ; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
792 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0
793 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
794 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
795 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
796 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
797 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
798 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
801 ; VI-LABEL: select_v2f16_imm_a:
802 ; VI: ; %bb.0: ; %entry
803 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
804 ; VI-NEXT: s_mov_b32 s11, 0xf000
805 ; VI-NEXT: s_mov_b32 s10, -1
806 ; VI-NEXT: s_mov_b32 s14, s10
807 ; VI-NEXT: s_mov_b32 s15, s11
808 ; VI-NEXT: s_waitcnt lgkmcnt(0)
809 ; VI-NEXT: s_mov_b32 s12, s2
810 ; VI-NEXT: s_mov_b32 s13, s3
811 ; VI-NEXT: s_mov_b32 s16, s4
812 ; VI-NEXT: s_mov_b32 s17, s5
813 ; VI-NEXT: s_mov_b32 s18, s10
814 ; VI-NEXT: s_mov_b32 s19, s11
815 ; VI-NEXT: s_mov_b32 s4, s6
816 ; VI-NEXT: s_mov_b32 s5, s7
817 ; VI-NEXT: s_mov_b32 s6, s10
818 ; VI-NEXT: s_mov_b32 s7, s11
819 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
820 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
821 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
822 ; VI-NEXT: s_movk_i32 s2, 0x3900
823 ; VI-NEXT: s_mov_b32 s8, s0
824 ; VI-NEXT: s_mov_b32 s9, s1
825 ; VI-NEXT: s_waitcnt vmcnt(2)
826 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
827 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
828 ; VI-NEXT: s_waitcnt vmcnt(0)
829 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
830 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
831 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
832 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3
833 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
834 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
835 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
836 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
839 ; GFX11-LABEL: select_v2f16_imm_a:
840 ; GFX11: ; %bb.0: ; %entry
841 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
842 ; GFX11-NEXT: s_mov_b32 s10, -1
843 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
844 ; GFX11-NEXT: s_mov_b32 s14, s10
845 ; GFX11-NEXT: s_mov_b32 s15, s11
846 ; GFX11-NEXT: s_mov_b32 s18, s10
847 ; GFX11-NEXT: s_mov_b32 s19, s11
848 ; GFX11-NEXT: s_mov_b32 s22, s10
849 ; GFX11-NEXT: s_mov_b32 s23, s11
850 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
851 ; GFX11-NEXT: s_mov_b32 s12, s2
852 ; GFX11-NEXT: s_mov_b32 s13, s3
853 ; GFX11-NEXT: s_mov_b32 s16, s4
854 ; GFX11-NEXT: s_mov_b32 s17, s5
855 ; GFX11-NEXT: s_mov_b32 s20, s6
856 ; GFX11-NEXT: s_mov_b32 s21, s7
857 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
858 ; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0
859 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
860 ; GFX11-NEXT: s_mov_b32 s8, s0
861 ; GFX11-NEXT: s_mov_b32 s9, s1
862 ; GFX11-NEXT: s_waitcnt vmcnt(2)
863 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
864 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
865 ; GFX11-NEXT: s_waitcnt vmcnt(1)
866 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
867 ; GFX11-NEXT: s_waitcnt vmcnt(0)
868 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
869 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
870 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0x3900, v3
871 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
872 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
873 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
874 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
875 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
876 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
877 ; GFX11-NEXT: s_nop 0
878 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
879 ; GFX11-NEXT: s_endpgm
883 ptr addrspace(1) %d) {
885 %b.val = load <2 x half>, ptr addrspace(1) %b
886 %c.val = load <2 x half>, ptr addrspace(1) %c
887 %d.val = load <2 x half>, ptr addrspace(1) %d
888 %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
889 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
890 store <2 x half> %r.val, ptr addrspace(1) %r
894 define amdgpu_kernel void @select_v2f16_imm_b(
895 ; SI-LABEL: select_v2f16_imm_b:
896 ; SI: ; %bb.0: ; %entry
897 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
898 ; SI-NEXT: s_mov_b32 s11, 0xf000
899 ; SI-NEXT: s_mov_b32 s10, -1
900 ; SI-NEXT: s_mov_b32 s14, s10
901 ; SI-NEXT: s_mov_b32 s15, s11
902 ; SI-NEXT: s_waitcnt lgkmcnt(0)
903 ; SI-NEXT: s_mov_b32 s12, s2
904 ; SI-NEXT: s_mov_b32 s13, s3
905 ; SI-NEXT: s_mov_b32 s16, s4
906 ; SI-NEXT: s_mov_b32 s17, s5
907 ; SI-NEXT: s_mov_b32 s18, s10
908 ; SI-NEXT: s_mov_b32 s19, s11
909 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
910 ; SI-NEXT: s_mov_b32 s4, s6
911 ; SI-NEXT: s_mov_b32 s5, s7
912 ; SI-NEXT: s_mov_b32 s6, s10
913 ; SI-NEXT: s_mov_b32 s7, s11
914 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
915 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
916 ; SI-NEXT: s_mov_b32 s2, 0x3f200000
917 ; SI-NEXT: s_mov_b32 s8, s0
918 ; SI-NEXT: s_mov_b32 s9, s1
919 ; SI-NEXT: s_waitcnt vmcnt(2)
920 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
921 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
922 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
923 ; SI-NEXT: s_waitcnt vmcnt(1)
924 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
925 ; SI-NEXT: s_waitcnt vmcnt(0)
926 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
927 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
928 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
929 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
930 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
931 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3
932 ; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
933 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0
934 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
935 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
936 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
937 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
938 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
939 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
942 ; VI-LABEL: select_v2f16_imm_b:
943 ; VI: ; %bb.0: ; %entry
944 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
945 ; VI-NEXT: s_mov_b32 s11, 0xf000
946 ; VI-NEXT: s_mov_b32 s10, -1
947 ; VI-NEXT: s_mov_b32 s14, s10
948 ; VI-NEXT: s_mov_b32 s15, s11
949 ; VI-NEXT: s_waitcnt lgkmcnt(0)
950 ; VI-NEXT: s_mov_b32 s12, s2
951 ; VI-NEXT: s_mov_b32 s13, s3
952 ; VI-NEXT: s_mov_b32 s16, s4
953 ; VI-NEXT: s_mov_b32 s17, s5
954 ; VI-NEXT: s_mov_b32 s18, s10
955 ; VI-NEXT: s_mov_b32 s19, s11
956 ; VI-NEXT: s_mov_b32 s4, s6
957 ; VI-NEXT: s_mov_b32 s5, s7
958 ; VI-NEXT: s_mov_b32 s6, s10
959 ; VI-NEXT: s_mov_b32 s7, s11
960 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
961 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
962 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
963 ; VI-NEXT: s_movk_i32 s2, 0x3900
964 ; VI-NEXT: s_mov_b32 s8, s0
965 ; VI-NEXT: s_mov_b32 s9, s1
966 ; VI-NEXT: s_waitcnt vmcnt(2)
967 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
968 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
969 ; VI-NEXT: s_waitcnt vmcnt(0)
970 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
971 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
972 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
973 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3
974 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
975 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
976 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
977 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
980 ; GFX11-LABEL: select_v2f16_imm_b:
981 ; GFX11: ; %bb.0: ; %entry
982 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
983 ; GFX11-NEXT: s_mov_b32 s10, -1
984 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
985 ; GFX11-NEXT: s_mov_b32 s14, s10
986 ; GFX11-NEXT: s_mov_b32 s15, s11
987 ; GFX11-NEXT: s_mov_b32 s18, s10
988 ; GFX11-NEXT: s_mov_b32 s19, s11
989 ; GFX11-NEXT: s_mov_b32 s22, s10
990 ; GFX11-NEXT: s_mov_b32 s23, s11
991 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
992 ; GFX11-NEXT: s_mov_b32 s12, s2
993 ; GFX11-NEXT: s_mov_b32 s13, s3
994 ; GFX11-NEXT: s_mov_b32 s16, s4
995 ; GFX11-NEXT: s_mov_b32 s17, s5
996 ; GFX11-NEXT: s_mov_b32 s20, s6
997 ; GFX11-NEXT: s_mov_b32 s21, s7
998 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
999 ; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0
1000 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
1001 ; GFX11-NEXT: s_mov_b32 s8, s0
1002 ; GFX11-NEXT: s_mov_b32 s9, s1
1003 ; GFX11-NEXT: s_waitcnt vmcnt(2)
1004 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
1005 ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
1006 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1007 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
1008 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1009 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
1010 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
1011 ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0x3900, v3
1012 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1013 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
1014 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
1015 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1016 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1017 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1018 ; GFX11-NEXT: s_nop 0
1019 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1020 ; GFX11-NEXT: s_endpgm
1021 ptr addrspace(1) %r,
1022 ptr addrspace(1) %a,
1023 ptr addrspace(1) %c,
1024 ptr addrspace(1) %d) {
1026 %a.val = load <2 x half>, ptr addrspace(1) %a
1027 %c.val = load <2 x half>, ptr addrspace(1) %c
1028 %d.val = load <2 x half>, ptr addrspace(1) %d
1029 %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
1030 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
1031 store <2 x half> %r.val, ptr addrspace(1) %r
1035 define amdgpu_kernel void @select_v2f16_imm_c(
1036 ; SI-LABEL: select_v2f16_imm_c:
1037 ; SI: ; %bb.0: ; %entry
1038 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
1039 ; SI-NEXT: s_mov_b32 s11, 0xf000
1040 ; SI-NEXT: s_mov_b32 s10, -1
1041 ; SI-NEXT: s_mov_b32 s14, s10
1042 ; SI-NEXT: s_mov_b32 s15, s11
1043 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1044 ; SI-NEXT: s_mov_b32 s12, s2
1045 ; SI-NEXT: s_mov_b32 s13, s3
1046 ; SI-NEXT: s_mov_b32 s16, s4
1047 ; SI-NEXT: s_mov_b32 s17, s5
1048 ; SI-NEXT: s_mov_b32 s18, s10
1049 ; SI-NEXT: s_mov_b32 s19, s11
1050 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
1051 ; SI-NEXT: s_mov_b32 s4, s6
1052 ; SI-NEXT: s_mov_b32 s5, s7
1053 ; SI-NEXT: s_mov_b32 s6, s10
1054 ; SI-NEXT: s_mov_b32 s7, s11
1055 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
1056 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
1057 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000
1058 ; SI-NEXT: s_mov_b32 s8, s0
1059 ; SI-NEXT: s_mov_b32 s9, s1
1060 ; SI-NEXT: s_waitcnt vmcnt(2)
1061 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
1062 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1063 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1064 ; SI-NEXT: s_waitcnt vmcnt(1)
1065 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
1066 ; SI-NEXT: s_waitcnt vmcnt(0)
1067 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
1068 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
1069 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
1070 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1071 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
1072 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5
1073 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
1074 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1
1075 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1076 ; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc
1077 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1078 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1079 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1080 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1083 ; VI-LABEL: select_v2f16_imm_c:
1084 ; VI: ; %bb.0: ; %entry
1085 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1086 ; VI-NEXT: s_mov_b32 s11, 0xf000
1087 ; VI-NEXT: s_mov_b32 s10, -1
1088 ; VI-NEXT: s_mov_b32 s18, s10
1089 ; VI-NEXT: s_mov_b32 s19, s11
1090 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1091 ; VI-NEXT: s_mov_b32 s16, s4
1092 ; VI-NEXT: s_mov_b32 s17, s5
1093 ; VI-NEXT: s_mov_b32 s14, s10
1094 ; VI-NEXT: s_mov_b32 s12, s2
1095 ; VI-NEXT: s_mov_b32 s13, s3
1096 ; VI-NEXT: s_mov_b32 s15, s11
1097 ; VI-NEXT: s_mov_b32 s4, s6
1098 ; VI-NEXT: s_mov_b32 s5, s7
1099 ; VI-NEXT: s_mov_b32 s6, s10
1100 ; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
1101 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
1102 ; VI-NEXT: s_mov_b32 s7, s11
1103 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
1104 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800
1105 ; VI-NEXT: v_mov_b32_e32 v4, 0x3900
1106 ; VI-NEXT: s_mov_b32 s8, s0
1107 ; VI-NEXT: s_mov_b32 s9, s1
1108 ; VI-NEXT: s_waitcnt vmcnt(2)
1109 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
1110 ; VI-NEXT: s_waitcnt vmcnt(1)
1111 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
1112 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0
1113 ; VI-NEXT: s_waitcnt vmcnt(0)
1114 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
1115 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
1116 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5
1117 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
1118 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1119 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1120 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1123 ; GFX11-LABEL: select_v2f16_imm_c:
1124 ; GFX11: ; %bb.0: ; %entry
1125 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
1126 ; GFX11-NEXT: s_mov_b32 s10, -1
1127 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
1128 ; GFX11-NEXT: s_mov_b32 s18, s10
1129 ; GFX11-NEXT: s_mov_b32 s19, s11
1130 ; GFX11-NEXT: s_mov_b32 s14, s10
1131 ; GFX11-NEXT: s_mov_b32 s15, s11
1132 ; GFX11-NEXT: s_mov_b32 s22, s10
1133 ; GFX11-NEXT: s_mov_b32 s23, s11
1134 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1135 ; GFX11-NEXT: s_mov_b32 s16, s4
1136 ; GFX11-NEXT: s_mov_b32 s17, s5
1137 ; GFX11-NEXT: s_mov_b32 s12, s2
1138 ; GFX11-NEXT: s_mov_b32 s13, s3
1139 ; GFX11-NEXT: s_mov_b32 s20, s6
1140 ; GFX11-NEXT: s_mov_b32 s21, s7
1141 ; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0
1142 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
1143 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
1144 ; GFX11-NEXT: s_mov_b32 s8, s0
1145 ; GFX11-NEXT: s_mov_b32 s9, s1
1146 ; GFX11-NEXT: s_waitcnt vmcnt(2)
1147 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
1148 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1149 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
1150 ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0
1151 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1152 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
1153 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
1154 ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v4, v3
1155 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
1156 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
1157 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
1158 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1159 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1160 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1161 ; GFX11-NEXT: s_nop 0
1162 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1163 ; GFX11-NEXT: s_endpgm
1164 ptr addrspace(1) %r,
1165 ptr addrspace(1) %a,
1166 ptr addrspace(1) %b,
1167 ptr addrspace(1) %d) {
1169 %a.val = load <2 x half>, ptr addrspace(1) %a
1170 %b.val = load <2 x half>, ptr addrspace(1) %b
1171 %d.val = load <2 x half>, ptr addrspace(1) %d
1172 %fcmp = fcmp olt <2 x half> %a.val, %b.val
1173 %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
1174 store <2 x half> %r.val, ptr addrspace(1) %r
1178 define amdgpu_kernel void @select_v2f16_imm_d(
1179 ; SI-LABEL: select_v2f16_imm_d:
1180 ; SI: ; %bb.0: ; %entry
1181 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
1182 ; SI-NEXT: s_mov_b32 s11, 0xf000
1183 ; SI-NEXT: s_mov_b32 s10, -1
1184 ; SI-NEXT: s_mov_b32 s14, s10
1185 ; SI-NEXT: s_mov_b32 s15, s11
1186 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1187 ; SI-NEXT: s_mov_b32 s12, s2
1188 ; SI-NEXT: s_mov_b32 s13, s3
1189 ; SI-NEXT: s_mov_b32 s16, s4
1190 ; SI-NEXT: s_mov_b32 s17, s5
1191 ; SI-NEXT: s_mov_b32 s18, s10
1192 ; SI-NEXT: s_mov_b32 s19, s11
1193 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
1194 ; SI-NEXT: s_mov_b32 s4, s6
1195 ; SI-NEXT: s_mov_b32 s5, s7
1196 ; SI-NEXT: s_mov_b32 s6, s10
1197 ; SI-NEXT: s_mov_b32 s7, s11
1198 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
1199 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
1200 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000
1201 ; SI-NEXT: s_mov_b32 s8, s0
1202 ; SI-NEXT: s_mov_b32 s9, s1
1203 ; SI-NEXT: s_waitcnt vmcnt(2)
1204 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
1205 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
1206 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1207 ; SI-NEXT: s_waitcnt vmcnt(1)
1208 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
1209 ; SI-NEXT: s_waitcnt vmcnt(0)
1210 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
1211 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
1212 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
1213 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1214 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
1215 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
1216 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
1217 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
1218 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
1219 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
1220 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1221 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
1222 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
1223 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1226 ; VI-LABEL: select_v2f16_imm_d:
1227 ; VI: ; %bb.0: ; %entry
1228 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1229 ; VI-NEXT: s_mov_b32 s11, 0xf000
1230 ; VI-NEXT: s_mov_b32 s10, -1
1231 ; VI-NEXT: s_mov_b32 s18, s10
1232 ; VI-NEXT: s_mov_b32 s19, s11
1233 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1234 ; VI-NEXT: s_mov_b32 s16, s4
1235 ; VI-NEXT: s_mov_b32 s17, s5
1236 ; VI-NEXT: s_mov_b32 s14, s10
1237 ; VI-NEXT: s_mov_b32 s12, s2
1238 ; VI-NEXT: s_mov_b32 s13, s3
1239 ; VI-NEXT: s_mov_b32 s15, s11
1240 ; VI-NEXT: s_mov_b32 s4, s6
1241 ; VI-NEXT: s_mov_b32 s5, s7
1242 ; VI-NEXT: s_mov_b32 s6, s10
1243 ; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
1244 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
1245 ; VI-NEXT: s_mov_b32 s7, s11
1246 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
1247 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800
1248 ; VI-NEXT: v_mov_b32_e32 v4, 0x3900
1249 ; VI-NEXT: s_mov_b32 s8, s0
1250 ; VI-NEXT: s_mov_b32 s9, s1
1251 ; VI-NEXT: s_waitcnt vmcnt(2)
1252 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
1253 ; VI-NEXT: s_waitcnt vmcnt(1)
1254 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
1255 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0
1256 ; VI-NEXT: s_waitcnt vmcnt(0)
1257 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
1258 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
1259 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5
1260 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
1261 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1262 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1263 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1266 ; GFX11-LABEL: select_v2f16_imm_d:
1267 ; GFX11: ; %bb.0: ; %entry
1268 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
1269 ; GFX11-NEXT: s_mov_b32 s10, -1
1270 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
1271 ; GFX11-NEXT: s_mov_b32 s18, s10
1272 ; GFX11-NEXT: s_mov_b32 s19, s11
1273 ; GFX11-NEXT: s_mov_b32 s14, s10
1274 ; GFX11-NEXT: s_mov_b32 s15, s11
1275 ; GFX11-NEXT: s_mov_b32 s22, s10
1276 ; GFX11-NEXT: s_mov_b32 s23, s11
1277 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1278 ; GFX11-NEXT: s_mov_b32 s16, s4
1279 ; GFX11-NEXT: s_mov_b32 s17, s5
1280 ; GFX11-NEXT: s_mov_b32 s12, s2
1281 ; GFX11-NEXT: s_mov_b32 s13, s3
1282 ; GFX11-NEXT: s_mov_b32 s20, s6
1283 ; GFX11-NEXT: s_mov_b32 s21, s7
1284 ; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0
1285 ; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
1286 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
1287 ; GFX11-NEXT: s_mov_b32 s8, s0
1288 ; GFX11-NEXT: s_mov_b32 s9, s1
1289 ; GFX11-NEXT: s_waitcnt vmcnt(2)
1290 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
1291 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1292 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
1293 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0
1294 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1295 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
1296 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
1297 ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v4, v3
1298 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
1299 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
1300 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
1301 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1302 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1303 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
1304 ; GFX11-NEXT: s_nop 0
1305 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1306 ; GFX11-NEXT: s_endpgm
1307 ptr addrspace(1) %r,
1308 ptr addrspace(1) %a,
1309 ptr addrspace(1) %b,
1310 ptr addrspace(1) %c) {
1312 %a.val = load <2 x half>, ptr addrspace(1) %a
1313 %b.val = load <2 x half>, ptr addrspace(1) %b
1314 %c.val = load <2 x half>, ptr addrspace(1) %c
1315 %fcmp = fcmp olt <2 x half> %a.val, %b.val
1316 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
1317 store <2 x half> %r.val, ptr addrspace(1) %r