1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI
3 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI
5 define amdgpu_kernel void @select_f16(
6 ; SI-LABEL: select_f16:
7 ; SI: ; %bb.0: ; %entry
8 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11
10 ; SI-NEXT: s_mov_b32 s15, 0xf000
11 ; SI-NEXT: s_mov_b32 s14, -1
12 ; SI-NEXT: s_mov_b32 s22, s14
13 ; SI-NEXT: s_waitcnt lgkmcnt(0)
14 ; SI-NEXT: s_mov_b32 s16, s10
15 ; SI-NEXT: s_mov_b32 s17, s11
16 ; SI-NEXT: s_mov_b32 s10, s14
17 ; SI-NEXT: s_mov_b32 s11, s15
18 ; SI-NEXT: s_mov_b32 s20, s6
19 ; SI-NEXT: s_mov_b32 s21, s7
20 ; SI-NEXT: s_mov_b32 s23, s15
21 ; SI-NEXT: s_mov_b32 s2, s14
22 ; SI-NEXT: s_mov_b32 s3, s15
23 ; SI-NEXT: s_mov_b32 s18, s14
24 ; SI-NEXT: s_mov_b32 s19, s15
25 ; SI-NEXT: buffer_load_ushort v0, off, s[20:23], 0
26 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
27 ; SI-NEXT: buffer_load_ushort v2, off, s[16:19], 0
28 ; SI-NEXT: buffer_load_ushort v3, off, s[0:3], 0
29 ; SI-NEXT: s_mov_b32 s12, s4
30 ; SI-NEXT: s_mov_b32 s13, s5
31 ; SI-NEXT: s_waitcnt vmcnt(3)
32 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
33 ; SI-NEXT: s_waitcnt vmcnt(2)
34 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
35 ; SI-NEXT: s_waitcnt vmcnt(1)
36 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
37 ; SI-NEXT: s_waitcnt vmcnt(0)
38 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
39 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
40 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
41 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
42 ; SI-NEXT: buffer_store_short v0, off, s[12:15], 0
45 ; VI-LABEL: select_f16:
46 ; VI: ; %bb.0: ; %entry
47 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
48 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
49 ; VI-NEXT: s_mov_b32 s3, 0xf000
50 ; VI-NEXT: s_mov_b32 s2, -1
51 ; VI-NEXT: s_mov_b32 s14, s2
52 ; VI-NEXT: s_waitcnt lgkmcnt(0)
53 ; VI-NEXT: s_mov_b32 s0, s4
54 ; VI-NEXT: s_mov_b32 s1, s5
55 ; VI-NEXT: s_mov_b32 s16, s10
56 ; VI-NEXT: s_mov_b32 s17, s11
57 ; VI-NEXT: s_mov_b32 s4, s6
58 ; VI-NEXT: s_mov_b32 s5, s7
59 ; VI-NEXT: s_mov_b32 s10, s2
60 ; VI-NEXT: s_mov_b32 s11, s3
61 ; VI-NEXT: s_mov_b32 s6, s2
62 ; VI-NEXT: s_mov_b32 s7, s3
63 ; VI-NEXT: s_mov_b32 s15, s3
64 ; VI-NEXT: s_mov_b32 s18, s2
65 ; VI-NEXT: s_mov_b32 s19, s3
66 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
67 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
68 ; VI-NEXT: buffer_load_ushort v2, off, s[16:19], 0
69 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0
70 ; VI-NEXT: s_waitcnt vmcnt(2)
71 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
72 ; VI-NEXT: s_waitcnt vmcnt(0)
73 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
74 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
76 half addrspace(1)* %r,
77 half addrspace(1)* %a,
78 half addrspace(1)* %b,
79 half addrspace(1)* %c,
80 half addrspace(1)* %d) {
82 %a.val = load volatile half, half addrspace(1)* %a
83 %b.val = load volatile half, half addrspace(1)* %b
84 %c.val = load volatile half, half addrspace(1)* %c
85 %d.val = load volatile half, half addrspace(1)* %d
86 %fcmp = fcmp olt half %a.val, %b.val
87 %r.val = select i1 %fcmp, half %c.val, half %d.val
88 store half %r.val, half addrspace(1)* %r
92 define amdgpu_kernel void @select_f16_imm_a(
93 ; SI-LABEL: select_f16_imm_a:
94 ; SI: ; %bb.0: ; %entry
95 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
96 ; SI-NEXT: s_mov_b32 s11, 0xf000
97 ; SI-NEXT: s_mov_b32 s10, -1
98 ; SI-NEXT: s_mov_b32 s18, s10
99 ; SI-NEXT: s_mov_b32 s19, s11
100 ; SI-NEXT: s_waitcnt lgkmcnt(0)
101 ; SI-NEXT: s_mov_b32 s16, s2
102 ; SI-NEXT: s_mov_b32 s17, s3
103 ; SI-NEXT: s_mov_b32 s12, s6
104 ; SI-NEXT: s_mov_b32 s13, s7
105 ; SI-NEXT: s_mov_b32 s14, s10
106 ; SI-NEXT: s_mov_b32 s15, s11
107 ; SI-NEXT: s_mov_b32 s6, s10
108 ; SI-NEXT: s_mov_b32 s7, s11
109 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
110 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
111 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
112 ; SI-NEXT: s_mov_b32 s8, s0
113 ; SI-NEXT: s_mov_b32 s9, s1
114 ; SI-NEXT: s_waitcnt vmcnt(2)
115 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
116 ; SI-NEXT: s_waitcnt vmcnt(1)
117 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
118 ; SI-NEXT: s_waitcnt vmcnt(0)
119 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
120 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0
121 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
122 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
123 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
126 ; VI-LABEL: select_f16_imm_a:
127 ; VI: ; %bb.0: ; %entry
128 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
129 ; VI-NEXT: s_mov_b32 s11, 0xf000
130 ; VI-NEXT: s_mov_b32 s10, -1
131 ; VI-NEXT: s_mov_b32 s14, s10
132 ; VI-NEXT: s_mov_b32 s15, s11
133 ; VI-NEXT: s_waitcnt lgkmcnt(0)
134 ; VI-NEXT: s_mov_b32 s8, s0
135 ; VI-NEXT: s_mov_b32 s9, s1
136 ; VI-NEXT: s_mov_b32 s0, s2
137 ; VI-NEXT: s_mov_b32 s1, s3
138 ; VI-NEXT: s_mov_b32 s2, s10
139 ; VI-NEXT: s_mov_b32 s3, s11
140 ; VI-NEXT: s_mov_b32 s12, s6
141 ; VI-NEXT: s_mov_b32 s13, s7
142 ; VI-NEXT: s_mov_b32 s6, s10
143 ; VI-NEXT: s_mov_b32 s7, s11
144 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
145 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
146 ; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
147 ; VI-NEXT: s_waitcnt vmcnt(2)
148 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
149 ; VI-NEXT: s_waitcnt vmcnt(0)
150 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
151 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
153 half addrspace(1)* %r,
154 half addrspace(1)* %b,
155 half addrspace(1)* %c,
156 half addrspace(1)* %d) {
158 %b.val = load volatile half, half addrspace(1)* %b
159 %c.val = load volatile half, half addrspace(1)* %c
160 %d.val = load volatile half, half addrspace(1)* %d
161 %fcmp = fcmp olt half 0xH3800, %b.val
162 %r.val = select i1 %fcmp, half %c.val, half %d.val
163 store half %r.val, half addrspace(1)* %r
167 define amdgpu_kernel void @select_f16_imm_b(
168 ; SI-LABEL: select_f16_imm_b:
169 ; SI: ; %bb.0: ; %entry
170 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
171 ; SI-NEXT: s_mov_b32 s11, 0xf000
172 ; SI-NEXT: s_mov_b32 s10, -1
173 ; SI-NEXT: s_mov_b32 s18, s10
174 ; SI-NEXT: s_mov_b32 s19, s11
175 ; SI-NEXT: s_waitcnt lgkmcnt(0)
176 ; SI-NEXT: s_mov_b32 s16, s2
177 ; SI-NEXT: s_mov_b32 s17, s3
178 ; SI-NEXT: s_mov_b32 s12, s6
179 ; SI-NEXT: s_mov_b32 s13, s7
180 ; SI-NEXT: s_mov_b32 s14, s10
181 ; SI-NEXT: s_mov_b32 s15, s11
182 ; SI-NEXT: s_mov_b32 s6, s10
183 ; SI-NEXT: s_mov_b32 s7, s11
184 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
185 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
186 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
187 ; SI-NEXT: s_mov_b32 s8, s0
188 ; SI-NEXT: s_mov_b32 s9, s1
189 ; SI-NEXT: s_waitcnt vmcnt(2)
190 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
191 ; SI-NEXT: s_waitcnt vmcnt(1)
192 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
193 ; SI-NEXT: s_waitcnt vmcnt(0)
194 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
195 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0
196 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
197 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
198 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
201 ; VI-LABEL: select_f16_imm_b:
202 ; VI: ; %bb.0: ; %entry
203 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
204 ; VI-NEXT: s_mov_b32 s11, 0xf000
205 ; VI-NEXT: s_mov_b32 s10, -1
206 ; VI-NEXT: s_mov_b32 s14, s10
207 ; VI-NEXT: s_mov_b32 s15, s11
208 ; VI-NEXT: s_waitcnt lgkmcnt(0)
209 ; VI-NEXT: s_mov_b32 s8, s0
210 ; VI-NEXT: s_mov_b32 s9, s1
211 ; VI-NEXT: s_mov_b32 s0, s2
212 ; VI-NEXT: s_mov_b32 s1, s3
213 ; VI-NEXT: s_mov_b32 s2, s10
214 ; VI-NEXT: s_mov_b32 s3, s11
215 ; VI-NEXT: s_mov_b32 s12, s6
216 ; VI-NEXT: s_mov_b32 s13, s7
217 ; VI-NEXT: s_mov_b32 s6, s10
218 ; VI-NEXT: s_mov_b32 s7, s11
219 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
220 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
221 ; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
222 ; VI-NEXT: s_waitcnt vmcnt(2)
223 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
224 ; VI-NEXT: s_waitcnt vmcnt(0)
225 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
226 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
228 half addrspace(1)* %r,
229 half addrspace(1)* %a,
230 half addrspace(1)* %c,
231 half addrspace(1)* %d) {
233 %a.val = load volatile half, half addrspace(1)* %a
234 %c.val = load volatile half, half addrspace(1)* %c
235 %d.val = load volatile half, half addrspace(1)* %d
236 %fcmp = fcmp olt half %a.val, 0xH3800
237 %r.val = select i1 %fcmp, half %c.val, half %d.val
238 store half %r.val, half addrspace(1)* %r
242 define amdgpu_kernel void @select_f16_imm_c(
243 ; SI-LABEL: select_f16_imm_c:
244 ; SI: ; %bb.0: ; %entry
245 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
246 ; SI-NEXT: s_mov_b32 s11, 0xf000
247 ; SI-NEXT: s_mov_b32 s10, -1
248 ; SI-NEXT: s_mov_b32 s18, s10
249 ; SI-NEXT: s_mov_b32 s19, s11
250 ; SI-NEXT: s_waitcnt lgkmcnt(0)
251 ; SI-NEXT: s_mov_b32 s12, s6
252 ; SI-NEXT: s_mov_b32 s13, s7
253 ; SI-NEXT: s_mov_b32 s6, s10
254 ; SI-NEXT: s_mov_b32 s7, s11
255 ; SI-NEXT: s_mov_b32 s16, s2
256 ; SI-NEXT: s_mov_b32 s17, s3
257 ; SI-NEXT: s_mov_b32 s14, s10
258 ; SI-NEXT: s_mov_b32 s15, s11
259 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
260 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
261 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
262 ; SI-NEXT: s_mov_b32 s8, s0
263 ; SI-NEXT: s_mov_b32 s9, s1
264 ; SI-NEXT: s_waitcnt vmcnt(2)
265 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
266 ; SI-NEXT: s_waitcnt vmcnt(1)
267 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
268 ; SI-NEXT: s_waitcnt vmcnt(0)
269 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
270 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
271 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
272 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
273 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
276 ; VI-LABEL: select_f16_imm_c:
277 ; VI: ; %bb.0: ; %entry
278 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
279 ; VI-NEXT: s_mov_b32 s11, 0xf000
280 ; VI-NEXT: s_mov_b32 s10, -1
281 ; VI-NEXT: s_mov_b32 s14, s10
282 ; VI-NEXT: s_mov_b32 s15, s11
283 ; VI-NEXT: s_waitcnt lgkmcnt(0)
284 ; VI-NEXT: s_mov_b32 s8, s0
285 ; VI-NEXT: s_mov_b32 s9, s1
286 ; VI-NEXT: s_mov_b32 s12, s6
287 ; VI-NEXT: s_mov_b32 s13, s7
288 ; VI-NEXT: s_mov_b32 s0, s2
289 ; VI-NEXT: s_mov_b32 s1, s3
290 ; VI-NEXT: s_mov_b32 s6, s10
291 ; VI-NEXT: s_mov_b32 s7, s11
292 ; VI-NEXT: s_mov_b32 s2, s10
293 ; VI-NEXT: s_mov_b32 s3, s11
294 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
295 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
296 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0
297 ; VI-NEXT: v_mov_b32_e32 v2, 0x3800
298 ; VI-NEXT: s_waitcnt vmcnt(1)
299 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
300 ; VI-NEXT: s_waitcnt vmcnt(0)
301 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
302 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
304 half addrspace(1)* %r,
305 half addrspace(1)* %a,
306 half addrspace(1)* %b,
307 half addrspace(1)* %d) {
309 %a.val = load volatile half, half addrspace(1)* %a
310 %b.val = load volatile half, half addrspace(1)* %b
311 %d.val = load volatile half, half addrspace(1)* %d
312 %fcmp = fcmp olt half %a.val, %b.val
313 %r.val = select i1 %fcmp, half 0xH3800, half %d.val
314 store half %r.val, half addrspace(1)* %r
318 define amdgpu_kernel void @select_f16_imm_d(
319 ; SI-LABEL: select_f16_imm_d:
320 ; SI: ; %bb.0: ; %entry
321 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
322 ; SI-NEXT: s_mov_b32 s11, 0xf000
323 ; SI-NEXT: s_mov_b32 s10, -1
324 ; SI-NEXT: s_mov_b32 s18, s10
325 ; SI-NEXT: s_mov_b32 s19, s11
326 ; SI-NEXT: s_waitcnt lgkmcnt(0)
327 ; SI-NEXT: s_mov_b32 s12, s6
328 ; SI-NEXT: s_mov_b32 s13, s7
329 ; SI-NEXT: s_mov_b32 s6, s10
330 ; SI-NEXT: s_mov_b32 s7, s11
331 ; SI-NEXT: s_mov_b32 s16, s2
332 ; SI-NEXT: s_mov_b32 s17, s3
333 ; SI-NEXT: s_mov_b32 s14, s10
334 ; SI-NEXT: s_mov_b32 s15, s11
335 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
336 ; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
337 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
338 ; SI-NEXT: s_mov_b32 s8, s0
339 ; SI-NEXT: s_mov_b32 s9, s1
340 ; SI-NEXT: s_waitcnt vmcnt(2)
341 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
342 ; SI-NEXT: s_waitcnt vmcnt(1)
343 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
344 ; SI-NEXT: s_waitcnt vmcnt(0)
345 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
346 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
347 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
348 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
349 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
352 ; VI-LABEL: select_f16_imm_d:
353 ; VI: ; %bb.0: ; %entry
354 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
355 ; VI-NEXT: s_mov_b32 s11, 0xf000
356 ; VI-NEXT: s_mov_b32 s10, -1
357 ; VI-NEXT: s_mov_b32 s14, s10
358 ; VI-NEXT: s_mov_b32 s15, s11
359 ; VI-NEXT: s_waitcnt lgkmcnt(0)
360 ; VI-NEXT: s_mov_b32 s8, s0
361 ; VI-NEXT: s_mov_b32 s9, s1
362 ; VI-NEXT: s_mov_b32 s12, s6
363 ; VI-NEXT: s_mov_b32 s13, s7
364 ; VI-NEXT: s_mov_b32 s0, s2
365 ; VI-NEXT: s_mov_b32 s1, s3
366 ; VI-NEXT: s_mov_b32 s6, s10
367 ; VI-NEXT: s_mov_b32 s7, s11
368 ; VI-NEXT: s_mov_b32 s2, s10
369 ; VI-NEXT: s_mov_b32 s3, s11
370 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
371 ; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0
372 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0
373 ; VI-NEXT: v_mov_b32_e32 v2, 0x3800
374 ; VI-NEXT: s_waitcnt vmcnt(1)
375 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
376 ; VI-NEXT: s_waitcnt vmcnt(0)
377 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
378 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
380 half addrspace(1)* %r,
381 half addrspace(1)* %a,
382 half addrspace(1)* %b,
383 half addrspace(1)* %c) {
385 %a.val = load volatile half, half addrspace(1)* %a
386 %b.val = load volatile half, half addrspace(1)* %b
387 %c.val = load volatile half, half addrspace(1)* %c
388 %fcmp = fcmp olt half %a.val, %b.val
389 %r.val = select i1 %fcmp, half %c.val, half 0xH3800
390 store half %r.val, half addrspace(1)* %r
394 define amdgpu_kernel void @select_v2f16(
395 ; SI-LABEL: select_v2f16:
396 ; SI: ; %bb.0: ; %entry
397 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
398 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11
399 ; SI-NEXT: s_mov_b32 s15, 0xf000
400 ; SI-NEXT: s_mov_b32 s14, -1
401 ; SI-NEXT: s_mov_b32 s22, s14
402 ; SI-NEXT: s_waitcnt lgkmcnt(0)
403 ; SI-NEXT: s_mov_b32 s16, s10
404 ; SI-NEXT: s_mov_b32 s17, s11
405 ; SI-NEXT: s_mov_b32 s10, s14
406 ; SI-NEXT: s_mov_b32 s11, s15
407 ; SI-NEXT: s_mov_b32 s20, s6
408 ; SI-NEXT: s_mov_b32 s21, s7
409 ; SI-NEXT: s_mov_b32 s23, s15
410 ; SI-NEXT: s_mov_b32 s2, s14
411 ; SI-NEXT: s_mov_b32 s3, s15
412 ; SI-NEXT: buffer_load_dword v0, off, s[20:23], 0
413 ; SI-NEXT: s_mov_b32 s18, s14
414 ; SI-NEXT: s_mov_b32 s19, s15
415 ; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
416 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], 0
417 ; SI-NEXT: buffer_load_dword v3, off, s[16:19], 0
418 ; SI-NEXT: s_mov_b32 s12, s4
419 ; SI-NEXT: s_mov_b32 s13, s5
420 ; SI-NEXT: s_waitcnt vmcnt(3)
421 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
422 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
423 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
424 ; SI-NEXT: s_waitcnt vmcnt(2)
425 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
426 ; SI-NEXT: s_waitcnt vmcnt(1)
427 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2
428 ; SI-NEXT: s_waitcnt vmcnt(0)
429 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
430 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
431 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
432 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
433 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
434 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
435 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
436 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6
437 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
438 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
439 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
440 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
441 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
442 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
443 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
444 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
447 ; VI-LABEL: select_v2f16:
448 ; VI: ; %bb.0: ; %entry
449 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
450 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
451 ; VI-NEXT: s_mov_b32 s3, 0xf000
452 ; VI-NEXT: s_mov_b32 s2, -1
453 ; VI-NEXT: s_mov_b32 s14, s2
454 ; VI-NEXT: s_waitcnt lgkmcnt(0)
455 ; VI-NEXT: s_mov_b32 s0, s4
456 ; VI-NEXT: s_mov_b32 s1, s5
457 ; VI-NEXT: s_mov_b32 s16, s10
458 ; VI-NEXT: s_mov_b32 s17, s11
459 ; VI-NEXT: s_mov_b32 s4, s6
460 ; VI-NEXT: s_mov_b32 s5, s7
461 ; VI-NEXT: s_mov_b32 s10, s2
462 ; VI-NEXT: s_mov_b32 s11, s3
463 ; VI-NEXT: s_mov_b32 s6, s2
464 ; VI-NEXT: s_mov_b32 s7, s3
465 ; VI-NEXT: s_mov_b32 s15, s3
466 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
467 ; VI-NEXT: s_mov_b32 s18, s2
468 ; VI-NEXT: s_mov_b32 s19, s3
469 ; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0
470 ; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0
471 ; VI-NEXT: buffer_load_dword v3, off, s[16:19], 0
472 ; VI-NEXT: s_waitcnt vmcnt(3)
473 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
474 ; VI-NEXT: s_waitcnt vmcnt(2)
475 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
476 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
477 ; VI-NEXT: s_waitcnt vmcnt(0)
478 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
479 ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
480 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
481 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5
482 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
483 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
484 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
485 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
487 <2 x half> addrspace(1)* %r,
488 <2 x half> addrspace(1)* %a,
489 <2 x half> addrspace(1)* %b,
490 <2 x half> addrspace(1)* %c,
491 <2 x half> addrspace(1)* %d) {
493 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
494 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
495 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
496 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
497 %fcmp = fcmp olt <2 x half> %a.val, %b.val
498 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
499 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
503 define amdgpu_kernel void @select_v2f16_imm_a(
504 ; SI-LABEL: select_v2f16_imm_a:
505 ; SI: ; %bb.0: ; %entry
506 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
507 ; SI-NEXT: s_mov_b32 s11, 0xf000
508 ; SI-NEXT: s_mov_b32 s10, -1
509 ; SI-NEXT: s_mov_b32 s18, s10
510 ; SI-NEXT: s_mov_b32 s19, s11
511 ; SI-NEXT: s_waitcnt lgkmcnt(0)
512 ; SI-NEXT: s_mov_b32 s16, s2
513 ; SI-NEXT: s_mov_b32 s17, s3
514 ; SI-NEXT: s_mov_b32 s12, s6
515 ; SI-NEXT: s_mov_b32 s13, s7
516 ; SI-NEXT: s_mov_b32 s14, s10
517 ; SI-NEXT: s_mov_b32 s15, s11
518 ; SI-NEXT: s_mov_b32 s6, s10
519 ; SI-NEXT: s_mov_b32 s7, s11
520 ; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0
521 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
522 ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
523 ; SI-NEXT: s_mov_b32 s2, 0x3f200000
524 ; SI-NEXT: s_mov_b32 s8, s0
525 ; SI-NEXT: s_mov_b32 s9, s1
526 ; SI-NEXT: s_waitcnt vmcnt(2)
527 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
528 ; SI-NEXT: s_waitcnt vmcnt(1)
529 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
530 ; SI-NEXT: s_waitcnt vmcnt(0)
531 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
532 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
533 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
534 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
535 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
536 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
537 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
538 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3
539 ; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
540 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0
541 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
542 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
543 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
544 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
545 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
546 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
549 ; VI-LABEL: select_v2f16_imm_a:
550 ; VI: ; %bb.0: ; %entry
551 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
552 ; VI-NEXT: s_mov_b32 s11, 0xf000
553 ; VI-NEXT: s_mov_b32 s10, -1
554 ; VI-NEXT: s_mov_b32 s14, s10
555 ; VI-NEXT: s_mov_b32 s15, s11
556 ; VI-NEXT: s_waitcnt lgkmcnt(0)
557 ; VI-NEXT: s_mov_b32 s8, s0
558 ; VI-NEXT: s_mov_b32 s9, s1
559 ; VI-NEXT: s_mov_b32 s0, s2
560 ; VI-NEXT: s_mov_b32 s1, s3
561 ; VI-NEXT: s_mov_b32 s2, s10
562 ; VI-NEXT: s_mov_b32 s3, s11
563 ; VI-NEXT: s_mov_b32 s12, s6
564 ; VI-NEXT: s_mov_b32 s13, s7
565 ; VI-NEXT: s_mov_b32 s6, s10
566 ; VI-NEXT: s_mov_b32 s7, s11
567 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
568 ; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0
569 ; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0
570 ; VI-NEXT: s_movk_i32 s0, 0x3900
571 ; VI-NEXT: s_waitcnt vmcnt(2)
572 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
573 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
574 ; VI-NEXT: s_waitcnt vmcnt(0)
575 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
576 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
577 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
578 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, s0, v3
579 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
580 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
581 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
582 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
584 <2 x half> addrspace(1)* %r,
585 <2 x half> addrspace(1)* %b,
586 <2 x half> addrspace(1)* %c,
587 <2 x half> addrspace(1)* %d) {
589 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
590 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
591 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
592 %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
593 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
594 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
598 define amdgpu_kernel void @select_v2f16_imm_b(
599 ; SI-LABEL: select_v2f16_imm_b:
600 ; SI: ; %bb.0: ; %entry
601 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
602 ; SI-NEXT: s_mov_b32 s11, 0xf000
603 ; SI-NEXT: s_mov_b32 s10, -1
604 ; SI-NEXT: s_mov_b32 s18, s10
605 ; SI-NEXT: s_mov_b32 s19, s11
606 ; SI-NEXT: s_waitcnt lgkmcnt(0)
607 ; SI-NEXT: s_mov_b32 s16, s2
608 ; SI-NEXT: s_mov_b32 s17, s3
609 ; SI-NEXT: s_mov_b32 s12, s6
610 ; SI-NEXT: s_mov_b32 s13, s7
611 ; SI-NEXT: s_mov_b32 s14, s10
612 ; SI-NEXT: s_mov_b32 s15, s11
613 ; SI-NEXT: s_mov_b32 s6, s10
614 ; SI-NEXT: s_mov_b32 s7, s11
615 ; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0
616 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
617 ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
618 ; SI-NEXT: s_mov_b32 s2, 0x3f200000
619 ; SI-NEXT: s_mov_b32 s8, s0
620 ; SI-NEXT: s_mov_b32 s9, s1
621 ; SI-NEXT: s_waitcnt vmcnt(2)
622 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
623 ; SI-NEXT: s_waitcnt vmcnt(1)
624 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
625 ; SI-NEXT: s_waitcnt vmcnt(0)
626 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
627 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
628 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
629 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
630 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
631 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
632 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
633 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3
634 ; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
635 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0
636 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
637 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
638 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
639 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
640 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
641 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
644 ; VI-LABEL: select_v2f16_imm_b:
645 ; VI: ; %bb.0: ; %entry
646 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
647 ; VI-NEXT: s_mov_b32 s11, 0xf000
648 ; VI-NEXT: s_mov_b32 s10, -1
649 ; VI-NEXT: s_mov_b32 s14, s10
650 ; VI-NEXT: s_mov_b32 s15, s11
651 ; VI-NEXT: s_waitcnt lgkmcnt(0)
652 ; VI-NEXT: s_mov_b32 s8, s0
653 ; VI-NEXT: s_mov_b32 s9, s1
654 ; VI-NEXT: s_mov_b32 s0, s2
655 ; VI-NEXT: s_mov_b32 s1, s3
656 ; VI-NEXT: s_mov_b32 s2, s10
657 ; VI-NEXT: s_mov_b32 s3, s11
658 ; VI-NEXT: s_mov_b32 s12, s6
659 ; VI-NEXT: s_mov_b32 s13, s7
660 ; VI-NEXT: s_mov_b32 s6, s10
661 ; VI-NEXT: s_mov_b32 s7, s11
662 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
663 ; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0
664 ; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0
665 ; VI-NEXT: s_movk_i32 s0, 0x3900
666 ; VI-NEXT: s_waitcnt vmcnt(2)
667 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
668 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
669 ; VI-NEXT: s_waitcnt vmcnt(0)
670 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
671 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
672 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
673 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, s0, v3
674 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
675 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
676 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
677 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
679 <2 x half> addrspace(1)* %r,
680 <2 x half> addrspace(1)* %a,
681 <2 x half> addrspace(1)* %c,
682 <2 x half> addrspace(1)* %d) {
684 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
685 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
686 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
687 %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
688 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
689 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
693 define amdgpu_kernel void @select_v2f16_imm_c(
694 ; SI-LABEL: select_v2f16_imm_c:
695 ; SI: ; %bb.0: ; %entry
696 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
697 ; SI-NEXT: s_mov_b32 s11, 0xf000
698 ; SI-NEXT: s_mov_b32 s10, -1
699 ; SI-NEXT: s_mov_b32 s18, s10
700 ; SI-NEXT: s_mov_b32 s19, s11
701 ; SI-NEXT: s_waitcnt lgkmcnt(0)
702 ; SI-NEXT: s_mov_b32 s12, s6
703 ; SI-NEXT: s_mov_b32 s13, s7
704 ; SI-NEXT: s_mov_b32 s6, s10
705 ; SI-NEXT: s_mov_b32 s7, s11
706 ; SI-NEXT: s_mov_b32 s16, s2
707 ; SI-NEXT: s_mov_b32 s17, s3
708 ; SI-NEXT: buffer_load_dword v3, off, s[4:7], 0
709 ; SI-NEXT: s_mov_b32 s14, s10
710 ; SI-NEXT: s_mov_b32 s15, s11
711 ; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0
712 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
713 ; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000
714 ; SI-NEXT: s_mov_b32 s8, s0
715 ; SI-NEXT: s_mov_b32 s9, s1
716 ; SI-NEXT: s_waitcnt vmcnt(2)
717 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
718 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
719 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
720 ; SI-NEXT: s_waitcnt vmcnt(1)
721 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
722 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
723 ; SI-NEXT: s_waitcnt vmcnt(0)
724 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
725 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
726 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
727 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
728 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5
729 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
730 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v3
731 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
732 ; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v1, vcc
733 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
734 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
735 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
736 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
739 ; VI-LABEL: select_v2f16_imm_c:
740 ; VI: ; %bb.0: ; %entry
741 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
742 ; VI-NEXT: s_mov_b32 s11, 0xf000
743 ; VI-NEXT: s_mov_b32 s10, -1
744 ; VI-NEXT: s_mov_b32 s14, s10
745 ; VI-NEXT: s_mov_b32 s15, s11
746 ; VI-NEXT: s_waitcnt lgkmcnt(0)
747 ; VI-NEXT: s_mov_b32 s8, s0
748 ; VI-NEXT: s_mov_b32 s9, s1
749 ; VI-NEXT: s_mov_b32 s12, s6
750 ; VI-NEXT: s_mov_b32 s13, s7
751 ; VI-NEXT: s_mov_b32 s0, s2
752 ; VI-NEXT: s_mov_b32 s1, s3
753 ; VI-NEXT: s_mov_b32 s6, s10
754 ; VI-NEXT: s_mov_b32 s7, s11
755 ; VI-NEXT: s_mov_b32 s2, s10
756 ; VI-NEXT: s_mov_b32 s3, s11
757 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
758 ; VI-NEXT: buffer_load_dword v4, off, s[4:7], 0
759 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
760 ; VI-NEXT: v_mov_b32_e32 v2, 0x3800
761 ; VI-NEXT: v_mov_b32_e32 v3, 0x3900
762 ; VI-NEXT: s_waitcnt vmcnt(2)
763 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
764 ; VI-NEXT: s_waitcnt vmcnt(1)
765 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v4
766 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
767 ; VI-NEXT: s_waitcnt vmcnt(0)
768 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
769 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
770 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5
771 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
772 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
773 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
774 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
776 <2 x half> addrspace(1)* %r,
777 <2 x half> addrspace(1)* %a,
778 <2 x half> addrspace(1)* %b,
779 <2 x half> addrspace(1)* %d) {
781 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
782 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
783 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
784 %fcmp = fcmp olt <2 x half> %a.val, %b.val
785 %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
786 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
790 define amdgpu_kernel void @select_v2f16_imm_d(
791 ; SI-LABEL: select_v2f16_imm_d:
792 ; SI: ; %bb.0: ; %entry
793 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
794 ; SI-NEXT: s_mov_b32 s11, 0xf000
795 ; SI-NEXT: s_mov_b32 s10, -1
796 ; SI-NEXT: s_mov_b32 s18, s10
797 ; SI-NEXT: s_mov_b32 s19, s11
798 ; SI-NEXT: s_waitcnt lgkmcnt(0)
799 ; SI-NEXT: s_mov_b32 s12, s6
800 ; SI-NEXT: s_mov_b32 s13, s7
801 ; SI-NEXT: s_mov_b32 s6, s10
802 ; SI-NEXT: s_mov_b32 s7, s11
803 ; SI-NEXT: s_mov_b32 s16, s2
804 ; SI-NEXT: s_mov_b32 s17, s3
805 ; SI-NEXT: buffer_load_dword v3, off, s[4:7], 0
806 ; SI-NEXT: s_mov_b32 s14, s10
807 ; SI-NEXT: s_mov_b32 s15, s11
808 ; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0
809 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
810 ; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000
811 ; SI-NEXT: s_mov_b32 s8, s0
812 ; SI-NEXT: s_mov_b32 s9, s1
813 ; SI-NEXT: s_waitcnt vmcnt(2)
814 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
815 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
816 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
817 ; SI-NEXT: s_waitcnt vmcnt(1)
818 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
819 ; SI-NEXT: s_waitcnt vmcnt(0)
820 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
821 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
822 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
823 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
824 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
825 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
826 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
827 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3
828 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v1, vcc
829 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
830 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
831 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
832 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
833 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
836 ; VI-LABEL: select_v2f16_imm_d:
837 ; VI: ; %bb.0: ; %entry
838 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
839 ; VI-NEXT: s_mov_b32 s11, 0xf000
840 ; VI-NEXT: s_mov_b32 s10, -1
841 ; VI-NEXT: s_mov_b32 s14, s10
842 ; VI-NEXT: s_mov_b32 s15, s11
843 ; VI-NEXT: s_waitcnt lgkmcnt(0)
844 ; VI-NEXT: s_mov_b32 s8, s0
845 ; VI-NEXT: s_mov_b32 s9, s1
846 ; VI-NEXT: s_mov_b32 s12, s6
847 ; VI-NEXT: s_mov_b32 s13, s7
848 ; VI-NEXT: s_mov_b32 s0, s2
849 ; VI-NEXT: s_mov_b32 s1, s3
850 ; VI-NEXT: s_mov_b32 s6, s10
851 ; VI-NEXT: s_mov_b32 s7, s11
852 ; VI-NEXT: s_mov_b32 s2, s10
853 ; VI-NEXT: s_mov_b32 s3, s11
854 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
855 ; VI-NEXT: buffer_load_dword v4, off, s[4:7], 0
856 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
857 ; VI-NEXT: v_mov_b32_e32 v2, 0x3800
858 ; VI-NEXT: v_mov_b32_e32 v3, 0x3900
859 ; VI-NEXT: s_waitcnt vmcnt(2)
860 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
861 ; VI-NEXT: s_waitcnt vmcnt(1)
862 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v4
863 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
864 ; VI-NEXT: s_waitcnt vmcnt(0)
865 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
866 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
867 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5
868 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
869 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
870 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
871 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
873 <2 x half> addrspace(1)* %r,
874 <2 x half> addrspace(1)* %a,
875 <2 x half> addrspace(1)* %b,
876 <2 x half> addrspace(1)* %c) {
878 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
879 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
880 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
881 %fcmp = fcmp olt <2 x half> %a.val, %b.val
882 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
883 store <2 x half> %r.val, <2 x half> addrspace(1)* %r