1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
5 define amdgpu_kernel void @select_f16(
6 ; SI-LABEL: select_f16:
7 ; SI: ; %bb.0: ; %entry
8 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
9 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
10 ; SI-NEXT: s_mov_b32 s3, 0xf000
11 ; SI-NEXT: s_mov_b32 s2, -1
12 ; SI-NEXT: s_mov_b32 s18, s2
13 ; SI-NEXT: s_waitcnt lgkmcnt(0)
14 ; SI-NEXT: s_mov_b32 s16, s6
15 ; SI-NEXT: s_mov_b32 s17, s7
16 ; SI-NEXT: s_mov_b32 s19, s3
17 ; SI-NEXT: s_mov_b32 s20, s8
18 ; SI-NEXT: s_mov_b32 s21, s9
19 ; SI-NEXT: s_mov_b32 s8, s10
20 ; SI-NEXT: s_mov_b32 s9, s11
21 ; SI-NEXT: s_mov_b32 s22, s2
22 ; SI-NEXT: s_mov_b32 s23, s3
23 ; SI-NEXT: s_mov_b32 s10, s2
24 ; SI-NEXT: s_mov_b32 s11, s3
25 ; SI-NEXT: s_mov_b32 s14, s2
26 ; SI-NEXT: s_mov_b32 s15, s3
27 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
28 ; SI-NEXT: s_waitcnt vmcnt(0)
29 ; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc
30 ; SI-NEXT: s_waitcnt vmcnt(0)
31 ; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
32 ; SI-NEXT: s_waitcnt vmcnt(0)
33 ; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc
34 ; SI-NEXT: s_waitcnt vmcnt(0)
35 ; SI-NEXT: s_mov_b32 s0, s4
36 ; SI-NEXT: s_mov_b32 s1, s5
37 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
38 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
39 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
40 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
41 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
42 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
43 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
44 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
47 ; VI-LABEL: select_f16:
48 ; VI: ; %bb.0: ; %entry
49 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
50 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
51 ; VI-NEXT: s_mov_b32 s3, 0xf000
52 ; VI-NEXT: s_mov_b32 s2, -1
53 ; VI-NEXT: s_mov_b32 s18, s2
54 ; VI-NEXT: s_waitcnt lgkmcnt(0)
55 ; VI-NEXT: s_mov_b32 s16, s6
56 ; VI-NEXT: s_mov_b32 s17, s7
57 ; VI-NEXT: s_mov_b32 s19, s3
58 ; VI-NEXT: s_mov_b32 s20, s8
59 ; VI-NEXT: s_mov_b32 s21, s9
60 ; VI-NEXT: s_mov_b32 s8, s10
61 ; VI-NEXT: s_mov_b32 s9, s11
62 ; VI-NEXT: s_mov_b32 s22, s2
63 ; VI-NEXT: s_mov_b32 s23, s3
64 ; VI-NEXT: s_mov_b32 s10, s2
65 ; VI-NEXT: s_mov_b32 s11, s3
66 ; VI-NEXT: s_mov_b32 s14, s2
67 ; VI-NEXT: s_mov_b32 s15, s3
68 ; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
69 ; VI-NEXT: s_waitcnt vmcnt(0)
70 ; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc
71 ; VI-NEXT: s_waitcnt vmcnt(0)
72 ; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
73 ; VI-NEXT: s_waitcnt vmcnt(0)
74 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc
75 ; VI-NEXT: s_waitcnt vmcnt(0)
76 ; VI-NEXT: s_mov_b32 s0, s4
77 ; VI-NEXT: s_mov_b32 s1, s5
78 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
79 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
80 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
82 half addrspace(1)* %r,
83 half addrspace(1)* %a,
84 half addrspace(1)* %b,
85 half addrspace(1)* %c,
86 half addrspace(1)* %d) {
88 %a.val = load volatile half, half addrspace(1)* %a
89 %b.val = load volatile half, half addrspace(1)* %b
90 %c.val = load volatile half, half addrspace(1)* %c
91 %d.val = load volatile half, half addrspace(1)* %d
92 %fcmp = fcmp olt half %a.val, %b.val
93 %r.val = select i1 %fcmp, half %c.val, half %d.val
94 store half %r.val, half addrspace(1)* %r
98 define amdgpu_kernel void @select_f16_imm_a(
99 ; SI-LABEL: select_f16_imm_a:
100 ; SI: ; %bb.0: ; %entry
101 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
102 ; SI-NEXT: s_mov_b32 s11, 0xf000
103 ; SI-NEXT: s_mov_b32 s10, -1
104 ; SI-NEXT: s_mov_b32 s14, s10
105 ; SI-NEXT: s_mov_b32 s15, s11
106 ; SI-NEXT: s_waitcnt lgkmcnt(0)
107 ; SI-NEXT: s_mov_b32 s12, s2
108 ; SI-NEXT: s_mov_b32 s13, s3
109 ; SI-NEXT: s_mov_b32 s16, s4
110 ; SI-NEXT: s_mov_b32 s17, s5
111 ; SI-NEXT: s_mov_b32 s4, s6
112 ; SI-NEXT: s_mov_b32 s5, s7
113 ; SI-NEXT: s_mov_b32 s18, s10
114 ; SI-NEXT: s_mov_b32 s19, s11
115 ; SI-NEXT: s_mov_b32 s6, s10
116 ; SI-NEXT: s_mov_b32 s7, s11
117 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
118 ; SI-NEXT: s_waitcnt vmcnt(0)
119 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
120 ; SI-NEXT: s_waitcnt vmcnt(0)
121 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
122 ; SI-NEXT: s_waitcnt vmcnt(0)
123 ; SI-NEXT: s_mov_b32 s8, s0
124 ; SI-NEXT: s_mov_b32 s9, s1
125 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
126 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
127 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
128 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0
129 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
130 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
131 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
134 ; VI-LABEL: select_f16_imm_a:
135 ; VI: ; %bb.0: ; %entry
136 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
137 ; VI-NEXT: s_mov_b32 s11, 0xf000
138 ; VI-NEXT: s_mov_b32 s10, -1
139 ; VI-NEXT: s_mov_b32 s14, s10
140 ; VI-NEXT: s_mov_b32 s15, s11
141 ; VI-NEXT: s_waitcnt lgkmcnt(0)
142 ; VI-NEXT: s_mov_b32 s12, s2
143 ; VI-NEXT: s_mov_b32 s13, s3
144 ; VI-NEXT: s_mov_b32 s16, s4
145 ; VI-NEXT: s_mov_b32 s17, s5
146 ; VI-NEXT: s_mov_b32 s4, s6
147 ; VI-NEXT: s_mov_b32 s5, s7
148 ; VI-NEXT: s_mov_b32 s18, s10
149 ; VI-NEXT: s_mov_b32 s19, s11
150 ; VI-NEXT: s_mov_b32 s6, s10
151 ; VI-NEXT: s_mov_b32 s7, s11
152 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
153 ; VI-NEXT: s_waitcnt vmcnt(0)
154 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
155 ; VI-NEXT: s_waitcnt vmcnt(0)
156 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
157 ; VI-NEXT: s_waitcnt vmcnt(0)
158 ; VI-NEXT: s_mov_b32 s8, s0
159 ; VI-NEXT: s_mov_b32 s9, s1
160 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
161 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
162 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
164 half addrspace(1)* %r,
165 half addrspace(1)* %b,
166 half addrspace(1)* %c,
167 half addrspace(1)* %d) {
169 %b.val = load volatile half, half addrspace(1)* %b
170 %c.val = load volatile half, half addrspace(1)* %c
171 %d.val = load volatile half, half addrspace(1)* %d
172 %fcmp = fcmp olt half 0xH3800, %b.val
173 %r.val = select i1 %fcmp, half %c.val, half %d.val
174 store half %r.val, half addrspace(1)* %r
178 define amdgpu_kernel void @select_f16_imm_b(
179 ; SI-LABEL: select_f16_imm_b:
180 ; SI: ; %bb.0: ; %entry
181 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
182 ; SI-NEXT: s_mov_b32 s11, 0xf000
183 ; SI-NEXT: s_mov_b32 s10, -1
184 ; SI-NEXT: s_mov_b32 s14, s10
185 ; SI-NEXT: s_mov_b32 s15, s11
186 ; SI-NEXT: s_waitcnt lgkmcnt(0)
187 ; SI-NEXT: s_mov_b32 s12, s2
188 ; SI-NEXT: s_mov_b32 s13, s3
189 ; SI-NEXT: s_mov_b32 s16, s4
190 ; SI-NEXT: s_mov_b32 s17, s5
191 ; SI-NEXT: s_mov_b32 s4, s6
192 ; SI-NEXT: s_mov_b32 s5, s7
193 ; SI-NEXT: s_mov_b32 s18, s10
194 ; SI-NEXT: s_mov_b32 s19, s11
195 ; SI-NEXT: s_mov_b32 s6, s10
196 ; SI-NEXT: s_mov_b32 s7, s11
197 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
198 ; SI-NEXT: s_waitcnt vmcnt(0)
199 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
200 ; SI-NEXT: s_waitcnt vmcnt(0)
201 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
202 ; SI-NEXT: s_waitcnt vmcnt(0)
203 ; SI-NEXT: s_mov_b32 s8, s0
204 ; SI-NEXT: s_mov_b32 s9, s1
205 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
206 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
207 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
208 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0
209 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
210 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
211 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
214 ; VI-LABEL: select_f16_imm_b:
215 ; VI: ; %bb.0: ; %entry
216 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
217 ; VI-NEXT: s_mov_b32 s11, 0xf000
218 ; VI-NEXT: s_mov_b32 s10, -1
219 ; VI-NEXT: s_mov_b32 s14, s10
220 ; VI-NEXT: s_mov_b32 s15, s11
221 ; VI-NEXT: s_waitcnt lgkmcnt(0)
222 ; VI-NEXT: s_mov_b32 s12, s2
223 ; VI-NEXT: s_mov_b32 s13, s3
224 ; VI-NEXT: s_mov_b32 s16, s4
225 ; VI-NEXT: s_mov_b32 s17, s5
226 ; VI-NEXT: s_mov_b32 s4, s6
227 ; VI-NEXT: s_mov_b32 s5, s7
228 ; VI-NEXT: s_mov_b32 s18, s10
229 ; VI-NEXT: s_mov_b32 s19, s11
230 ; VI-NEXT: s_mov_b32 s6, s10
231 ; VI-NEXT: s_mov_b32 s7, s11
232 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
233 ; VI-NEXT: s_waitcnt vmcnt(0)
234 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
235 ; VI-NEXT: s_waitcnt vmcnt(0)
236 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
237 ; VI-NEXT: s_waitcnt vmcnt(0)
238 ; VI-NEXT: s_mov_b32 s8, s0
239 ; VI-NEXT: s_mov_b32 s9, s1
240 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
241 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
242 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
244 half addrspace(1)* %r,
245 half addrspace(1)* %a,
246 half addrspace(1)* %c,
247 half addrspace(1)* %d) {
249 %a.val = load volatile half, half addrspace(1)* %a
250 %c.val = load volatile half, half addrspace(1)* %c
251 %d.val = load volatile half, half addrspace(1)* %d
252 %fcmp = fcmp olt half %a.val, 0xH3800
253 %r.val = select i1 %fcmp, half %c.val, half %d.val
254 store half %r.val, half addrspace(1)* %r
258 define amdgpu_kernel void @select_f16_imm_c(
259 ; SI-LABEL: select_f16_imm_c:
260 ; SI: ; %bb.0: ; %entry
261 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
262 ; SI-NEXT: s_mov_b32 s11, 0xf000
263 ; SI-NEXT: s_mov_b32 s10, -1
264 ; SI-NEXT: s_mov_b32 s14, s10
265 ; SI-NEXT: s_mov_b32 s15, s11
266 ; SI-NEXT: s_waitcnt lgkmcnt(0)
267 ; SI-NEXT: s_mov_b32 s12, s2
268 ; SI-NEXT: s_mov_b32 s13, s3
269 ; SI-NEXT: s_mov_b32 s16, s4
270 ; SI-NEXT: s_mov_b32 s17, s5
271 ; SI-NEXT: s_mov_b32 s4, s6
272 ; SI-NEXT: s_mov_b32 s5, s7
273 ; SI-NEXT: s_mov_b32 s18, s10
274 ; SI-NEXT: s_mov_b32 s19, s11
275 ; SI-NEXT: s_mov_b32 s6, s10
276 ; SI-NEXT: s_mov_b32 s7, s11
277 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
278 ; SI-NEXT: s_waitcnt vmcnt(0)
279 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
280 ; SI-NEXT: s_waitcnt vmcnt(0)
281 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
282 ; SI-NEXT: s_waitcnt vmcnt(0)
283 ; SI-NEXT: s_mov_b32 s8, s0
284 ; SI-NEXT: s_mov_b32 s9, s1
285 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
286 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
287 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
288 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
289 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
290 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
291 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
294 ; VI-LABEL: select_f16_imm_c:
295 ; VI: ; %bb.0: ; %entry
296 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
297 ; VI-NEXT: s_mov_b32 s11, 0xf000
298 ; VI-NEXT: s_mov_b32 s10, -1
299 ; VI-NEXT: s_mov_b32 s14, s10
300 ; VI-NEXT: s_mov_b32 s15, s11
301 ; VI-NEXT: s_waitcnt lgkmcnt(0)
302 ; VI-NEXT: s_mov_b32 s12, s2
303 ; VI-NEXT: s_mov_b32 s13, s3
304 ; VI-NEXT: s_mov_b32 s16, s4
305 ; VI-NEXT: s_mov_b32 s17, s5
306 ; VI-NEXT: s_mov_b32 s4, s6
307 ; VI-NEXT: s_mov_b32 s5, s7
308 ; VI-NEXT: s_mov_b32 s18, s10
309 ; VI-NEXT: s_mov_b32 s19, s11
310 ; VI-NEXT: s_mov_b32 s6, s10
311 ; VI-NEXT: s_mov_b32 s7, s11
312 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
313 ; VI-NEXT: s_waitcnt vmcnt(0)
314 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
315 ; VI-NEXT: s_waitcnt vmcnt(0)
316 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
317 ; VI-NEXT: s_waitcnt vmcnt(0)
318 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800
319 ; VI-NEXT: s_mov_b32 s8, s0
320 ; VI-NEXT: s_mov_b32 s9, s1
321 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
322 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
323 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
325 half addrspace(1)* %r,
326 half addrspace(1)* %a,
327 half addrspace(1)* %b,
328 half addrspace(1)* %d) {
330 %a.val = load volatile half, half addrspace(1)* %a
331 %b.val = load volatile half, half addrspace(1)* %b
332 %d.val = load volatile half, half addrspace(1)* %d
333 %fcmp = fcmp olt half %a.val, %b.val
334 %r.val = select i1 %fcmp, half 0xH3800, half %d.val
335 store half %r.val, half addrspace(1)* %r
339 define amdgpu_kernel void @select_f16_imm_d(
340 ; SI-LABEL: select_f16_imm_d:
341 ; SI: ; %bb.0: ; %entry
342 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
343 ; SI-NEXT: s_mov_b32 s11, 0xf000
344 ; SI-NEXT: s_mov_b32 s10, -1
345 ; SI-NEXT: s_mov_b32 s14, s10
346 ; SI-NEXT: s_mov_b32 s15, s11
347 ; SI-NEXT: s_waitcnt lgkmcnt(0)
348 ; SI-NEXT: s_mov_b32 s12, s2
349 ; SI-NEXT: s_mov_b32 s13, s3
350 ; SI-NEXT: s_mov_b32 s16, s4
351 ; SI-NEXT: s_mov_b32 s17, s5
352 ; SI-NEXT: s_mov_b32 s4, s6
353 ; SI-NEXT: s_mov_b32 s5, s7
354 ; SI-NEXT: s_mov_b32 s18, s10
355 ; SI-NEXT: s_mov_b32 s19, s11
356 ; SI-NEXT: s_mov_b32 s6, s10
357 ; SI-NEXT: s_mov_b32 s7, s11
358 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
359 ; SI-NEXT: s_waitcnt vmcnt(0)
360 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
361 ; SI-NEXT: s_waitcnt vmcnt(0)
362 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
363 ; SI-NEXT: s_waitcnt vmcnt(0)
364 ; SI-NEXT: s_mov_b32 s8, s0
365 ; SI-NEXT: s_mov_b32 s9, s1
366 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
367 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
368 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
369 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
370 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
371 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
372 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
375 ; VI-LABEL: select_f16_imm_d:
376 ; VI: ; %bb.0: ; %entry
377 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
378 ; VI-NEXT: s_mov_b32 s11, 0xf000
379 ; VI-NEXT: s_mov_b32 s10, -1
380 ; VI-NEXT: s_mov_b32 s14, s10
381 ; VI-NEXT: s_mov_b32 s15, s11
382 ; VI-NEXT: s_waitcnt lgkmcnt(0)
383 ; VI-NEXT: s_mov_b32 s12, s2
384 ; VI-NEXT: s_mov_b32 s13, s3
385 ; VI-NEXT: s_mov_b32 s16, s4
386 ; VI-NEXT: s_mov_b32 s17, s5
387 ; VI-NEXT: s_mov_b32 s4, s6
388 ; VI-NEXT: s_mov_b32 s5, s7
389 ; VI-NEXT: s_mov_b32 s18, s10
390 ; VI-NEXT: s_mov_b32 s19, s11
391 ; VI-NEXT: s_mov_b32 s6, s10
392 ; VI-NEXT: s_mov_b32 s7, s11
393 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
394 ; VI-NEXT: s_waitcnt vmcnt(0)
395 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
396 ; VI-NEXT: s_waitcnt vmcnt(0)
397 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
398 ; VI-NEXT: s_waitcnt vmcnt(0)
399 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800
400 ; VI-NEXT: s_mov_b32 s8, s0
401 ; VI-NEXT: s_mov_b32 s9, s1
402 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
403 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
404 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
406 half addrspace(1)* %r,
407 half addrspace(1)* %a,
408 half addrspace(1)* %b,
409 half addrspace(1)* %c) {
411 %a.val = load volatile half, half addrspace(1)* %a
412 %b.val = load volatile half, half addrspace(1)* %b
413 %c.val = load volatile half, half addrspace(1)* %c
414 %fcmp = fcmp olt half %a.val, %b.val
415 %r.val = select i1 %fcmp, half %c.val, half 0xH3800
416 store half %r.val, half addrspace(1)* %r
420 define amdgpu_kernel void @select_v2f16(
421 ; SI-LABEL: select_v2f16:
422 ; SI: ; %bb.0: ; %entry
423 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
424 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
425 ; SI-NEXT: s_mov_b32 s3, 0xf000
426 ; SI-NEXT: s_mov_b32 s2, -1
427 ; SI-NEXT: s_mov_b32 s18, s2
428 ; SI-NEXT: s_waitcnt lgkmcnt(0)
429 ; SI-NEXT: s_mov_b32 s16, s6
430 ; SI-NEXT: s_mov_b32 s17, s7
431 ; SI-NEXT: s_mov_b32 s19, s3
432 ; SI-NEXT: s_mov_b32 s20, s8
433 ; SI-NEXT: s_mov_b32 s21, s9
434 ; SI-NEXT: s_mov_b32 s8, s10
435 ; SI-NEXT: s_mov_b32 s9, s11
436 ; SI-NEXT: s_mov_b32 s22, s2
437 ; SI-NEXT: s_mov_b32 s23, s3
438 ; SI-NEXT: s_mov_b32 s10, s2
439 ; SI-NEXT: s_mov_b32 s11, s3
440 ; SI-NEXT: s_mov_b32 s14, s2
441 ; SI-NEXT: s_mov_b32 s15, s3
442 ; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0
443 ; SI-NEXT: buffer_load_dword v1, off, s[20:23], 0
444 ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
445 ; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0
446 ; SI-NEXT: s_mov_b32 s0, s4
447 ; SI-NEXT: s_mov_b32 s1, s5
448 ; SI-NEXT: s_waitcnt vmcnt(3)
449 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
450 ; SI-NEXT: s_waitcnt vmcnt(2)
451 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
452 ; SI-NEXT: s_waitcnt vmcnt(1)
453 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2
454 ; SI-NEXT: s_waitcnt vmcnt(0)
455 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
456 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
457 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
458 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
459 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
460 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
461 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
462 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
463 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
464 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6
465 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
466 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
467 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
468 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
469 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
470 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
471 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
472 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
475 ; VI-LABEL: select_v2f16:
476 ; VI: ; %bb.0: ; %entry
477 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
478 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
479 ; VI-NEXT: s_mov_b32 s3, 0xf000
480 ; VI-NEXT: s_mov_b32 s2, -1
481 ; VI-NEXT: s_mov_b32 s18, s2
482 ; VI-NEXT: s_waitcnt lgkmcnt(0)
483 ; VI-NEXT: s_mov_b32 s16, s6
484 ; VI-NEXT: s_mov_b32 s17, s7
485 ; VI-NEXT: s_mov_b32 s19, s3
486 ; VI-NEXT: s_mov_b32 s20, s8
487 ; VI-NEXT: s_mov_b32 s21, s9
488 ; VI-NEXT: s_mov_b32 s8, s10
489 ; VI-NEXT: s_mov_b32 s9, s11
490 ; VI-NEXT: s_mov_b32 s22, s2
491 ; VI-NEXT: s_mov_b32 s23, s3
492 ; VI-NEXT: s_mov_b32 s10, s2
493 ; VI-NEXT: s_mov_b32 s11, s3
494 ; VI-NEXT: s_mov_b32 s14, s2
495 ; VI-NEXT: s_mov_b32 s15, s3
496 ; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
497 ; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0
498 ; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0
499 ; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0
500 ; VI-NEXT: s_mov_b32 s0, s4
501 ; VI-NEXT: s_mov_b32 s1, s5
502 ; VI-NEXT: s_waitcnt vmcnt(3)
503 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
504 ; VI-NEXT: s_waitcnt vmcnt(2)
505 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
506 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
507 ; VI-NEXT: s_waitcnt vmcnt(0)
508 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
509 ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
510 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
511 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5
512 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
513 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
514 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
515 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
517 <2 x half> addrspace(1)* %r,
518 <2 x half> addrspace(1)* %a,
519 <2 x half> addrspace(1)* %b,
520 <2 x half> addrspace(1)* %c,
521 <2 x half> addrspace(1)* %d) {
523 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
524 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
525 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
526 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
527 %fcmp = fcmp olt <2 x half> %a.val, %b.val
528 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
529 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
533 define amdgpu_kernel void @select_v2f16_imm_a(
534 ; SI-LABEL: select_v2f16_imm_a:
535 ; SI: ; %bb.0: ; %entry
536 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
537 ; SI-NEXT: s_mov_b32 s11, 0xf000
538 ; SI-NEXT: s_mov_b32 s10, -1
539 ; SI-NEXT: s_mov_b32 s14, s10
540 ; SI-NEXT: s_mov_b32 s15, s11
541 ; SI-NEXT: s_waitcnt lgkmcnt(0)
542 ; SI-NEXT: s_mov_b32 s12, s2
543 ; SI-NEXT: s_mov_b32 s13, s3
544 ; SI-NEXT: s_mov_b32 s16, s4
545 ; SI-NEXT: s_mov_b32 s17, s5
546 ; SI-NEXT: s_mov_b32 s4, s6
547 ; SI-NEXT: s_mov_b32 s5, s7
548 ; SI-NEXT: s_mov_b32 s18, s10
549 ; SI-NEXT: s_mov_b32 s19, s11
550 ; SI-NEXT: s_mov_b32 s6, s10
551 ; SI-NEXT: s_mov_b32 s7, s11
552 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
553 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
554 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
555 ; SI-NEXT: s_mov_b32 s2, 0x3f200000
556 ; SI-NEXT: s_mov_b32 s8, s0
557 ; SI-NEXT: s_mov_b32 s9, s1
558 ; SI-NEXT: s_waitcnt vmcnt(2)
559 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
560 ; SI-NEXT: s_waitcnt vmcnt(1)
561 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
562 ; SI-NEXT: s_waitcnt vmcnt(0)
563 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
564 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
565 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
566 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
567 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
568 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
569 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
570 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3
571 ; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
572 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0
573 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
574 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
575 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
576 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
577 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
578 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
581 ; VI-LABEL: select_v2f16_imm_a:
582 ; VI: ; %bb.0: ; %entry
583 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
584 ; VI-NEXT: s_mov_b32 s11, 0xf000
585 ; VI-NEXT: s_mov_b32 s10, -1
586 ; VI-NEXT: s_mov_b32 s14, s10
587 ; VI-NEXT: s_mov_b32 s15, s11
588 ; VI-NEXT: s_waitcnt lgkmcnt(0)
589 ; VI-NEXT: s_mov_b32 s12, s2
590 ; VI-NEXT: s_mov_b32 s13, s3
591 ; VI-NEXT: s_mov_b32 s16, s4
592 ; VI-NEXT: s_mov_b32 s17, s5
593 ; VI-NEXT: s_mov_b32 s4, s6
594 ; VI-NEXT: s_mov_b32 s5, s7
595 ; VI-NEXT: s_mov_b32 s18, s10
596 ; VI-NEXT: s_mov_b32 s19, s11
597 ; VI-NEXT: s_mov_b32 s6, s10
598 ; VI-NEXT: s_mov_b32 s7, s11
599 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
600 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
601 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
602 ; VI-NEXT: s_movk_i32 s2, 0x3900
603 ; VI-NEXT: s_mov_b32 s8, s0
604 ; VI-NEXT: s_mov_b32 s9, s1
605 ; VI-NEXT: s_waitcnt vmcnt(2)
606 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
607 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
608 ; VI-NEXT: s_waitcnt vmcnt(0)
609 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
610 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
611 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
612 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3
613 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
614 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
615 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
616 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
618 <2 x half> addrspace(1)* %r,
619 <2 x half> addrspace(1)* %b,
620 <2 x half> addrspace(1)* %c,
621 <2 x half> addrspace(1)* %d) {
623 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
624 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
625 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
626 %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
627 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
628 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
632 define amdgpu_kernel void @select_v2f16_imm_b(
633 ; SI-LABEL: select_v2f16_imm_b:
634 ; SI: ; %bb.0: ; %entry
635 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
636 ; SI-NEXT: s_mov_b32 s11, 0xf000
637 ; SI-NEXT: s_mov_b32 s10, -1
638 ; SI-NEXT: s_mov_b32 s14, s10
639 ; SI-NEXT: s_mov_b32 s15, s11
640 ; SI-NEXT: s_waitcnt lgkmcnt(0)
641 ; SI-NEXT: s_mov_b32 s12, s2
642 ; SI-NEXT: s_mov_b32 s13, s3
643 ; SI-NEXT: s_mov_b32 s16, s4
644 ; SI-NEXT: s_mov_b32 s17, s5
645 ; SI-NEXT: s_mov_b32 s4, s6
646 ; SI-NEXT: s_mov_b32 s5, s7
647 ; SI-NEXT: s_mov_b32 s18, s10
648 ; SI-NEXT: s_mov_b32 s19, s11
649 ; SI-NEXT: s_mov_b32 s6, s10
650 ; SI-NEXT: s_mov_b32 s7, s11
651 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
652 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
653 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
654 ; SI-NEXT: s_mov_b32 s2, 0x3f200000
655 ; SI-NEXT: s_mov_b32 s8, s0
656 ; SI-NEXT: s_mov_b32 s9, s1
657 ; SI-NEXT: s_waitcnt vmcnt(2)
658 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
659 ; SI-NEXT: s_waitcnt vmcnt(1)
660 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
661 ; SI-NEXT: s_waitcnt vmcnt(0)
662 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
663 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
664 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
665 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
666 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
667 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
668 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
669 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3
670 ; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
671 ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0
672 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
673 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
674 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
675 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
676 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
677 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
680 ; VI-LABEL: select_v2f16_imm_b:
681 ; VI: ; %bb.0: ; %entry
682 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
683 ; VI-NEXT: s_mov_b32 s11, 0xf000
684 ; VI-NEXT: s_mov_b32 s10, -1
685 ; VI-NEXT: s_mov_b32 s14, s10
686 ; VI-NEXT: s_mov_b32 s15, s11
687 ; VI-NEXT: s_waitcnt lgkmcnt(0)
688 ; VI-NEXT: s_mov_b32 s12, s2
689 ; VI-NEXT: s_mov_b32 s13, s3
690 ; VI-NEXT: s_mov_b32 s16, s4
691 ; VI-NEXT: s_mov_b32 s17, s5
692 ; VI-NEXT: s_mov_b32 s4, s6
693 ; VI-NEXT: s_mov_b32 s5, s7
694 ; VI-NEXT: s_mov_b32 s18, s10
695 ; VI-NEXT: s_mov_b32 s19, s11
696 ; VI-NEXT: s_mov_b32 s6, s10
697 ; VI-NEXT: s_mov_b32 s7, s11
698 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
699 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
700 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
701 ; VI-NEXT: s_movk_i32 s2, 0x3900
702 ; VI-NEXT: s_mov_b32 s8, s0
703 ; VI-NEXT: s_mov_b32 s9, s1
704 ; VI-NEXT: s_waitcnt vmcnt(2)
705 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
706 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
707 ; VI-NEXT: s_waitcnt vmcnt(0)
708 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
709 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
710 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
711 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3
712 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
713 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
714 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
715 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
717 <2 x half> addrspace(1)* %r,
718 <2 x half> addrspace(1)* %a,
719 <2 x half> addrspace(1)* %c,
720 <2 x half> addrspace(1)* %d) {
722 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
723 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
724 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
725 %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
726 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
727 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
731 define amdgpu_kernel void @select_v2f16_imm_c(
732 ; SI-LABEL: select_v2f16_imm_c:
733 ; SI: ; %bb.0: ; %entry
734 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
735 ; SI-NEXT: s_mov_b32 s11, 0xf000
736 ; SI-NEXT: s_mov_b32 s10, -1
737 ; SI-NEXT: s_mov_b32 s14, s10
738 ; SI-NEXT: s_mov_b32 s15, s11
739 ; SI-NEXT: s_waitcnt lgkmcnt(0)
740 ; SI-NEXT: s_mov_b32 s12, s2
741 ; SI-NEXT: s_mov_b32 s13, s3
742 ; SI-NEXT: s_mov_b32 s16, s4
743 ; SI-NEXT: s_mov_b32 s17, s5
744 ; SI-NEXT: s_mov_b32 s4, s6
745 ; SI-NEXT: s_mov_b32 s5, s7
746 ; SI-NEXT: s_mov_b32 s18, s10
747 ; SI-NEXT: s_mov_b32 s19, s11
748 ; SI-NEXT: s_mov_b32 s6, s10
749 ; SI-NEXT: s_mov_b32 s7, s11
750 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
751 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
752 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
753 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000
754 ; SI-NEXT: s_mov_b32 s8, s0
755 ; SI-NEXT: s_mov_b32 s9, s1
756 ; SI-NEXT: s_waitcnt vmcnt(2)
757 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
758 ; SI-NEXT: s_waitcnt vmcnt(1)
759 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
760 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
761 ; SI-NEXT: s_waitcnt vmcnt(0)
762 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
763 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
764 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
765 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
766 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
767 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
768 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5
769 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
770 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1
771 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
772 ; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc
773 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
774 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
775 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
776 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
779 ; VI-LABEL: select_v2f16_imm_c:
780 ; VI: ; %bb.0: ; %entry
781 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
782 ; VI-NEXT: s_mov_b32 s11, 0xf000
783 ; VI-NEXT: s_mov_b32 s10, -1
784 ; VI-NEXT: s_mov_b32 s14, s10
785 ; VI-NEXT: s_mov_b32 s15, s11
786 ; VI-NEXT: s_waitcnt lgkmcnt(0)
787 ; VI-NEXT: s_mov_b32 s12, s2
788 ; VI-NEXT: s_mov_b32 s13, s3
789 ; VI-NEXT: s_mov_b32 s16, s4
790 ; VI-NEXT: s_mov_b32 s17, s5
791 ; VI-NEXT: s_mov_b32 s4, s6
792 ; VI-NEXT: s_mov_b32 s5, s7
793 ; VI-NEXT: s_mov_b32 s18, s10
794 ; VI-NEXT: s_mov_b32 s19, s11
795 ; VI-NEXT: s_mov_b32 s6, s10
796 ; VI-NEXT: s_mov_b32 s7, s11
797 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
798 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
799 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
800 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800
801 ; VI-NEXT: v_mov_b32_e32 v4, 0x3900
802 ; VI-NEXT: s_mov_b32 s8, s0
803 ; VI-NEXT: s_mov_b32 s9, s1
804 ; VI-NEXT: s_waitcnt vmcnt(2)
805 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
806 ; VI-NEXT: s_waitcnt vmcnt(1)
807 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
808 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
809 ; VI-NEXT: s_waitcnt vmcnt(0)
810 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
811 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
812 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5
813 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
814 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
815 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
816 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
818 <2 x half> addrspace(1)* %r,
819 <2 x half> addrspace(1)* %a,
820 <2 x half> addrspace(1)* %b,
821 <2 x half> addrspace(1)* %d) {
823 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
824 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
825 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
826 %fcmp = fcmp olt <2 x half> %a.val, %b.val
827 %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
828 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
832 define amdgpu_kernel void @select_v2f16_imm_d(
833 ; SI-LABEL: select_v2f16_imm_d:
834 ; SI: ; %bb.0: ; %entry
835 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
836 ; SI-NEXT: s_mov_b32 s11, 0xf000
837 ; SI-NEXT: s_mov_b32 s10, -1
838 ; SI-NEXT: s_mov_b32 s14, s10
839 ; SI-NEXT: s_mov_b32 s15, s11
840 ; SI-NEXT: s_waitcnt lgkmcnt(0)
841 ; SI-NEXT: s_mov_b32 s12, s2
842 ; SI-NEXT: s_mov_b32 s13, s3
843 ; SI-NEXT: s_mov_b32 s16, s4
844 ; SI-NEXT: s_mov_b32 s17, s5
845 ; SI-NEXT: s_mov_b32 s4, s6
846 ; SI-NEXT: s_mov_b32 s5, s7
847 ; SI-NEXT: s_mov_b32 s18, s10
848 ; SI-NEXT: s_mov_b32 s19, s11
849 ; SI-NEXT: s_mov_b32 s6, s10
850 ; SI-NEXT: s_mov_b32 s7, s11
851 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
852 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
853 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
854 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000
855 ; SI-NEXT: s_mov_b32 s8, s0
856 ; SI-NEXT: s_mov_b32 s9, s1
857 ; SI-NEXT: s_waitcnt vmcnt(2)
858 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
859 ; SI-NEXT: s_waitcnt vmcnt(1)
860 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
861 ; SI-NEXT: s_waitcnt vmcnt(0)
862 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
863 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
864 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
865 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
866 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
867 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
868 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
869 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
870 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
871 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
872 ; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
873 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
874 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
875 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
876 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
877 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
880 ; VI-LABEL: select_v2f16_imm_d:
881 ; VI: ; %bb.0: ; %entry
882 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
883 ; VI-NEXT: s_mov_b32 s11, 0xf000
884 ; VI-NEXT: s_mov_b32 s10, -1
885 ; VI-NEXT: s_mov_b32 s14, s10
886 ; VI-NEXT: s_mov_b32 s15, s11
887 ; VI-NEXT: s_waitcnt lgkmcnt(0)
888 ; VI-NEXT: s_mov_b32 s12, s2
889 ; VI-NEXT: s_mov_b32 s13, s3
890 ; VI-NEXT: s_mov_b32 s16, s4
891 ; VI-NEXT: s_mov_b32 s17, s5
892 ; VI-NEXT: s_mov_b32 s4, s6
893 ; VI-NEXT: s_mov_b32 s5, s7
894 ; VI-NEXT: s_mov_b32 s18, s10
895 ; VI-NEXT: s_mov_b32 s19, s11
896 ; VI-NEXT: s_mov_b32 s6, s10
897 ; VI-NEXT: s_mov_b32 s7, s11
898 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
899 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
900 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
901 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800
902 ; VI-NEXT: v_mov_b32_e32 v4, 0x3900
903 ; VI-NEXT: s_mov_b32 s8, s0
904 ; VI-NEXT: s_mov_b32 s9, s1
905 ; VI-NEXT: s_waitcnt vmcnt(2)
906 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
907 ; VI-NEXT: s_waitcnt vmcnt(1)
908 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
909 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
910 ; VI-NEXT: s_waitcnt vmcnt(0)
911 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
912 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
913 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5
914 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
915 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
916 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
917 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
919 <2 x half> addrspace(1)* %r,
920 <2 x half> addrspace(1)* %a,
921 <2 x half> addrspace(1)* %b,
922 <2 x half> addrspace(1)* %c) {
924 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
925 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
926 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
927 %fcmp = fcmp olt <2 x half> %a.val, %b.val
928 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
929 store <2 x half> %r.val, <2 x half> addrspace(1)* %r