1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s
6 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s
8 declare half @llvm.minnum.f16(half %a, half %b)
9 declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
10 declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b)
11 declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
13 define amdgpu_kernel void @minnum_f16_ieee(
14 ; SI-LABEL: minnum_f16_ieee:
15 ; SI: ; %bb.0: ; %entry
16 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
17 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
18 ; SI-NEXT: s_mov_b32 s7, 0xf000
19 ; SI-NEXT: s_mov_b32 s6, -1
20 ; SI-NEXT: s_mov_b32 s14, s6
21 ; SI-NEXT: s_waitcnt lgkmcnt(0)
22 ; SI-NEXT: s_mov_b32 s12, s2
23 ; SI-NEXT: s_mov_b32 s13, s3
24 ; SI-NEXT: s_mov_b32 s15, s7
25 ; SI-NEXT: s_mov_b32 s10, s6
26 ; SI-NEXT: s_mov_b32 s11, s7
27 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
28 ; SI-NEXT: s_waitcnt vmcnt(0)
29 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
30 ; SI-NEXT: s_waitcnt vmcnt(0)
31 ; SI-NEXT: s_mov_b32 s4, s0
32 ; SI-NEXT: s_mov_b32 s5, s1
33 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
34 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
35 ; SI-NEXT: v_min_f32_e32 v0, v0, v1
36 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
37 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
40 ; VI-LABEL: minnum_f16_ieee:
41 ; VI: ; %bb.0: ; %entry
42 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
43 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
44 ; VI-NEXT: s_mov_b32 s7, 0xf000
45 ; VI-NEXT: s_mov_b32 s6, -1
46 ; VI-NEXT: s_mov_b32 s14, s6
47 ; VI-NEXT: s_waitcnt lgkmcnt(0)
48 ; VI-NEXT: s_mov_b32 s12, s2
49 ; VI-NEXT: s_mov_b32 s13, s3
50 ; VI-NEXT: s_mov_b32 s15, s7
51 ; VI-NEXT: s_mov_b32 s10, s6
52 ; VI-NEXT: s_mov_b32 s11, s7
53 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
54 ; VI-NEXT: s_waitcnt vmcnt(0)
55 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
56 ; VI-NEXT: s_waitcnt vmcnt(0)
57 ; VI-NEXT: s_mov_b32 s4, s0
58 ; VI-NEXT: s_mov_b32 s5, s1
59 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
60 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
61 ; VI-NEXT: v_min_f16_e32 v0, v0, v1
62 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
65 ; GFX9-LABEL: minnum_f16_ieee:
66 ; GFX9: ; %bb.0: ; %entry
67 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
68 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
69 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
70 ; GFX9-NEXT: s_mov_b32 s6, -1
71 ; GFX9-NEXT: s_mov_b32 s14, s6
72 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX9-NEXT: s_mov_b32 s12, s2
74 ; GFX9-NEXT: s_mov_b32 s13, s3
75 ; GFX9-NEXT: s_mov_b32 s15, s7
76 ; GFX9-NEXT: s_mov_b32 s10, s6
77 ; GFX9-NEXT: s_mov_b32 s11, s7
78 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
79 ; GFX9-NEXT: s_waitcnt vmcnt(0)
80 ; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
81 ; GFX9-NEXT: s_waitcnt vmcnt(0)
82 ; GFX9-NEXT: s_mov_b32 s4, s0
83 ; GFX9-NEXT: s_mov_b32 s5, s1
84 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
85 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
86 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
87 ; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
90 ; GFX10-LABEL: minnum_f16_ieee:
91 ; GFX10: ; %bb.0: ; %entry
92 ; GFX10-NEXT: s_clause 0x1
93 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
94 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
95 ; GFX10-NEXT: s_mov_b32 s6, -1
96 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
97 ; GFX10-NEXT: s_mov_b32 s14, s6
98 ; GFX10-NEXT: s_mov_b32 s15, s7
99 ; GFX10-NEXT: s_mov_b32 s10, s6
100 ; GFX10-NEXT: s_mov_b32 s11, s7
101 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
102 ; GFX10-NEXT: s_mov_b32 s12, s2
103 ; GFX10-NEXT: s_mov_b32 s13, s3
104 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc
105 ; GFX10-NEXT: s_waitcnt vmcnt(0)
106 ; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc
107 ; GFX10-NEXT: s_waitcnt vmcnt(0)
108 ; GFX10-NEXT: s_mov_b32 s4, s0
109 ; GFX10-NEXT: s_mov_b32 s5, s1
110 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
111 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
112 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v1
113 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
114 ; GFX10-NEXT: s_endpgm
116 ; GFX11-LABEL: minnum_f16_ieee:
117 ; GFX11: ; %bb.0: ; %entry
118 ; GFX11-NEXT: s_clause 0x1
119 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
120 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
121 ; GFX11-NEXT: s_mov_b32 s10, -1
122 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
123 ; GFX11-NEXT: s_mov_b32 s14, s10
124 ; GFX11-NEXT: s_mov_b32 s15, s11
125 ; GFX11-NEXT: s_mov_b32 s6, s10
126 ; GFX11-NEXT: s_mov_b32 s7, s11
127 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
128 ; GFX11-NEXT: s_mov_b32 s12, s2
129 ; GFX11-NEXT: s_mov_b32 s13, s3
130 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
131 ; GFX11-NEXT: s_waitcnt vmcnt(0)
132 ; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
133 ; GFX11-NEXT: s_waitcnt vmcnt(0)
134 ; GFX11-NEXT: s_mov_b32 s8, s0
135 ; GFX11-NEXT: s_mov_b32 s9, s1
136 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
137 ; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
138 ; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
139 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
140 ; GFX11-NEXT: s_endpgm
143 ptr addrspace(1) %b) #0 {
145 %a.val = load volatile half, ptr addrspace(1) %a
146 %b.val = load volatile half, ptr addrspace(1) %b
147 %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val)
148 store half %r.val, ptr addrspace(1) %r
152 define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 {
153 ; SI-LABEL: minnum_f16_no_ieee:
155 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
156 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
157 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
158 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
159 ; SI-NEXT: v_min_f32_e32 v0, v0, v1
160 ; SI-NEXT: ; return to shader part epilog
162 ; VI-LABEL: minnum_f16_no_ieee:
164 ; VI-NEXT: v_min_f16_e32 v0, v0, v1
165 ; VI-NEXT: ; return to shader part epilog
167 ; GFX9-LABEL: minnum_f16_no_ieee:
169 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
170 ; GFX9-NEXT: ; return to shader part epilog
172 ; GFX10PLUS-LABEL: minnum_f16_no_ieee:
173 ; GFX10PLUS: ; %bb.0:
174 ; GFX10PLUS-NEXT: v_min_f16_e32 v0, v0, v1
175 ; GFX10PLUS-NEXT: ; return to shader part epilog
176 %r.val = call half @llvm.minnum.f16(half %a, half %b)
180 define amdgpu_kernel void @minnum_f16_imm_a(
181 ; SI-LABEL: minnum_f16_imm_a:
182 ; SI: ; %bb.0: ; %entry
183 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
184 ; SI-NEXT: s_mov_b32 s7, 0xf000
185 ; SI-NEXT: s_mov_b32 s6, -1
186 ; SI-NEXT: s_mov_b32 s10, s6
187 ; SI-NEXT: s_mov_b32 s11, s7
188 ; SI-NEXT: s_waitcnt lgkmcnt(0)
189 ; SI-NEXT: s_mov_b32 s8, s2
190 ; SI-NEXT: s_mov_b32 s9, s3
191 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
192 ; SI-NEXT: s_mov_b32 s4, s0
193 ; SI-NEXT: s_mov_b32 s5, s1
194 ; SI-NEXT: s_waitcnt vmcnt(0)
195 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
196 ; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
197 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
198 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
201 ; VI-LABEL: minnum_f16_imm_a:
202 ; VI: ; %bb.0: ; %entry
203 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
204 ; VI-NEXT: s_mov_b32 s7, 0xf000
205 ; VI-NEXT: s_mov_b32 s6, -1
206 ; VI-NEXT: s_mov_b32 s10, s6
207 ; VI-NEXT: s_mov_b32 s11, s7
208 ; VI-NEXT: s_waitcnt lgkmcnt(0)
209 ; VI-NEXT: s_mov_b32 s8, s2
210 ; VI-NEXT: s_mov_b32 s9, s3
211 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
212 ; VI-NEXT: s_mov_b32 s4, s0
213 ; VI-NEXT: s_mov_b32 s5, s1
214 ; VI-NEXT: s_waitcnt vmcnt(0)
215 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
216 ; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0
217 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
220 ; GFX9-LABEL: minnum_f16_imm_a:
221 ; GFX9: ; %bb.0: ; %entry
222 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
223 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
224 ; GFX9-NEXT: s_mov_b32 s6, -1
225 ; GFX9-NEXT: s_mov_b32 s10, s6
226 ; GFX9-NEXT: s_mov_b32 s11, s7
227 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
228 ; GFX9-NEXT: s_mov_b32 s8, s2
229 ; GFX9-NEXT: s_mov_b32 s9, s3
230 ; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
231 ; GFX9-NEXT: s_mov_b32 s4, s0
232 ; GFX9-NEXT: s_mov_b32 s5, s1
233 ; GFX9-NEXT: s_waitcnt vmcnt(0)
234 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
235 ; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0
236 ; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
237 ; GFX9-NEXT: s_endpgm
239 ; GFX10-LABEL: minnum_f16_imm_a:
240 ; GFX10: ; %bb.0: ; %entry
241 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
242 ; GFX10-NEXT: s_mov_b32 s6, -1
243 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
244 ; GFX10-NEXT: s_mov_b32 s10, s6
245 ; GFX10-NEXT: s_mov_b32 s11, s7
246 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
247 ; GFX10-NEXT: s_mov_b32 s8, s2
248 ; GFX10-NEXT: s_mov_b32 s9, s3
249 ; GFX10-NEXT: s_mov_b32 s4, s0
250 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
251 ; GFX10-NEXT: s_mov_b32 s5, s1
252 ; GFX10-NEXT: s_waitcnt vmcnt(0)
253 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
254 ; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0
255 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
256 ; GFX10-NEXT: s_endpgm
258 ; GFX11-LABEL: minnum_f16_imm_a:
259 ; GFX11: ; %bb.0: ; %entry
260 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
261 ; GFX11-NEXT: s_mov_b32 s6, -1
262 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
263 ; GFX11-NEXT: s_mov_b32 s10, s6
264 ; GFX11-NEXT: s_mov_b32 s11, s7
265 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
266 ; GFX11-NEXT: s_mov_b32 s8, s2
267 ; GFX11-NEXT: s_mov_b32 s9, s3
268 ; GFX11-NEXT: s_mov_b32 s4, s0
269 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
270 ; GFX11-NEXT: s_mov_b32 s5, s1
271 ; GFX11-NEXT: s_waitcnt vmcnt(0)
272 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
273 ; GFX11-NEXT: v_min_f16_e32 v0, 0x4200, v0
274 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
275 ; GFX11-NEXT: s_endpgm
277 ptr addrspace(1) %b) #0 {
279 %b.val = load half, ptr addrspace(1) %b
280 %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val)
281 store half %r.val, ptr addrspace(1) %r
285 define amdgpu_kernel void @minnum_f16_imm_b(
286 ; SI-LABEL: minnum_f16_imm_b:
287 ; SI: ; %bb.0: ; %entry
288 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
289 ; SI-NEXT: s_mov_b32 s7, 0xf000
290 ; SI-NEXT: s_mov_b32 s6, -1
291 ; SI-NEXT: s_mov_b32 s10, s6
292 ; SI-NEXT: s_mov_b32 s11, s7
293 ; SI-NEXT: s_waitcnt lgkmcnt(0)
294 ; SI-NEXT: s_mov_b32 s8, s2
295 ; SI-NEXT: s_mov_b32 s9, s3
296 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
297 ; SI-NEXT: s_mov_b32 s4, s0
298 ; SI-NEXT: s_mov_b32 s5, s1
299 ; SI-NEXT: s_waitcnt vmcnt(0)
300 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
301 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
302 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
303 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
306 ; VI-LABEL: minnum_f16_imm_b:
307 ; VI: ; %bb.0: ; %entry
308 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
309 ; VI-NEXT: s_mov_b32 s7, 0xf000
310 ; VI-NEXT: s_mov_b32 s6, -1
311 ; VI-NEXT: s_mov_b32 s10, s6
312 ; VI-NEXT: s_mov_b32 s11, s7
313 ; VI-NEXT: s_waitcnt lgkmcnt(0)
314 ; VI-NEXT: s_mov_b32 s8, s2
315 ; VI-NEXT: s_mov_b32 s9, s3
316 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
317 ; VI-NEXT: s_mov_b32 s4, s0
318 ; VI-NEXT: s_mov_b32 s5, s1
319 ; VI-NEXT: s_waitcnt vmcnt(0)
320 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
321 ; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
322 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
325 ; GFX9-LABEL: minnum_f16_imm_b:
326 ; GFX9: ; %bb.0: ; %entry
327 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
328 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
329 ; GFX9-NEXT: s_mov_b32 s6, -1
330 ; GFX9-NEXT: s_mov_b32 s10, s6
331 ; GFX9-NEXT: s_mov_b32 s11, s7
332 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
333 ; GFX9-NEXT: s_mov_b32 s8, s2
334 ; GFX9-NEXT: s_mov_b32 s9, s3
335 ; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
336 ; GFX9-NEXT: s_mov_b32 s4, s0
337 ; GFX9-NEXT: s_mov_b32 s5, s1
338 ; GFX9-NEXT: s_waitcnt vmcnt(0)
339 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
340 ; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0
341 ; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
342 ; GFX9-NEXT: s_endpgm
344 ; GFX10-LABEL: minnum_f16_imm_b:
345 ; GFX10: ; %bb.0: ; %entry
346 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
347 ; GFX10-NEXT: s_mov_b32 s6, -1
348 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
349 ; GFX10-NEXT: s_mov_b32 s10, s6
350 ; GFX10-NEXT: s_mov_b32 s11, s7
351 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
352 ; GFX10-NEXT: s_mov_b32 s8, s2
353 ; GFX10-NEXT: s_mov_b32 s9, s3
354 ; GFX10-NEXT: s_mov_b32 s4, s0
355 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
356 ; GFX10-NEXT: s_mov_b32 s5, s1
357 ; GFX10-NEXT: s_waitcnt vmcnt(0)
358 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
359 ; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0
360 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
361 ; GFX10-NEXT: s_endpgm
363 ; GFX11-LABEL: minnum_f16_imm_b:
364 ; GFX11: ; %bb.0: ; %entry
365 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
366 ; GFX11-NEXT: s_mov_b32 s6, -1
367 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
368 ; GFX11-NEXT: s_mov_b32 s10, s6
369 ; GFX11-NEXT: s_mov_b32 s11, s7
370 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
371 ; GFX11-NEXT: s_mov_b32 s8, s2
372 ; GFX11-NEXT: s_mov_b32 s9, s3
373 ; GFX11-NEXT: s_mov_b32 s4, s0
374 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
375 ; GFX11-NEXT: s_mov_b32 s5, s1
376 ; GFX11-NEXT: s_waitcnt vmcnt(0)
377 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
378 ; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0
379 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
380 ; GFX11-NEXT: s_endpgm
382 ptr addrspace(1) %a) #0 {
384 %a.val = load half, ptr addrspace(1) %a
385 %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0)
386 store half %r.val, ptr addrspace(1) %r
390 define amdgpu_kernel void @minnum_v2f16_ieee(
391 ; SI-LABEL: minnum_v2f16_ieee:
392 ; SI: ; %bb.0: ; %entry
393 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
394 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
395 ; SI-NEXT: s_waitcnt lgkmcnt(0)
396 ; SI-NEXT: s_load_dword s2, s[2:3], 0x0
397 ; SI-NEXT: s_load_dword s4, s[4:5], 0x0
398 ; SI-NEXT: s_mov_b32 s3, 0xf000
399 ; SI-NEXT: s_waitcnt lgkmcnt(0)
400 ; SI-NEXT: s_lshr_b32 s5, s2, 16
401 ; SI-NEXT: s_lshr_b32 s6, s4, 16
402 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s5
403 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s6
404 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s2
405 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
406 ; SI-NEXT: s_mov_b32 s2, -1
407 ; SI-NEXT: v_min_f32_e32 v0, v0, v1
408 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
409 ; SI-NEXT: v_min_f32_e32 v1, v2, v3
410 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
411 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
412 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
413 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
416 ; VI-LABEL: minnum_v2f16_ieee:
417 ; VI: ; %bb.0: ; %entry
418 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
419 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
420 ; VI-NEXT: s_mov_b32 s7, 0xf000
421 ; VI-NEXT: s_mov_b32 s6, -1
422 ; VI-NEXT: s_waitcnt lgkmcnt(0)
423 ; VI-NEXT: s_load_dword s8, s[8:9], 0x0
424 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
425 ; VI-NEXT: s_mov_b32 s4, s0
426 ; VI-NEXT: s_mov_b32 s5, s1
427 ; VI-NEXT: s_waitcnt lgkmcnt(0)
428 ; VI-NEXT: v_max_f16_e64 v0, s8, s8
429 ; VI-NEXT: v_max_f16_e64 v1, s2, s2
430 ; VI-NEXT: s_lshr_b32 s0, s8, 16
431 ; VI-NEXT: v_min_f16_e32 v0, v1, v0
432 ; VI-NEXT: v_max_f16_e64 v1, s0, s0
433 ; VI-NEXT: s_lshr_b32 s0, s2, 16
434 ; VI-NEXT: v_max_f16_e64 v2, s0, s0
435 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
436 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
437 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
440 ; GFX9-LABEL: minnum_v2f16_ieee:
441 ; GFX9: ; %bb.0: ; %entry
442 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
443 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
444 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
445 ; GFX9-NEXT: s_mov_b32 s6, -1
446 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
447 ; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0
448 ; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0
449 ; GFX9-NEXT: s_mov_b32 s4, s0
450 ; GFX9-NEXT: s_mov_b32 s5, s1
451 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
452 ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10
453 ; GFX9-NEXT: v_pk_max_f16 v1, s11, s11
454 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0
455 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
456 ; GFX9-NEXT: s_endpgm
458 ; GFX10-LABEL: minnum_v2f16_ieee:
459 ; GFX10: ; %bb.0: ; %entry
460 ; GFX10-NEXT: s_clause 0x1
461 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
462 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
463 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
464 ; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0
465 ; GFX10-NEXT: s_load_dword s5, s[2:3], 0x0
466 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
467 ; GFX10-NEXT: s_mov_b32 s2, -1
468 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
469 ; GFX10-NEXT: v_pk_max_f16 v0, s4, s4
470 ; GFX10-NEXT: v_pk_max_f16 v1, s5, s5
471 ; GFX10-NEXT: v_pk_min_f16 v0, v1, v0
472 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
473 ; GFX10-NEXT: s_endpgm
475 ; GFX11-LABEL: minnum_v2f16_ieee:
476 ; GFX11: ; %bb.0: ; %entry
477 ; GFX11-NEXT: s_clause 0x1
478 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
479 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
480 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
481 ; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0
482 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
483 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
484 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
485 ; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
486 ; GFX11-NEXT: v_pk_max_f16 v1, s2, s2
487 ; GFX11-NEXT: s_mov_b32 s2, -1
488 ; GFX11-NEXT: v_pk_min_f16 v0, v1, v0
489 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
490 ; GFX11-NEXT: s_endpgm
493 ptr addrspace(1) %b) #0 {
495 %a.val = load <2 x half>, ptr addrspace(1) %a
496 %b.val = load <2 x half>, ptr addrspace(1) %b
497 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
498 store <2 x half> %r.val, ptr addrspace(1) %r
502 define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 {
503 ; SI-LABEL: minnum_v2f16_no_ieee:
505 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
506 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
507 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
508 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
509 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
510 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
511 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
512 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
513 ; SI-NEXT: v_min_f32_e32 v0, v0, v2
514 ; SI-NEXT: v_min_f32_e32 v1, v1, v3
515 ; SI-NEXT: ; return to shader part epilog
517 ; VI-LABEL: minnum_v2f16_no_ieee:
519 ; VI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
520 ; VI-NEXT: v_min_f16_e32 v0, v0, v1
521 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
522 ; VI-NEXT: ; return to shader part epilog
524 ; GFX9-LABEL: minnum_v2f16_no_ieee:
526 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
527 ; GFX9-NEXT: ; return to shader part epilog
529 ; GFX10PLUS-LABEL: minnum_v2f16_no_ieee:
530 ; GFX10PLUS: ; %bb.0:
531 ; GFX10PLUS-NEXT: v_pk_min_f16 v0, v0, v1
532 ; GFX10PLUS-NEXT: ; return to shader part epilog
533 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
534 ret <2 x half> %r.val
537 define amdgpu_kernel void @minnum_v2f16_imm_a(
538 ; SI-LABEL: minnum_v2f16_imm_a:
539 ; SI: ; %bb.0: ; %entry
540 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
541 ; SI-NEXT: s_waitcnt lgkmcnt(0)
542 ; SI-NEXT: s_load_dword s2, s[2:3], 0x0
543 ; SI-NEXT: s_waitcnt lgkmcnt(0)
544 ; SI-NEXT: s_lshr_b32 s3, s2, 16
545 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
546 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
547 ; SI-NEXT: s_mov_b32 s3, 0xf000
548 ; SI-NEXT: s_mov_b32 s2, -1
549 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
550 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
551 ; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
552 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
553 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
554 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
555 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
558 ; VI-LABEL: minnum_v2f16_imm_a:
559 ; VI: ; %bb.0: ; %entry
560 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
561 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400
562 ; VI-NEXT: s_waitcnt lgkmcnt(0)
563 ; VI-NEXT: s_load_dword s4, s[2:3], 0x0
564 ; VI-NEXT: s_mov_b32 s3, 0xf000
565 ; VI-NEXT: s_mov_b32 s2, -1
566 ; VI-NEXT: s_waitcnt lgkmcnt(0)
567 ; VI-NEXT: v_max_f16_e64 v0, s4, s4
568 ; VI-NEXT: s_lshr_b32 s4, s4, 16
569 ; VI-NEXT: v_max_f16_e64 v1, s4, s4
570 ; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0
571 ; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
572 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
573 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
576 ; GFX9-LABEL: minnum_v2f16_imm_a:
577 ; GFX9: ; %bb.0: ; %entry
578 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
579 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
580 ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
581 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
582 ; GFX9-NEXT: s_mov_b32 s2, -1
583 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
584 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
585 ; GFX9-NEXT: s_mov_b32 s4, 0x44004200
586 ; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
587 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
588 ; GFX9-NEXT: s_endpgm
590 ; GFX10-LABEL: minnum_v2f16_imm_a:
591 ; GFX10: ; %bb.0: ; %entry
592 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
593 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
594 ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
595 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
596 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
597 ; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
598 ; GFX10-NEXT: s_mov_b32 s2, -1
599 ; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0
600 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
601 ; GFX10-NEXT: s_endpgm
603 ; GFX11-LABEL: minnum_v2f16_imm_a:
604 ; GFX11: ; %bb.0: ; %entry
605 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
606 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
607 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
608 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
609 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
610 ; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
611 ; GFX11-NEXT: s_mov_b32 s2, -1
612 ; GFX11-NEXT: v_pk_min_f16 v0, 0x44004200, v0
613 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
614 ; GFX11-NEXT: s_endpgm
616 ptr addrspace(1) %b) #0 {
618 %b.val = load <2 x half>, ptr addrspace(1) %b
619 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
620 store <2 x half> %r.val, ptr addrspace(1) %r
624 define amdgpu_kernel void @minnum_v2f16_imm_b(
625 ; SI-LABEL: minnum_v2f16_imm_b:
626 ; SI: ; %bb.0: ; %entry
627 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
628 ; SI-NEXT: s_waitcnt lgkmcnt(0)
629 ; SI-NEXT: s_load_dword s2, s[2:3], 0x0
630 ; SI-NEXT: s_waitcnt lgkmcnt(0)
631 ; SI-NEXT: s_lshr_b32 s3, s2, 16
632 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
633 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
634 ; SI-NEXT: s_mov_b32 s3, 0xf000
635 ; SI-NEXT: s_mov_b32 s2, -1
636 ; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
637 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
638 ; SI-NEXT: v_min_f32_e32 v1, 4.0, v1
639 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
640 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
641 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
642 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
645 ; VI-LABEL: minnum_v2f16_imm_b:
646 ; VI: ; %bb.0: ; %entry
647 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
648 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200
649 ; VI-NEXT: s_waitcnt lgkmcnt(0)
650 ; VI-NEXT: s_load_dword s4, s[2:3], 0x0
651 ; VI-NEXT: s_mov_b32 s3, 0xf000
652 ; VI-NEXT: s_mov_b32 s2, -1
653 ; VI-NEXT: s_waitcnt lgkmcnt(0)
654 ; VI-NEXT: v_max_f16_e64 v0, s4, s4
655 ; VI-NEXT: s_lshr_b32 s4, s4, 16
656 ; VI-NEXT: v_max_f16_e64 v1, s4, s4
657 ; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
658 ; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
659 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
660 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
663 ; GFX9-LABEL: minnum_v2f16_imm_b:
664 ; GFX9: ; %bb.0: ; %entry
665 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
666 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
667 ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
668 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
669 ; GFX9-NEXT: s_mov_b32 s2, -1
670 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
671 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
672 ; GFX9-NEXT: s_mov_b32 s4, 0x42004400
673 ; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
674 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
675 ; GFX9-NEXT: s_endpgm
677 ; GFX10-LABEL: minnum_v2f16_imm_b:
678 ; GFX10: ; %bb.0: ; %entry
679 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
680 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
681 ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
682 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
683 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
684 ; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
685 ; GFX10-NEXT: s_mov_b32 s2, -1
686 ; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0
687 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
688 ; GFX10-NEXT: s_endpgm
690 ; GFX11-LABEL: minnum_v2f16_imm_b:
691 ; GFX11: ; %bb.0: ; %entry
692 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
693 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
694 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
695 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
696 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
697 ; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
698 ; GFX11-NEXT: s_mov_b32 s2, -1
699 ; GFX11-NEXT: v_pk_min_f16 v0, 0x42004400, v0
700 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
701 ; GFX11-NEXT: s_endpgm
703 ptr addrspace(1) %a) #0 {
705 %a.val = load <2 x half>, ptr addrspace(1) %a
706 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
707 store <2 x half> %r.val, ptr addrspace(1) %r
711 ; FIXME: Scalarize with undef half
712 define amdgpu_kernel void @minnum_v3f16(
713 ; SI-LABEL: minnum_v3f16:
714 ; SI: ; %bb.0: ; %entry
715 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
716 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
717 ; SI-NEXT: s_waitcnt lgkmcnt(0)
718 ; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
719 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
720 ; SI-NEXT: s_mov_b32 s3, 0xf000
721 ; SI-NEXT: s_mov_b32 s2, -1
722 ; SI-NEXT: s_waitcnt lgkmcnt(0)
723 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s7
724 ; SI-NEXT: s_lshr_b32 s7, s6, 16
725 ; SI-NEXT: s_lshr_b32 s8, s4, 16
726 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s7
727 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s8
728 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6
729 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
730 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s5
731 ; SI-NEXT: v_min_f32_e32 v1, v1, v2
732 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
733 ; SI-NEXT: v_min_f32_e32 v2, v3, v4
734 ; SI-NEXT: v_min_f32_e32 v0, v0, v5
735 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
736 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
737 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
738 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
739 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
740 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
743 ; VI-LABEL: minnum_v3f16:
744 ; VI: ; %bb.0: ; %entry
745 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
746 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
747 ; VI-NEXT: s_mov_b32 s7, 0xf000
748 ; VI-NEXT: s_mov_b32 s6, -1
749 ; VI-NEXT: s_waitcnt lgkmcnt(0)
750 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
751 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
752 ; VI-NEXT: s_mov_b32 s4, s0
753 ; VI-NEXT: s_mov_b32 s5, s1
754 ; VI-NEXT: s_waitcnt lgkmcnt(0)
755 ; VI-NEXT: v_max_f16_e64 v0, s8, s8
756 ; VI-NEXT: v_max_f16_e64 v1, s2, s2
757 ; VI-NEXT: s_lshr_b32 s0, s8, 16
758 ; VI-NEXT: v_min_f16_e32 v0, v1, v0
759 ; VI-NEXT: v_max_f16_e64 v1, s0, s0
760 ; VI-NEXT: s_lshr_b32 s0, s2, 16
761 ; VI-NEXT: v_max_f16_e64 v2, s0, s0
762 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
763 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
764 ; VI-NEXT: v_max_f16_e64 v1, s9, s9
765 ; VI-NEXT: v_max_f16_e64 v2, s3, s3
766 ; VI-NEXT: v_min_f16_e32 v1, v2, v1
767 ; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
768 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
771 ; GFX9-LABEL: minnum_v3f16:
772 ; GFX9: ; %bb.0: ; %entry
773 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
774 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
775 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
776 ; GFX9-NEXT: s_mov_b32 s6, -1
777 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
778 ; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
779 ; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0
780 ; GFX9-NEXT: s_mov_b32 s4, s0
781 ; GFX9-NEXT: s_mov_b32 s5, s1
782 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
783 ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10
784 ; GFX9-NEXT: v_pk_max_f16 v1, s12, s12
785 ; GFX9-NEXT: v_pk_max_f16 v2, s11, s11
786 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0
787 ; GFX9-NEXT: v_pk_max_f16 v1, s13, s13
788 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2
789 ; GFX9-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
790 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
791 ; GFX9-NEXT: s_endpgm
793 ; GFX10-LABEL: minnum_v3f16:
794 ; GFX10: ; %bb.0: ; %entry
795 ; GFX10-NEXT: s_clause 0x1
796 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
797 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
798 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
799 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
800 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
801 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
802 ; GFX10-NEXT: s_mov_b32 s2, -1
803 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
804 ; GFX10-NEXT: v_pk_max_f16 v1, s5, s5
805 ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9
806 ; GFX10-NEXT: v_pk_max_f16 v0, s4, s4
807 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8
808 ; GFX10-NEXT: v_pk_min_f16 v1, v2, v1
809 ; GFX10-NEXT: v_pk_min_f16 v0, v3, v0
810 ; GFX10-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
811 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
812 ; GFX10-NEXT: s_endpgm
814 ; GFX11-LABEL: minnum_v3f16:
815 ; GFX11: ; %bb.0: ; %entry
816 ; GFX11-NEXT: s_clause 0x1
817 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
818 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
819 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
820 ; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
821 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
822 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
823 ; GFX11-NEXT: v_pk_max_f16 v1, s5, s5
824 ; GFX11-NEXT: v_pk_max_f16 v2, s3, s3
825 ; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
826 ; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
827 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
828 ; GFX11-NEXT: s_mov_b32 s2, -1
829 ; GFX11-NEXT: v_pk_min_f16 v1, v2, v1
830 ; GFX11-NEXT: v_pk_min_f16 v0, v3, v0
831 ; GFX11-NEXT: s_clause 0x1
832 ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4
833 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
834 ; GFX11-NEXT: s_endpgm
837 ptr addrspace(1) %b) #0 {
839 %a.val = load <3 x half>, ptr addrspace(1) %a
840 %b.val = load <3 x half>, ptr addrspace(1) %b
841 %r.val = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
842 store <3 x half> %r.val, ptr addrspace(1) %r
846 define amdgpu_kernel void @minnum_v4f16(
847 ; SI-LABEL: minnum_v4f16:
848 ; SI: ; %bb.0: ; %entry
849 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
850 ; SI-NEXT: s_mov_b32 s3, 0xf000
851 ; SI-NEXT: s_mov_b32 s2, -1
852 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
853 ; SI-NEXT: s_waitcnt lgkmcnt(0)
854 ; SI-NEXT: s_load_dwordx2 s[6:7], s[10:11], 0x0
855 ; SI-NEXT: s_mov_b32 s0, s8
856 ; SI-NEXT: s_mov_b32 s1, s9
857 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
858 ; SI-NEXT: s_waitcnt lgkmcnt(0)
859 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s6
860 ; SI-NEXT: s_lshr_b32 s6, s6, 16
861 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6
862 ; SI-NEXT: s_lshr_b32 s6, s7, 16
863 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6
864 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
865 ; SI-NEXT: s_lshr_b32 s6, s5, 16
866 ; SI-NEXT: s_lshr_b32 s4, s4, 16
867 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6
868 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
869 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s7
870 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s5
871 ; SI-NEXT: v_min_f32_e32 v3, v3, v5
872 ; SI-NEXT: v_min_f32_e32 v2, v2, v7
873 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
874 ; SI-NEXT: v_min_f32_e32 v1, v1, v6
875 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
876 ; SI-NEXT: v_min_f32_e32 v0, v0, v4
877 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
878 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
879 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
880 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
881 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
882 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
883 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
886 ; VI-LABEL: minnum_v4f16:
887 ; VI: ; %bb.0: ; %entry
888 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
889 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
890 ; VI-NEXT: s_mov_b32 s7, 0xf000
891 ; VI-NEXT: s_mov_b32 s6, -1
892 ; VI-NEXT: s_waitcnt lgkmcnt(0)
893 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
894 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
895 ; VI-NEXT: s_mov_b32 s4, s0
896 ; VI-NEXT: s_mov_b32 s5, s1
897 ; VI-NEXT: s_waitcnt lgkmcnt(0)
898 ; VI-NEXT: v_max_f16_e64 v0, s9, s9
899 ; VI-NEXT: v_max_f16_e64 v1, s3, s3
900 ; VI-NEXT: s_lshr_b32 s0, s9, 16
901 ; VI-NEXT: v_min_f16_e32 v0, v1, v0
902 ; VI-NEXT: v_max_f16_e64 v1, s0, s0
903 ; VI-NEXT: s_lshr_b32 s0, s3, 16
904 ; VI-NEXT: v_max_f16_e64 v2, s0, s0
905 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
906 ; VI-NEXT: v_or_b32_e32 v1, v0, v1
907 ; VI-NEXT: v_max_f16_e64 v0, s8, s8
908 ; VI-NEXT: v_max_f16_e64 v2, s2, s2
909 ; VI-NEXT: s_lshr_b32 s0, s8, 16
910 ; VI-NEXT: v_min_f16_e32 v0, v2, v0
911 ; VI-NEXT: v_max_f16_e64 v2, s0, s0
912 ; VI-NEXT: s_lshr_b32 s0, s2, 16
913 ; VI-NEXT: v_max_f16_e64 v3, s0, s0
914 ; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
915 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
916 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
919 ; GFX9-LABEL: minnum_v4f16:
920 ; GFX9: ; %bb.0: ; %entry
921 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
922 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
923 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
924 ; GFX9-NEXT: s_mov_b32 s6, -1
925 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
926 ; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
927 ; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0
928 ; GFX9-NEXT: s_mov_b32 s4, s0
929 ; GFX9-NEXT: s_mov_b32 s5, s1
930 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
931 ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11
932 ; GFX9-NEXT: v_pk_max_f16 v1, s13, s13
933 ; GFX9-NEXT: v_pk_max_f16 v2, s10, s10
934 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0
935 ; GFX9-NEXT: v_pk_max_f16 v0, s12, s12
936 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2
937 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
938 ; GFX9-NEXT: s_endpgm
940 ; GFX10-LABEL: minnum_v4f16:
941 ; GFX10: ; %bb.0: ; %entry
942 ; GFX10-NEXT: s_clause 0x1
943 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
944 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
945 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
946 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
947 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
948 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
949 ; GFX10-NEXT: s_mov_b32 s2, -1
950 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
951 ; GFX10-NEXT: v_pk_max_f16 v0, s5, s5
952 ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9
953 ; GFX10-NEXT: v_pk_max_f16 v2, s4, s4
954 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8
955 ; GFX10-NEXT: v_pk_min_f16 v1, v1, v0
956 ; GFX10-NEXT: v_pk_min_f16 v0, v3, v2
957 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
958 ; GFX10-NEXT: s_endpgm
960 ; GFX11-LABEL: minnum_v4f16:
961 ; GFX11: ; %bb.0: ; %entry
962 ; GFX11-NEXT: s_clause 0x1
963 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
964 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
965 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
966 ; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
967 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
968 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
969 ; GFX11-NEXT: v_pk_max_f16 v0, s5, s5
970 ; GFX11-NEXT: v_pk_max_f16 v1, s3, s3
971 ; GFX11-NEXT: v_pk_max_f16 v2, s4, s4
972 ; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
973 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
974 ; GFX11-NEXT: s_mov_b32 s2, -1
975 ; GFX11-NEXT: v_pk_min_f16 v1, v1, v0
976 ; GFX11-NEXT: v_pk_min_f16 v0, v3, v2
977 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
978 ; GFX11-NEXT: s_endpgm
981 ptr addrspace(1) %b) #0 {
983 %a.val = load <4 x half>, ptr addrspace(1) %a
984 %b.val = load <4 x half>, ptr addrspace(1) %b
985 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
986 store <4 x half> %r.val, ptr addrspace(1) %r
990 define amdgpu_kernel void @fmin_v4f16_imm_a(
991 ; SI-LABEL: fmin_v4f16_imm_a:
992 ; SI: ; %bb.0: ; %entry
993 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
994 ; SI-NEXT: s_waitcnt lgkmcnt(0)
995 ; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
996 ; SI-NEXT: s_mov_b32 s3, 0xf000
997 ; SI-NEXT: s_mov_b32 s2, -1
998 ; SI-NEXT: s_waitcnt lgkmcnt(0)
999 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
1000 ; SI-NEXT: s_lshr_b32 s5, s5, 16
1001 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
1002 ; SI-NEXT: s_lshr_b32 s4, s4, 16
1003 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
1004 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
1005 ; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
1006 ; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0
1007 ; SI-NEXT: v_min_f32_e32 v2, 4.0, v2
1008 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
1009 ; SI-NEXT: v_min_f32_e32 v3, 2.0, v3
1010 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1011 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
1012 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1013 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1014 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
1015 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
1016 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
1017 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1020 ; VI-LABEL: fmin_v4f16_imm_a:
1021 ; VI: ; %bb.0: ; %entry
1022 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1023 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400
1024 ; VI-NEXT: s_mov_b32 s7, 0xf000
1025 ; VI-NEXT: s_mov_b32 s6, -1
1026 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1027 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1028 ; VI-NEXT: s_mov_b32 s4, s0
1029 ; VI-NEXT: s_mov_b32 s5, s1
1030 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1031 ; VI-NEXT: s_lshr_b32 s0, s3, 16
1032 ; VI-NEXT: v_max_f16_e64 v1, s3, s3
1033 ; VI-NEXT: v_max_f16_e64 v3, s0, s0
1034 ; VI-NEXT: v_max_f16_e64 v2, s2, s2
1035 ; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1
1036 ; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1037 ; VI-NEXT: s_lshr_b32 s0, s2, 16
1038 ; VI-NEXT: v_or_b32_e32 v1, v1, v0
1039 ; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2
1040 ; VI-NEXT: v_max_f16_e64 v2, s0, s0
1041 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000
1042 ; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1043 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
1044 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1047 ; GFX9-LABEL: fmin_v4f16_imm_a:
1048 ; GFX9: ; %bb.0: ; %entry
1049 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1050 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200
1051 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800
1052 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
1053 ; GFX9-NEXT: s_mov_b32 s6, -1
1054 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1055 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1056 ; GFX9-NEXT: s_mov_b32 s4, s0
1057 ; GFX9-NEXT: s_mov_b32 s5, s1
1058 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1059 ; GFX9-NEXT: v_pk_max_f16 v0, s3, s3
1060 ; GFX9-NEXT: v_pk_max_f16 v2, s2, s2
1061 ; GFX9-NEXT: v_pk_min_f16 v1, v0, s8
1062 ; GFX9-NEXT: v_pk_min_f16 v0, v2, s9
1063 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1064 ; GFX9-NEXT: s_endpgm
1066 ; GFX10-LABEL: fmin_v4f16_imm_a:
1067 ; GFX10: ; %bb.0: ; %entry
1068 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1069 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1070 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1071 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1072 ; GFX10-NEXT: v_pk_max_f16 v0, s3, s3
1073 ; GFX10-NEXT: v_pk_max_f16 v2, s2, s2
1074 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
1075 ; GFX10-NEXT: s_mov_b32 s2, -1
1076 ; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, v0
1077 ; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2
1078 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1079 ; GFX10-NEXT: s_endpgm
1081 ; GFX11-LABEL: fmin_v4f16_imm_a:
1082 ; GFX11: ; %bb.0: ; %entry
1083 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1084 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1085 ; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
1086 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1087 ; GFX11-NEXT: v_pk_max_f16 v0, s3, s3
1088 ; GFX11-NEXT: v_pk_max_f16 v2, s2, s2
1089 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
1090 ; GFX11-NEXT: s_mov_b32 s2, -1
1091 ; GFX11-NEXT: v_pk_min_f16 v1, 0x44004200, v0
1092 ; GFX11-NEXT: v_pk_min_f16 v0, 0x40004800, v2
1093 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
1094 ; GFX11-NEXT: s_endpgm
1095 ptr addrspace(1) %r,
1096 ptr addrspace(1) %b) #0 {
1098 %b.val = load <4 x half>, ptr addrspace(1) %b
1099 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
1100 store <4 x half> %r.val, ptr addrspace(1) %r
1104 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }