1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
7 declare half @llvm.minnum.f16(half %a, half %b)
8 declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
9 declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b)
10 declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
12 define amdgpu_kernel void @minnum_f16_ieee(
13 ; SI-LABEL: minnum_f16_ieee:
14 ; SI: ; %bb.0: ; %entry
15 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
16 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
17 ; SI-NEXT: s_mov_b32 s3, 0xf000
18 ; SI-NEXT: s_mov_b32 s2, -1
19 ; SI-NEXT: s_mov_b32 s14, s2
20 ; SI-NEXT: s_waitcnt lgkmcnt(0)
21 ; SI-NEXT: s_mov_b32 s12, s6
22 ; SI-NEXT: s_mov_b32 s13, s7
23 ; SI-NEXT: s_mov_b32 s15, s3
24 ; SI-NEXT: s_mov_b32 s10, s2
25 ; SI-NEXT: s_mov_b32 s11, s3
26 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
27 ; SI-NEXT: s_waitcnt vmcnt(0)
28 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
29 ; SI-NEXT: s_waitcnt vmcnt(0)
30 ; SI-NEXT: s_mov_b32 s0, s4
31 ; SI-NEXT: s_mov_b32 s1, s5
32 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
34 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
35 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
36 ; SI-NEXT: v_min_f32_e32 v0, v0, v1
37 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
38 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
41 ; VI-LABEL: minnum_f16_ieee:
42 ; VI: ; %bb.0: ; %entry
43 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
44 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
45 ; VI-NEXT: s_mov_b32 s3, 0xf000
46 ; VI-NEXT: s_mov_b32 s2, -1
47 ; VI-NEXT: s_mov_b32 s14, s2
48 ; VI-NEXT: s_waitcnt lgkmcnt(0)
49 ; VI-NEXT: s_mov_b32 s12, s6
50 ; VI-NEXT: s_mov_b32 s13, s7
51 ; VI-NEXT: s_mov_b32 s15, s3
52 ; VI-NEXT: s_mov_b32 s10, s2
53 ; VI-NEXT: s_mov_b32 s11, s3
54 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
55 ; VI-NEXT: s_waitcnt vmcnt(0)
56 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
57 ; VI-NEXT: s_waitcnt vmcnt(0)
58 ; VI-NEXT: s_mov_b32 s0, s4
59 ; VI-NEXT: s_mov_b32 s1, s5
60 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
61 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
62 ; VI-NEXT: v_min_f16_e32 v0, v0, v1
63 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
66 ; GFX9-LABEL: minnum_f16_ieee:
67 ; GFX9: ; %bb.0: ; %entry
68 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
69 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
70 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
71 ; GFX9-NEXT: s_mov_b32 s2, -1
72 ; GFX9-NEXT: s_mov_b32 s14, s2
73 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
74 ; GFX9-NEXT: s_mov_b32 s12, s6
75 ; GFX9-NEXT: s_mov_b32 s13, s7
76 ; GFX9-NEXT: s_mov_b32 s15, s3
77 ; GFX9-NEXT: s_mov_b32 s10, s2
78 ; GFX9-NEXT: s_mov_b32 s11, s3
79 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
80 ; GFX9-NEXT: s_waitcnt vmcnt(0)
81 ; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
82 ; GFX9-NEXT: s_waitcnt vmcnt(0)
83 ; GFX9-NEXT: s_mov_b32 s0, s4
84 ; GFX9-NEXT: s_mov_b32 s1, s5
85 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
86 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
87 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
88 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
91 ; GFX10-LABEL: minnum_f16_ieee:
92 ; GFX10: ; %bb.0: ; %entry
93 ; GFX10-NEXT: s_clause 0x1
94 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
95 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
96 ; GFX10-NEXT: s_mov_b32 s2, -1
97 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
98 ; GFX10-NEXT: s_mov_b32 s14, s2
99 ; GFX10-NEXT: s_mov_b32 s15, s3
100 ; GFX10-NEXT: s_mov_b32 s10, s2
101 ; GFX10-NEXT: s_mov_b32 s11, s3
102 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
103 ; GFX10-NEXT: s_mov_b32 s12, s6
104 ; GFX10-NEXT: s_mov_b32 s13, s7
105 ; GFX10-NEXT: s_mov_b32 s0, s4
106 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc
107 ; GFX10-NEXT: s_waitcnt vmcnt(0)
108 ; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc
109 ; GFX10-NEXT: s_waitcnt vmcnt(0)
110 ; GFX10-NEXT: s_mov_b32 s1, s5
111 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
112 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
113 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v1
114 ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
115 ; GFX10-NEXT: s_endpgm
116 half addrspace(1)* %r,
117 half addrspace(1)* %a,
118 half addrspace(1)* %b) #0 {
120 %a.val = load volatile half, half addrspace(1)* %a
121 %b.val = load volatile half, half addrspace(1)* %b
122 %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val)
123 store half %r.val, half addrspace(1)* %r
127 define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 {
128 ; SI-LABEL: minnum_f16_no_ieee:
130 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
131 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
132 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
133 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
134 ; SI-NEXT: v_min_f32_e32 v0, v0, v1
135 ; SI-NEXT: ; return to shader part epilog
137 ; VI-LABEL: minnum_f16_no_ieee:
139 ; VI-NEXT: v_min_f16_e32 v0, v0, v1
140 ; VI-NEXT: ; return to shader part epilog
142 ; GFX9-LABEL: minnum_f16_no_ieee:
144 ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
145 ; GFX9-NEXT: ; return to shader part epilog
147 ; GFX10-LABEL: minnum_f16_no_ieee:
149 ; GFX10-NEXT: v_min_f16_e32 v0, v0, v1
150 ; GFX10-NEXT: ; return to shader part epilog
151 %r.val = call half @llvm.minnum.f16(half %a, half %b)
155 define amdgpu_kernel void @minnum_f16_imm_a(
156 ; SI-LABEL: minnum_f16_imm_a:
157 ; SI: ; %bb.0: ; %entry
158 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
159 ; SI-NEXT: s_mov_b32 s3, 0xf000
160 ; SI-NEXT: s_mov_b32 s2, -1
161 ; SI-NEXT: s_mov_b32 s10, s2
162 ; SI-NEXT: s_mov_b32 s11, s3
163 ; SI-NEXT: s_waitcnt lgkmcnt(0)
164 ; SI-NEXT: s_mov_b32 s8, s6
165 ; SI-NEXT: s_mov_b32 s9, s7
166 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
167 ; SI-NEXT: s_mov_b32 s0, s4
168 ; SI-NEXT: s_mov_b32 s1, s5
169 ; SI-NEXT: s_waitcnt vmcnt(0)
170 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
171 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
172 ; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
173 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
174 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
177 ; VI-LABEL: minnum_f16_imm_a:
178 ; VI: ; %bb.0: ; %entry
179 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
180 ; VI-NEXT: s_mov_b32 s3, 0xf000
181 ; VI-NEXT: s_mov_b32 s2, -1
182 ; VI-NEXT: s_waitcnt lgkmcnt(0)
183 ; VI-NEXT: s_mov_b32 s0, s4
184 ; VI-NEXT: s_mov_b32 s1, s5
185 ; VI-NEXT: s_mov_b32 s4, s6
186 ; VI-NEXT: s_mov_b32 s5, s7
187 ; VI-NEXT: s_mov_b32 s6, s2
188 ; VI-NEXT: s_mov_b32 s7, s3
189 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
190 ; VI-NEXT: s_waitcnt vmcnt(0)
191 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
192 ; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0
193 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
196 ; GFX9-LABEL: minnum_f16_imm_a:
197 ; GFX9: ; %bb.0: ; %entry
198 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
199 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
200 ; GFX9-NEXT: s_mov_b32 s2, -1
201 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
202 ; GFX9-NEXT: s_mov_b32 s0, s4
203 ; GFX9-NEXT: s_mov_b32 s1, s5
204 ; GFX9-NEXT: s_mov_b32 s4, s6
205 ; GFX9-NEXT: s_mov_b32 s5, s7
206 ; GFX9-NEXT: s_mov_b32 s6, s2
207 ; GFX9-NEXT: s_mov_b32 s7, s3
208 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
209 ; GFX9-NEXT: s_waitcnt vmcnt(0)
210 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
211 ; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0
212 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
213 ; GFX9-NEXT: s_endpgm
215 ; GFX10-LABEL: minnum_f16_imm_a:
216 ; GFX10: ; %bb.0: ; %entry
217 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
218 ; GFX10-NEXT: s_mov_b32 s6, -1
219 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
220 ; GFX10-NEXT: s_mov_b32 s10, s6
221 ; GFX10-NEXT: s_mov_b32 s11, s7
222 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
223 ; GFX10-NEXT: s_mov_b32 s8, s2
224 ; GFX10-NEXT: s_mov_b32 s9, s3
225 ; GFX10-NEXT: s_mov_b32 s4, s0
226 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
227 ; GFX10-NEXT: s_mov_b32 s5, s1
228 ; GFX10-NEXT: s_waitcnt vmcnt(0)
229 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
230 ; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0
231 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
232 ; GFX10-NEXT: s_endpgm
233 half addrspace(1)* %r,
234 half addrspace(1)* %b) #0 {
236 %b.val = load half, half addrspace(1)* %b
237 %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val)
238 store half %r.val, half addrspace(1)* %r
242 define amdgpu_kernel void @minnum_f16_imm_b(
243 ; SI-LABEL: minnum_f16_imm_b:
244 ; SI: ; %bb.0: ; %entry
245 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
246 ; SI-NEXT: s_mov_b32 s3, 0xf000
247 ; SI-NEXT: s_mov_b32 s2, -1
248 ; SI-NEXT: s_mov_b32 s10, s2
249 ; SI-NEXT: s_mov_b32 s11, s3
250 ; SI-NEXT: s_waitcnt lgkmcnt(0)
251 ; SI-NEXT: s_mov_b32 s8, s6
252 ; SI-NEXT: s_mov_b32 s9, s7
253 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
254 ; SI-NEXT: s_mov_b32 s0, s4
255 ; SI-NEXT: s_mov_b32 s1, s5
256 ; SI-NEXT: s_waitcnt vmcnt(0)
257 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
258 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
259 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
260 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
261 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
264 ; VI-LABEL: minnum_f16_imm_b:
265 ; VI: ; %bb.0: ; %entry
266 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
267 ; VI-NEXT: s_mov_b32 s3, 0xf000
268 ; VI-NEXT: s_mov_b32 s2, -1
269 ; VI-NEXT: s_waitcnt lgkmcnt(0)
270 ; VI-NEXT: s_mov_b32 s0, s4
271 ; VI-NEXT: s_mov_b32 s1, s5
272 ; VI-NEXT: s_mov_b32 s4, s6
273 ; VI-NEXT: s_mov_b32 s5, s7
274 ; VI-NEXT: s_mov_b32 s6, s2
275 ; VI-NEXT: s_mov_b32 s7, s3
276 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
277 ; VI-NEXT: s_waitcnt vmcnt(0)
278 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
279 ; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
280 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
283 ; GFX9-LABEL: minnum_f16_imm_b:
284 ; GFX9: ; %bb.0: ; %entry
285 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
286 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
287 ; GFX9-NEXT: s_mov_b32 s2, -1
288 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
289 ; GFX9-NEXT: s_mov_b32 s0, s4
290 ; GFX9-NEXT: s_mov_b32 s1, s5
291 ; GFX9-NEXT: s_mov_b32 s4, s6
292 ; GFX9-NEXT: s_mov_b32 s5, s7
293 ; GFX9-NEXT: s_mov_b32 s6, s2
294 ; GFX9-NEXT: s_mov_b32 s7, s3
295 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
296 ; GFX9-NEXT: s_waitcnt vmcnt(0)
297 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
298 ; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0
299 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
300 ; GFX9-NEXT: s_endpgm
302 ; GFX10-LABEL: minnum_f16_imm_b:
303 ; GFX10: ; %bb.0: ; %entry
304 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
305 ; GFX10-NEXT: s_mov_b32 s6, -1
306 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
307 ; GFX10-NEXT: s_mov_b32 s10, s6
308 ; GFX10-NEXT: s_mov_b32 s11, s7
309 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
310 ; GFX10-NEXT: s_mov_b32 s8, s2
311 ; GFX10-NEXT: s_mov_b32 s9, s3
312 ; GFX10-NEXT: s_mov_b32 s4, s0
313 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
314 ; GFX10-NEXT: s_mov_b32 s5, s1
315 ; GFX10-NEXT: s_waitcnt vmcnt(0)
316 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
317 ; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0
318 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
319 ; GFX10-NEXT: s_endpgm
320 half addrspace(1)* %r,
321 half addrspace(1)* %a) #0 {
323 %a.val = load half, half addrspace(1)* %a
324 %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0)
325 store half %r.val, half addrspace(1)* %r
329 define amdgpu_kernel void @minnum_v2f16_ieee(
330 ; SI-LABEL: minnum_v2f16_ieee:
331 ; SI: ; %bb.0: ; %entry
332 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
333 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
334 ; SI-NEXT: s_mov_b32 s3, 0xf000
335 ; SI-NEXT: s_mov_b32 s2, -1
336 ; SI-NEXT: s_waitcnt lgkmcnt(0)
337 ; SI-NEXT: s_load_dword s6, s[6:7], 0x0
338 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0
339 ; SI-NEXT: s_waitcnt lgkmcnt(0)
340 ; SI-NEXT: s_lshr_b32 s1, s6, 16
341 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
342 ; SI-NEXT: s_lshr_b32 s0, s0, 16
343 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
344 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
345 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s6
346 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
347 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
348 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
349 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
350 ; SI-NEXT: v_min_f32_e32 v2, v3, v2
351 ; SI-NEXT: v_min_f32_e32 v0, v0, v1
352 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
353 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
354 ; SI-NEXT: s_mov_b32 s0, s4
355 ; SI-NEXT: s_mov_b32 s1, s5
356 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
357 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
358 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
361 ; VI-LABEL: minnum_v2f16_ieee:
362 ; VI: ; %bb.0: ; %entry
363 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
364 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
365 ; VI-NEXT: s_mov_b32 s3, 0xf000
366 ; VI-NEXT: s_mov_b32 s2, -1
367 ; VI-NEXT: s_waitcnt lgkmcnt(0)
368 ; VI-NEXT: s_mov_b32 s0, s4
369 ; VI-NEXT: s_mov_b32 s1, s5
370 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0
371 ; VI-NEXT: s_load_dword s5, s[8:9], 0x0
372 ; VI-NEXT: s_waitcnt lgkmcnt(0)
373 ; VI-NEXT: v_max_f16_e64 v1, s4, s4
374 ; VI-NEXT: v_max_f16_e64 v0, s5, s5
375 ; VI-NEXT: s_lshr_b32 s4, s4, 16
376 ; VI-NEXT: s_lshr_b32 s5, s5, 16
377 ; VI-NEXT: v_min_f16_e32 v0, v1, v0
378 ; VI-NEXT: v_max_f16_e64 v1, s5, s5
379 ; VI-NEXT: v_max_f16_e64 v2, s4, s4
380 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
381 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
382 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
385 ; GFX9-LABEL: minnum_v2f16_ieee:
386 ; GFX9: ; %bb.0: ; %entry
387 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
388 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
389 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
390 ; GFX9-NEXT: s_mov_b32 s2, -1
391 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
392 ; GFX9-NEXT: s_mov_b32 s0, s4
393 ; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0
394 ; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0
395 ; GFX9-NEXT: s_mov_b32 s1, s5
396 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
397 ; GFX9-NEXT: v_pk_max_f16 v1, s10, s10
398 ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11
399 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0
400 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
401 ; GFX9-NEXT: s_endpgm
403 ; GFX10-LABEL: minnum_v2f16_ieee:
404 ; GFX10: ; %bb.0: ; %entry
405 ; GFX10-NEXT: s_clause 0x1
406 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
407 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
408 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
409 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
410 ; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0
411 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
412 ; GFX10-NEXT: s_mov_b32 s6, -1
413 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
414 ; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
415 ; GFX10-NEXT: v_pk_max_f16 v1, s1, s1
416 ; GFX10-NEXT: v_pk_min_f16 v0, v1, v0
417 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
418 ; GFX10-NEXT: s_endpgm
419 <2 x half> addrspace(1)* %r,
420 <2 x half> addrspace(1)* %a,
421 <2 x half> addrspace(1)* %b) #0 {
423 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
424 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
425 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
426 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
430 define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 {
431 ; SI-LABEL: minnum_v2f16_no_ieee:
433 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
434 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
435 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
436 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
437 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
438 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
439 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
440 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
441 ; SI-NEXT: v_min_f32_e32 v0, v0, v2
442 ; SI-NEXT: v_min_f32_e32 v1, v1, v3
443 ; SI-NEXT: ; return to shader part epilog
445 ; VI-LABEL: minnum_v2f16_no_ieee:
447 ; VI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
448 ; VI-NEXT: v_min_f16_e32 v0, v0, v1
449 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
450 ; VI-NEXT: ; return to shader part epilog
452 ; GFX9-LABEL: minnum_v2f16_no_ieee:
454 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
455 ; GFX9-NEXT: ; return to shader part epilog
457 ; GFX10-LABEL: minnum_v2f16_no_ieee:
459 ; GFX10-NEXT: v_pk_min_f16 v0, v0, v1
460 ; GFX10-NEXT: ; return to shader part epilog
461 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
462 ret <2 x half> %r.val
465 define amdgpu_kernel void @minnum_v2f16_imm_a(
466 ; SI-LABEL: minnum_v2f16_imm_a:
467 ; SI: ; %bb.0: ; %entry
468 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
469 ; SI-NEXT: s_waitcnt lgkmcnt(0)
470 ; SI-NEXT: s_load_dword s2, s[2:3], 0x0
471 ; SI-NEXT: s_mov_b32 s3, 0xf000
472 ; SI-NEXT: s_waitcnt lgkmcnt(0)
473 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
474 ; SI-NEXT: s_lshr_b32 s2, s2, 16
475 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
476 ; SI-NEXT: s_mov_b32 s2, -1
477 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
478 ; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
479 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
480 ; SI-NEXT: v_min_f32_e32 v1, 4.0, v1
481 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
482 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
483 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
484 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
485 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
488 ; VI-LABEL: minnum_v2f16_imm_a:
489 ; VI: ; %bb.0: ; %entry
490 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
491 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400
492 ; VI-NEXT: s_mov_b32 s3, 0xf000
493 ; VI-NEXT: s_mov_b32 s2, -1
494 ; VI-NEXT: s_waitcnt lgkmcnt(0)
495 ; VI-NEXT: s_mov_b32 s0, s4
496 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0
497 ; VI-NEXT: s_mov_b32 s1, s5
498 ; VI-NEXT: s_waitcnt lgkmcnt(0)
499 ; VI-NEXT: v_max_f16_e64 v0, s4, s4
500 ; VI-NEXT: s_lshr_b32 s4, s4, 16
501 ; VI-NEXT: v_max_f16_e64 v1, s4, s4
502 ; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0
503 ; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
504 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
505 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
508 ; GFX9-LABEL: minnum_v2f16_imm_a:
509 ; GFX9: ; %bb.0: ; %entry
510 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
511 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
512 ; GFX9-NEXT: s_mov_b32 s6, -1
513 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
514 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
515 ; GFX9-NEXT: s_mov_b32 s4, s0
516 ; GFX9-NEXT: s_mov_b32 s0, 0x44004200
517 ; GFX9-NEXT: s_mov_b32 s5, s1
518 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
519 ; GFX9-NEXT: v_pk_max_f16 v0, s2, s2
520 ; GFX9-NEXT: v_pk_min_f16 v0, v0, s0
521 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
522 ; GFX9-NEXT: s_endpgm
524 ; GFX10-LABEL: minnum_v2f16_imm_a:
525 ; GFX10: ; %bb.0: ; %entry
526 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
527 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
528 ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
529 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
530 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
531 ; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
532 ; GFX10-NEXT: s_mov_b32 s2, -1
533 ; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0
534 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
535 ; GFX10-NEXT: s_endpgm
536 <2 x half> addrspace(1)* %r,
537 <2 x half> addrspace(1)* %b) #0 {
539 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
540 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
541 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
545 define amdgpu_kernel void @minnum_v2f16_imm_b(
546 ; SI-LABEL: minnum_v2f16_imm_b:
547 ; SI: ; %bb.0: ; %entry
548 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
549 ; SI-NEXT: s_waitcnt lgkmcnt(0)
550 ; SI-NEXT: s_load_dword s2, s[2:3], 0x0
551 ; SI-NEXT: s_mov_b32 s3, 0xf000
552 ; SI-NEXT: s_waitcnt lgkmcnt(0)
553 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
554 ; SI-NEXT: s_lshr_b32 s2, s2, 16
555 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
556 ; SI-NEXT: s_mov_b32 s2, -1
557 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
558 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
559 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
560 ; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
561 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
562 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
563 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
564 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
565 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
568 ; VI-LABEL: minnum_v2f16_imm_b:
569 ; VI: ; %bb.0: ; %entry
570 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
571 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200
572 ; VI-NEXT: s_mov_b32 s3, 0xf000
573 ; VI-NEXT: s_mov_b32 s2, -1
574 ; VI-NEXT: s_waitcnt lgkmcnt(0)
575 ; VI-NEXT: s_mov_b32 s0, s4
576 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0
577 ; VI-NEXT: s_mov_b32 s1, s5
578 ; VI-NEXT: s_waitcnt lgkmcnt(0)
579 ; VI-NEXT: v_max_f16_e64 v0, s4, s4
580 ; VI-NEXT: s_lshr_b32 s4, s4, 16
581 ; VI-NEXT: v_max_f16_e64 v1, s4, s4
582 ; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
583 ; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
584 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
585 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
588 ; GFX9-LABEL: minnum_v2f16_imm_b:
589 ; GFX9: ; %bb.0: ; %entry
590 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
591 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
592 ; GFX9-NEXT: s_mov_b32 s6, -1
593 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
594 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
595 ; GFX9-NEXT: s_mov_b32 s4, s0
596 ; GFX9-NEXT: s_mov_b32 s0, 0x42004400
597 ; GFX9-NEXT: s_mov_b32 s5, s1
598 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
599 ; GFX9-NEXT: v_pk_max_f16 v0, s2, s2
600 ; GFX9-NEXT: v_pk_min_f16 v0, v0, s0
601 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
602 ; GFX9-NEXT: s_endpgm
604 ; GFX10-LABEL: minnum_v2f16_imm_b:
605 ; GFX10: ; %bb.0: ; %entry
606 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
607 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
608 ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
609 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
610 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
611 ; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
612 ; GFX10-NEXT: s_mov_b32 s2, -1
613 ; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0
614 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
615 ; GFX10-NEXT: s_endpgm
616 <2 x half> addrspace(1)* %r,
617 <2 x half> addrspace(1)* %a) #0 {
619 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
620 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
621 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
625 ; FIXME: Scalarize with undef half
626 define amdgpu_kernel void @minnum_v3f16(
627 ; SI-LABEL: minnum_v3f16:
628 ; SI: ; %bb.0: ; %entry
629 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
630 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
631 ; SI-NEXT: s_mov_b32 s3, 0xf000
632 ; SI-NEXT: s_mov_b32 s2, -1
633 ; SI-NEXT: s_waitcnt lgkmcnt(0)
634 ; SI-NEXT: s_mov_b32 s0, s4
635 ; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
636 ; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
637 ; SI-NEXT: s_waitcnt lgkmcnt(0)
638 ; SI-NEXT: s_lshr_b32 s1, s6, 16
639 ; SI-NEXT: s_lshr_b32 s4, s8, 16
640 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
641 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
642 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s6
643 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s8
644 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s7
645 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9
646 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
647 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
648 ; SI-NEXT: v_min_f32_e32 v2, v3, v2
649 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
650 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
651 ; SI-NEXT: v_min_f32_e32 v1, v1, v3
652 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
653 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
654 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
655 ; SI-NEXT: v_min_f32_e32 v0, v0, v3
656 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
657 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
658 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
659 ; SI-NEXT: s_mov_b32 s1, s5
660 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
661 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
662 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
665 ; VI-LABEL: minnum_v3f16:
666 ; VI: ; %bb.0: ; %entry
667 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
668 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
669 ; VI-NEXT: s_mov_b32 s3, 0xf000
670 ; VI-NEXT: s_mov_b32 s2, -1
671 ; VI-NEXT: s_waitcnt lgkmcnt(0)
672 ; VI-NEXT: s_mov_b32 s0, s4
673 ; VI-NEXT: s_mov_b32 s1, s5
674 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
675 ; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
676 ; VI-NEXT: s_waitcnt lgkmcnt(0)
677 ; VI-NEXT: v_max_f16_e64 v1, s4, s4
678 ; VI-NEXT: v_max_f16_e64 v0, s6, s6
679 ; VI-NEXT: s_lshr_b32 s4, s4, 16
680 ; VI-NEXT: s_lshr_b32 s6, s6, 16
681 ; VI-NEXT: v_min_f16_e32 v0, v1, v0
682 ; VI-NEXT: v_max_f16_e64 v1, s6, s6
683 ; VI-NEXT: v_max_f16_e64 v2, s4, s4
684 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
685 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
686 ; VI-NEXT: v_max_f16_e64 v1, s7, s7
687 ; VI-NEXT: v_max_f16_e64 v2, s5, s5
688 ; VI-NEXT: v_min_f16_e32 v1, v2, v1
689 ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
690 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
693 ; GFX9-LABEL: minnum_v3f16:
694 ; GFX9: ; %bb.0: ; %entry
695 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
696 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
697 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
698 ; GFX9-NEXT: s_mov_b32 s2, -1
699 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
700 ; GFX9-NEXT: s_mov_b32 s0, s4
701 ; GFX9-NEXT: s_mov_b32 s1, s5
702 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
703 ; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
704 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
705 ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
706 ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10
707 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0
708 ; GFX9-NEXT: v_pk_max_f16 v2, s11, s11
709 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
710 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2
711 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
712 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
713 ; GFX9-NEXT: s_endpgm
715 ; GFX10-LABEL: minnum_v3f16:
716 ; GFX10: ; %bb.0: ; %entry
717 ; GFX10-NEXT: s_clause 0x1
718 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
719 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
720 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
721 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
722 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
723 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
724 ; GFX10-NEXT: s_mov_b32 s6, -1
725 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
726 ; GFX10-NEXT: v_pk_max_f16 v1, s1, s1
727 ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9
728 ; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
729 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8
730 ; GFX10-NEXT: v_pk_min_f16 v1, v2, v1
731 ; GFX10-NEXT: v_pk_min_f16 v0, v3, v0
732 ; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
733 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
734 ; GFX10-NEXT: s_endpgm
735 <3 x half> addrspace(1)* %r,
736 <3 x half> addrspace(1)* %a,
737 <3 x half> addrspace(1)* %b) #0 {
739 %a.val = load <3 x half>, <3 x half> addrspace(1)* %a
740 %b.val = load <3 x half>, <3 x half> addrspace(1)* %b
741 %r.val = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
742 store <3 x half> %r.val, <3 x half> addrspace(1)* %r
746 define amdgpu_kernel void @minnum_v4f16(
747 ; SI-LABEL: minnum_v4f16:
748 ; SI: ; %bb.0: ; %entry
749 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
750 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
751 ; SI-NEXT: s_mov_b32 s3, 0xf000
752 ; SI-NEXT: s_mov_b32 s2, -1
753 ; SI-NEXT: s_waitcnt lgkmcnt(0)
754 ; SI-NEXT: s_mov_b32 s0, s4
755 ; SI-NEXT: s_mov_b32 s1, s5
756 ; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
757 ; SI-NEXT: s_waitcnt lgkmcnt(0)
758 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
759 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
760 ; SI-NEXT: s_lshr_b32 s4, s4, 16
761 ; SI-NEXT: s_lshr_b32 s5, s5, 16
762 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
763 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5
764 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
765 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
766 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
767 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
768 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
769 ; SI-NEXT: s_waitcnt lgkmcnt(0)
770 ; SI-NEXT: s_lshr_b32 s6, s5, 16
771 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6
772 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
773 ; SI-NEXT: s_lshr_b32 s4, s4, 16
774 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s5
775 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
776 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
777 ; SI-NEXT: v_min_f32_e32 v3, v3, v5
778 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7
779 ; SI-NEXT: v_min_f32_e32 v1, v1, v5
780 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
781 ; SI-NEXT: v_min_f32_e32 v2, v2, v5
782 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
783 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
784 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
785 ; SI-NEXT: v_min_f32_e32 v0, v0, v4
786 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
787 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
788 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
789 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
790 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
791 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
792 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
795 ; VI-LABEL: minnum_v4f16:
796 ; VI: ; %bb.0: ; %entry
797 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
798 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
799 ; VI-NEXT: s_mov_b32 s3, 0xf000
800 ; VI-NEXT: s_mov_b32 s2, -1
801 ; VI-NEXT: s_waitcnt lgkmcnt(0)
802 ; VI-NEXT: s_mov_b32 s0, s4
803 ; VI-NEXT: s_mov_b32 s1, s5
804 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
805 ; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
806 ; VI-NEXT: s_waitcnt lgkmcnt(0)
807 ; VI-NEXT: v_max_f16_e64 v1, s5, s5
808 ; VI-NEXT: v_max_f16_e64 v0, s7, s7
809 ; VI-NEXT: s_lshr_b32 s5, s5, 16
810 ; VI-NEXT: s_lshr_b32 s7, s7, 16
811 ; VI-NEXT: v_min_f16_e32 v0, v1, v0
812 ; VI-NEXT: v_max_f16_e64 v2, s5, s5
813 ; VI-NEXT: v_max_f16_e64 v1, s7, s7
814 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
815 ; VI-NEXT: v_or_b32_e32 v1, v0, v1
816 ; VI-NEXT: v_max_f16_e64 v2, s4, s4
817 ; VI-NEXT: v_max_f16_e64 v0, s6, s6
818 ; VI-NEXT: s_lshr_b32 s4, s4, 16
819 ; VI-NEXT: s_lshr_b32 s5, s6, 16
820 ; VI-NEXT: v_min_f16_e32 v0, v2, v0
821 ; VI-NEXT: v_max_f16_e64 v2, s5, s5
822 ; VI-NEXT: v_max_f16_e64 v3, s4, s4
823 ; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
824 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
825 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
828 ; GFX9-LABEL: minnum_v4f16:
829 ; GFX9: ; %bb.0: ; %entry
830 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
831 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
832 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
833 ; GFX9-NEXT: s_mov_b32 s2, -1
834 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
835 ; GFX9-NEXT: s_mov_b32 s0, s4
836 ; GFX9-NEXT: s_mov_b32 s1, s5
837 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
838 ; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
839 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
840 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
841 ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11
842 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v0
843 ; GFX9-NEXT: v_pk_max_f16 v2, s10, s10
844 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
845 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2
846 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
847 ; GFX9-NEXT: s_endpgm
849 ; GFX10-LABEL: minnum_v4f16:
850 ; GFX10: ; %bb.0: ; %entry
851 ; GFX10-NEXT: s_clause 0x1
852 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
853 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
854 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
855 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
856 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
857 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
858 ; GFX10-NEXT: s_mov_b32 s6, -1
859 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
860 ; GFX10-NEXT: v_pk_max_f16 v0, s1, s1
861 ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9
862 ; GFX10-NEXT: v_pk_max_f16 v2, s0, s0
863 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8
864 ; GFX10-NEXT: v_pk_min_f16 v1, v1, v0
865 ; GFX10-NEXT: v_pk_min_f16 v0, v3, v2
866 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
867 ; GFX10-NEXT: s_endpgm
868 <4 x half> addrspace(1)* %r,
869 <4 x half> addrspace(1)* %a,
870 <4 x half> addrspace(1)* %b) #0 {
872 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
873 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
874 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
875 store <4 x half> %r.val, <4 x half> addrspace(1)* %r
879 define amdgpu_kernel void @fmin_v4f16_imm_a(
880 ; SI-LABEL: fmin_v4f16_imm_a:
881 ; SI: ; %bb.0: ; %entry
882 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
883 ; SI-NEXT: s_mov_b32 s3, 0xf000
884 ; SI-NEXT: s_mov_b32 s2, -1
885 ; SI-NEXT: s_waitcnt lgkmcnt(0)
886 ; SI-NEXT: s_mov_b32 s0, s4
887 ; SI-NEXT: s_mov_b32 s1, s5
888 ; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
889 ; SI-NEXT: s_waitcnt lgkmcnt(0)
890 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
891 ; SI-NEXT: s_lshr_b32 s5, s5, 16
892 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
893 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
894 ; SI-NEXT: s_lshr_b32 s4, s4, 16
895 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
896 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
897 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
898 ; SI-NEXT: v_min_f32_e32 v2, 4.0, v2
899 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
900 ; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
901 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
902 ; SI-NEXT: v_min_f32_e32 v3, 2.0, v3
903 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
904 ; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0
905 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
906 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
907 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
908 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
909 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
910 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
911 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
912 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
915 ; VI-LABEL: fmin_v4f16_imm_a:
916 ; VI: ; %bb.0: ; %entry
917 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
918 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400
919 ; VI-NEXT: s_mov_b32 s3, 0xf000
920 ; VI-NEXT: s_mov_b32 s2, -1
921 ; VI-NEXT: s_waitcnt lgkmcnt(0)
922 ; VI-NEXT: s_mov_b32 s0, s4
923 ; VI-NEXT: s_mov_b32 s1, s5
924 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
925 ; VI-NEXT: s_waitcnt lgkmcnt(0)
926 ; VI-NEXT: v_max_f16_e64 v1, s5, s5
927 ; VI-NEXT: s_lshr_b32 s5, s5, 16
928 ; VI-NEXT: v_max_f16_e64 v3, s5, s5
929 ; VI-NEXT: v_max_f16_e64 v2, s4, s4
930 ; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
931 ; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1
932 ; VI-NEXT: s_lshr_b32 s4, s4, 16
933 ; VI-NEXT: v_or_b32_e32 v1, v1, v0
934 ; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2
935 ; VI-NEXT: v_max_f16_e64 v2, s4, s4
936 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000
937 ; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
938 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
939 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
942 ; GFX9-LABEL: fmin_v4f16_imm_a:
943 ; GFX9: ; %bb.0: ; %entry
944 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
945 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200
946 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800
947 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
948 ; GFX9-NEXT: s_mov_b32 s2, -1
949 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
950 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
951 ; GFX9-NEXT: s_mov_b32 s0, s4
952 ; GFX9-NEXT: s_mov_b32 s1, s5
953 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
954 ; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
955 ; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
956 ; GFX9-NEXT: v_pk_min_f16 v1, v0, s8
957 ; GFX9-NEXT: v_pk_min_f16 v0, v2, s9
958 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
959 ; GFX9-NEXT: s_endpgm
961 ; GFX10-LABEL: fmin_v4f16_imm_a:
962 ; GFX10: ; %bb.0: ; %entry
963 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
964 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
965 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
966 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
967 ; GFX10-NEXT: v_pk_max_f16 v0, s3, s3
968 ; GFX10-NEXT: v_pk_max_f16 v2, s2, s2
969 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
970 ; GFX10-NEXT: s_mov_b32 s2, -1
971 ; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, v0
972 ; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2
973 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
974 ; GFX10-NEXT: s_endpgm
975 <4 x half> addrspace(1)* %r,
976 <4 x half> addrspace(1)* %b) #0 {
978 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
979 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
980 store <4 x half> %r.val, <4 x half> addrspace(1)* %r
984 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }