1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s
3 ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
4 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
7 declare half @llvm.maxnum.f16(half %a, half %b)
8 declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
9 declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b)
10 declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
12 define amdgpu_kernel void @maxnum_f16(
13 ; SI-LABEL: maxnum_f16:
14 ; SI: ; %bb.0: ; %entry
15 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
16 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
17 ; SI-NEXT: s_mov_b32 s3, 0xf000
18 ; SI-NEXT: s_mov_b32 s2, -1
19 ; SI-NEXT: s_mov_b32 s14, s2
20 ; SI-NEXT: s_waitcnt lgkmcnt(0)
21 ; SI-NEXT: s_mov_b32 s12, s6
22 ; SI-NEXT: s_mov_b32 s13, s7
23 ; SI-NEXT: s_mov_b32 s15, s3
24 ; SI-NEXT: s_mov_b32 s10, s2
25 ; SI-NEXT: s_mov_b32 s11, s3
26 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
27 ; SI-NEXT: s_waitcnt vmcnt(0)
28 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
29 ; SI-NEXT: s_waitcnt vmcnt(0)
30 ; SI-NEXT: s_mov_b32 s0, s4
31 ; SI-NEXT: s_mov_b32 s1, s5
32 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
33 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
34 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
35 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
36 ; SI-NEXT: v_max_f32_e32 v0, v0, v1
37 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
38 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
41 ; VI-LABEL: maxnum_f16:
42 ; VI: ; %bb.0: ; %entry
43 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
44 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
45 ; VI-NEXT: s_mov_b32 s3, 0xf000
46 ; VI-NEXT: s_mov_b32 s2, -1
47 ; VI-NEXT: s_mov_b32 s14, s2
48 ; VI-NEXT: s_waitcnt lgkmcnt(0)
49 ; VI-NEXT: s_mov_b32 s12, s6
50 ; VI-NEXT: s_mov_b32 s13, s7
51 ; VI-NEXT: s_mov_b32 s15, s3
52 ; VI-NEXT: s_mov_b32 s10, s2
53 ; VI-NEXT: s_mov_b32 s11, s3
54 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
55 ; VI-NEXT: s_waitcnt vmcnt(0)
56 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
57 ; VI-NEXT: s_waitcnt vmcnt(0)
58 ; VI-NEXT: s_mov_b32 s0, s4
59 ; VI-NEXT: s_mov_b32 s1, s5
60 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
61 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
62 ; VI-NEXT: v_max_f16_e32 v0, v0, v1
63 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
66 ; GFX9-LABEL: maxnum_f16:
67 ; GFX9: ; %bb.0: ; %entry
68 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
69 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
70 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
71 ; GFX9-NEXT: s_mov_b32 s2, -1
72 ; GFX9-NEXT: s_mov_b32 s14, s2
73 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
74 ; GFX9-NEXT: s_mov_b32 s12, s6
75 ; GFX9-NEXT: s_mov_b32 s13, s7
76 ; GFX9-NEXT: s_mov_b32 s15, s3
77 ; GFX9-NEXT: s_mov_b32 s10, s2
78 ; GFX9-NEXT: s_mov_b32 s11, s3
79 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
80 ; GFX9-NEXT: s_waitcnt vmcnt(0)
81 ; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
82 ; GFX9-NEXT: s_waitcnt vmcnt(0)
83 ; GFX9-NEXT: s_mov_b32 s0, s4
84 ; GFX9-NEXT: s_mov_b32 s1, s5
85 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
86 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
87 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
88 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
91 ; GFX10-LABEL: maxnum_f16:
92 ; GFX10: ; %bb.0: ; %entry
93 ; GFX10-NEXT: s_clause 0x1
94 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
95 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
96 ; GFX10-NEXT: s_mov_b32 s2, -1
97 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
98 ; GFX10-NEXT: s_mov_b32 s14, s2
99 ; GFX10-NEXT: s_mov_b32 s15, s3
100 ; GFX10-NEXT: s_mov_b32 s10, s2
101 ; GFX10-NEXT: s_mov_b32 s11, s3
102 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
103 ; GFX10-NEXT: s_mov_b32 s12, s6
104 ; GFX10-NEXT: s_mov_b32 s13, s7
105 ; GFX10-NEXT: s_mov_b32 s0, s4
106 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc
107 ; GFX10-NEXT: s_waitcnt vmcnt(0)
108 ; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc
109 ; GFX10-NEXT: s_waitcnt vmcnt(0)
110 ; GFX10-NEXT: s_mov_b32 s1, s5
111 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
112 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
113 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v1
114 ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
115 ; GFX10-NEXT: s_endpgm
116 half addrspace(1)* %r,
117 half addrspace(1)* %a,
118 half addrspace(1)* %b) #0 {
120 %a.val = load volatile half, half addrspace(1)* %a
121 %b.val = load volatile half, half addrspace(1)* %b
122 %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val)
123 store half %r.val, half addrspace(1)* %r
127 define amdgpu_kernel void @maxnum_f16_imm_a(
128 ; SI-LABEL: maxnum_f16_imm_a:
129 ; SI: ; %bb.0: ; %entry
130 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
131 ; SI-NEXT: s_mov_b32 s3, 0xf000
132 ; SI-NEXT: s_mov_b32 s2, -1
133 ; SI-NEXT: s_mov_b32 s10, s2
134 ; SI-NEXT: s_mov_b32 s11, s3
135 ; SI-NEXT: s_waitcnt lgkmcnt(0)
136 ; SI-NEXT: s_mov_b32 s8, s6
137 ; SI-NEXT: s_mov_b32 s9, s7
138 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
139 ; SI-NEXT: s_mov_b32 s0, s4
140 ; SI-NEXT: s_mov_b32 s1, s5
141 ; SI-NEXT: s_waitcnt vmcnt(0)
142 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
143 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
144 ; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
145 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
146 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
149 ; VI-LABEL: maxnum_f16_imm_a:
150 ; VI: ; %bb.0: ; %entry
151 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
152 ; VI-NEXT: s_mov_b32 s3, 0xf000
153 ; VI-NEXT: s_mov_b32 s2, -1
154 ; VI-NEXT: s_waitcnt lgkmcnt(0)
155 ; VI-NEXT: s_mov_b32 s0, s4
156 ; VI-NEXT: s_mov_b32 s1, s5
157 ; VI-NEXT: s_mov_b32 s4, s6
158 ; VI-NEXT: s_mov_b32 s5, s7
159 ; VI-NEXT: s_mov_b32 s6, s2
160 ; VI-NEXT: s_mov_b32 s7, s3
161 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
162 ; VI-NEXT: s_waitcnt vmcnt(0)
163 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
164 ; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0
165 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
168 ; GFX9-LABEL: maxnum_f16_imm_a:
169 ; GFX9: ; %bb.0: ; %entry
170 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
171 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
172 ; GFX9-NEXT: s_mov_b32 s2, -1
173 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
174 ; GFX9-NEXT: s_mov_b32 s0, s4
175 ; GFX9-NEXT: s_mov_b32 s1, s5
176 ; GFX9-NEXT: s_mov_b32 s4, s6
177 ; GFX9-NEXT: s_mov_b32 s5, s7
178 ; GFX9-NEXT: s_mov_b32 s6, s2
179 ; GFX9-NEXT: s_mov_b32 s7, s3
180 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
181 ; GFX9-NEXT: s_waitcnt vmcnt(0)
182 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
183 ; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0
184 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
185 ; GFX9-NEXT: s_endpgm
187 ; GFX10-LABEL: maxnum_f16_imm_a:
188 ; GFX10: ; %bb.0: ; %entry
189 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
190 ; GFX10-NEXT: s_mov_b32 s6, -1
191 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
192 ; GFX10-NEXT: s_mov_b32 s10, s6
193 ; GFX10-NEXT: s_mov_b32 s11, s7
194 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
195 ; GFX10-NEXT: s_mov_b32 s8, s2
196 ; GFX10-NEXT: s_mov_b32 s9, s3
197 ; GFX10-NEXT: s_mov_b32 s4, s0
198 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
199 ; GFX10-NEXT: s_mov_b32 s5, s1
200 ; GFX10-NEXT: s_waitcnt vmcnt(0)
201 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
202 ; GFX10-NEXT: v_max_f16_e32 v0, 0x4200, v0
203 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
204 ; GFX10-NEXT: s_endpgm
205 half addrspace(1)* %r,
206 half addrspace(1)* %b) #0 {
208 %b.val = load half, half addrspace(1)* %b
209 %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val)
210 store half %r.val, half addrspace(1)* %r
214 define amdgpu_kernel void @maxnum_f16_imm_b(
215 ; SI-LABEL: maxnum_f16_imm_b:
216 ; SI: ; %bb.0: ; %entry
217 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
218 ; SI-NEXT: s_mov_b32 s3, 0xf000
219 ; SI-NEXT: s_mov_b32 s2, -1
220 ; SI-NEXT: s_mov_b32 s10, s2
221 ; SI-NEXT: s_mov_b32 s11, s3
222 ; SI-NEXT: s_waitcnt lgkmcnt(0)
223 ; SI-NEXT: s_mov_b32 s8, s6
224 ; SI-NEXT: s_mov_b32 s9, s7
225 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
226 ; SI-NEXT: s_mov_b32 s0, s4
227 ; SI-NEXT: s_mov_b32 s1, s5
228 ; SI-NEXT: s_waitcnt vmcnt(0)
229 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
230 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
231 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
232 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
233 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
236 ; VI-LABEL: maxnum_f16_imm_b:
237 ; VI: ; %bb.0: ; %entry
238 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
239 ; VI-NEXT: s_mov_b32 s3, 0xf000
240 ; VI-NEXT: s_mov_b32 s2, -1
241 ; VI-NEXT: s_waitcnt lgkmcnt(0)
242 ; VI-NEXT: s_mov_b32 s0, s4
243 ; VI-NEXT: s_mov_b32 s1, s5
244 ; VI-NEXT: s_mov_b32 s4, s6
245 ; VI-NEXT: s_mov_b32 s5, s7
246 ; VI-NEXT: s_mov_b32 s6, s2
247 ; VI-NEXT: s_mov_b32 s7, s3
248 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
249 ; VI-NEXT: s_waitcnt vmcnt(0)
250 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
251 ; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
252 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
255 ; GFX9-LABEL: maxnum_f16_imm_b:
256 ; GFX9: ; %bb.0: ; %entry
257 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
258 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
259 ; GFX9-NEXT: s_mov_b32 s2, -1
260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
261 ; GFX9-NEXT: s_mov_b32 s0, s4
262 ; GFX9-NEXT: s_mov_b32 s1, s5
263 ; GFX9-NEXT: s_mov_b32 s4, s6
264 ; GFX9-NEXT: s_mov_b32 s5, s7
265 ; GFX9-NEXT: s_mov_b32 s6, s2
266 ; GFX9-NEXT: s_mov_b32 s7, s3
267 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
268 ; GFX9-NEXT: s_waitcnt vmcnt(0)
269 ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
270 ; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0
271 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
272 ; GFX9-NEXT: s_endpgm
274 ; GFX10-LABEL: maxnum_f16_imm_b:
275 ; GFX10: ; %bb.0: ; %entry
276 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
277 ; GFX10-NEXT: s_mov_b32 s6, -1
278 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
279 ; GFX10-NEXT: s_mov_b32 s10, s6
280 ; GFX10-NEXT: s_mov_b32 s11, s7
281 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
282 ; GFX10-NEXT: s_mov_b32 s8, s2
283 ; GFX10-NEXT: s_mov_b32 s9, s3
284 ; GFX10-NEXT: s_mov_b32 s4, s0
285 ; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
286 ; GFX10-NEXT: s_mov_b32 s5, s1
287 ; GFX10-NEXT: s_waitcnt vmcnt(0)
288 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
289 ; GFX10-NEXT: v_max_f16_e32 v0, 4.0, v0
290 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
291 ; GFX10-NEXT: s_endpgm
292 half addrspace(1)* %r,
293 half addrspace(1)* %a) #0 {
295 %a.val = load half, half addrspace(1)* %a
296 %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0)
297 store half %r.val, half addrspace(1)* %r
301 define amdgpu_kernel void @maxnum_v2f16(
302 ; SI-LABEL: maxnum_v2f16:
303 ; SI: ; %bb.0: ; %entry
304 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
305 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
306 ; SI-NEXT: s_mov_b32 s3, 0xf000
307 ; SI-NEXT: s_mov_b32 s2, -1
308 ; SI-NEXT: s_waitcnt lgkmcnt(0)
309 ; SI-NEXT: s_load_dword s6, s[6:7], 0x0
310 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0
311 ; SI-NEXT: s_waitcnt lgkmcnt(0)
312 ; SI-NEXT: s_lshr_b32 s1, s6, 16
313 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
314 ; SI-NEXT: s_lshr_b32 s0, s0, 16
315 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
316 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
317 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s6
318 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
319 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
320 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
321 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
322 ; SI-NEXT: v_max_f32_e32 v2, v3, v2
323 ; SI-NEXT: v_max_f32_e32 v0, v0, v1
324 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
325 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
326 ; SI-NEXT: s_mov_b32 s0, s4
327 ; SI-NEXT: s_mov_b32 s1, s5
328 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
329 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
330 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
333 ; VI-LABEL: maxnum_v2f16:
334 ; VI: ; %bb.0: ; %entry
335 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
336 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
337 ; VI-NEXT: s_mov_b32 s3, 0xf000
338 ; VI-NEXT: s_mov_b32 s2, -1
339 ; VI-NEXT: s_waitcnt lgkmcnt(0)
340 ; VI-NEXT: s_mov_b32 s0, s4
341 ; VI-NEXT: s_mov_b32 s1, s5
342 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0
343 ; VI-NEXT: s_load_dword s5, s[8:9], 0x0
344 ; VI-NEXT: s_waitcnt lgkmcnt(0)
345 ; VI-NEXT: v_max_f16_e64 v1, s4, s4
346 ; VI-NEXT: v_max_f16_e64 v0, s5, s5
347 ; VI-NEXT: s_lshr_b32 s4, s4, 16
348 ; VI-NEXT: s_lshr_b32 s5, s5, 16
349 ; VI-NEXT: v_max_f16_e32 v0, v1, v0
350 ; VI-NEXT: v_max_f16_e64 v1, s5, s5
351 ; VI-NEXT: v_max_f16_e64 v2, s4, s4
352 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
353 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
354 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
357 ; GFX9-LABEL: maxnum_v2f16:
358 ; GFX9: ; %bb.0: ; %entry
359 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
360 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
361 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
362 ; GFX9-NEXT: s_mov_b32 s2, -1
363 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
364 ; GFX9-NEXT: s_mov_b32 s0, s4
365 ; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0
366 ; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0
367 ; GFX9-NEXT: s_mov_b32 s1, s5
368 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
369 ; GFX9-NEXT: v_pk_max_f16 v1, s10, s10
370 ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11
371 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0
372 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
373 ; GFX9-NEXT: s_endpgm
375 ; GFX10-LABEL: maxnum_v2f16:
376 ; GFX10: ; %bb.0: ; %entry
377 ; GFX10-NEXT: s_clause 0x1
378 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
379 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
380 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
381 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
382 ; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0
383 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
384 ; GFX10-NEXT: s_mov_b32 s6, -1
385 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
386 ; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
387 ; GFX10-NEXT: v_pk_max_f16 v1, s1, s1
388 ; GFX10-NEXT: v_pk_max_f16 v0, v1, v0
389 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
390 ; GFX10-NEXT: s_endpgm
391 <2 x half> addrspace(1)* %r,
392 <2 x half> addrspace(1)* %a,
393 <2 x half> addrspace(1)* %b) #0 {
395 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
396 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
397 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val)
398 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
402 define amdgpu_kernel void @maxnum_v2f16_imm_a(
403 ; SI-LABEL: maxnum_v2f16_imm_a:
404 ; SI: ; %bb.0: ; %entry
405 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
406 ; SI-NEXT: s_waitcnt lgkmcnt(0)
407 ; SI-NEXT: s_load_dword s2, s[2:3], 0x0
408 ; SI-NEXT: s_mov_b32 s3, 0xf000
409 ; SI-NEXT: s_waitcnt lgkmcnt(0)
410 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
411 ; SI-NEXT: s_lshr_b32 s2, s2, 16
412 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
413 ; SI-NEXT: s_mov_b32 s2, -1
414 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
415 ; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
416 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
417 ; SI-NEXT: v_max_f32_e32 v1, 4.0, v1
418 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
419 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
420 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
421 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
422 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
425 ; VI-LABEL: maxnum_v2f16_imm_a:
426 ; VI: ; %bb.0: ; %entry
427 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
428 ; VI-NEXT: v_mov_b32_e32 v2, 0x4400
429 ; VI-NEXT: s_mov_b32 s3, 0xf000
430 ; VI-NEXT: s_mov_b32 s2, -1
431 ; VI-NEXT: s_waitcnt lgkmcnt(0)
432 ; VI-NEXT: s_mov_b32 s0, s4
433 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0
434 ; VI-NEXT: s_mov_b32 s1, s5
435 ; VI-NEXT: s_waitcnt lgkmcnt(0)
436 ; VI-NEXT: v_max_f16_e64 v0, s4, s4
437 ; VI-NEXT: s_lshr_b32 s4, s4, 16
438 ; VI-NEXT: v_max_f16_e64 v1, s4, s4
439 ; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0
440 ; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
441 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
442 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
445 ; GFX9-LABEL: maxnum_v2f16_imm_a:
446 ; GFX9: ; %bb.0: ; %entry
447 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
448 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
449 ; GFX9-NEXT: s_mov_b32 s6, -1
450 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
451 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
452 ; GFX9-NEXT: s_mov_b32 s4, s0
453 ; GFX9-NEXT: s_mov_b32 s0, 0x44004200
454 ; GFX9-NEXT: s_mov_b32 s5, s1
455 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
456 ; GFX9-NEXT: v_pk_max_f16 v0, s2, s2
457 ; GFX9-NEXT: v_pk_max_f16 v0, v0, s0
458 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
459 ; GFX9-NEXT: s_endpgm
461 ; GFX10-LABEL: maxnum_v2f16_imm_a:
462 ; GFX10: ; %bb.0: ; %entry
463 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
464 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
465 ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
466 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
467 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
468 ; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
469 ; GFX10-NEXT: s_mov_b32 s2, -1
470 ; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, v0
471 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
472 ; GFX10-NEXT: s_endpgm
473 <2 x half> addrspace(1)* %r,
474 <2 x half> addrspace(1)* %b) #0 {
476 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
477 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val)
478 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
482 define amdgpu_kernel void @maxnum_v2f16_imm_b(
483 ; SI-LABEL: maxnum_v2f16_imm_b:
484 ; SI: ; %bb.0: ; %entry
485 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
486 ; SI-NEXT: s_waitcnt lgkmcnt(0)
487 ; SI-NEXT: s_load_dword s2, s[2:3], 0x0
488 ; SI-NEXT: s_mov_b32 s3, 0xf000
489 ; SI-NEXT: s_waitcnt lgkmcnt(0)
490 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
491 ; SI-NEXT: s_lshr_b32 s2, s2, 16
492 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
493 ; SI-NEXT: s_mov_b32 s2, -1
494 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
495 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
496 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
497 ; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
498 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
499 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
500 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
501 ; SI-NEXT: v_or_b32_e32 v0, v0, v1
502 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
505 ; VI-LABEL: maxnum_v2f16_imm_b:
506 ; VI: ; %bb.0: ; %entry
507 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
508 ; VI-NEXT: v_mov_b32_e32 v2, 0x4200
509 ; VI-NEXT: s_mov_b32 s3, 0xf000
510 ; VI-NEXT: s_mov_b32 s2, -1
511 ; VI-NEXT: s_waitcnt lgkmcnt(0)
512 ; VI-NEXT: s_mov_b32 s0, s4
513 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0
514 ; VI-NEXT: s_mov_b32 s1, s5
515 ; VI-NEXT: s_waitcnt lgkmcnt(0)
516 ; VI-NEXT: v_max_f16_e64 v0, s4, s4
517 ; VI-NEXT: s_lshr_b32 s4, s4, 16
518 ; VI-NEXT: v_max_f16_e64 v1, s4, s4
519 ; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
520 ; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
521 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
522 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
525 ; GFX9-LABEL: maxnum_v2f16_imm_b:
526 ; GFX9: ; %bb.0: ; %entry
527 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
528 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
529 ; GFX9-NEXT: s_mov_b32 s6, -1
530 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
531 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
532 ; GFX9-NEXT: s_mov_b32 s4, s0
533 ; GFX9-NEXT: s_mov_b32 s0, 0x42004400
534 ; GFX9-NEXT: s_mov_b32 s5, s1
535 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
536 ; GFX9-NEXT: v_pk_max_f16 v0, s2, s2
537 ; GFX9-NEXT: v_pk_max_f16 v0, v0, s0
538 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
539 ; GFX9-NEXT: s_endpgm
541 ; GFX10-LABEL: maxnum_v2f16_imm_b:
542 ; GFX10: ; %bb.0: ; %entry
543 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
544 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
545 ; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
546 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
547 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
548 ; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
549 ; GFX10-NEXT: s_mov_b32 s2, -1
550 ; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, v0
551 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
552 ; GFX10-NEXT: s_endpgm
553 <2 x half> addrspace(1)* %r,
554 <2 x half> addrspace(1)* %a) #0 {
556 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
557 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>)
558 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
562 ; FIXME: Scalarize with undef half
563 define amdgpu_kernel void @maxnum_v3f16(
564 ; SI-LABEL: maxnum_v3f16:
565 ; SI: ; %bb.0: ; %entry
566 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
567 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
568 ; SI-NEXT: s_mov_b32 s3, 0xf000
569 ; SI-NEXT: s_mov_b32 s2, -1
570 ; SI-NEXT: s_waitcnt lgkmcnt(0)
571 ; SI-NEXT: s_mov_b32 s0, s4
572 ; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
573 ; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
574 ; SI-NEXT: s_waitcnt lgkmcnt(0)
575 ; SI-NEXT: s_lshr_b32 s1, s6, 16
576 ; SI-NEXT: s_lshr_b32 s4, s8, 16
577 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
578 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
579 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s6
580 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s8
581 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s7
582 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s9
583 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
584 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
585 ; SI-NEXT: v_max_f32_e32 v2, v3, v2
586 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
587 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
588 ; SI-NEXT: v_max_f32_e32 v1, v1, v3
589 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
590 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
591 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
592 ; SI-NEXT: v_max_f32_e32 v0, v0, v3
593 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
594 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
595 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
596 ; SI-NEXT: s_mov_b32 s1, s5
597 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
598 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
599 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
602 ; VI-LABEL: maxnum_v3f16:
603 ; VI: ; %bb.0: ; %entry
604 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
605 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
606 ; VI-NEXT: s_mov_b32 s3, 0xf000
607 ; VI-NEXT: s_mov_b32 s2, -1
608 ; VI-NEXT: s_waitcnt lgkmcnt(0)
609 ; VI-NEXT: s_mov_b32 s0, s4
610 ; VI-NEXT: s_mov_b32 s1, s5
611 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
612 ; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
613 ; VI-NEXT: s_waitcnt lgkmcnt(0)
614 ; VI-NEXT: v_max_f16_e64 v1, s4, s4
615 ; VI-NEXT: v_max_f16_e64 v0, s6, s6
616 ; VI-NEXT: s_lshr_b32 s4, s4, 16
617 ; VI-NEXT: s_lshr_b32 s6, s6, 16
618 ; VI-NEXT: v_max_f16_e32 v0, v1, v0
619 ; VI-NEXT: v_max_f16_e64 v1, s6, s6
620 ; VI-NEXT: v_max_f16_e64 v2, s4, s4
621 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
622 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
623 ; VI-NEXT: v_max_f16_e64 v1, s7, s7
624 ; VI-NEXT: v_max_f16_e64 v2, s5, s5
625 ; VI-NEXT: v_max_f16_e32 v1, v2, v1
626 ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
627 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
630 ; GFX9-LABEL: maxnum_v3f16:
631 ; GFX9: ; %bb.0: ; %entry
632 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
633 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
634 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
635 ; GFX9-NEXT: s_mov_b32 s2, -1
636 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
637 ; GFX9-NEXT: s_mov_b32 s0, s4
638 ; GFX9-NEXT: s_mov_b32 s1, s5
639 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
640 ; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
641 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
642 ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
643 ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10
644 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0
645 ; GFX9-NEXT: v_pk_max_f16 v2, s11, s11
646 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
647 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2
648 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
649 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
650 ; GFX9-NEXT: s_endpgm
652 ; GFX10-LABEL: maxnum_v3f16:
653 ; GFX10: ; %bb.0: ; %entry
654 ; GFX10-NEXT: s_clause 0x1
655 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
656 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
657 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
658 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
659 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
660 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
661 ; GFX10-NEXT: s_mov_b32 s6, -1
662 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
663 ; GFX10-NEXT: v_pk_max_f16 v1, s1, s1
664 ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9
665 ; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
666 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8
667 ; GFX10-NEXT: v_pk_max_f16 v1, v2, v1
668 ; GFX10-NEXT: v_pk_max_f16 v0, v3, v0
669 ; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
670 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
671 ; GFX10-NEXT: s_endpgm
672 <3 x half> addrspace(1)* %r,
673 <3 x half> addrspace(1)* %a,
674 <3 x half> addrspace(1)* %b) #0 {
676 %a.val = load <3 x half>, <3 x half> addrspace(1)* %a
677 %b.val = load <3 x half>, <3 x half> addrspace(1)* %b
678 %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val)
679 store <3 x half> %r.val, <3 x half> addrspace(1)* %r
683 define amdgpu_kernel void @maxnum_v4f16(
684 ; SI-LABEL: maxnum_v4f16:
685 ; SI: ; %bb.0: ; %entry
686 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
687 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
688 ; SI-NEXT: s_mov_b32 s3, 0xf000
689 ; SI-NEXT: s_mov_b32 s2, -1
690 ; SI-NEXT: s_waitcnt lgkmcnt(0)
691 ; SI-NEXT: s_mov_b32 s0, s4
692 ; SI-NEXT: s_mov_b32 s1, s5
693 ; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
694 ; SI-NEXT: s_waitcnt lgkmcnt(0)
695 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
696 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
697 ; SI-NEXT: s_lshr_b32 s4, s4, 16
698 ; SI-NEXT: s_lshr_b32 s5, s5, 16
699 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
700 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5
701 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
702 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
703 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
704 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
705 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
706 ; SI-NEXT: s_waitcnt lgkmcnt(0)
707 ; SI-NEXT: s_lshr_b32 s6, s5, 16
708 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6
709 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
710 ; SI-NEXT: s_lshr_b32 s4, s4, 16
711 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s5
712 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
713 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
714 ; SI-NEXT: v_max_f32_e32 v3, v3, v5
715 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7
716 ; SI-NEXT: v_max_f32_e32 v1, v1, v5
717 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
718 ; SI-NEXT: v_max_f32_e32 v2, v2, v5
719 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
720 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
721 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
722 ; SI-NEXT: v_max_f32_e32 v0, v0, v4
723 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
724 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
725 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
726 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
727 ; SI-NEXT: v_or_b32_e32 v1, v1, v3
728 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
729 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
732 ; VI-LABEL: maxnum_v4f16:
733 ; VI: ; %bb.0: ; %entry
734 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
735 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
736 ; VI-NEXT: s_mov_b32 s3, 0xf000
737 ; VI-NEXT: s_mov_b32 s2, -1
738 ; VI-NEXT: s_waitcnt lgkmcnt(0)
739 ; VI-NEXT: s_mov_b32 s0, s4
740 ; VI-NEXT: s_mov_b32 s1, s5
741 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
742 ; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
743 ; VI-NEXT: s_waitcnt lgkmcnt(0)
744 ; VI-NEXT: v_max_f16_e64 v1, s5, s5
745 ; VI-NEXT: v_max_f16_e64 v0, s7, s7
746 ; VI-NEXT: s_lshr_b32 s5, s5, 16
747 ; VI-NEXT: s_lshr_b32 s7, s7, 16
748 ; VI-NEXT: v_max_f16_e32 v0, v1, v0
749 ; VI-NEXT: v_max_f16_e64 v2, s5, s5
750 ; VI-NEXT: v_max_f16_e64 v1, s7, s7
751 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
752 ; VI-NEXT: v_or_b32_e32 v1, v0, v1
753 ; VI-NEXT: v_max_f16_e64 v2, s4, s4
754 ; VI-NEXT: v_max_f16_e64 v0, s6, s6
755 ; VI-NEXT: s_lshr_b32 s4, s4, 16
756 ; VI-NEXT: s_lshr_b32 s5, s6, 16
757 ; VI-NEXT: v_max_f16_e32 v0, v2, v0
758 ; VI-NEXT: v_max_f16_e64 v2, s5, s5
759 ; VI-NEXT: v_max_f16_e64 v3, s4, s4
760 ; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
761 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
762 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
765 ; GFX9-LABEL: maxnum_v4f16:
766 ; GFX9: ; %bb.0: ; %entry
767 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
768 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
769 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
770 ; GFX9-NEXT: s_mov_b32 s2, -1
771 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
772 ; GFX9-NEXT: s_mov_b32 s0, s4
773 ; GFX9-NEXT: s_mov_b32 s1, s5
774 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
775 ; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
776 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
777 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
778 ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11
779 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v0
780 ; GFX9-NEXT: v_pk_max_f16 v2, s10, s10
781 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
782 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
783 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
784 ; GFX9-NEXT: s_endpgm
786 ; GFX10-LABEL: maxnum_v4f16:
787 ; GFX10: ; %bb.0: ; %entry
788 ; GFX10-NEXT: s_clause 0x1
789 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
790 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
791 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
792 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
793 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
794 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
795 ; GFX10-NEXT: s_mov_b32 s6, -1
796 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
797 ; GFX10-NEXT: v_pk_max_f16 v0, s1, s1
798 ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9
799 ; GFX10-NEXT: v_pk_max_f16 v2, s0, s0
800 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8
801 ; GFX10-NEXT: v_pk_max_f16 v1, v1, v0
802 ; GFX10-NEXT: v_pk_max_f16 v0, v3, v2
803 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
804 ; GFX10-NEXT: s_endpgm
805 <4 x half> addrspace(1)* %r,
806 <4 x half> addrspace(1)* %a,
807 <4 x half> addrspace(1)* %b) #0 {
809 %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
810 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
811 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val)
812 store <4 x half> %r.val, <4 x half> addrspace(1)* %r
816 define amdgpu_kernel void @fmax_v4f16_imm_a(
817 ; SI-LABEL: fmax_v4f16_imm_a:
818 ; SI: ; %bb.0: ; %entry
819 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
820 ; SI-NEXT: s_mov_b32 s3, 0xf000
821 ; SI-NEXT: s_mov_b32 s2, -1
822 ; SI-NEXT: s_waitcnt lgkmcnt(0)
823 ; SI-NEXT: s_mov_b32 s0, s4
824 ; SI-NEXT: s_mov_b32 s1, s5
825 ; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
826 ; SI-NEXT: s_waitcnt lgkmcnt(0)
827 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
828 ; SI-NEXT: s_lshr_b32 s5, s5, 16
829 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
830 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
831 ; SI-NEXT: s_lshr_b32 s4, s4, 16
832 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
833 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
834 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
835 ; SI-NEXT: v_max_f32_e32 v2, 4.0, v2
836 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
837 ; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
838 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
839 ; SI-NEXT: v_max_f32_e32 v3, 2.0, v3
840 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
841 ; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0
842 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
843 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
844 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
845 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
846 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
847 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
848 ; SI-NEXT: v_or_b32_e32 v0, v0, v2
849 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
852 ; VI-LABEL: fmax_v4f16_imm_a:
853 ; VI: ; %bb.0: ; %entry
854 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
855 ; VI-NEXT: v_mov_b32_e32 v0, 0x4400
856 ; VI-NEXT: s_mov_b32 s3, 0xf000
857 ; VI-NEXT: s_mov_b32 s2, -1
858 ; VI-NEXT: s_waitcnt lgkmcnt(0)
859 ; VI-NEXT: s_mov_b32 s0, s4
860 ; VI-NEXT: s_mov_b32 s1, s5
861 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
862 ; VI-NEXT: s_waitcnt lgkmcnt(0)
863 ; VI-NEXT: v_max_f16_e64 v1, s5, s5
864 ; VI-NEXT: s_lshr_b32 s5, s5, 16
865 ; VI-NEXT: v_max_f16_e64 v3, s5, s5
866 ; VI-NEXT: v_max_f16_e64 v2, s4, s4
867 ; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
868 ; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1
869 ; VI-NEXT: s_lshr_b32 s4, s4, 16
870 ; VI-NEXT: v_or_b32_e32 v1, v1, v0
871 ; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2
872 ; VI-NEXT: v_max_f16_e64 v2, s4, s4
873 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000
874 ; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
875 ; VI-NEXT: v_or_b32_e32 v0, v0, v2
876 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
879 ; GFX9-LABEL: fmax_v4f16_imm_a:
880 ; GFX9: ; %bb.0: ; %entry
881 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
882 ; GFX9-NEXT: s_mov_b32 s8, 0x44004200
883 ; GFX9-NEXT: s_mov_b32 s9, 0x40004800
884 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
885 ; GFX9-NEXT: s_mov_b32 s2, -1
886 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
887 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
888 ; GFX9-NEXT: s_mov_b32 s0, s4
889 ; GFX9-NEXT: s_mov_b32 s1, s5
890 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
891 ; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
892 ; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
893 ; GFX9-NEXT: v_pk_max_f16 v1, v0, s8
894 ; GFX9-NEXT: v_pk_max_f16 v0, v2, s9
895 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
896 ; GFX9-NEXT: s_endpgm
898 ; GFX10-LABEL: fmax_v4f16_imm_a:
899 ; GFX10: ; %bb.0: ; %entry
900 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
901 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
902 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
903 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
904 ; GFX10-NEXT: v_pk_max_f16 v0, s3, s3
905 ; GFX10-NEXT: v_pk_max_f16 v2, s2, s2
906 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
907 ; GFX10-NEXT: s_mov_b32 s2, -1
908 ; GFX10-NEXT: v_pk_max_f16 v1, 0x44004200, v0
909 ; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, v2
910 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
911 ; GFX10-NEXT: s_endpgm
912 <4 x half> addrspace(1)* %r,
913 <4 x half> addrspace(1)* %b) #0 {
915 %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
916 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val)
917 store <4 x half> %r.val, <4 x half> addrspace(1)* %r
921 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }