1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
7 define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
8 ; SI-LABEL: test_fmax3_olt_0_f32:
10 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
11 ; SI-NEXT: s_mov_b32 s11, 0xf000
12 ; SI-NEXT: s_mov_b32 s10, -1
13 ; SI-NEXT: s_mov_b32 s14, s10
14 ; SI-NEXT: s_mov_b32 s15, s11
15 ; SI-NEXT: s_mov_b32 s18, s10
16 ; SI-NEXT: s_mov_b32 s19, s11
17 ; SI-NEXT: s_mov_b32 s22, s10
18 ; SI-NEXT: s_mov_b32 s23, s11
19 ; SI-NEXT: s_waitcnt lgkmcnt(0)
20 ; SI-NEXT: s_mov_b32 s12, s2
21 ; SI-NEXT: s_mov_b32 s13, s3
22 ; SI-NEXT: s_mov_b32 s16, s4
23 ; SI-NEXT: s_mov_b32 s17, s5
24 ; SI-NEXT: s_mov_b32 s20, s6
25 ; SI-NEXT: s_mov_b32 s21, s7
26 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
27 ; SI-NEXT: s_waitcnt vmcnt(0)
28 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
29 ; SI-NEXT: s_waitcnt vmcnt(0)
30 ; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc
31 ; SI-NEXT: s_waitcnt vmcnt(0)
32 ; SI-NEXT: s_mov_b32 s8, s0
33 ; SI-NEXT: s_mov_b32 s9, s1
34 ; SI-NEXT: v_max3_f32 v0, v0, v1, v2
35 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
38 ; VI-LABEL: test_fmax3_olt_0_f32:
40 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
41 ; VI-NEXT: s_mov_b32 s11, 0xf000
42 ; VI-NEXT: s_mov_b32 s10, -1
43 ; VI-NEXT: s_mov_b32 s14, s10
44 ; VI-NEXT: s_mov_b32 s15, s11
45 ; VI-NEXT: s_waitcnt lgkmcnt(0)
46 ; VI-NEXT: s_mov_b32 s12, s2
47 ; VI-NEXT: s_mov_b32 s13, s3
48 ; VI-NEXT: s_mov_b32 s16, s4
49 ; VI-NEXT: s_mov_b32 s17, s5
50 ; VI-NEXT: s_mov_b32 s18, s10
51 ; VI-NEXT: s_mov_b32 s19, s11
52 ; VI-NEXT: s_mov_b32 s4, s6
53 ; VI-NEXT: s_mov_b32 s5, s7
54 ; VI-NEXT: s_mov_b32 s6, s10
55 ; VI-NEXT: s_mov_b32 s7, s11
56 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
57 ; VI-NEXT: s_waitcnt vmcnt(0)
58 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
59 ; VI-NEXT: s_waitcnt vmcnt(0)
60 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
61 ; VI-NEXT: s_waitcnt vmcnt(0)
62 ; VI-NEXT: s_mov_b32 s8, s0
63 ; VI-NEXT: s_mov_b32 s9, s1
64 ; VI-NEXT: v_max3_f32 v0, v0, v1, v2
65 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
68 ; GFX9-LABEL: test_fmax3_olt_0_f32:
70 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
71 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
72 ; GFX9-NEXT: s_mov_b32 s10, -1
73 ; GFX9-NEXT: s_mov_b32 s14, s10
74 ; GFX9-NEXT: s_mov_b32 s15, s11
75 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
76 ; GFX9-NEXT: s_mov_b32 s12, s2
77 ; GFX9-NEXT: s_mov_b32 s13, s3
78 ; GFX9-NEXT: s_mov_b32 s16, s4
79 ; GFX9-NEXT: s_mov_b32 s17, s5
80 ; GFX9-NEXT: s_mov_b32 s18, s10
81 ; GFX9-NEXT: s_mov_b32 s19, s11
82 ; GFX9-NEXT: s_mov_b32 s4, s6
83 ; GFX9-NEXT: s_mov_b32 s5, s7
84 ; GFX9-NEXT: s_mov_b32 s6, s10
85 ; GFX9-NEXT: s_mov_b32 s7, s11
86 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
87 ; GFX9-NEXT: s_waitcnt vmcnt(0)
88 ; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
89 ; GFX9-NEXT: s_waitcnt vmcnt(0)
90 ; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
91 ; GFX9-NEXT: s_waitcnt vmcnt(0)
92 ; GFX9-NEXT: s_mov_b32 s8, s0
93 ; GFX9-NEXT: s_mov_b32 s9, s1
94 ; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
95 ; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
98 ; GFX11-LABEL: test_fmax3_olt_0_f32:
100 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
101 ; GFX11-NEXT: s_mov_b32 s10, -1
102 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
103 ; GFX11-NEXT: s_mov_b32 s14, s10
104 ; GFX11-NEXT: s_mov_b32 s15, s11
105 ; GFX11-NEXT: s_mov_b32 s18, s10
106 ; GFX11-NEXT: s_mov_b32 s19, s11
107 ; GFX11-NEXT: s_mov_b32 s22, s10
108 ; GFX11-NEXT: s_mov_b32 s23, s11
109 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
110 ; GFX11-NEXT: s_mov_b32 s12, s2
111 ; GFX11-NEXT: s_mov_b32 s13, s3
112 ; GFX11-NEXT: s_mov_b32 s16, s4
113 ; GFX11-NEXT: s_mov_b32 s17, s5
114 ; GFX11-NEXT: s_mov_b32 s20, s6
115 ; GFX11-NEXT: s_mov_b32 s21, s7
116 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
117 ; GFX11-NEXT: s_waitcnt vmcnt(0)
118 ; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
119 ; GFX11-NEXT: s_waitcnt vmcnt(0)
120 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
121 ; GFX11-NEXT: s_waitcnt vmcnt(0)
122 ; GFX11-NEXT: s_mov_b32 s8, s0
123 ; GFX11-NEXT: s_mov_b32 s9, s1
124 ; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2
125 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
126 ; GFX11-NEXT: s_nop 0
127 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
128 ; GFX11-NEXT: s_endpgm
129 %a = load volatile float, ptr addrspace(1) %aptr, align 4
130 %b = load volatile float, ptr addrspace(1) %bptr, align 4
131 %c = load volatile float, ptr addrspace(1) %cptr, align 4
132 %f0 = call float @llvm.maxnum.f32(float %a, float %b)
133 %f1 = call float @llvm.maxnum.f32(float %f0, float %c)
134 store float %f1, ptr addrspace(1) %out, align 4
138 ; Commute operand of second fmax
139 define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
140 ; SI-LABEL: test_fmax3_olt_1_f32:
142 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
143 ; SI-NEXT: s_mov_b32 s11, 0xf000
144 ; SI-NEXT: s_mov_b32 s10, -1
145 ; SI-NEXT: s_mov_b32 s14, s10
146 ; SI-NEXT: s_mov_b32 s15, s11
147 ; SI-NEXT: s_mov_b32 s18, s10
148 ; SI-NEXT: s_mov_b32 s19, s11
149 ; SI-NEXT: s_mov_b32 s22, s10
150 ; SI-NEXT: s_mov_b32 s23, s11
151 ; SI-NEXT: s_waitcnt lgkmcnt(0)
152 ; SI-NEXT: s_mov_b32 s12, s2
153 ; SI-NEXT: s_mov_b32 s13, s3
154 ; SI-NEXT: s_mov_b32 s16, s4
155 ; SI-NEXT: s_mov_b32 s17, s5
156 ; SI-NEXT: s_mov_b32 s20, s6
157 ; SI-NEXT: s_mov_b32 s21, s7
158 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
159 ; SI-NEXT: s_waitcnt vmcnt(0)
160 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
161 ; SI-NEXT: s_waitcnt vmcnt(0)
162 ; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc
163 ; SI-NEXT: s_waitcnt vmcnt(0)
164 ; SI-NEXT: s_mov_b32 s8, s0
165 ; SI-NEXT: s_mov_b32 s9, s1
166 ; SI-NEXT: v_max3_f32 v0, v2, v0, v1
167 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
170 ; VI-LABEL: test_fmax3_olt_1_f32:
172 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
173 ; VI-NEXT: s_mov_b32 s11, 0xf000
174 ; VI-NEXT: s_mov_b32 s10, -1
175 ; VI-NEXT: s_mov_b32 s14, s10
176 ; VI-NEXT: s_mov_b32 s15, s11
177 ; VI-NEXT: s_waitcnt lgkmcnt(0)
178 ; VI-NEXT: s_mov_b32 s12, s2
179 ; VI-NEXT: s_mov_b32 s13, s3
180 ; VI-NEXT: s_mov_b32 s16, s4
181 ; VI-NEXT: s_mov_b32 s17, s5
182 ; VI-NEXT: s_mov_b32 s18, s10
183 ; VI-NEXT: s_mov_b32 s19, s11
184 ; VI-NEXT: s_mov_b32 s4, s6
185 ; VI-NEXT: s_mov_b32 s5, s7
186 ; VI-NEXT: s_mov_b32 s6, s10
187 ; VI-NEXT: s_mov_b32 s7, s11
188 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
189 ; VI-NEXT: s_waitcnt vmcnt(0)
190 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
191 ; VI-NEXT: s_waitcnt vmcnt(0)
192 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
193 ; VI-NEXT: s_waitcnt vmcnt(0)
194 ; VI-NEXT: s_mov_b32 s8, s0
195 ; VI-NEXT: s_mov_b32 s9, s1
196 ; VI-NEXT: v_max3_f32 v0, v2, v0, v1
197 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
200 ; GFX9-LABEL: test_fmax3_olt_1_f32:
202 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
203 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
204 ; GFX9-NEXT: s_mov_b32 s10, -1
205 ; GFX9-NEXT: s_mov_b32 s14, s10
206 ; GFX9-NEXT: s_mov_b32 s15, s11
207 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
208 ; GFX9-NEXT: s_mov_b32 s12, s2
209 ; GFX9-NEXT: s_mov_b32 s13, s3
210 ; GFX9-NEXT: s_mov_b32 s16, s4
211 ; GFX9-NEXT: s_mov_b32 s17, s5
212 ; GFX9-NEXT: s_mov_b32 s18, s10
213 ; GFX9-NEXT: s_mov_b32 s19, s11
214 ; GFX9-NEXT: s_mov_b32 s4, s6
215 ; GFX9-NEXT: s_mov_b32 s5, s7
216 ; GFX9-NEXT: s_mov_b32 s6, s10
217 ; GFX9-NEXT: s_mov_b32 s7, s11
218 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
219 ; GFX9-NEXT: s_waitcnt vmcnt(0)
220 ; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
221 ; GFX9-NEXT: s_waitcnt vmcnt(0)
222 ; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
223 ; GFX9-NEXT: s_waitcnt vmcnt(0)
224 ; GFX9-NEXT: s_mov_b32 s8, s0
225 ; GFX9-NEXT: s_mov_b32 s9, s1
226 ; GFX9-NEXT: v_max3_f32 v0, v2, v0, v1
227 ; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
228 ; GFX9-NEXT: s_endpgm
230 ; GFX11-LABEL: test_fmax3_olt_1_f32:
232 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
233 ; GFX11-NEXT: s_mov_b32 s10, -1
234 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
235 ; GFX11-NEXT: s_mov_b32 s14, s10
236 ; GFX11-NEXT: s_mov_b32 s15, s11
237 ; GFX11-NEXT: s_mov_b32 s18, s10
238 ; GFX11-NEXT: s_mov_b32 s19, s11
239 ; GFX11-NEXT: s_mov_b32 s22, s10
240 ; GFX11-NEXT: s_mov_b32 s23, s11
241 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
242 ; GFX11-NEXT: s_mov_b32 s12, s2
243 ; GFX11-NEXT: s_mov_b32 s13, s3
244 ; GFX11-NEXT: s_mov_b32 s16, s4
245 ; GFX11-NEXT: s_mov_b32 s17, s5
246 ; GFX11-NEXT: s_mov_b32 s20, s6
247 ; GFX11-NEXT: s_mov_b32 s21, s7
248 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
249 ; GFX11-NEXT: s_waitcnt vmcnt(0)
250 ; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
251 ; GFX11-NEXT: s_waitcnt vmcnt(0)
252 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
253 ; GFX11-NEXT: s_waitcnt vmcnt(0)
254 ; GFX11-NEXT: s_mov_b32 s8, s0
255 ; GFX11-NEXT: s_mov_b32 s9, s1
256 ; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1
257 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
258 ; GFX11-NEXT: s_nop 0
259 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
260 ; GFX11-NEXT: s_endpgm
261 %a = load volatile float, ptr addrspace(1) %aptr, align 4
262 %b = load volatile float, ptr addrspace(1) %bptr, align 4
263 %c = load volatile float, ptr addrspace(1) %cptr, align 4
264 %f0 = call float @llvm.maxnum.f32(float %a, float %b)
265 %f1 = call float @llvm.maxnum.f32(float %c, float %f0)
266 store float %f1, ptr addrspace(1) %out, align 4
270 define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
271 ; SI-LABEL: test_fmax3_olt_0_f16:
273 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
274 ; SI-NEXT: s_mov_b32 s11, 0xf000
275 ; SI-NEXT: s_mov_b32 s10, -1
276 ; SI-NEXT: s_mov_b32 s14, s10
277 ; SI-NEXT: s_mov_b32 s15, s11
278 ; SI-NEXT: s_mov_b32 s18, s10
279 ; SI-NEXT: s_mov_b32 s19, s11
280 ; SI-NEXT: s_mov_b32 s22, s10
281 ; SI-NEXT: s_mov_b32 s23, s11
282 ; SI-NEXT: s_waitcnt lgkmcnt(0)
283 ; SI-NEXT: s_mov_b32 s12, s2
284 ; SI-NEXT: s_mov_b32 s13, s3
285 ; SI-NEXT: s_mov_b32 s16, s4
286 ; SI-NEXT: s_mov_b32 s17, s5
287 ; SI-NEXT: s_mov_b32 s20, s6
288 ; SI-NEXT: s_mov_b32 s21, s7
289 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
290 ; SI-NEXT: s_waitcnt vmcnt(0)
291 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
292 ; SI-NEXT: s_waitcnt vmcnt(0)
293 ; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc
294 ; SI-NEXT: s_waitcnt vmcnt(0)
295 ; SI-NEXT: s_mov_b32 s8, s0
296 ; SI-NEXT: s_mov_b32 s9, s1
297 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
298 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
299 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
300 ; SI-NEXT: v_max3_f32 v0, v0, v1, v2
301 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
302 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
305 ; VI-LABEL: test_fmax3_olt_0_f16:
307 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
308 ; VI-NEXT: s_mov_b32 s11, 0xf000
309 ; VI-NEXT: s_mov_b32 s10, -1
310 ; VI-NEXT: s_mov_b32 s14, s10
311 ; VI-NEXT: s_mov_b32 s15, s11
312 ; VI-NEXT: s_waitcnt lgkmcnt(0)
313 ; VI-NEXT: s_mov_b32 s12, s2
314 ; VI-NEXT: s_mov_b32 s13, s3
315 ; VI-NEXT: s_mov_b32 s16, s4
316 ; VI-NEXT: s_mov_b32 s17, s5
317 ; VI-NEXT: s_mov_b32 s18, s10
318 ; VI-NEXT: s_mov_b32 s19, s11
319 ; VI-NEXT: s_mov_b32 s4, s6
320 ; VI-NEXT: s_mov_b32 s5, s7
321 ; VI-NEXT: s_mov_b32 s6, s10
322 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
323 ; VI-NEXT: s_waitcnt vmcnt(0)
324 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
325 ; VI-NEXT: s_waitcnt vmcnt(0)
326 ; VI-NEXT: s_mov_b32 s7, s11
327 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
328 ; VI-NEXT: s_waitcnt vmcnt(0)
329 ; VI-NEXT: s_mov_b32 s8, s0
330 ; VI-NEXT: s_mov_b32 s9, s1
331 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
332 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
333 ; VI-NEXT: v_max_f16_e32 v0, v0, v1
334 ; VI-NEXT: v_max_f16_e32 v1, v2, v2
335 ; VI-NEXT: v_max_f16_e32 v0, v0, v1
336 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
339 ; GFX9-LABEL: test_fmax3_olt_0_f16:
341 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
342 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
343 ; GFX9-NEXT: s_mov_b32 s10, -1
344 ; GFX9-NEXT: s_mov_b32 s14, s10
345 ; GFX9-NEXT: s_mov_b32 s15, s11
346 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
347 ; GFX9-NEXT: s_mov_b32 s12, s2
348 ; GFX9-NEXT: s_mov_b32 s13, s3
349 ; GFX9-NEXT: s_mov_b32 s16, s4
350 ; GFX9-NEXT: s_mov_b32 s17, s5
351 ; GFX9-NEXT: s_mov_b32 s18, s10
352 ; GFX9-NEXT: s_mov_b32 s19, s11
353 ; GFX9-NEXT: s_mov_b32 s4, s6
354 ; GFX9-NEXT: s_mov_b32 s5, s7
355 ; GFX9-NEXT: s_mov_b32 s6, s10
356 ; GFX9-NEXT: s_mov_b32 s7, s11
357 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
358 ; GFX9-NEXT: s_waitcnt vmcnt(0)
359 ; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
360 ; GFX9-NEXT: s_waitcnt vmcnt(0)
361 ; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
362 ; GFX9-NEXT: s_waitcnt vmcnt(0)
363 ; GFX9-NEXT: s_mov_b32 s8, s0
364 ; GFX9-NEXT: s_mov_b32 s9, s1
365 ; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2
366 ; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
367 ; GFX9-NEXT: s_endpgm
369 ; GFX11-LABEL: test_fmax3_olt_0_f16:
371 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
372 ; GFX11-NEXT: s_mov_b32 s10, -1
373 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
374 ; GFX11-NEXT: s_mov_b32 s14, s10
375 ; GFX11-NEXT: s_mov_b32 s15, s11
376 ; GFX11-NEXT: s_mov_b32 s18, s10
377 ; GFX11-NEXT: s_mov_b32 s19, s11
378 ; GFX11-NEXT: s_mov_b32 s22, s10
379 ; GFX11-NEXT: s_mov_b32 s23, s11
380 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
381 ; GFX11-NEXT: s_mov_b32 s12, s2
382 ; GFX11-NEXT: s_mov_b32 s13, s3
383 ; GFX11-NEXT: s_mov_b32 s16, s4
384 ; GFX11-NEXT: s_mov_b32 s17, s5
385 ; GFX11-NEXT: s_mov_b32 s20, s6
386 ; GFX11-NEXT: s_mov_b32 s21, s7
387 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
388 ; GFX11-NEXT: s_waitcnt vmcnt(0)
389 ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
390 ; GFX11-NEXT: s_waitcnt vmcnt(0)
391 ; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
392 ; GFX11-NEXT: s_waitcnt vmcnt(0)
393 ; GFX11-NEXT: s_mov_b32 s8, s0
394 ; GFX11-NEXT: s_mov_b32 s9, s1
395 ; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2
396 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
397 ; GFX11-NEXT: s_nop 0
398 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
399 ; GFX11-NEXT: s_endpgm
400 %a = load volatile half, ptr addrspace(1) %aptr, align 2
401 %b = load volatile half, ptr addrspace(1) %bptr, align 2
402 %c = load volatile half, ptr addrspace(1) %cptr, align 2
403 %f0 = call half @llvm.maxnum.f16(half %a, half %b)
404 %f1 = call half @llvm.maxnum.f16(half %f0, half %c)
405 store half %f1, ptr addrspace(1) %out, align 2
409 ; Commute operand of second fmax
410 define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
411 ; SI-LABEL: test_fmax3_olt_1_f16:
413 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
414 ; SI-NEXT: s_mov_b32 s11, 0xf000
415 ; SI-NEXT: s_mov_b32 s10, -1
416 ; SI-NEXT: s_mov_b32 s14, s10
417 ; SI-NEXT: s_mov_b32 s15, s11
418 ; SI-NEXT: s_mov_b32 s18, s10
419 ; SI-NEXT: s_mov_b32 s19, s11
420 ; SI-NEXT: s_mov_b32 s22, s10
421 ; SI-NEXT: s_mov_b32 s23, s11
422 ; SI-NEXT: s_waitcnt lgkmcnt(0)
423 ; SI-NEXT: s_mov_b32 s12, s2
424 ; SI-NEXT: s_mov_b32 s13, s3
425 ; SI-NEXT: s_mov_b32 s16, s4
426 ; SI-NEXT: s_mov_b32 s17, s5
427 ; SI-NEXT: s_mov_b32 s20, s6
428 ; SI-NEXT: s_mov_b32 s21, s7
429 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
430 ; SI-NEXT: s_waitcnt vmcnt(0)
431 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
432 ; SI-NEXT: s_waitcnt vmcnt(0)
433 ; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc
434 ; SI-NEXT: s_waitcnt vmcnt(0)
435 ; SI-NEXT: s_mov_b32 s8, s0
436 ; SI-NEXT: s_mov_b32 s9, s1
437 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
438 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
439 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
440 ; SI-NEXT: v_max3_f32 v0, v2, v0, v1
441 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
442 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
445 ; VI-LABEL: test_fmax3_olt_1_f16:
447 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
448 ; VI-NEXT: s_mov_b32 s11, 0xf000
449 ; VI-NEXT: s_mov_b32 s10, -1
450 ; VI-NEXT: s_mov_b32 s14, s10
451 ; VI-NEXT: s_mov_b32 s15, s11
452 ; VI-NEXT: s_waitcnt lgkmcnt(0)
453 ; VI-NEXT: s_mov_b32 s12, s2
454 ; VI-NEXT: s_mov_b32 s13, s3
455 ; VI-NEXT: s_mov_b32 s16, s4
456 ; VI-NEXT: s_mov_b32 s17, s5
457 ; VI-NEXT: s_mov_b32 s18, s10
458 ; VI-NEXT: s_mov_b32 s19, s11
459 ; VI-NEXT: s_mov_b32 s4, s6
460 ; VI-NEXT: s_mov_b32 s5, s7
461 ; VI-NEXT: s_mov_b32 s6, s10
462 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
463 ; VI-NEXT: s_waitcnt vmcnt(0)
464 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
465 ; VI-NEXT: s_waitcnt vmcnt(0)
466 ; VI-NEXT: s_mov_b32 s7, s11
467 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
468 ; VI-NEXT: s_waitcnt vmcnt(0)
469 ; VI-NEXT: s_mov_b32 s8, s0
470 ; VI-NEXT: s_mov_b32 s9, s1
471 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
472 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
473 ; VI-NEXT: v_max_f16_e32 v0, v0, v1
474 ; VI-NEXT: v_max_f16_e32 v1, v2, v2
475 ; VI-NEXT: v_max_f16_e32 v0, v1, v0
476 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
479 ; GFX9-LABEL: test_fmax3_olt_1_f16:
481 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
482 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
483 ; GFX9-NEXT: s_mov_b32 s10, -1
484 ; GFX9-NEXT: s_mov_b32 s14, s10
485 ; GFX9-NEXT: s_mov_b32 s15, s11
486 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
487 ; GFX9-NEXT: s_mov_b32 s12, s2
488 ; GFX9-NEXT: s_mov_b32 s13, s3
489 ; GFX9-NEXT: s_mov_b32 s16, s4
490 ; GFX9-NEXT: s_mov_b32 s17, s5
491 ; GFX9-NEXT: s_mov_b32 s18, s10
492 ; GFX9-NEXT: s_mov_b32 s19, s11
493 ; GFX9-NEXT: s_mov_b32 s4, s6
494 ; GFX9-NEXT: s_mov_b32 s5, s7
495 ; GFX9-NEXT: s_mov_b32 s6, s10
496 ; GFX9-NEXT: s_mov_b32 s7, s11
497 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
498 ; GFX9-NEXT: s_waitcnt vmcnt(0)
499 ; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
500 ; GFX9-NEXT: s_waitcnt vmcnt(0)
501 ; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
502 ; GFX9-NEXT: s_waitcnt vmcnt(0)
503 ; GFX9-NEXT: s_mov_b32 s8, s0
504 ; GFX9-NEXT: s_mov_b32 s9, s1
505 ; GFX9-NEXT: v_max3_f16 v0, v2, v0, v1
506 ; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
507 ; GFX9-NEXT: s_endpgm
509 ; GFX11-LABEL: test_fmax3_olt_1_f16:
511 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
512 ; GFX11-NEXT: s_mov_b32 s10, -1
513 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
514 ; GFX11-NEXT: s_mov_b32 s14, s10
515 ; GFX11-NEXT: s_mov_b32 s15, s11
516 ; GFX11-NEXT: s_mov_b32 s18, s10
517 ; GFX11-NEXT: s_mov_b32 s19, s11
518 ; GFX11-NEXT: s_mov_b32 s22, s10
519 ; GFX11-NEXT: s_mov_b32 s23, s11
520 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
521 ; GFX11-NEXT: s_mov_b32 s12, s2
522 ; GFX11-NEXT: s_mov_b32 s13, s3
523 ; GFX11-NEXT: s_mov_b32 s16, s4
524 ; GFX11-NEXT: s_mov_b32 s17, s5
525 ; GFX11-NEXT: s_mov_b32 s20, s6
526 ; GFX11-NEXT: s_mov_b32 s21, s7
527 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
528 ; GFX11-NEXT: s_waitcnt vmcnt(0)
529 ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
530 ; GFX11-NEXT: s_waitcnt vmcnt(0)
531 ; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
532 ; GFX11-NEXT: s_waitcnt vmcnt(0)
533 ; GFX11-NEXT: s_mov_b32 s8, s0
534 ; GFX11-NEXT: s_mov_b32 s9, s1
535 ; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1
536 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
537 ; GFX11-NEXT: s_nop 0
538 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
539 ; GFX11-NEXT: s_endpgm
540 %a = load volatile half, ptr addrspace(1) %aptr, align 2
541 %b = load volatile half, ptr addrspace(1) %bptr, align 2
542 %c = load volatile half, ptr addrspace(1) %cptr, align 2
543 %f0 = call half @llvm.maxnum.f16(half %a, half %b)
544 %f1 = call half @llvm.maxnum.f16(half %c, half %f0)
545 store half %f1, ptr addrspace(1) %out, align 2
549 ; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of max3
550 ; since there are no pack instructions for fmax3.
551 define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
552 ; SI-LABEL: no_fmax3_v2f16:
553 ; SI: ; %bb.0: ; %entry
554 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
556 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
557 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
558 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
559 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
560 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
561 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
562 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
563 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
564 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
565 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
566 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
567 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
568 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
569 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
570 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
571 ; SI-NEXT: v_max_f32_e32 v1, v1, v3
572 ; SI-NEXT: v_max_f32_e32 v0, v0, v2
573 ; SI-NEXT: v_max3_f32 v0, v4, v0, v6
574 ; SI-NEXT: v_max3_f32 v1, v5, v1, v7
575 ; SI-NEXT: s_setpc_b64 s[30:31]
577 ; VI-LABEL: no_fmax3_v2f16:
578 ; VI: ; %bb.0: ; %entry
579 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
580 ; VI-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
581 ; VI-NEXT: v_max_f16_e32 v0, v0, v1
582 ; VI-NEXT: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
583 ; VI-NEXT: v_max_f16_e32 v0, v2, v0
584 ; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
585 ; VI-NEXT: v_max_f16_e32 v0, v0, v3
586 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
587 ; VI-NEXT: s_setpc_b64 s[30:31]
589 ; GFX9-LABEL: no_fmax3_v2f16:
590 ; GFX9: ; %bb.0: ; %entry
591 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
593 ; GFX9-NEXT: v_pk_max_f16 v0, v2, v0
594 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v3
595 ; GFX9-NEXT: s_setpc_b64 s[30:31]
597 ; GFX11-LABEL: no_fmax3_v2f16:
598 ; GFX11: ; %bb.0: ; %entry
599 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
600 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v1
601 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
602 ; GFX11-NEXT: v_pk_max_f16 v0, v2, v0
603 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v3
604 ; GFX11-NEXT: s_setpc_b64 s[30:31]
606 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
607 %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
608 %res = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
612 declare i32 @llvm.amdgcn.workitem.id.x() #1
613 declare float @llvm.maxnum.f32(float, float) #1
614 declare half @llvm.maxnum.f16(half, half) #1
615 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
617 attributes #0 = { nounwind }
618 attributes #1 = { nounwind readnone speculatable }
619 attributes #2 = { nounwind "no-nans-fp-math"="true" }