1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
7 define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
8 ; SI-LABEL: test_fmin3_olt_0_f32:
10 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
11 ; SI-NEXT: s_mov_b32 s11, 0xf000
12 ; SI-NEXT: s_mov_b32 s10, -1
13 ; SI-NEXT: s_mov_b32 s14, s10
14 ; SI-NEXT: s_mov_b32 s15, s11
15 ; SI-NEXT: s_mov_b32 s18, s10
16 ; SI-NEXT: s_mov_b32 s19, s11
17 ; SI-NEXT: s_mov_b32 s22, s10
18 ; SI-NEXT: s_mov_b32 s23, s11
19 ; SI-NEXT: s_waitcnt lgkmcnt(0)
20 ; SI-NEXT: s_mov_b32 s12, s2
21 ; SI-NEXT: s_mov_b32 s13, s3
22 ; SI-NEXT: s_mov_b32 s16, s4
23 ; SI-NEXT: s_mov_b32 s17, s5
24 ; SI-NEXT: s_mov_b32 s20, s6
25 ; SI-NEXT: s_mov_b32 s21, s7
26 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
27 ; SI-NEXT: s_waitcnt vmcnt(0)
28 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
29 ; SI-NEXT: s_waitcnt vmcnt(0)
30 ; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc
31 ; SI-NEXT: s_waitcnt vmcnt(0)
32 ; SI-NEXT: s_mov_b32 s8, s0
33 ; SI-NEXT: s_mov_b32 s9, s1
34 ; SI-NEXT: v_min3_f32 v0, v0, v1, v2
35 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
38 ; VI-LABEL: test_fmin3_olt_0_f32:
40 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
41 ; VI-NEXT: s_mov_b32 s11, 0xf000
42 ; VI-NEXT: s_mov_b32 s10, -1
43 ; VI-NEXT: s_mov_b32 s14, s10
44 ; VI-NEXT: s_mov_b32 s15, s11
45 ; VI-NEXT: s_waitcnt lgkmcnt(0)
46 ; VI-NEXT: s_mov_b32 s12, s2
47 ; VI-NEXT: s_mov_b32 s13, s3
48 ; VI-NEXT: s_mov_b32 s16, s4
49 ; VI-NEXT: s_mov_b32 s17, s5
50 ; VI-NEXT: s_mov_b32 s18, s10
51 ; VI-NEXT: s_mov_b32 s19, s11
52 ; VI-NEXT: s_mov_b32 s4, s6
53 ; VI-NEXT: s_mov_b32 s5, s7
54 ; VI-NEXT: s_mov_b32 s6, s10
55 ; VI-NEXT: s_mov_b32 s7, s11
56 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
57 ; VI-NEXT: s_waitcnt vmcnt(0)
58 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
59 ; VI-NEXT: s_waitcnt vmcnt(0)
60 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
61 ; VI-NEXT: s_waitcnt vmcnt(0)
62 ; VI-NEXT: s_mov_b32 s8, s0
63 ; VI-NEXT: s_mov_b32 s9, s1
64 ; VI-NEXT: v_min3_f32 v0, v0, v1, v2
65 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
68 ; GFX9-LABEL: test_fmin3_olt_0_f32:
70 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
71 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
72 ; GFX9-NEXT: s_mov_b32 s10, -1
73 ; GFX9-NEXT: s_mov_b32 s14, s10
74 ; GFX9-NEXT: s_mov_b32 s15, s11
75 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
76 ; GFX9-NEXT: s_mov_b32 s12, s2
77 ; GFX9-NEXT: s_mov_b32 s13, s3
78 ; GFX9-NEXT: s_mov_b32 s16, s4
79 ; GFX9-NEXT: s_mov_b32 s17, s5
80 ; GFX9-NEXT: s_mov_b32 s18, s10
81 ; GFX9-NEXT: s_mov_b32 s19, s11
82 ; GFX9-NEXT: s_mov_b32 s4, s6
83 ; GFX9-NEXT: s_mov_b32 s5, s7
84 ; GFX9-NEXT: s_mov_b32 s6, s10
85 ; GFX9-NEXT: s_mov_b32 s7, s11
86 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
87 ; GFX9-NEXT: s_waitcnt vmcnt(0)
88 ; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
89 ; GFX9-NEXT: s_waitcnt vmcnt(0)
90 ; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
91 ; GFX9-NEXT: s_waitcnt vmcnt(0)
92 ; GFX9-NEXT: s_mov_b32 s8, s0
93 ; GFX9-NEXT: s_mov_b32 s9, s1
94 ; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2
95 ; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
98 ; GFX11-LABEL: test_fmin3_olt_0_f32:
100 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
101 ; GFX11-NEXT: s_mov_b32 s10, -1
102 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
103 ; GFX11-NEXT: s_mov_b32 s14, s10
104 ; GFX11-NEXT: s_mov_b32 s15, s11
105 ; GFX11-NEXT: s_mov_b32 s18, s10
106 ; GFX11-NEXT: s_mov_b32 s19, s11
107 ; GFX11-NEXT: s_mov_b32 s22, s10
108 ; GFX11-NEXT: s_mov_b32 s23, s11
109 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
110 ; GFX11-NEXT: s_mov_b32 s12, s2
111 ; GFX11-NEXT: s_mov_b32 s13, s3
112 ; GFX11-NEXT: s_mov_b32 s16, s4
113 ; GFX11-NEXT: s_mov_b32 s17, s5
114 ; GFX11-NEXT: s_mov_b32 s20, s6
115 ; GFX11-NEXT: s_mov_b32 s21, s7
116 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
117 ; GFX11-NEXT: s_waitcnt vmcnt(0)
118 ; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
119 ; GFX11-NEXT: s_waitcnt vmcnt(0)
120 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
121 ; GFX11-NEXT: s_waitcnt vmcnt(0)
122 ; GFX11-NEXT: s_mov_b32 s8, s0
123 ; GFX11-NEXT: s_mov_b32 s9, s1
124 ; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2
125 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
126 ; GFX11-NEXT: s_nop 0
127 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
128 ; GFX11-NEXT: s_endpgm
129 %a = load volatile float, ptr addrspace(1) %aptr, align 4
130 %b = load volatile float, ptr addrspace(1) %bptr, align 4
131 %c = load volatile float, ptr addrspace(1) %cptr, align 4
132 %f0 = call float @llvm.minnum.f32(float %a, float %b)
133 %f1 = call float @llvm.minnum.f32(float %f0, float %c)
134 store float %f1, ptr addrspace(1) %out, align 4
138 ; Commute operand of second fmin
139 define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
140 ; SI-LABEL: test_fmin3_olt_1_f32:
142 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
143 ; SI-NEXT: s_mov_b32 s11, 0xf000
144 ; SI-NEXT: s_mov_b32 s10, -1
145 ; SI-NEXT: s_mov_b32 s14, s10
146 ; SI-NEXT: s_mov_b32 s15, s11
147 ; SI-NEXT: s_mov_b32 s18, s10
148 ; SI-NEXT: s_mov_b32 s19, s11
149 ; SI-NEXT: s_mov_b32 s22, s10
150 ; SI-NEXT: s_mov_b32 s23, s11
151 ; SI-NEXT: s_waitcnt lgkmcnt(0)
152 ; SI-NEXT: s_mov_b32 s12, s2
153 ; SI-NEXT: s_mov_b32 s13, s3
154 ; SI-NEXT: s_mov_b32 s16, s4
155 ; SI-NEXT: s_mov_b32 s17, s5
156 ; SI-NEXT: s_mov_b32 s20, s6
157 ; SI-NEXT: s_mov_b32 s21, s7
158 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
159 ; SI-NEXT: s_waitcnt vmcnt(0)
160 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
161 ; SI-NEXT: s_waitcnt vmcnt(0)
162 ; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 glc
163 ; SI-NEXT: s_waitcnt vmcnt(0)
164 ; SI-NEXT: s_mov_b32 s8, s0
165 ; SI-NEXT: s_mov_b32 s9, s1
166 ; SI-NEXT: v_min3_f32 v0, v2, v0, v1
167 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
170 ; VI-LABEL: test_fmin3_olt_1_f32:
172 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
173 ; VI-NEXT: s_mov_b32 s11, 0xf000
174 ; VI-NEXT: s_mov_b32 s10, -1
175 ; VI-NEXT: s_mov_b32 s14, s10
176 ; VI-NEXT: s_mov_b32 s15, s11
177 ; VI-NEXT: s_waitcnt lgkmcnt(0)
178 ; VI-NEXT: s_mov_b32 s12, s2
179 ; VI-NEXT: s_mov_b32 s13, s3
180 ; VI-NEXT: s_mov_b32 s16, s4
181 ; VI-NEXT: s_mov_b32 s17, s5
182 ; VI-NEXT: s_mov_b32 s18, s10
183 ; VI-NEXT: s_mov_b32 s19, s11
184 ; VI-NEXT: s_mov_b32 s4, s6
185 ; VI-NEXT: s_mov_b32 s5, s7
186 ; VI-NEXT: s_mov_b32 s6, s10
187 ; VI-NEXT: s_mov_b32 s7, s11
188 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
189 ; VI-NEXT: s_waitcnt vmcnt(0)
190 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
191 ; VI-NEXT: s_waitcnt vmcnt(0)
192 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
193 ; VI-NEXT: s_waitcnt vmcnt(0)
194 ; VI-NEXT: s_mov_b32 s8, s0
195 ; VI-NEXT: s_mov_b32 s9, s1
196 ; VI-NEXT: v_min3_f32 v0, v2, v0, v1
197 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
200 ; GFX9-LABEL: test_fmin3_olt_1_f32:
202 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
203 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
204 ; GFX9-NEXT: s_mov_b32 s10, -1
205 ; GFX9-NEXT: s_mov_b32 s14, s10
206 ; GFX9-NEXT: s_mov_b32 s15, s11
207 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
208 ; GFX9-NEXT: s_mov_b32 s12, s2
209 ; GFX9-NEXT: s_mov_b32 s13, s3
210 ; GFX9-NEXT: s_mov_b32 s16, s4
211 ; GFX9-NEXT: s_mov_b32 s17, s5
212 ; GFX9-NEXT: s_mov_b32 s18, s10
213 ; GFX9-NEXT: s_mov_b32 s19, s11
214 ; GFX9-NEXT: s_mov_b32 s4, s6
215 ; GFX9-NEXT: s_mov_b32 s5, s7
216 ; GFX9-NEXT: s_mov_b32 s6, s10
217 ; GFX9-NEXT: s_mov_b32 s7, s11
218 ; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
219 ; GFX9-NEXT: s_waitcnt vmcnt(0)
220 ; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
221 ; GFX9-NEXT: s_waitcnt vmcnt(0)
222 ; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
223 ; GFX9-NEXT: s_waitcnt vmcnt(0)
224 ; GFX9-NEXT: s_mov_b32 s8, s0
225 ; GFX9-NEXT: s_mov_b32 s9, s1
226 ; GFX9-NEXT: v_min3_f32 v0, v2, v0, v1
227 ; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
228 ; GFX9-NEXT: s_endpgm
230 ; GFX11-LABEL: test_fmin3_olt_1_f32:
232 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
233 ; GFX11-NEXT: s_mov_b32 s10, -1
234 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
235 ; GFX11-NEXT: s_mov_b32 s14, s10
236 ; GFX11-NEXT: s_mov_b32 s15, s11
237 ; GFX11-NEXT: s_mov_b32 s18, s10
238 ; GFX11-NEXT: s_mov_b32 s19, s11
239 ; GFX11-NEXT: s_mov_b32 s22, s10
240 ; GFX11-NEXT: s_mov_b32 s23, s11
241 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
242 ; GFX11-NEXT: s_mov_b32 s12, s2
243 ; GFX11-NEXT: s_mov_b32 s13, s3
244 ; GFX11-NEXT: s_mov_b32 s16, s4
245 ; GFX11-NEXT: s_mov_b32 s17, s5
246 ; GFX11-NEXT: s_mov_b32 s20, s6
247 ; GFX11-NEXT: s_mov_b32 s21, s7
248 ; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
249 ; GFX11-NEXT: s_waitcnt vmcnt(0)
250 ; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
251 ; GFX11-NEXT: s_waitcnt vmcnt(0)
252 ; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
253 ; GFX11-NEXT: s_waitcnt vmcnt(0)
254 ; GFX11-NEXT: s_mov_b32 s8, s0
255 ; GFX11-NEXT: s_mov_b32 s9, s1
256 ; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1
257 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
258 ; GFX11-NEXT: s_nop 0
259 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
260 ; GFX11-NEXT: s_endpgm
261 %a = load volatile float, ptr addrspace(1) %aptr, align 4
262 %b = load volatile float, ptr addrspace(1) %bptr, align 4
263 %c = load volatile float, ptr addrspace(1) %cptr, align 4
264 %f0 = call float @llvm.minnum.f32(float %a, float %b)
265 %f1 = call float @llvm.minnum.f32(float %c, float %f0)
266 store float %f1, ptr addrspace(1) %out, align 4
270 define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
271 ; SI-LABEL: test_fmin3_olt_0_f16:
273 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
274 ; SI-NEXT: s_mov_b32 s11, 0xf000
275 ; SI-NEXT: s_mov_b32 s10, -1
276 ; SI-NEXT: s_mov_b32 s14, s10
277 ; SI-NEXT: s_mov_b32 s15, s11
278 ; SI-NEXT: s_mov_b32 s18, s10
279 ; SI-NEXT: s_mov_b32 s19, s11
280 ; SI-NEXT: s_mov_b32 s22, s10
281 ; SI-NEXT: s_mov_b32 s23, s11
282 ; SI-NEXT: s_waitcnt lgkmcnt(0)
283 ; SI-NEXT: s_mov_b32 s12, s2
284 ; SI-NEXT: s_mov_b32 s13, s3
285 ; SI-NEXT: s_mov_b32 s16, s4
286 ; SI-NEXT: s_mov_b32 s17, s5
287 ; SI-NEXT: s_mov_b32 s20, s6
288 ; SI-NEXT: s_mov_b32 s21, s7
289 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
290 ; SI-NEXT: s_waitcnt vmcnt(0)
291 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
292 ; SI-NEXT: s_waitcnt vmcnt(0)
293 ; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc
294 ; SI-NEXT: s_waitcnt vmcnt(0)
295 ; SI-NEXT: s_mov_b32 s8, s0
296 ; SI-NEXT: s_mov_b32 s9, s1
297 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
298 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
299 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
300 ; SI-NEXT: v_min3_f32 v0, v0, v1, v2
301 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
302 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
305 ; VI-LABEL: test_fmin3_olt_0_f16:
307 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
308 ; VI-NEXT: s_mov_b32 s11, 0xf000
309 ; VI-NEXT: s_mov_b32 s10, -1
310 ; VI-NEXT: s_mov_b32 s14, s10
311 ; VI-NEXT: s_mov_b32 s15, s11
312 ; VI-NEXT: s_waitcnt lgkmcnt(0)
313 ; VI-NEXT: s_mov_b32 s12, s2
314 ; VI-NEXT: s_mov_b32 s13, s3
315 ; VI-NEXT: s_mov_b32 s16, s4
316 ; VI-NEXT: s_mov_b32 s17, s5
317 ; VI-NEXT: s_mov_b32 s18, s10
318 ; VI-NEXT: s_mov_b32 s19, s11
319 ; VI-NEXT: s_mov_b32 s4, s6
320 ; VI-NEXT: s_mov_b32 s5, s7
321 ; VI-NEXT: s_mov_b32 s6, s10
322 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
323 ; VI-NEXT: s_waitcnt vmcnt(0)
324 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
325 ; VI-NEXT: s_waitcnt vmcnt(0)
326 ; VI-NEXT: s_mov_b32 s7, s11
327 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
328 ; VI-NEXT: s_waitcnt vmcnt(0)
329 ; VI-NEXT: s_mov_b32 s8, s0
330 ; VI-NEXT: s_mov_b32 s9, s1
331 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
332 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
333 ; VI-NEXT: v_min_f16_e32 v0, v0, v1
334 ; VI-NEXT: v_max_f16_e32 v1, v2, v2
335 ; VI-NEXT: v_min_f16_e32 v0, v0, v1
336 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
339 ; GFX9-LABEL: test_fmin3_olt_0_f16:
341 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
342 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
343 ; GFX9-NEXT: s_mov_b32 s10, -1
344 ; GFX9-NEXT: s_mov_b32 s14, s10
345 ; GFX9-NEXT: s_mov_b32 s15, s11
346 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
347 ; GFX9-NEXT: s_mov_b32 s12, s2
348 ; GFX9-NEXT: s_mov_b32 s13, s3
349 ; GFX9-NEXT: s_mov_b32 s16, s4
350 ; GFX9-NEXT: s_mov_b32 s17, s5
351 ; GFX9-NEXT: s_mov_b32 s18, s10
352 ; GFX9-NEXT: s_mov_b32 s19, s11
353 ; GFX9-NEXT: s_mov_b32 s4, s6
354 ; GFX9-NEXT: s_mov_b32 s5, s7
355 ; GFX9-NEXT: s_mov_b32 s6, s10
356 ; GFX9-NEXT: s_mov_b32 s7, s11
357 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
358 ; GFX9-NEXT: s_waitcnt vmcnt(0)
359 ; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
360 ; GFX9-NEXT: s_waitcnt vmcnt(0)
361 ; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
362 ; GFX9-NEXT: s_waitcnt vmcnt(0)
363 ; GFX9-NEXT: s_mov_b32 s8, s0
364 ; GFX9-NEXT: s_mov_b32 s9, s1
365 ; GFX9-NEXT: v_min3_f16 v0, v0, v1, v2
366 ; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
367 ; GFX9-NEXT: s_endpgm
369 ; GFX11-LABEL: test_fmin3_olt_0_f16:
371 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
372 ; GFX11-NEXT: s_mov_b32 s10, -1
373 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
374 ; GFX11-NEXT: s_mov_b32 s14, s10
375 ; GFX11-NEXT: s_mov_b32 s15, s11
376 ; GFX11-NEXT: s_mov_b32 s18, s10
377 ; GFX11-NEXT: s_mov_b32 s19, s11
378 ; GFX11-NEXT: s_mov_b32 s22, s10
379 ; GFX11-NEXT: s_mov_b32 s23, s11
380 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
381 ; GFX11-NEXT: s_mov_b32 s12, s2
382 ; GFX11-NEXT: s_mov_b32 s13, s3
383 ; GFX11-NEXT: s_mov_b32 s16, s4
384 ; GFX11-NEXT: s_mov_b32 s17, s5
385 ; GFX11-NEXT: s_mov_b32 s20, s6
386 ; GFX11-NEXT: s_mov_b32 s21, s7
387 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
388 ; GFX11-NEXT: s_waitcnt vmcnt(0)
389 ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
390 ; GFX11-NEXT: s_waitcnt vmcnt(0)
391 ; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
392 ; GFX11-NEXT: s_waitcnt vmcnt(0)
393 ; GFX11-NEXT: s_mov_b32 s8, s0
394 ; GFX11-NEXT: s_mov_b32 s9, s1
395 ; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2
396 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
397 ; GFX11-NEXT: s_nop 0
398 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
399 ; GFX11-NEXT: s_endpgm
400 %a = load volatile half, ptr addrspace(1) %aptr, align 2
401 %b = load volatile half, ptr addrspace(1) %bptr, align 2
402 %c = load volatile half, ptr addrspace(1) %cptr, align 2
403 %f0 = call half @llvm.minnum.f16(half %a, half %b)
404 %f1 = call half @llvm.minnum.f16(half %f0, half %c)
405 store half %f1, ptr addrspace(1) %out, align 2
409 ; Commute operand of second fmin
410 define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
411 ; SI-LABEL: test_fmin3_olt_1_f16:
413 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
414 ; SI-NEXT: s_mov_b32 s11, 0xf000
415 ; SI-NEXT: s_mov_b32 s10, -1
416 ; SI-NEXT: s_mov_b32 s14, s10
417 ; SI-NEXT: s_mov_b32 s15, s11
418 ; SI-NEXT: s_mov_b32 s18, s10
419 ; SI-NEXT: s_mov_b32 s19, s11
420 ; SI-NEXT: s_mov_b32 s22, s10
421 ; SI-NEXT: s_mov_b32 s23, s11
422 ; SI-NEXT: s_waitcnt lgkmcnt(0)
423 ; SI-NEXT: s_mov_b32 s12, s2
424 ; SI-NEXT: s_mov_b32 s13, s3
425 ; SI-NEXT: s_mov_b32 s16, s4
426 ; SI-NEXT: s_mov_b32 s17, s5
427 ; SI-NEXT: s_mov_b32 s20, s6
428 ; SI-NEXT: s_mov_b32 s21, s7
429 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
430 ; SI-NEXT: s_waitcnt vmcnt(0)
431 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
432 ; SI-NEXT: s_waitcnt vmcnt(0)
433 ; SI-NEXT: buffer_load_ushort v2, off, s[20:23], 0 glc
434 ; SI-NEXT: s_waitcnt vmcnt(0)
435 ; SI-NEXT: s_mov_b32 s8, s0
436 ; SI-NEXT: s_mov_b32 s9, s1
437 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
438 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
439 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
440 ; SI-NEXT: v_min3_f32 v0, v2, v0, v1
441 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
442 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
445 ; VI-LABEL: test_fmin3_olt_1_f16:
447 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
448 ; VI-NEXT: s_mov_b32 s11, 0xf000
449 ; VI-NEXT: s_mov_b32 s10, -1
450 ; VI-NEXT: s_mov_b32 s14, s10
451 ; VI-NEXT: s_mov_b32 s15, s11
452 ; VI-NEXT: s_waitcnt lgkmcnt(0)
453 ; VI-NEXT: s_mov_b32 s12, s2
454 ; VI-NEXT: s_mov_b32 s13, s3
455 ; VI-NEXT: s_mov_b32 s16, s4
456 ; VI-NEXT: s_mov_b32 s17, s5
457 ; VI-NEXT: s_mov_b32 s18, s10
458 ; VI-NEXT: s_mov_b32 s19, s11
459 ; VI-NEXT: s_mov_b32 s4, s6
460 ; VI-NEXT: s_mov_b32 s5, s7
461 ; VI-NEXT: s_mov_b32 s6, s10
462 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
463 ; VI-NEXT: s_waitcnt vmcnt(0)
464 ; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
465 ; VI-NEXT: s_waitcnt vmcnt(0)
466 ; VI-NEXT: s_mov_b32 s7, s11
467 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
468 ; VI-NEXT: s_waitcnt vmcnt(0)
469 ; VI-NEXT: s_mov_b32 s8, s0
470 ; VI-NEXT: s_mov_b32 s9, s1
471 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
472 ; VI-NEXT: v_max_f16_e32 v1, v1, v1
473 ; VI-NEXT: v_min_f16_e32 v0, v0, v1
474 ; VI-NEXT: v_max_f16_e32 v1, v2, v2
475 ; VI-NEXT: v_min_f16_e32 v0, v1, v0
476 ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
479 ; GFX9-LABEL: test_fmin3_olt_1_f16:
481 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
482 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
483 ; GFX9-NEXT: s_mov_b32 s10, -1
484 ; GFX9-NEXT: s_mov_b32 s14, s10
485 ; GFX9-NEXT: s_mov_b32 s15, s11
486 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
487 ; GFX9-NEXT: s_mov_b32 s12, s2
488 ; GFX9-NEXT: s_mov_b32 s13, s3
489 ; GFX9-NEXT: s_mov_b32 s16, s4
490 ; GFX9-NEXT: s_mov_b32 s17, s5
491 ; GFX9-NEXT: s_mov_b32 s18, s10
492 ; GFX9-NEXT: s_mov_b32 s19, s11
493 ; GFX9-NEXT: s_mov_b32 s4, s6
494 ; GFX9-NEXT: s_mov_b32 s5, s7
495 ; GFX9-NEXT: s_mov_b32 s6, s10
496 ; GFX9-NEXT: s_mov_b32 s7, s11
497 ; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
498 ; GFX9-NEXT: s_waitcnt vmcnt(0)
499 ; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
500 ; GFX9-NEXT: s_waitcnt vmcnt(0)
501 ; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
502 ; GFX9-NEXT: s_waitcnt vmcnt(0)
503 ; GFX9-NEXT: s_mov_b32 s8, s0
504 ; GFX9-NEXT: s_mov_b32 s9, s1
505 ; GFX9-NEXT: v_min3_f16 v0, v2, v0, v1
506 ; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
507 ; GFX9-NEXT: s_endpgm
509 ; GFX11-LABEL: test_fmin3_olt_1_f16:
511 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
512 ; GFX11-NEXT: s_mov_b32 s10, -1
513 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
514 ; GFX11-NEXT: s_mov_b32 s14, s10
515 ; GFX11-NEXT: s_mov_b32 s15, s11
516 ; GFX11-NEXT: s_mov_b32 s18, s10
517 ; GFX11-NEXT: s_mov_b32 s19, s11
518 ; GFX11-NEXT: s_mov_b32 s22, s10
519 ; GFX11-NEXT: s_mov_b32 s23, s11
520 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
521 ; GFX11-NEXT: s_mov_b32 s12, s2
522 ; GFX11-NEXT: s_mov_b32 s13, s3
523 ; GFX11-NEXT: s_mov_b32 s16, s4
524 ; GFX11-NEXT: s_mov_b32 s17, s5
525 ; GFX11-NEXT: s_mov_b32 s20, s6
526 ; GFX11-NEXT: s_mov_b32 s21, s7
527 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
528 ; GFX11-NEXT: s_waitcnt vmcnt(0)
529 ; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
530 ; GFX11-NEXT: s_waitcnt vmcnt(0)
531 ; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
532 ; GFX11-NEXT: s_waitcnt vmcnt(0)
533 ; GFX11-NEXT: s_mov_b32 s8, s0
534 ; GFX11-NEXT: s_mov_b32 s9, s1
535 ; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1
536 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
537 ; GFX11-NEXT: s_nop 0
538 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
539 ; GFX11-NEXT: s_endpgm
540 %a = load volatile half, ptr addrspace(1) %aptr, align 2
541 %b = load volatile half, ptr addrspace(1) %bptr, align 2
542 %c = load volatile half, ptr addrspace(1) %cptr, align 2
543 %f0 = call half @llvm.minnum.f16(half %a, half %b)
544 %f1 = call half @llvm.minnum.f16(half %c, half %f0)
545 store half %f1, ptr addrspace(1) %out, align 2
549 ; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of min3
550 ; since there are no pack instructions for fmin3.
551 define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
552 ; SI-LABEL: no_fmin3_v2f16:
553 ; SI: ; %bb.0: ; %entry
554 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
556 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
557 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
558 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
559 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
560 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
561 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
562 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
563 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
564 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
565 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
566 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
567 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
568 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
569 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
570 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
571 ; SI-NEXT: v_min_f32_e32 v1, v1, v3
572 ; SI-NEXT: v_min_f32_e32 v0, v0, v2
573 ; SI-NEXT: v_min3_f32 v0, v4, v0, v6
574 ; SI-NEXT: v_min3_f32 v1, v5, v1, v7
575 ; SI-NEXT: s_setpc_b64 s[30:31]
577 ; VI-LABEL: no_fmin3_v2f16:
578 ; VI: ; %bb.0: ; %entry
579 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
580 ; VI-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
581 ; VI-NEXT: v_min_f16_e32 v0, v0, v1
582 ; VI-NEXT: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
583 ; VI-NEXT: v_min_f16_e32 v0, v2, v0
584 ; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
585 ; VI-NEXT: v_min_f16_e32 v0, v0, v3
586 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
587 ; VI-NEXT: s_setpc_b64 s[30:31]
589 ; GFX9-LABEL: no_fmin3_v2f16:
590 ; GFX9: ; %bb.0: ; %entry
591 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
593 ; GFX9-NEXT: v_pk_min_f16 v0, v2, v0
594 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v3
595 ; GFX9-NEXT: s_setpc_b64 s[30:31]
597 ; GFX11-LABEL: no_fmin3_v2f16:
598 ; GFX11: ; %bb.0: ; %entry
599 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
600 ; GFX11-NEXT: v_pk_min_f16 v0, v0, v1
601 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
602 ; GFX11-NEXT: v_pk_min_f16 v0, v2, v0
603 ; GFX11-NEXT: v_pk_min_f16 v0, v0, v3
604 ; GFX11-NEXT: s_setpc_b64 s[30:31]
606 %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
607 %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
608 %res = call <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d)
612 define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
613 ; SI-LABEL: test_fmin3_olt_0_f64:
615 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
616 ; SI-NEXT: s_mov_b32 s11, 0xf000
617 ; SI-NEXT: s_mov_b32 s10, -1
618 ; SI-NEXT: s_mov_b32 s14, s10
619 ; SI-NEXT: s_mov_b32 s15, s11
620 ; SI-NEXT: s_mov_b32 s18, s10
621 ; SI-NEXT: s_mov_b32 s19, s11
622 ; SI-NEXT: s_waitcnt lgkmcnt(0)
623 ; SI-NEXT: s_mov_b32 s12, s2
624 ; SI-NEXT: s_mov_b32 s13, s3
625 ; SI-NEXT: s_mov_b32 s16, s4
626 ; SI-NEXT: s_mov_b32 s17, s5
627 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
628 ; SI-NEXT: s_waitcnt vmcnt(0)
629 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
630 ; SI-NEXT: s_waitcnt vmcnt(0)
631 ; SI-NEXT: s_mov_b32 s2, s10
632 ; SI-NEXT: s_mov_b32 s3, s11
633 ; SI-NEXT: s_mov_b32 s8, s0
634 ; SI-NEXT: s_mov_b32 s9, s1
635 ; SI-NEXT: s_mov_b32 s0, s6
636 ; SI-NEXT: s_mov_b32 s1, s7
637 ; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc
638 ; SI-NEXT: s_waitcnt vmcnt(0)
639 ; SI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
640 ; SI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
641 ; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
642 ; SI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
643 ; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
644 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
647 ; VI-LABEL: test_fmin3_olt_0_f64:
649 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
650 ; VI-NEXT: s_mov_b32 s11, 0xf000
651 ; VI-NEXT: s_mov_b32 s10, -1
652 ; VI-NEXT: s_mov_b32 s14, s10
653 ; VI-NEXT: s_mov_b32 s15, s11
654 ; VI-NEXT: s_waitcnt lgkmcnt(0)
655 ; VI-NEXT: s_mov_b32 s12, s2
656 ; VI-NEXT: s_mov_b32 s13, s3
657 ; VI-NEXT: s_mov_b32 s16, s4
658 ; VI-NEXT: s_mov_b32 s17, s5
659 ; VI-NEXT: s_mov_b32 s18, s10
660 ; VI-NEXT: s_mov_b32 s19, s11
661 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
662 ; VI-NEXT: s_waitcnt vmcnt(0)
663 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
664 ; VI-NEXT: s_waitcnt vmcnt(0)
665 ; VI-NEXT: s_mov_b32 s4, s6
666 ; VI-NEXT: s_mov_b32 s5, s7
667 ; VI-NEXT: s_mov_b32 s6, s10
668 ; VI-NEXT: s_mov_b32 s7, s11
669 ; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
670 ; VI-NEXT: s_waitcnt vmcnt(0)
671 ; VI-NEXT: s_mov_b32 s8, s0
672 ; VI-NEXT: s_mov_b32 s9, s1
673 ; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
674 ; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
675 ; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
676 ; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
677 ; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
678 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
681 ; GFX9-LABEL: test_fmin3_olt_0_f64:
683 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
684 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
685 ; GFX9-NEXT: s_mov_b32 s10, -1
686 ; GFX9-NEXT: s_mov_b32 s14, s10
687 ; GFX9-NEXT: s_mov_b32 s15, s11
688 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
689 ; GFX9-NEXT: s_mov_b32 s12, s2
690 ; GFX9-NEXT: s_mov_b32 s13, s3
691 ; GFX9-NEXT: s_mov_b32 s16, s4
692 ; GFX9-NEXT: s_mov_b32 s17, s5
693 ; GFX9-NEXT: s_mov_b32 s18, s10
694 ; GFX9-NEXT: s_mov_b32 s19, s11
695 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
696 ; GFX9-NEXT: s_waitcnt vmcnt(0)
697 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
698 ; GFX9-NEXT: s_waitcnt vmcnt(0)
699 ; GFX9-NEXT: s_mov_b32 s4, s6
700 ; GFX9-NEXT: s_mov_b32 s5, s7
701 ; GFX9-NEXT: s_mov_b32 s6, s10
702 ; GFX9-NEXT: s_mov_b32 s7, s11
703 ; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
704 ; GFX9-NEXT: s_waitcnt vmcnt(0)
705 ; GFX9-NEXT: s_mov_b32 s8, s0
706 ; GFX9-NEXT: s_mov_b32 s9, s1
707 ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
708 ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
709 ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
710 ; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
711 ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
712 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
713 ; GFX9-NEXT: s_endpgm
715 ; GFX11-LABEL: test_fmin3_olt_0_f64:
717 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
718 ; GFX11-NEXT: s_mov_b32 s10, -1
719 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
720 ; GFX11-NEXT: s_mov_b32 s14, s10
721 ; GFX11-NEXT: s_mov_b32 s15, s11
722 ; GFX11-NEXT: s_mov_b32 s18, s10
723 ; GFX11-NEXT: s_mov_b32 s19, s11
724 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
725 ; GFX11-NEXT: s_mov_b32 s12, s2
726 ; GFX11-NEXT: s_mov_b32 s13, s3
727 ; GFX11-NEXT: s_mov_b32 s16, s4
728 ; GFX11-NEXT: s_mov_b32 s17, s5
729 ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 glc dlc
730 ; GFX11-NEXT: s_waitcnt vmcnt(0)
731 ; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc
732 ; GFX11-NEXT: s_waitcnt vmcnt(0)
733 ; GFX11-NEXT: s_mov_b32 s12, s6
734 ; GFX11-NEXT: s_mov_b32 s13, s7
735 ; GFX11-NEXT: s_mov_b32 s8, s0
736 ; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc
737 ; GFX11-NEXT: s_waitcnt vmcnt(0)
738 ; GFX11-NEXT: s_mov_b32 s9, s1
739 ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
740 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
741 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
742 ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
743 ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
744 ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
745 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
746 ; GFX11-NEXT: s_nop 0
747 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
748 ; GFX11-NEXT: s_endpgm
749 %a = load volatile double, ptr addrspace(1) %aptr, align 4
750 %b = load volatile double, ptr addrspace(1) %bptr, align 4
751 %c = load volatile double, ptr addrspace(1) %cptr, align 4
752 %f0 = call double @llvm.minnum.f64(double %a, double %b)
753 %f1 = call double @llvm.minnum.f64(double %f0, double %c)
754 store double %f1, ptr addrspace(1) %out, align 4
758 ; Commute operand of second fmin
759 define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
760 ; SI-LABEL: test_fmin3_olt_1_f64:
762 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
763 ; SI-NEXT: s_mov_b32 s11, 0xf000
764 ; SI-NEXT: s_mov_b32 s10, -1
765 ; SI-NEXT: s_mov_b32 s14, s10
766 ; SI-NEXT: s_mov_b32 s15, s11
767 ; SI-NEXT: s_mov_b32 s18, s10
768 ; SI-NEXT: s_mov_b32 s19, s11
769 ; SI-NEXT: s_waitcnt lgkmcnt(0)
770 ; SI-NEXT: s_mov_b32 s12, s2
771 ; SI-NEXT: s_mov_b32 s13, s3
772 ; SI-NEXT: s_mov_b32 s16, s4
773 ; SI-NEXT: s_mov_b32 s17, s5
774 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
775 ; SI-NEXT: s_waitcnt vmcnt(0)
776 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
777 ; SI-NEXT: s_waitcnt vmcnt(0)
778 ; SI-NEXT: s_mov_b32 s2, s10
779 ; SI-NEXT: s_mov_b32 s3, s11
780 ; SI-NEXT: s_mov_b32 s8, s0
781 ; SI-NEXT: s_mov_b32 s9, s1
782 ; SI-NEXT: s_mov_b32 s0, s6
783 ; SI-NEXT: s_mov_b32 s1, s7
784 ; SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc
785 ; SI-NEXT: s_waitcnt vmcnt(0)
786 ; SI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
787 ; SI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
788 ; SI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
789 ; SI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
790 ; SI-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
791 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
794 ; VI-LABEL: test_fmin3_olt_1_f64:
796 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
797 ; VI-NEXT: s_mov_b32 s11, 0xf000
798 ; VI-NEXT: s_mov_b32 s10, -1
799 ; VI-NEXT: s_mov_b32 s14, s10
800 ; VI-NEXT: s_mov_b32 s15, s11
801 ; VI-NEXT: s_waitcnt lgkmcnt(0)
802 ; VI-NEXT: s_mov_b32 s12, s2
803 ; VI-NEXT: s_mov_b32 s13, s3
804 ; VI-NEXT: s_mov_b32 s16, s4
805 ; VI-NEXT: s_mov_b32 s17, s5
806 ; VI-NEXT: s_mov_b32 s18, s10
807 ; VI-NEXT: s_mov_b32 s19, s11
808 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
809 ; VI-NEXT: s_waitcnt vmcnt(0)
810 ; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
811 ; VI-NEXT: s_waitcnt vmcnt(0)
812 ; VI-NEXT: s_mov_b32 s4, s6
813 ; VI-NEXT: s_mov_b32 s5, s7
814 ; VI-NEXT: s_mov_b32 s6, s10
815 ; VI-NEXT: s_mov_b32 s7, s11
816 ; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
817 ; VI-NEXT: s_waitcnt vmcnt(0)
818 ; VI-NEXT: s_mov_b32 s8, s0
819 ; VI-NEXT: s_mov_b32 s9, s1
820 ; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
821 ; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
822 ; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
823 ; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
824 ; VI-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
825 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
828 ; GFX9-LABEL: test_fmin3_olt_1_f64:
830 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
831 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
832 ; GFX9-NEXT: s_mov_b32 s10, -1
833 ; GFX9-NEXT: s_mov_b32 s14, s10
834 ; GFX9-NEXT: s_mov_b32 s15, s11
835 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
836 ; GFX9-NEXT: s_mov_b32 s12, s2
837 ; GFX9-NEXT: s_mov_b32 s13, s3
838 ; GFX9-NEXT: s_mov_b32 s16, s4
839 ; GFX9-NEXT: s_mov_b32 s17, s5
840 ; GFX9-NEXT: s_mov_b32 s18, s10
841 ; GFX9-NEXT: s_mov_b32 s19, s11
842 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
843 ; GFX9-NEXT: s_waitcnt vmcnt(0)
844 ; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
845 ; GFX9-NEXT: s_waitcnt vmcnt(0)
846 ; GFX9-NEXT: s_mov_b32 s4, s6
847 ; GFX9-NEXT: s_mov_b32 s5, s7
848 ; GFX9-NEXT: s_mov_b32 s6, s10
849 ; GFX9-NEXT: s_mov_b32 s7, s11
850 ; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
851 ; GFX9-NEXT: s_waitcnt vmcnt(0)
852 ; GFX9-NEXT: s_mov_b32 s8, s0
853 ; GFX9-NEXT: s_mov_b32 s9, s1
854 ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
855 ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
856 ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
857 ; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
858 ; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
859 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
860 ; GFX9-NEXT: s_endpgm
862 ; GFX11-LABEL: test_fmin3_olt_1_f64:
864 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
865 ; GFX11-NEXT: s_mov_b32 s10, -1
866 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
867 ; GFX11-NEXT: s_mov_b32 s14, s10
868 ; GFX11-NEXT: s_mov_b32 s15, s11
869 ; GFX11-NEXT: s_mov_b32 s18, s10
870 ; GFX11-NEXT: s_mov_b32 s19, s11
871 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
872 ; GFX11-NEXT: s_mov_b32 s12, s2
873 ; GFX11-NEXT: s_mov_b32 s13, s3
874 ; GFX11-NEXT: s_mov_b32 s16, s4
875 ; GFX11-NEXT: s_mov_b32 s17, s5
876 ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 glc dlc
877 ; GFX11-NEXT: s_waitcnt vmcnt(0)
878 ; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc
879 ; GFX11-NEXT: s_waitcnt vmcnt(0)
880 ; GFX11-NEXT: s_mov_b32 s12, s6
881 ; GFX11-NEXT: s_mov_b32 s13, s7
882 ; GFX11-NEXT: s_mov_b32 s8, s0
883 ; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc
884 ; GFX11-NEXT: s_waitcnt vmcnt(0)
885 ; GFX11-NEXT: s_mov_b32 s9, s1
886 ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
887 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
888 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
889 ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
890 ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
891 ; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
892 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
893 ; GFX11-NEXT: s_nop 0
894 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
895 ; GFX11-NEXT: s_endpgm
896 %a = load volatile double, ptr addrspace(1) %aptr, align 4
897 %b = load volatile double, ptr addrspace(1) %bptr, align 4
898 %c = load volatile double, ptr addrspace(1) %cptr, align 4
899 %f0 = call double @llvm.minnum.f64(double %a, double %b)
900 %f1 = call double @llvm.minnum.f64(double %c, double %f0)
901 store double %f1, ptr addrspace(1) %out, align 4
905 declare i32 @llvm.amdgcn.workitem.id.x() #1
906 declare double @llvm.minnum.f64(double, double) #1
907 declare float @llvm.minnum.f32(float, float) #1
908 declare half @llvm.minnum.f16(half, half) #1
909 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>)
911 attributes #0 = { nounwind }
912 attributes #1 = { nounwind readnone speculatable }
913 attributes #2 = { nounwind "no-nans-fp-math"="true" }