1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
13 define float @test_v2f32(<2 x float> %a0) {
14 ; SSE2-LABEL: test_v2f32:
16 ; SSE2-NEXT: movaps %xmm0, %xmm1
17 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
18 ; SSE2-NEXT: maxps %xmm1, %xmm0
21 ; SSE41-LABEL: test_v2f32:
23 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
24 ; SSE41-NEXT: maxps %xmm1, %xmm0
27 ; AVX-LABEL: test_v2f32:
29 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
30 ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
33 ; AVX512-LABEL: test_v2f32:
35 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
36 ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0
38 %1 = call float @llvm.experimental.vector.reduce.fmax.f32.v2f32(<2 x float> %a0)
42 define float @test_v4f32(<4 x float> %a0) {
43 ; SSE2-LABEL: test_v4f32:
45 ; SSE2-NEXT: movaps %xmm0, %xmm1
46 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
47 ; SSE2-NEXT: maxps %xmm1, %xmm0
48 ; SSE2-NEXT: movaps %xmm0, %xmm1
49 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
50 ; SSE2-NEXT: maxps %xmm1, %xmm0
53 ; SSE41-LABEL: test_v4f32:
55 ; SSE41-NEXT: movaps %xmm0, %xmm1
56 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
57 ; SSE41-NEXT: maxps %xmm1, %xmm0
58 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
59 ; SSE41-NEXT: maxps %xmm1, %xmm0
62 ; AVX-LABEL: test_v4f32:
64 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
65 ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
66 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
67 ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
70 ; AVX512-LABEL: test_v4f32:
72 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
73 ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0
74 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
75 ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0
77 %1 = call float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %a0)
81 define float @test_v8f32(<8 x float> %a0) {
82 ; SSE2-LABEL: test_v8f32:
84 ; SSE2-NEXT: maxps %xmm1, %xmm0
85 ; SSE2-NEXT: movaps %xmm0, %xmm1
86 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
87 ; SSE2-NEXT: maxps %xmm1, %xmm0
88 ; SSE2-NEXT: movaps %xmm0, %xmm1
89 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
90 ; SSE2-NEXT: maxps %xmm1, %xmm0
93 ; SSE41-LABEL: test_v8f32:
95 ; SSE41-NEXT: maxps %xmm1, %xmm0
96 ; SSE41-NEXT: movaps %xmm0, %xmm1
97 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
98 ; SSE41-NEXT: maxps %xmm1, %xmm0
99 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
100 ; SSE41-NEXT: maxps %xmm1, %xmm0
103 ; AVX-LABEL: test_v8f32:
105 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
106 ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0
107 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
108 ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0
109 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
110 ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0
111 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
112 ; AVX-NEXT: vzeroupper
115 ; AVX512-LABEL: test_v8f32:
117 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
118 ; AVX512-NEXT: vmaxps %ymm1, %ymm0, %ymm0
119 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
120 ; AVX512-NEXT: vmaxps %ymm1, %ymm0, %ymm0
121 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
122 ; AVX512-NEXT: vmaxps %ymm1, %ymm0, %ymm0
123 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
124 ; AVX512-NEXT: vzeroupper
126 %1 = call float @llvm.experimental.vector.reduce.fmax.f32.v8f32(<8 x float> %a0)
130 define float @test_v16f32(<16 x float> %a0) {
131 ; SSE2-LABEL: test_v16f32:
133 ; SSE2-NEXT: maxps %xmm3, %xmm1
134 ; SSE2-NEXT: maxps %xmm2, %xmm0
135 ; SSE2-NEXT: maxps %xmm1, %xmm0
136 ; SSE2-NEXT: movaps %xmm0, %xmm1
137 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
138 ; SSE2-NEXT: maxps %xmm1, %xmm0
139 ; SSE2-NEXT: movaps %xmm0, %xmm1
140 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
141 ; SSE2-NEXT: maxps %xmm1, %xmm0
144 ; SSE41-LABEL: test_v16f32:
146 ; SSE41-NEXT: maxps %xmm3, %xmm1
147 ; SSE41-NEXT: maxps %xmm2, %xmm0
148 ; SSE41-NEXT: maxps %xmm1, %xmm0
149 ; SSE41-NEXT: movaps %xmm0, %xmm1
150 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
151 ; SSE41-NEXT: maxps %xmm1, %xmm0
152 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
153 ; SSE41-NEXT: maxps %xmm1, %xmm0
156 ; AVX-LABEL: test_v16f32:
158 ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0
159 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
160 ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0
161 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
162 ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0
163 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
164 ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0
165 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
166 ; AVX-NEXT: vzeroupper
169 ; AVX512-LABEL: test_v16f32:
171 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
172 ; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0
173 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
174 ; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0
175 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
176 ; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0
177 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
178 ; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0
179 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
180 ; AVX512-NEXT: vzeroupper
182 %1 = call float @llvm.experimental.vector.reduce.fmax.f32.v16f32(<16 x float> %a0)
190 define double @test_v2f64(<2 x double> %a0) {
191 ; SSE-LABEL: test_v2f64:
193 ; SSE-NEXT: movapd %xmm0, %xmm1
194 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
195 ; SSE-NEXT: maxpd %xmm1, %xmm0
198 ; AVX-LABEL: test_v2f64:
200 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
201 ; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
204 ; AVX512-LABEL: test_v2f64:
206 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
207 ; AVX512-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
209 %1 = call double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %a0)
213 define double @test_v4f64(<4 x double> %a0) {
214 ; SSE-LABEL: test_v4f64:
216 ; SSE-NEXT: maxpd %xmm1, %xmm0
217 ; SSE-NEXT: movapd %xmm0, %xmm1
218 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
219 ; SSE-NEXT: maxpd %xmm1, %xmm0
222 ; AVX-LABEL: test_v4f64:
224 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
225 ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
226 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
227 ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
228 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
229 ; AVX-NEXT: vzeroupper
232 ; AVX512-LABEL: test_v4f64:
234 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
235 ; AVX512-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
236 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
237 ; AVX512-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
238 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
239 ; AVX512-NEXT: vzeroupper
241 %1 = call double @llvm.experimental.vector.reduce.fmax.f64.v4f64(<4 x double> %a0)
245 define double @test_v8f64(<8 x double> %a0) {
246 ; SSE-LABEL: test_v8f64:
248 ; SSE-NEXT: maxpd %xmm3, %xmm1
249 ; SSE-NEXT: maxpd %xmm2, %xmm0
250 ; SSE-NEXT: maxpd %xmm1, %xmm0
251 ; SSE-NEXT: movapd %xmm0, %xmm1
252 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
253 ; SSE-NEXT: maxpd %xmm1, %xmm0
256 ; AVX-LABEL: test_v8f64:
258 ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
259 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
260 ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
261 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
262 ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
263 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
264 ; AVX-NEXT: vzeroupper
267 ; AVX512-LABEL: test_v8f64:
269 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
270 ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
271 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
272 ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
273 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
274 ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
275 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
276 ; AVX512-NEXT: vzeroupper
278 %1 = call double @llvm.experimental.vector.reduce.fmax.f64.v8f64(<8 x double> %a0)
282 define double @test_v16f64(<16 x double> %a0) {
283 ; SSE-LABEL: test_v16f64:
285 ; SSE-NEXT: maxpd %xmm6, %xmm2
286 ; SSE-NEXT: maxpd %xmm4, %xmm0
287 ; SSE-NEXT: maxpd %xmm2, %xmm0
288 ; SSE-NEXT: maxpd %xmm7, %xmm3
289 ; SSE-NEXT: maxpd %xmm5, %xmm1
290 ; SSE-NEXT: maxpd %xmm3, %xmm1
291 ; SSE-NEXT: maxpd %xmm1, %xmm0
292 ; SSE-NEXT: movapd %xmm0, %xmm1
293 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
294 ; SSE-NEXT: maxpd %xmm1, %xmm0
297 ; AVX-LABEL: test_v16f64:
299 ; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm1
300 ; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0
301 ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
302 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
303 ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
304 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
305 ; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
306 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
307 ; AVX-NEXT: vzeroupper
310 ; AVX512-LABEL: test_v16f64:
312 ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
313 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
314 ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
315 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
316 ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
317 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
318 ; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
319 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
320 ; AVX512-NEXT: vzeroupper
322 %1 = call double @llvm.experimental.vector.reduce.fmax.f64.v16f64(<16 x double> %a0)
326 declare float @llvm.experimental.vector.reduce.fmax.f32.v2f32(<2 x float>)
327 declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float>)
328 declare float @llvm.experimental.vector.reduce.fmax.f32.v8f32(<8 x float>)
329 declare float @llvm.experimental.vector.reduce.fmax.f32.v16f32(<16 x float>)
331 declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double>)
332 declare double @llvm.experimental.vector.reduce.fmax.f64.v4f64(<4 x double>)
333 declare double @llvm.experimental.vector.reduce.fmax.f64.v8f64(<8 x double>)
334 declare double @llvm.experimental.vector.reduce.fmax.f64.v16f64(<16 x double>)