1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
8 define void @vp_fadd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
9 ; X86-LABEL: vp_fadd_v4f32:
11 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
12 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
13 ; X86-NEXT: vmovaps %xmm0, (%eax)
16 ; SSE-LABEL: vp_fadd_v4f32:
18 ; SSE-NEXT: addps %xmm1, %xmm0
19 ; SSE-NEXT: movaps %xmm0, (%rdi)
22 ; AVX-LABEL: vp_fadd_v4f32:
24 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
25 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
27 %res = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
28 store <4 x float> %res, ptr %out
31 declare <4 x float> @llvm.vp.fadd.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
33 define void @vp_fsub_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
34 ; X86-LABEL: vp_fsub_v4f32:
36 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
37 ; X86-NEXT: vsubps %xmm1, %xmm0, %xmm0
38 ; X86-NEXT: vmovaps %xmm0, (%eax)
41 ; SSE-LABEL: vp_fsub_v4f32:
43 ; SSE-NEXT: subps %xmm1, %xmm0
44 ; SSE-NEXT: movaps %xmm0, (%rdi)
47 ; AVX-LABEL: vp_fsub_v4f32:
49 ; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0
50 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
52 %res = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
53 store <4 x float> %res, ptr %out
56 declare <4 x float> @llvm.vp.fsub.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
58 define void @vp_fmul_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
59 ; X86-LABEL: vp_fmul_v4f32:
61 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
62 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
63 ; X86-NEXT: vmovaps %xmm0, (%eax)
66 ; SSE-LABEL: vp_fmul_v4f32:
68 ; SSE-NEXT: mulps %xmm1, %xmm0
69 ; SSE-NEXT: movaps %xmm0, (%rdi)
72 ; AVX-LABEL: vp_fmul_v4f32:
74 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
75 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
77 %res = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
78 store <4 x float> %res, ptr %out
81 declare <4 x float> @llvm.vp.fmul.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
83 define void @vp_fdiv_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
84 ; X86-LABEL: vp_fdiv_v4f32:
86 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
87 ; X86-NEXT: vdivps %xmm1, %xmm0, %xmm0
88 ; X86-NEXT: vmovaps %xmm0, (%eax)
91 ; SSE-LABEL: vp_fdiv_v4f32:
93 ; SSE-NEXT: divps %xmm1, %xmm0
94 ; SSE-NEXT: movaps %xmm0, (%rdi)
97 ; AVX-LABEL: vp_fdiv_v4f32:
99 ; AVX-NEXT: vdivps %xmm1, %xmm0, %xmm0
100 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
102 %res = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
103 store <4 x float> %res, ptr %out
106 declare <4 x float> @llvm.vp.fdiv.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
108 define void @vp_frem_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
109 ; X86-LABEL: vp_frem_v4f32:
111 ; X86-NEXT: pushl %esi
112 ; X86-NEXT: subl $80, %esp
113 ; X86-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
114 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
115 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
116 ; X86-NEXT: vextractps $2, %xmm1, {{[0-9]+}}(%esp)
117 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
118 ; X86-NEXT: calll fmodf
119 ; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
120 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
121 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
122 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
123 ; X86-NEXT: vextractps $1, %xmm0, (%esp)
124 ; X86-NEXT: calll fmodf
125 ; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
126 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
127 ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
128 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
129 ; X86-NEXT: vmovss %xmm0, (%esp)
130 ; X86-NEXT: calll fmodf
131 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
132 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
133 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
134 ; X86-NEXT: vextractps $3, %xmm0, (%esp)
135 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
136 ; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
137 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
138 ; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
139 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
140 ; X86-NEXT: calll fmodf
141 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
142 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
143 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
144 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
145 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
146 ; X86-NEXT: vmovaps %xmm0, (%esi)
147 ; X86-NEXT: addl $80, %esp
148 ; X86-NEXT: popl %esi
151 ; SSE-LABEL: vp_frem_v4f32:
153 ; SSE-NEXT: pushq %rbx
154 ; SSE-NEXT: subq $64, %rsp
155 ; SSE-NEXT: movq %rdi, %rbx
156 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
157 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
158 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
159 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
160 ; SSE-NEXT: callq fmodf@PLT
161 ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
162 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
163 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
164 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
165 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
166 ; SSE-NEXT: callq fmodf@PLT
167 ; SSE-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
168 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
169 ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
170 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
171 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
172 ; SSE-NEXT: callq fmodf@PLT
173 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
174 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
175 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
176 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
177 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
178 ; SSE-NEXT: callq fmodf@PLT
179 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
180 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
181 ; SSE-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
182 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0]
183 ; SSE-NEXT: movaps %xmm1, (%rbx)
184 ; SSE-NEXT: addq $64, %rsp
185 ; SSE-NEXT: popq %rbx
188 ; AVX-LABEL: vp_frem_v4f32:
190 ; AVX-NEXT: pushq %rbx
191 ; AVX-NEXT: subq $48, %rsp
192 ; AVX-NEXT: movq %rdi, %rbx
193 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
194 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
195 ; AVX-NEXT: callq fmodf@PLT
196 ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
197 ; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
198 ; AVX-NEXT: # xmm0 = mem[1,1,3,3]
199 ; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
200 ; AVX-NEXT: # xmm1 = mem[1,1,3,3]
201 ; AVX-NEXT: callq fmodf@PLT
202 ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
203 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
204 ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
205 ; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
206 ; AVX-NEXT: # xmm0 = mem[1,0]
207 ; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
208 ; AVX-NEXT: # xmm1 = mem[1,0]
209 ; AVX-NEXT: callq fmodf@PLT
210 ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
211 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
212 ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
213 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
214 ; AVX-NEXT: # xmm0 = mem[3,3,3,3]
215 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
216 ; AVX-NEXT: # xmm1 = mem[3,3,3,3]
217 ; AVX-NEXT: callq fmodf@PLT
218 ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
219 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
220 ; AVX-NEXT: vmovaps %xmm0, (%rbx)
221 ; AVX-NEXT: addq $48, %rsp
222 ; AVX-NEXT: popq %rbx
224 %res = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
225 store <4 x float> %res, ptr %out
228 declare <4 x float> @llvm.vp.frem.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
230 define void @vp_fabs_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
231 ; X86-LABEL: vp_fabs_v4f32:
233 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
234 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
235 ; X86-NEXT: vmovaps %xmm0, (%eax)
238 ; SSE-LABEL: vp_fabs_v4f32:
240 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
241 ; SSE-NEXT: movaps %xmm0, (%rdi)
244 ; AVX1-LABEL: vp_fabs_v4f32:
246 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
247 ; AVX1-NEXT: vmovaps %xmm0, (%rdi)
250 ; AVX2-LABEL: vp_fabs_v4f32:
252 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
253 ; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
254 ; AVX2-NEXT: vmovaps %xmm0, (%rdi)
257 ; AVX512-LABEL: vp_fabs_v4f32:
259 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
260 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
262 %res = call <4 x float> @llvm.vp.fabs.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
263 store <4 x float> %res, ptr %out
266 declare <4 x float> @llvm.vp.fabs.v4f32(<4 x float>, <4 x i1>, i32)
268 define void @vp_sqrt_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
269 ; X86-LABEL: vp_sqrt_v4f32:
271 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
272 ; X86-NEXT: vsqrtps %xmm0, %xmm0
273 ; X86-NEXT: vmovaps %xmm0, (%eax)
276 ; SSE-LABEL: vp_sqrt_v4f32:
278 ; SSE-NEXT: sqrtps %xmm0, %xmm0
279 ; SSE-NEXT: movaps %xmm0, (%rdi)
282 ; AVX-LABEL: vp_sqrt_v4f32:
284 ; AVX-NEXT: vsqrtps %xmm0, %xmm0
285 ; AVX-NEXT: vmovaps %xmm0, (%rdi)
287 %res = call <4 x float> @llvm.vp.sqrt.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
288 store <4 x float> %res, ptr %out
291 declare <4 x float> @llvm.vp.sqrt.v4f32(<4 x float>, <4 x i1>, i32)
293 define void @vp_fneg_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
294 ; X86-LABEL: vp_fneg_v4f32:
296 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
297 ; X86-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
298 ; X86-NEXT: vmovaps %xmm0, (%eax)
301 ; SSE-LABEL: vp_fneg_v4f32:
303 ; SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
304 ; SSE-NEXT: movaps %xmm0, (%rdi)
307 ; AVX1-LABEL: vp_fneg_v4f32:
309 ; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
310 ; AVX1-NEXT: vmovaps %xmm0, (%rdi)
313 ; AVX2-LABEL: vp_fneg_v4f32:
315 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
316 ; AVX2-NEXT: vxorps %xmm1, %xmm0, %xmm0
317 ; AVX2-NEXT: vmovaps %xmm0, (%rdi)
320 ; AVX512-LABEL: vp_fneg_v4f32:
322 ; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
323 ; AVX512-NEXT: vmovdqa %xmm0, (%rdi)
325 %res = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
326 store <4 x float> %res, ptr %out
329 declare <4 x float> @llvm.vp.fneg.v4f32(<4 x float>, <4 x i1>, i32)
331 define void @vp_fma_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind {
332 ; X86-LABEL: vp_fma_v4f32:
334 ; X86-NEXT: pushl %esi
335 ; X86-NEXT: subl $84, %esp
336 ; X86-NEXT: vmovupd %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
337 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
338 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
339 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
340 ; X86-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
341 ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
342 ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
343 ; X86-NEXT: calll fmaf
344 ; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
345 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
346 ; X86-NEXT: vextractps $1, %xmm0, (%esp)
347 ; X86-NEXT: vmovshdup {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Folded Reload
348 ; X86-NEXT: # xmm0 = mem[1,1,3,3]
349 ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
350 ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
351 ; X86-NEXT: calll fmaf
352 ; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
353 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
354 ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
355 ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
356 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
357 ; X86-NEXT: vmovss %xmm0, (%esp)
358 ; X86-NEXT: calll fmaf
359 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
360 ; X86-NEXT: vextractps $3, %xmm0, (%esp)
361 ; X86-NEXT: vpermilps $255, {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Folded Reload
362 ; X86-NEXT: # xmm0 = mem[3,3,3,3]
363 ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
364 ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
365 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
366 ; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
367 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
368 ; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
369 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
370 ; X86-NEXT: calll fmaf
371 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
372 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
373 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
374 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
375 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
376 ; X86-NEXT: vmovaps %xmm0, (%esi)
377 ; X86-NEXT: addl $84, %esp
378 ; X86-NEXT: popl %esi
381 ; SSE-LABEL: vp_fma_v4f32:
383 ; SSE-NEXT: pushq %rbx
384 ; SSE-NEXT: subq $64, %rsp
385 ; SSE-NEXT: movq %rdi, %rbx
386 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
387 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
388 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
389 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
390 ; SSE-NEXT: movaps %xmm1, %xmm2
391 ; SSE-NEXT: callq fmaf@PLT
392 ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
393 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
394 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
395 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
396 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
397 ; SSE-NEXT: movaps %xmm1, %xmm2
398 ; SSE-NEXT: callq fmaf@PLT
399 ; SSE-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
400 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
401 ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
402 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
403 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
404 ; SSE-NEXT: movaps %xmm1, %xmm2
405 ; SSE-NEXT: callq fmaf@PLT
406 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
407 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
408 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
409 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
410 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
411 ; SSE-NEXT: movaps %xmm1, %xmm2
412 ; SSE-NEXT: callq fmaf@PLT
413 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
414 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
415 ; SSE-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
416 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0]
417 ; SSE-NEXT: movaps %xmm1, (%rbx)
418 ; SSE-NEXT: addq $64, %rsp
419 ; SSE-NEXT: popq %rbx
422 ; AVX1-LABEL: vp_fma_v4f32:
424 ; AVX1-NEXT: pushq %rbx
425 ; AVX1-NEXT: subq $48, %rsp
426 ; AVX1-NEXT: movq %rdi, %rbx
427 ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
428 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
429 ; AVX1-NEXT: vmovaps %xmm1, %xmm2
430 ; AVX1-NEXT: callq fmaf@PLT
431 ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
432 ; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
433 ; AVX1-NEXT: # xmm0 = mem[1,1,3,3]
434 ; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
435 ; AVX1-NEXT: # xmm1 = mem[1,1,3,3]
436 ; AVX1-NEXT: vmovaps %xmm1, %xmm2
437 ; AVX1-NEXT: callq fmaf@PLT
438 ; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
439 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
440 ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
441 ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
442 ; AVX1-NEXT: # xmm0 = mem[1,0]
443 ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
444 ; AVX1-NEXT: # xmm1 = mem[1,0]
445 ; AVX1-NEXT: vmovapd %xmm1, %xmm2
446 ; AVX1-NEXT: callq fmaf@PLT
447 ; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
448 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
449 ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
450 ; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
451 ; AVX1-NEXT: # xmm0 = mem[3,3,3,3]
452 ; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
453 ; AVX1-NEXT: # xmm1 = mem[3,3,3,3]
454 ; AVX1-NEXT: vmovaps %xmm1, %xmm2
455 ; AVX1-NEXT: callq fmaf@PLT
456 ; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
457 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
458 ; AVX1-NEXT: vmovaps %xmm0, (%rbx)
459 ; AVX1-NEXT: addq $48, %rsp
460 ; AVX1-NEXT: popq %rbx
463 ; AVX2-LABEL: vp_fma_v4f32:
465 ; AVX2-NEXT: pushq %rbx
466 ; AVX2-NEXT: subq $48, %rsp
467 ; AVX2-NEXT: movq %rdi, %rbx
468 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
469 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
470 ; AVX2-NEXT: vmovaps %xmm1, %xmm2
471 ; AVX2-NEXT: callq fmaf@PLT
472 ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
473 ; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
474 ; AVX2-NEXT: # xmm0 = mem[1,1,3,3]
475 ; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
476 ; AVX2-NEXT: # xmm1 = mem[1,1,3,3]
477 ; AVX2-NEXT: vmovaps %xmm1, %xmm2
478 ; AVX2-NEXT: callq fmaf@PLT
479 ; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
480 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
481 ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
482 ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
483 ; AVX2-NEXT: # xmm0 = mem[1,0]
484 ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
485 ; AVX2-NEXT: # xmm1 = mem[1,0]
486 ; AVX2-NEXT: vmovapd %xmm1, %xmm2
487 ; AVX2-NEXT: callq fmaf@PLT
488 ; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
489 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
490 ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
491 ; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
492 ; AVX2-NEXT: # xmm0 = mem[3,3,3,3]
493 ; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
494 ; AVX2-NEXT: # xmm1 = mem[3,3,3,3]
495 ; AVX2-NEXT: vmovaps %xmm1, %xmm2
496 ; AVX2-NEXT: callq fmaf@PLT
497 ; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
498 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
499 ; AVX2-NEXT: vmovaps %xmm0, (%rbx)
500 ; AVX2-NEXT: addq $48, %rsp
501 ; AVX2-NEXT: popq %rbx
504 ; AVX512-LABEL: vp_fma_v4f32:
506 ; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
507 ; AVX512-NEXT: vmovaps %xmm0, (%rdi)
509 %res = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 4)
510 store <4 x float> %res, ptr %out
513 declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32)
515 define void @vp_fmuladd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind {
516 ; X86-LABEL: vp_fmuladd_v4f32:
518 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
519 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
520 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
521 ; X86-NEXT: vmovaps %xmm0, (%eax)
524 ; SSE-LABEL: vp_fmuladd_v4f32:
526 ; SSE-NEXT: mulps %xmm1, %xmm0
527 ; SSE-NEXT: addps %xmm1, %xmm0
528 ; SSE-NEXT: movaps %xmm0, (%rdi)
531 ; AVX1-LABEL: vp_fmuladd_v4f32:
533 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0
534 ; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0
535 ; AVX1-NEXT: vmovaps %xmm0, (%rdi)
538 ; AVX2-LABEL: vp_fmuladd_v4f32:
540 ; AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0
541 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
542 ; AVX2-NEXT: vmovaps %xmm0, (%rdi)
545 ; AVX512-LABEL: vp_fmuladd_v4f32:
547 ; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
548 ; AVX512-NEXT: vmovaps %xmm0, (%rdi)
550 %res = call <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 4)
551 store <4 x float> %res, ptr %out
554 declare <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32)
556 declare <4 x float> @llvm.vp.maxnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
557 define <4 x float> @vfmax_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) {
558 ; X86-LABEL: vfmax_vv_v4f32:
560 ; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm2
561 ; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
562 ; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
565 ; SSE-LABEL: vfmax_vv_v4f32:
567 ; SSE-NEXT: movaps %xmm1, %xmm2
568 ; SSE-NEXT: maxps %xmm0, %xmm2
569 ; SSE-NEXT: cmpunordps %xmm0, %xmm0
570 ; SSE-NEXT: andps %xmm0, %xmm1
571 ; SSE-NEXT: andnps %xmm2, %xmm0
572 ; SSE-NEXT: orps %xmm1, %xmm0
575 ; AVX1-LABEL: vfmax_vv_v4f32:
577 ; AVX1-NEXT: vmaxps %xmm0, %xmm1, %xmm2
578 ; AVX1-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
579 ; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
582 ; AVX2-LABEL: vfmax_vv_v4f32:
584 ; AVX2-NEXT: vmaxps %xmm0, %xmm1, %xmm2
585 ; AVX2-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
586 ; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
589 ; AVX512-LABEL: vfmax_vv_v4f32:
591 ; AVX512-NEXT: vmaxps %xmm0, %xmm1, %xmm2
592 ; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %k1
593 ; AVX512-NEXT: vmovaps %xmm1, %xmm2 {%k1}
594 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
596 %v = call <4 x float> @llvm.vp.maxnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl)
600 declare <8 x float> @llvm.vp.maxnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32)
601 define <8 x float> @vfmax_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) {
602 ; X86-LABEL: vfmax_vv_v8f32:
604 ; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm2
605 ; X86-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0
606 ; X86-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
609 ; SSE-LABEL: vfmax_vv_v8f32:
611 ; SSE-NEXT: movaps %xmm2, %xmm4
612 ; SSE-NEXT: maxps %xmm0, %xmm4
613 ; SSE-NEXT: cmpunordps %xmm0, %xmm0
614 ; SSE-NEXT: andps %xmm0, %xmm2
615 ; SSE-NEXT: andnps %xmm4, %xmm0
616 ; SSE-NEXT: orps %xmm2, %xmm0
617 ; SSE-NEXT: movaps %xmm3, %xmm2
618 ; SSE-NEXT: maxps %xmm1, %xmm2
619 ; SSE-NEXT: cmpunordps %xmm1, %xmm1
620 ; SSE-NEXT: andps %xmm1, %xmm3
621 ; SSE-NEXT: andnps %xmm2, %xmm1
622 ; SSE-NEXT: orps %xmm3, %xmm1
625 ; AVX1-LABEL: vfmax_vv_v8f32:
627 ; AVX1-NEXT: vmaxps %ymm0, %ymm1, %ymm2
628 ; AVX1-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0
629 ; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
632 ; AVX2-LABEL: vfmax_vv_v8f32:
634 ; AVX2-NEXT: vmaxps %ymm0, %ymm1, %ymm2
635 ; AVX2-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0
636 ; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
639 ; AVX512-LABEL: vfmax_vv_v8f32:
641 ; AVX512-NEXT: vmaxps %ymm0, %ymm1, %ymm2
642 ; AVX512-NEXT: vcmpunordps %ymm0, %ymm0, %k1
643 ; AVX512-NEXT: vmovaps %ymm1, %ymm2 {%k1}
644 ; AVX512-NEXT: vmovaps %ymm2, %ymm0
646 %v = call <8 x float> @llvm.vp.maxnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl)
650 declare <4 x float> @llvm.vp.minnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
651 define <4 x float> @vfmin_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) {
652 ; X86-LABEL: vfmin_vv_v4f32:
654 ; X86-NEXT: vminps %xmm0, %xmm1, %xmm2
655 ; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
656 ; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
659 ; SSE-LABEL: vfmin_vv_v4f32:
661 ; SSE-NEXT: movaps %xmm1, %xmm2
662 ; SSE-NEXT: minps %xmm0, %xmm2
663 ; SSE-NEXT: cmpunordps %xmm0, %xmm0
664 ; SSE-NEXT: andps %xmm0, %xmm1
665 ; SSE-NEXT: andnps %xmm2, %xmm0
666 ; SSE-NEXT: orps %xmm1, %xmm0
669 ; AVX1-LABEL: vfmin_vv_v4f32:
671 ; AVX1-NEXT: vminps %xmm0, %xmm1, %xmm2
672 ; AVX1-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
673 ; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
676 ; AVX2-LABEL: vfmin_vv_v4f32:
678 ; AVX2-NEXT: vminps %xmm0, %xmm1, %xmm2
679 ; AVX2-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
680 ; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
683 ; AVX512-LABEL: vfmin_vv_v4f32:
685 ; AVX512-NEXT: vminps %xmm0, %xmm1, %xmm2
686 ; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %k1
687 ; AVX512-NEXT: vmovaps %xmm1, %xmm2 {%k1}
688 ; AVX512-NEXT: vmovaps %xmm2, %xmm0
690 %v = call <4 x float> @llvm.vp.minnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl)
694 declare <8 x float> @llvm.vp.minnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32)
695 define <8 x float> @vfmin_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) {
696 ; X86-LABEL: vfmin_vv_v8f32:
698 ; X86-NEXT: vminps %ymm0, %ymm1, %ymm2
699 ; X86-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0
700 ; X86-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
703 ; SSE-LABEL: vfmin_vv_v8f32:
705 ; SSE-NEXT: movaps %xmm2, %xmm4
706 ; SSE-NEXT: minps %xmm0, %xmm4
707 ; SSE-NEXT: cmpunordps %xmm0, %xmm0
708 ; SSE-NEXT: andps %xmm0, %xmm2
709 ; SSE-NEXT: andnps %xmm4, %xmm0
710 ; SSE-NEXT: orps %xmm2, %xmm0
711 ; SSE-NEXT: movaps %xmm3, %xmm2
712 ; SSE-NEXT: minps %xmm1, %xmm2
713 ; SSE-NEXT: cmpunordps %xmm1, %xmm1
714 ; SSE-NEXT: andps %xmm1, %xmm3
715 ; SSE-NEXT: andnps %xmm2, %xmm1
716 ; SSE-NEXT: orps %xmm3, %xmm1
719 ; AVX1-LABEL: vfmin_vv_v8f32:
721 ; AVX1-NEXT: vminps %ymm0, %ymm1, %ymm2
722 ; AVX1-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0
723 ; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
726 ; AVX2-LABEL: vfmin_vv_v8f32:
728 ; AVX2-NEXT: vminps %ymm0, %ymm1, %ymm2
729 ; AVX2-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0
730 ; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
733 ; AVX512-LABEL: vfmin_vv_v8f32:
735 ; AVX512-NEXT: vminps %ymm0, %ymm1, %ymm2
736 ; AVX512-NEXT: vcmpunordps %ymm0, %ymm0, %k1
737 ; AVX512-NEXT: vmovaps %ymm1, %ymm2 {%k1}
738 ; AVX512-NEXT: vmovaps %ymm2, %ymm0
740 %v = call <8 x float> @llvm.vp.minnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl)