1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -verify-machineinstrs -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
13 ; CHECK-LABEL: stack_fold_addpd:
15 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
19 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
22 %2 = fadd <2 x double> %a0, %a1
26 define <4 x double> @stack_fold_addpd_ymm(<4 x double> %a0, <4 x double> %a1) {
27 ; CHECK-LABEL: stack_fold_addpd_ymm:
29 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
33 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
35 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
36 %2 = fadd <4 x double> %a0, %a1
40 define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
41 ; CHECK-LABEL: stack_fold_addps:
43 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
47 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
49 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
50 %2 = fadd <4 x float> %a0, %a1
54 define <8 x float> @stack_fold_addps_ymm(<8 x float> %a0, <8 x float> %a1) {
55 ; CHECK-LABEL: stack_fold_addps_ymm:
57 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
61 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
63 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
64 %2 = fadd <8 x float> %a0, %a1
68 define double @stack_fold_addsd(double %a0, double %a1) {
69 ; CHECK-LABEL: stack_fold_addsd:
71 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
75 ; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
77 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
78 %2 = fadd double %a0, %a1
82 define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
83 ; CHECK-LABEL: stack_fold_addsd_int:
85 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
89 ; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
91 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
92 %2 = extractelement <2 x double> %a0, i32 0
93 %3 = extractelement <2 x double> %a1, i32 0
94 %4 = fadd double %2, %3
95 %5 = insertelement <2 x double> %a0, double %4, i32 0
98 declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
100 define float @stack_fold_addss(float %a0, float %a1) {
101 ; CHECK-LABEL: stack_fold_addss:
103 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
106 ; CHECK-NEXT: #NO_APP
107 ; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
109 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
110 %2 = fadd float %a0, %a1
114 define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
115 ; CHECK-LABEL: stack_fold_addss_int:
117 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
120 ; CHECK-NEXT: #NO_APP
121 ; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
123 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
124 %2 = extractelement <4 x float> %a0, i32 0
125 %3 = extractelement <4 x float> %a1, i32 0
126 %4 = fadd float %2, %3
127 %5 = insertelement <4 x float> %a0, float %4, i32 0
130 declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
132 define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
133 ; CHECK-LABEL: stack_fold_addsubpd:
135 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
138 ; CHECK-NEXT: #NO_APP
139 ; CHECK-NEXT: vaddsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
141 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
142 %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
145 declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
147 define <4 x double> @stack_fold_addsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
148 ; CHECK-LABEL: stack_fold_addsubpd_ymm:
150 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
153 ; CHECK-NEXT: #NO_APP
154 ; CHECK-NEXT: vaddsubpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
156 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
157 %2 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
160 declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
162 define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
163 ; CHECK-LABEL: stack_fold_addsubps:
165 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
168 ; CHECK-NEXT: #NO_APP
169 ; CHECK-NEXT: vaddsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
171 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
172 %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
175 declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
177 define <8 x float> @stack_fold_addsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
178 ; CHECK-LABEL: stack_fold_addsubps_ymm:
180 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
183 ; CHECK-NEXT: #NO_APP
184 ; CHECK-NEXT: vaddsubps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
186 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
187 %2 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
190 declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
192 define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
193 ; CHECK-LABEL: stack_fold_andnpd:
195 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
198 ; CHECK-NEXT: #NO_APP
199 ; CHECK-NEXT: vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
200 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
201 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
204 %2 = bitcast <2 x double> %a0 to <2 x i64>
205 %3 = bitcast <2 x double> %a1 to <2 x i64>
206 %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
207 %5 = and <2 x i64> %4, %3
208 %6 = bitcast <2 x i64> %5 to <2 x double>
209 ; fadd forces execution domain
210 %7 = fadd <2 x double> %6, <double 0x0, double 0x0>
214 define <4 x double> @stack_fold_andnpd_ymm(<4 x double> %a0, <4 x double> %a1) {
215 ; CHECK-LABEL: stack_fold_andnpd_ymm:
217 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
220 ; CHECK-NEXT: #NO_APP
221 ; CHECK-NEXT: vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
222 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
223 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
225 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
226 %2 = bitcast <4 x double> %a0 to <4 x i64>
227 %3 = bitcast <4 x double> %a1 to <4 x i64>
228 %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
229 %5 = and <4 x i64> %4, %3
230 %6 = bitcast <4 x i64> %5 to <4 x double>
231 ; fadd forces execution domain
232 %7 = fadd <4 x double> %6, <double 0x0, double 0x0, double 0x0, double 0x0>
236 define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
237 ; CHECK-LABEL: stack_fold_andnps:
239 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
242 ; CHECK-NEXT: #NO_APP
243 ; CHECK-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
244 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
245 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
247 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
248 %2 = bitcast <4 x float> %a0 to <2 x i64>
249 %3 = bitcast <4 x float> %a1 to <2 x i64>
250 %4 = xor <2 x i64> %2, <i64 -1, i64 -1>
251 %5 = and <2 x i64> %4, %3
252 %6 = bitcast <2 x i64> %5 to <4 x float>
253 ; fadd forces execution domain
254 %7 = fadd <4 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0>
258 define <8 x float> @stack_fold_andnps_ymm(<8 x float> %a0, <8 x float> %a1) {
259 ; CHECK-LABEL: stack_fold_andnps_ymm:
261 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
264 ; CHECK-NEXT: #NO_APP
265 ; CHECK-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
266 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
267 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
269 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
270 %2 = bitcast <8 x float> %a0 to <4 x i64>
271 %3 = bitcast <8 x float> %a1 to <4 x i64>
272 %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
273 %5 = and <4 x i64> %4, %3
274 %6 = bitcast <4 x i64> %5 to <8 x float>
275 ; fadd forces execution domain
276 %7 = fadd <8 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
280 define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
281 ; CHECK-LABEL: stack_fold_andpd:
283 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
286 ; CHECK-NEXT: #NO_APP
287 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
288 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
289 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
291 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
292 %2 = bitcast <2 x double> %a0 to <2 x i64>
293 %3 = bitcast <2 x double> %a1 to <2 x i64>
294 %4 = and <2 x i64> %2, %3
295 %5 = bitcast <2 x i64> %4 to <2 x double>
296 ; fadd forces execution domain
297 %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
301 define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) {
302 ; CHECK-LABEL: stack_fold_andpd_ymm:
304 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
307 ; CHECK-NEXT: #NO_APP
308 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
309 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
310 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
312 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
313 %2 = bitcast <4 x double> %a0 to <4 x i64>
314 %3 = bitcast <4 x double> %a1 to <4 x i64>
315 %4 = and <4 x i64> %2, %3
316 %5 = bitcast <4 x i64> %4 to <4 x double>
317 ; fadd forces execution domain
318 %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
322 define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
323 ; CHECK-LABEL: stack_fold_andps:
325 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
328 ; CHECK-NEXT: #NO_APP
329 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
330 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
331 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
333 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
334 %2 = bitcast <4 x float> %a0 to <2 x i64>
335 %3 = bitcast <4 x float> %a1 to <2 x i64>
336 %4 = and <2 x i64> %2, %3
337 %5 = bitcast <2 x i64> %4 to <4 x float>
338 ; fadd forces execution domain
339 %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
343 define <8 x float> @stack_fold_andps_ymm(<8 x float> %a0, <8 x float> %a1) {
344 ; CHECK-LABEL: stack_fold_andps_ymm:
346 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
349 ; CHECK-NEXT: #NO_APP
350 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
351 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
352 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
354 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
355 %2 = bitcast <8 x float> %a0 to <4 x i64>
356 %3 = bitcast <8 x float> %a1 to <4 x i64>
357 %4 = and <4 x i64> %2, %3
358 %5 = bitcast <4 x i64> %4 to <8 x float>
359 ; fadd forces execution domain
360 %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
364 define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
365 ; CHECK-LABEL: stack_fold_blendpd:
367 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
370 ; CHECK-NEXT: #NO_APP
371 ; CHECK-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
372 ; CHECK-NEXT: # xmm0 = xmm0[0],mem[1]
373 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
374 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
376 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
377 %2 = select <2 x i1> <i1 1, i1 0>, <2 x double> %a0, <2 x double> %a1
378 ; fadd forces execution domain
379 %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
383 define <4 x double> @stack_fold_blendpd_ymm(<4 x double> %a0, <4 x double> %a1) {
384 ; CHECK-LABEL: stack_fold_blendpd_ymm:
386 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
389 ; CHECK-NEXT: #NO_APP
390 ; CHECK-NEXT: vblendpd $6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
391 ; CHECK-NEXT: # ymm0 = ymm0[0],mem[1,2],ymm0[3]
392 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
393 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
395 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
396 %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %a0, <4 x double> %a1
397 ; fadd forces execution domain
398 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
401 define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
402 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
403 %2 = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x float> %a0, <4 x float> %a1
404 ; fadd forces execution domain
405 %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
409 define <8 x float> @stack_fold_blendps_ymm(<8 x float> %a0, <8 x float> %a1) {
410 ; CHECK-LABEL: stack_fold_blendps_ymm:
412 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
415 ; CHECK-NEXT: #NO_APP
416 ; CHECK-NEXT: vblendps $102, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
417 ; CHECK-NEXT: # ymm0 = ymm0[0],mem[1,2],ymm0[3,4],mem[5,6],ymm0[7]
418 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
419 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
421 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
422 %2 = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %a0, <8 x float> %a1
423 ; fadd forces execution domain
424 %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
428 define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
429 ; CHECK-LABEL: stack_fold_blendvpd:
431 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
434 ; CHECK-NEXT: #NO_APP
435 ; CHECK-NEXT: vblendvpd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
437 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
438 %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
441 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
443 define <4 x double> @stack_fold_blendvpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %c) {
444 ; CHECK-LABEL: stack_fold_blendvpd_ymm:
446 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
449 ; CHECK-NEXT: #NO_APP
450 ; CHECK-NEXT: vblendvpd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
452 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
453 %2 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a1, <4 x double> %c, <4 x double> %a0)
456 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
458 define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
459 ; CHECK-LABEL: stack_fold_blendvps:
461 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
464 ; CHECK-NEXT: #NO_APP
465 ; CHECK-NEXT: vblendvps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
467 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
468 %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
471 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
473 define <8 x float> @stack_fold_blendvps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %c) {
474 ; CHECK-LABEL: stack_fold_blendvps_ymm:
476 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
479 ; CHECK-NEXT: #NO_APP
480 ; CHECK-NEXT: vblendvps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
482 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
483 %2 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a1, <8 x float> %c, <8 x float> %a0)
486 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
488 define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
489 ; CHECK-LABEL: stack_fold_cmppd:
491 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
494 ; CHECK-NEXT: #NO_APP
495 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
497 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
498 %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
501 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
503 define <4 x double> @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {
504 ; CHECK-LABEL: stack_fold_cmppd_ymm:
506 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
509 ; CHECK-NEXT: #NO_APP
510 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
512 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
513 %2 = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
516 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
518 define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
519 ; CHECK-LABEL: stack_fold_cmpps:
521 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
524 ; CHECK-NEXT: #NO_APP
525 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
527 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
528 %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
531 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
533 define <8 x float> @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {
534 ; CHECK-LABEL: stack_fold_cmpps_ymm:
536 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
539 ; CHECK-NEXT: #NO_APP
540 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
542 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
543 %2 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 0)
546 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
548 define i32 @stack_fold_cmpsd(double %a0, double %a1) {
549 ; CHECK-LABEL: stack_fold_cmpsd:
551 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
554 ; CHECK-NEXT: #NO_APP
555 ; CHECK-NEXT: vcmpeqsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
556 ; CHECK-NEXT: vmovq %xmm0, %rax
557 ; CHECK-NEXT: andl $1, %eax
558 ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
560 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
561 %2 = fcmp oeq double %a0, %a1
562 %3 = zext i1 %2 to i32
566 define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
567 ; CHECK-LABEL: stack_fold_cmpsd_int:
569 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
572 ; CHECK-NEXT: #NO_APP
573 ; CHECK-NEXT: vcmpeqsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
575 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
576 %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
579 declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
581 define i32 @stack_fold_cmpss(float %a0, float %a1) {
582 ; CHECK-LABEL: stack_fold_cmpss:
584 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
587 ; CHECK-NEXT: #NO_APP
588 ; CHECK-NEXT: vcmpeqss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
589 ; CHECK-NEXT: vmovd %xmm0, %eax
590 ; CHECK-NEXT: andl $1, %eax
592 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
593 %2 = fcmp oeq float %a0, %a1
594 %3 = zext i1 %2 to i32
598 define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
599 ; CHECK-LABEL: stack_fold_cmpss_int:
601 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
604 ; CHECK-NEXT: #NO_APP
605 ; CHECK-NEXT: vcmpeqss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
607 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
608 %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
611 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
613 ; TODO stack_fold_comisd
615 define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
616 ; CHECK-LABEL: stack_fold_comisd_int:
618 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
621 ; CHECK-NEXT: #NO_APP
622 ; CHECK-NEXT: vcomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
623 ; CHECK-NEXT: setnp %al
624 ; CHECK-NEXT: sete %cl
625 ; CHECK-NEXT: andb %al, %cl
626 ; CHECK-NEXT: movzbl %cl, %eax
628 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
629 %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
632 declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
634 ; TODO stack_fold_comiss
636 define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
637 ; CHECK-LABEL: stack_fold_comiss_int:
639 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
642 ; CHECK-NEXT: #NO_APP
643 ; CHECK-NEXT: vcomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
644 ; CHECK-NEXT: setnp %al
645 ; CHECK-NEXT: sete %cl
646 ; CHECK-NEXT: andb %al, %cl
647 ; CHECK-NEXT: movzbl %cl, %eax
649 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
650 %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
653 declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
655 define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
656 ; CHECK-LABEL: stack_fold_cvtdq2pd:
658 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
661 ; CHECK-NEXT: #NO_APP
662 ; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
664 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
665 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
666 %3 = sitofp <2 x i32> %2 to <2 x double>
669 define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
670 ; CHECK-LABEL: stack_fold_cvtdq2pd_int:
672 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
675 ; CHECK-NEXT: #NO_APP
676 ; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
678 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
679 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a0, <2 x i32> <i32 0, i32 1>
680 %cvt = sitofp <2 x i32> %2 to <2 x double>
681 ret <2 x double> %cvt
684 define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) {
685 ; CHECK-LABEL: stack_fold_cvtdq2pd_ymm:
687 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
690 ; CHECK-NEXT: #NO_APP
691 ; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
693 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
694 %2 = sitofp <4 x i32> %a0 to <4 x double>
698 define <4 x double> @stack_fold_cvtdq2pd_ymm_int(<4 x i32> %a0) {
699 ; CHECK-LABEL: stack_fold_cvtdq2pd_ymm_int:
701 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
704 ; CHECK-NEXT: #NO_APP
705 ; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
707 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
708 %cvt = sitofp <4 x i32> %a0 to <4 x double>
709 ret <4 x double> %cvt
712 define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
713 ; CHECK-LABEL: stack_fold_cvtdq2ps:
715 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
718 ; CHECK-NEXT: #NO_APP
719 ; CHECK-NEXT: vcvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
721 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
722 %2 = sitofp <4 x i32> %a0 to <4 x float>
726 define <8 x float> @stack_fold_cvtdq2ps_ymm(<8 x i32> %a0) {
727 ; CHECK-LABEL: stack_fold_cvtdq2ps_ymm:
729 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
732 ; CHECK-NEXT: #NO_APP
733 ; CHECK-NEXT: vcvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
735 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
736 %2 = sitofp <8 x i32> %a0 to <8 x float>
740 define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
741 ; CHECK-LABEL: stack_fold_cvtpd2dq:
743 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
746 ; CHECK-NEXT: #NO_APP
747 ; CHECK-NEXT: vcvtpd2dqx {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
749 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
750 %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
753 declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
755 define <4 x i32> @stack_fold_cvtpd2dq_ymm(<4 x double> %a0) {
756 ; CHECK-LABEL: stack_fold_cvtpd2dq_ymm:
758 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
761 ; CHECK-NEXT: #NO_APP
762 ; CHECK-NEXT: vcvtpd2dqy {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload
763 ; CHECK-NEXT: vzeroupper
765 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
766 %2 = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
769 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
771 define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
772 ; CHECK-LABEL: stack_fold_cvtpd2ps:
774 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
777 ; CHECK-NEXT: #NO_APP
778 ; CHECK-NEXT: vcvtpd2psx {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
780 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
781 %2 = fptrunc <2 x double> %a0 to <2 x float>
785 define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) {
786 ; CHECK-LABEL: stack_fold_cvtpd2ps_ymm:
788 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
791 ; CHECK-NEXT: #NO_APP
792 ; CHECK-NEXT: vcvtpd2psy {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload
793 ; CHECK-NEXT: vzeroupper
795 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
796 %2 = fptrunc <4 x double> %a0 to <4 x float>
800 define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) {
801 ; CHECK-LABEL: stack_fold_cvtph2ps:
803 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
806 ; CHECK-NEXT: #NO_APP
807 ; CHECK-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
809 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
810 %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0)
813 declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
815 define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) {
816 ; CHECK-LABEL: stack_fold_cvtph2ps_ymm:
818 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
821 ; CHECK-NEXT: #NO_APP
822 ; CHECK-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
824 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
825 %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0)
828 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
830 define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
831 ; CHECK-LABEL: stack_fold_cvtps2dq:
833 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
836 ; CHECK-NEXT: #NO_APP
837 ; CHECK-NEXT: vcvtps2dq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
839 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
840 %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
843 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
845 define <8 x i32> @stack_fold_cvtps2dq_ymm(<8 x float> %a0) {
846 ; CHECK-LABEL: stack_fold_cvtps2dq_ymm:
848 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
851 ; CHECK-NEXT: #NO_APP
852 ; CHECK-NEXT: vcvtps2dq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
854 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
855 %2 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
858 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
860 define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
861 ; CHECK-LABEL: stack_fold_cvtps2pd:
863 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
866 ; CHECK-NEXT: #NO_APP
867 ; CHECK-NEXT: vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
869 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
870 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
871 %3 = fpext <2 x float> %2 to <2 x double>
875 define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
876 ; CHECK-LABEL: stack_fold_cvtps2pd_int:
878 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
881 ; CHECK-NEXT: #NO_APP
882 ; CHECK-NEXT: vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
884 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
885 %2 = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32> <i32 0, i32 1>
886 %cvtps2pd = fpext <2 x float> %2 to <2 x double>
887 ret <2 x double> %cvtps2pd
890 define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) {
891 ; CHECK-LABEL: stack_fold_cvtps2pd_ymm:
893 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
896 ; CHECK-NEXT: #NO_APP
897 ; CHECK-NEXT: vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
899 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
900 %2 = fpext <4 x float> %a0 to <4 x double>
904 define <4 x double> @stack_fold_cvtps2pd_ymm_int(<4 x float> %a0) {
905 ; CHECK-LABEL: stack_fold_cvtps2pd_ymm_int:
907 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
910 ; CHECK-NEXT: #NO_APP
911 ; CHECK-NEXT: vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
913 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
914 %cvtps2pd = fpext <4 x float> %a0 to <4 x double>
915 ret <4 x double> %cvtps2pd
918 define <8 x i16> @stack_fold_cvtps2ph_ymm(<8 x float> %a0) {
919 ; CHECK-LABEL: stack_fold_cvtps2ph_ymm:
921 ; CHECK-NEXT: vcvtps2ph $0, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
924 ; CHECK-NEXT: #NO_APP
925 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
926 ; CHECK-NEXT: vzeroupper
928 %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
929 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
932 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
934 ; TODO stack_fold_cvtsd2si
936 define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
937 ; CHECK-LABEL: stack_fold_cvtsd2si_int:
939 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
942 ; CHECK-NEXT: #NO_APP
943 ; CHECK-NEXT: vcvtsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
945 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
946 %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
949 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
951 ; TODO stack_fold_cvtsd2si64
953 define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
954 ; CHECK-LABEL: stack_fold_cvtsd2si64_int:
956 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
959 ; CHECK-NEXT: #NO_APP
960 ; CHECK-NEXT: vcvtsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
962 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
963 %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
966 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
968 define double @stack_fold_cvtsi2sd(i32 %a0) {
969 ; CHECK-LABEL: stack_fold_cvtsi2sd:
971 ; CHECK-NEXT: pushq %rbp
972 ; CHECK-NEXT: .cfi_def_cfa_offset 16
973 ; CHECK-NEXT: pushq %r15
974 ; CHECK-NEXT: .cfi_def_cfa_offset 24
975 ; CHECK-NEXT: pushq %r14
976 ; CHECK-NEXT: .cfi_def_cfa_offset 32
977 ; CHECK-NEXT: pushq %r13
978 ; CHECK-NEXT: .cfi_def_cfa_offset 40
979 ; CHECK-NEXT: pushq %r12
980 ; CHECK-NEXT: .cfi_def_cfa_offset 48
981 ; CHECK-NEXT: pushq %rbx
982 ; CHECK-NEXT: .cfi_def_cfa_offset 56
983 ; CHECK-NEXT: .cfi_offset %rbx, -56
984 ; CHECK-NEXT: .cfi_offset %r12, -48
985 ; CHECK-NEXT: .cfi_offset %r13, -40
986 ; CHECK-NEXT: .cfi_offset %r14, -32
987 ; CHECK-NEXT: .cfi_offset %r15, -24
988 ; CHECK-NEXT: .cfi_offset %rbp, -16
989 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
992 ; CHECK-NEXT: #NO_APP
993 ; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
994 ; CHECK-NEXT: popq %rbx
995 ; CHECK-NEXT: .cfi_def_cfa_offset 48
996 ; CHECK-NEXT: popq %r12
997 ; CHECK-NEXT: .cfi_def_cfa_offset 40
998 ; CHECK-NEXT: popq %r13
999 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1000 ; CHECK-NEXT: popq %r14
1001 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1002 ; CHECK-NEXT: popq %r15
1003 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1004 ; CHECK-NEXT: popq %rbp
1005 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1007 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1008 %2 = sitofp i32 %a0 to double
1012 define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) {
1013 ; CHECK-LABEL: stack_fold_cvtsi2sd_int:
1015 ; CHECK-NEXT: pushq %rbp
1016 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1017 ; CHECK-NEXT: pushq %r15
1018 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1019 ; CHECK-NEXT: pushq %r14
1020 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1021 ; CHECK-NEXT: pushq %r13
1022 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1023 ; CHECK-NEXT: pushq %r12
1024 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1025 ; CHECK-NEXT: pushq %rbx
1026 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1027 ; CHECK-NEXT: .cfi_offset %rbx, -56
1028 ; CHECK-NEXT: .cfi_offset %r12, -48
1029 ; CHECK-NEXT: .cfi_offset %r13, -40
1030 ; CHECK-NEXT: .cfi_offset %r14, -32
1031 ; CHECK-NEXT: .cfi_offset %r15, -24
1032 ; CHECK-NEXT: .cfi_offset %rbp, -16
1033 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1036 ; CHECK-NEXT: #NO_APP
1037 ; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1038 ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1039 ; CHECK-NEXT: popq %rbx
1040 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1041 ; CHECK-NEXT: popq %r12
1042 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1043 ; CHECK-NEXT: popq %r13
1044 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1045 ; CHECK-NEXT: popq %r14
1046 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1047 ; CHECK-NEXT: popq %r15
1048 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1049 ; CHECK-NEXT: popq %rbp
1050 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1052 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1053 %2 = sitofp i32 %a0 to double
1054 %3 = insertelement <2 x double> zeroinitializer, double %2, i64 0
1058 define double @stack_fold_cvtsi642sd(i64 %a0) {
1059 ; CHECK-LABEL: stack_fold_cvtsi642sd:
1061 ; CHECK-NEXT: pushq %rbp
1062 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1063 ; CHECK-NEXT: pushq %r15
1064 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1065 ; CHECK-NEXT: pushq %r14
1066 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1067 ; CHECK-NEXT: pushq %r13
1068 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1069 ; CHECK-NEXT: pushq %r12
1070 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1071 ; CHECK-NEXT: pushq %rbx
1072 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1073 ; CHECK-NEXT: .cfi_offset %rbx, -56
1074 ; CHECK-NEXT: .cfi_offset %r12, -48
1075 ; CHECK-NEXT: .cfi_offset %r13, -40
1076 ; CHECK-NEXT: .cfi_offset %r14, -32
1077 ; CHECK-NEXT: .cfi_offset %r15, -24
1078 ; CHECK-NEXT: .cfi_offset %rbp, -16
1079 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1082 ; CHECK-NEXT: #NO_APP
1083 ; CHECK-NEXT: vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
1084 ; CHECK-NEXT: popq %rbx
1085 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1086 ; CHECK-NEXT: popq %r12
1087 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1088 ; CHECK-NEXT: popq %r13
1089 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1090 ; CHECK-NEXT: popq %r14
1091 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1092 ; CHECK-NEXT: popq %r15
1093 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1094 ; CHECK-NEXT: popq %rbp
1095 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1097 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1098 %2 = sitofp i64 %a0 to double
1102 define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) {
1103 ; CHECK-LABEL: stack_fold_cvtsi642sd_int:
1105 ; CHECK-NEXT: pushq %rbp
1106 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1107 ; CHECK-NEXT: pushq %r15
1108 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1109 ; CHECK-NEXT: pushq %r14
1110 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1111 ; CHECK-NEXT: pushq %r13
1112 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1113 ; CHECK-NEXT: pushq %r12
1114 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1115 ; CHECK-NEXT: pushq %rbx
1116 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1117 ; CHECK-NEXT: .cfi_offset %rbx, -56
1118 ; CHECK-NEXT: .cfi_offset %r12, -48
1119 ; CHECK-NEXT: .cfi_offset %r13, -40
1120 ; CHECK-NEXT: .cfi_offset %r14, -32
1121 ; CHECK-NEXT: .cfi_offset %r15, -24
1122 ; CHECK-NEXT: .cfi_offset %rbp, -16
1123 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1126 ; CHECK-NEXT: #NO_APP
1127 ; CHECK-NEXT: vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
1128 ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1129 ; CHECK-NEXT: popq %rbx
1130 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1131 ; CHECK-NEXT: popq %r12
1132 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1133 ; CHECK-NEXT: popq %r13
1134 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1135 ; CHECK-NEXT: popq %r14
1136 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1137 ; CHECK-NEXT: popq %r15
1138 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1139 ; CHECK-NEXT: popq %rbp
1140 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1142 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1143 %2 = sitofp i64 %a0 to double
1144 %3 = insertelement <2 x double> zeroinitializer, double %2, i64 0
1148 define float @stack_fold_cvtsi2ss(i32 %a0) {
1149 ; CHECK-LABEL: stack_fold_cvtsi2ss:
1151 ; CHECK-NEXT: pushq %rbp
1152 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1153 ; CHECK-NEXT: pushq %r15
1154 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1155 ; CHECK-NEXT: pushq %r14
1156 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1157 ; CHECK-NEXT: pushq %r13
1158 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1159 ; CHECK-NEXT: pushq %r12
1160 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1161 ; CHECK-NEXT: pushq %rbx
1162 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1163 ; CHECK-NEXT: .cfi_offset %rbx, -56
1164 ; CHECK-NEXT: .cfi_offset %r12, -48
1165 ; CHECK-NEXT: .cfi_offset %r13, -40
1166 ; CHECK-NEXT: .cfi_offset %r14, -32
1167 ; CHECK-NEXT: .cfi_offset %r15, -24
1168 ; CHECK-NEXT: .cfi_offset %rbp, -16
1169 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1172 ; CHECK-NEXT: #NO_APP
1173 ; CHECK-NEXT: vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1174 ; CHECK-NEXT: popq %rbx
1175 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1176 ; CHECK-NEXT: popq %r12
1177 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1178 ; CHECK-NEXT: popq %r13
1179 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1180 ; CHECK-NEXT: popq %r14
1181 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1182 ; CHECK-NEXT: popq %r15
1183 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1184 ; CHECK-NEXT: popq %rbp
1185 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1187 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1188 %2 = sitofp i32 %a0 to float
1192 define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
1193 ; CHECK-LABEL: stack_fold_cvtsi2ss_int:
1195 ; CHECK-NEXT: pushq %rbp
1196 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1197 ; CHECK-NEXT: pushq %r15
1198 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1199 ; CHECK-NEXT: pushq %r14
1200 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1201 ; CHECK-NEXT: pushq %r13
1202 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1203 ; CHECK-NEXT: pushq %r12
1204 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1205 ; CHECK-NEXT: pushq %rbx
1206 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1207 ; CHECK-NEXT: .cfi_offset %rbx, -56
1208 ; CHECK-NEXT: .cfi_offset %r12, -48
1209 ; CHECK-NEXT: .cfi_offset %r13, -40
1210 ; CHECK-NEXT: .cfi_offset %r14, -32
1211 ; CHECK-NEXT: .cfi_offset %r15, -24
1212 ; CHECK-NEXT: .cfi_offset %rbp, -16
1213 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1216 ; CHECK-NEXT: #NO_APP
1217 ; CHECK-NEXT: vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1218 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
1219 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1220 ; CHECK-NEXT: popq %rbx
1221 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1222 ; CHECK-NEXT: popq %r12
1223 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1224 ; CHECK-NEXT: popq %r13
1225 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1226 ; CHECK-NEXT: popq %r14
1227 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1228 ; CHECK-NEXT: popq %r15
1229 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1230 ; CHECK-NEXT: popq %rbp
1231 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1233 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1234 %2 = sitofp i32 %a0 to float
1235 %3 = insertelement <4 x float> zeroinitializer, float %2, i64 0
1239 define float @stack_fold_cvtsi642ss(i64 %a0) {
1240 ; CHECK-LABEL: stack_fold_cvtsi642ss:
1242 ; CHECK-NEXT: pushq %rbp
1243 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1244 ; CHECK-NEXT: pushq %r15
1245 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1246 ; CHECK-NEXT: pushq %r14
1247 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1248 ; CHECK-NEXT: pushq %r13
1249 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1250 ; CHECK-NEXT: pushq %r12
1251 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1252 ; CHECK-NEXT: pushq %rbx
1253 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1254 ; CHECK-NEXT: .cfi_offset %rbx, -56
1255 ; CHECK-NEXT: .cfi_offset %r12, -48
1256 ; CHECK-NEXT: .cfi_offset %r13, -40
1257 ; CHECK-NEXT: .cfi_offset %r14, -32
1258 ; CHECK-NEXT: .cfi_offset %r15, -24
1259 ; CHECK-NEXT: .cfi_offset %rbp, -16
1260 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1263 ; CHECK-NEXT: #NO_APP
1264 ; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
1265 ; CHECK-NEXT: popq %rbx
1266 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1267 ; CHECK-NEXT: popq %r12
1268 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1269 ; CHECK-NEXT: popq %r13
1270 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1271 ; CHECK-NEXT: popq %r14
1272 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1273 ; CHECK-NEXT: popq %r15
1274 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1275 ; CHECK-NEXT: popq %rbp
1276 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1278 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1279 %2 = sitofp i64 %a0 to float
1283 define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
1284 ; CHECK-LABEL: stack_fold_cvtsi642ss_int:
1286 ; CHECK-NEXT: pushq %rbp
1287 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1288 ; CHECK-NEXT: pushq %r15
1289 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1290 ; CHECK-NEXT: pushq %r14
1291 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1292 ; CHECK-NEXT: pushq %r13
1293 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1294 ; CHECK-NEXT: pushq %r12
1295 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1296 ; CHECK-NEXT: pushq %rbx
1297 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1298 ; CHECK-NEXT: .cfi_offset %rbx, -56
1299 ; CHECK-NEXT: .cfi_offset %r12, -48
1300 ; CHECK-NEXT: .cfi_offset %r13, -40
1301 ; CHECK-NEXT: .cfi_offset %r14, -32
1302 ; CHECK-NEXT: .cfi_offset %r15, -24
1303 ; CHECK-NEXT: .cfi_offset %rbp, -16
1304 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1307 ; CHECK-NEXT: #NO_APP
1308 ; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
1309 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
1310 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1311 ; CHECK-NEXT: popq %rbx
1312 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1313 ; CHECK-NEXT: popq %r12
1314 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1315 ; CHECK-NEXT: popq %r13
1316 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1317 ; CHECK-NEXT: popq %r14
1318 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1319 ; CHECK-NEXT: popq %r15
1320 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1321 ; CHECK-NEXT: popq %rbp
1322 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1324 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1325 %2 = sitofp i64 %a0 to float
1326 %3 = insertelement <4 x float> zeroinitializer, float %2, i64 0
1330 ; TODO stack_fold_cvtss2si
1332 define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
1333 ; CHECK-LABEL: stack_fold_cvtss2si_int:
1335 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1338 ; CHECK-NEXT: #NO_APP
1339 ; CHECK-NEXT: vcvtss2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
1341 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1342 %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
1345 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
1347 ; TODO stack_fold_cvtss2si64
1349 define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) {
1350 ; CHECK-LABEL: stack_fold_cvtss2si64_int:
1352 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1355 ; CHECK-NEXT: #NO_APP
1356 ; CHECK-NEXT: vcvtss2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
1358 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1359 %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
1362 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
1364 define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {
1365 ; CHECK-LABEL: stack_fold_cvttpd2dq:
1367 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1370 ; CHECK-NEXT: #NO_APP
1371 ; CHECK-NEXT: vcvttpd2dqx {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1373 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1374 %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
1377 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
1379 define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) {
1380 ; CHECK-LABEL: stack_fold_cvttpd2dq_ymm:
1382 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1385 ; CHECK-NEXT: #NO_APP
1386 ; CHECK-NEXT: vcvttpd2dqy {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload
1387 ; CHECK-NEXT: vzeroupper
1389 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1390 %2 = fptosi <4 x double> %a0 to <4 x i32>
1394 define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) {
1395 ; CHECK-LABEL: stack_fold_cvttps2dq:
1397 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1400 ; CHECK-NEXT: #NO_APP
1401 ; CHECK-NEXT: vcvttps2dq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1403 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1404 %2 = fptosi <4 x float> %a0 to <4 x i32>
1408 define <8 x i32> @stack_fold_cvttps2dq_ymm(<8 x float> %a0) {
1409 ; CHECK-LABEL: stack_fold_cvttps2dq_ymm:
1411 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1414 ; CHECK-NEXT: #NO_APP
1415 ; CHECK-NEXT: vcvttps2dq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1417 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1418 %2 = fptosi <8 x float> %a0 to <8 x i32>
1422 define i32 @stack_fold_cvttsd2si(double %a0) {
1423 ; CHECK-LABEL: stack_fold_cvttsd2si:
1425 ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1428 ; CHECK-NEXT: #NO_APP
1429 ; CHECK-NEXT: vcvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 8-byte Folded Reload
1431 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1432 %2 = fptosi double %a0 to i32
1436 define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) {
1437 ; CHECK-LABEL: stack_fold_cvttsd2si_int:
1439 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1442 ; CHECK-NEXT: #NO_APP
1443 ; CHECK-NEXT: vcvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
1445 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1446 %2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
1449 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
1451 define i64 @stack_fold_cvttsd2si64(double %a0) {
1452 ; CHECK-LABEL: stack_fold_cvttsd2si64:
1454 ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1457 ; CHECK-NEXT: #NO_APP
1458 ; CHECK-NEXT: vcvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
1460 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1461 %2 = fptosi double %a0 to i64
1465 define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) {
1466 ; CHECK-LABEL: stack_fold_cvttsd2si64_int:
1468 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1471 ; CHECK-NEXT: #NO_APP
1472 ; CHECK-NEXT: vcvttsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
1474 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1475 %2 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
1478 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
1480 define i32 @stack_fold_cvttss2si(float %a0) {
1481 ; CHECK-LABEL: stack_fold_cvttss2si:
1483 ; CHECK-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1486 ; CHECK-NEXT: #NO_APP
1487 ; CHECK-NEXT: vcvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
1489 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1490 %2 = fptosi float %a0 to i32
1494 define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) {
1495 ; CHECK-LABEL: stack_fold_cvttss2si_int:
1497 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1500 ; CHECK-NEXT: #NO_APP
1501 ; CHECK-NEXT: vcvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
1503 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1504 %2 = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
1507 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
1509 define i64 @stack_fold_cvttss2si64(float %a0) {
1510 ; CHECK-LABEL: stack_fold_cvttss2si64:
1512 ; CHECK-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1515 ; CHECK-NEXT: #NO_APP
1516 ; CHECK-NEXT: vcvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 4-byte Folded Reload
1518 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1519 %2 = fptosi float %a0 to i64
1523 define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
1524 ; CHECK-LABEL: stack_fold_cvttss2si64_int:
1526 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1529 ; CHECK-NEXT: #NO_APP
1530 ; CHECK-NEXT: vcvttss2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
1532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1533 %2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
1536 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
1538 define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) {
1539 ; CHECK-LABEL: stack_fold_divpd:
1541 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1544 ; CHECK-NEXT: #NO_APP
1545 ; CHECK-NEXT: vdivpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1547 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1548 %2 = fdiv <2 x double> %a0, %a1
1552 define <4 x double> @stack_fold_divpd_ymm(<4 x double> %a0, <4 x double> %a1) {
1553 ; CHECK-LABEL: stack_fold_divpd_ymm:
1555 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1558 ; CHECK-NEXT: #NO_APP
1559 ; CHECK-NEXT: vdivpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1561 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1562 %2 = fdiv <4 x double> %a0, %a1
1566 define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) {
1567 ; CHECK-LABEL: stack_fold_divps:
1569 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1572 ; CHECK-NEXT: #NO_APP
1573 ; CHECK-NEXT: vdivps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1575 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1576 %2 = fdiv <4 x float> %a0, %a1
1580 define <8 x float> @stack_fold_divps_ymm(<8 x float> %a0, <8 x float> %a1) {
1581 ; CHECK-LABEL: stack_fold_divps_ymm:
1583 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1586 ; CHECK-NEXT: #NO_APP
1587 ; CHECK-NEXT: vdivps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1589 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1590 %2 = fdiv <8 x float> %a0, %a1
1594 define double @stack_fold_divsd(double %a0, double %a1) {
1595 ; CHECK-LABEL: stack_fold_divsd:
1597 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1600 ; CHECK-NEXT: #NO_APP
1601 ; CHECK-NEXT: vdivsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
1603 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1604 %2 = fdiv double %a0, %a1
1608 define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
1609 ; CHECK-LABEL: stack_fold_divsd_int:
1611 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1614 ; CHECK-NEXT: #NO_APP
1615 ; CHECK-NEXT: vdivsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1617 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1618 %2 = extractelement <2 x double> %a0, i32 0
1619 %3 = extractelement <2 x double> %a1, i32 0
1620 %4 = fdiv double %2, %3
1621 %5 = insertelement <2 x double> %a0, double %4, i32 0
1625 define float @stack_fold_divss(float %a0, float %a1) {
1626 ; CHECK-LABEL: stack_fold_divss:
1628 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1631 ; CHECK-NEXT: #NO_APP
1632 ; CHECK-NEXT: vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1634 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1635 %2 = fdiv float %a0, %a1
1639 define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
1640 ; CHECK-LABEL: stack_fold_divss_int:
1642 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1645 ; CHECK-NEXT: #NO_APP
1646 ; CHECK-NEXT: vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1648 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1649 %2 = extractelement <4 x float> %a0, i32 0
1650 %3 = extractelement <4 x float> %a1, i32 0
1651 %4 = fdiv float %2, %3
1652 %5 = insertelement <4 x float> %a0, float %4, i32 0
1656 define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {
1657 ; CHECK-LABEL: stack_fold_dppd:
1659 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1662 ; CHECK-NEXT: #NO_APP
1663 ; CHECK-NEXT: vdppd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1665 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1666 %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
1669 declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
1671 define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {
1672 ; CHECK-LABEL: stack_fold_dpps:
1674 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1677 ; CHECK-NEXT: #NO_APP
1678 ; CHECK-NEXT: vdpps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1680 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1681 %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
1684 declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
1686 define <8 x float> @stack_fold_dpps_ymm(<8 x float> %a0, <8 x float> %a1) {
1687 ; CHECK-LABEL: stack_fold_dpps_ymm:
1689 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1692 ; CHECK-NEXT: #NO_APP
1693 ; CHECK-NEXT: vdpps $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1695 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1696 %2 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
1699 declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
1701 define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) {
1702 ; CHECK-LABEL: stack_fold_extractf128:
1704 ; CHECK-NEXT: vextractf128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
1707 ; CHECK-NEXT: #NO_APP
1708 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1709 ; CHECK-NEXT: vzeroupper
1711 %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1712 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1716 define i32 @stack_fold_extractps(<4 x float> %a0, <4 x float> %a1) {
1717 ; CHECK-LABEL: stack_fold_extractps:
1719 ; CHECK-NEXT: pushq %rbp
1720 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1721 ; CHECK-NEXT: pushq %r15
1722 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1723 ; CHECK-NEXT: pushq %r14
1724 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1725 ; CHECK-NEXT: pushq %r13
1726 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1727 ; CHECK-NEXT: pushq %r12
1728 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1729 ; CHECK-NEXT: pushq %rbx
1730 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1731 ; CHECK-NEXT: .cfi_offset %rbx, -56
1732 ; CHECK-NEXT: .cfi_offset %r12, -48
1733 ; CHECK-NEXT: .cfi_offset %r13, -40
1734 ; CHECK-NEXT: .cfi_offset %r14, -32
1735 ; CHECK-NEXT: .cfi_offset %r15, -24
1736 ; CHECK-NEXT: .cfi_offset %rbp, -16
1737 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
1738 ; CHECK-NEXT: vextractps $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1741 ; CHECK-NEXT: #NO_APP
1742 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1743 ; CHECK-NEXT: popq %rbx
1744 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1745 ; CHECK-NEXT: popq %r12
1746 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1747 ; CHECK-NEXT: popq %r13
1748 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1749 ; CHECK-NEXT: popq %r14
1750 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1751 ; CHECK-NEXT: popq %r15
1752 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1753 ; CHECK-NEXT: popq %rbp
1754 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1756 ; fadd forces execution domain
1757 %1 = fadd <4 x float> %a0, %a1
1758 %2 = extractelement <4 x float> %1, i32 1
1759 %3 = bitcast float %2 to i32
1760 %4 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1764 define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) {
1765 ; CHECK-LABEL: stack_fold_haddpd:
1767 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1770 ; CHECK-NEXT: #NO_APP
1771 ; CHECK-NEXT: vhaddpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1773 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1774 %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
1777 declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
1779 define <4 x double> @stack_fold_haddpd_ymm(<4 x double> %a0, <4 x double> %a1) {
1780 ; CHECK-LABEL: stack_fold_haddpd_ymm:
1782 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1785 ; CHECK-NEXT: #NO_APP
1786 ; CHECK-NEXT: vhaddpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1788 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1789 %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
1792 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
1794 define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) {
1795 ; CHECK-LABEL: stack_fold_haddps:
1797 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1800 ; CHECK-NEXT: #NO_APP
1801 ; CHECK-NEXT: vhaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1803 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1804 %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
1807 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
1809 define <8 x float> @stack_fold_haddps_ymm(<8 x float> %a0, <8 x float> %a1) {
1810 ; CHECK-LABEL: stack_fold_haddps_ymm:
1812 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1815 ; CHECK-NEXT: #NO_APP
1816 ; CHECK-NEXT: vhaddps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1818 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1819 %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
1822 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
1824 define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) {
1825 ; CHECK-LABEL: stack_fold_hsubpd:
1827 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1830 ; CHECK-NEXT: #NO_APP
1831 ; CHECK-NEXT: vhsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1833 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1834 %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
1837 declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
1839 define <4 x double> @stack_fold_hsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
1840 ; CHECK-LABEL: stack_fold_hsubpd_ymm:
1842 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1845 ; CHECK-NEXT: #NO_APP
1846 ; CHECK-NEXT: vhsubpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1848 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1849 %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
1852 declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
1854 define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) {
1855 ; CHECK-LABEL: stack_fold_hsubps:
1857 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1860 ; CHECK-NEXT: #NO_APP
1861 ; CHECK-NEXT: vhsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1863 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1864 %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
1867 declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
1869 define <8 x float> @stack_fold_hsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
1870 ; CHECK-LABEL: stack_fold_hsubps_ymm:
1872 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1875 ; CHECK-NEXT: #NO_APP
1876 ; CHECK-NEXT: vhsubps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1878 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1879 %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
1882 declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
1884 define <8 x float> @stack_fold_insertf128(<4 x float> %a0, <4 x float> %a1) {
1885 ; CHECK-LABEL: stack_fold_insertf128:
1887 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1888 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1891 ; CHECK-NEXT: #NO_APP
1892 ; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1894 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1895 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1899 define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
1900 ; CHECK-LABEL: stack_fold_insertps:
1902 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1905 ; CHECK-NEXT: #NO_APP
1906 ; CHECK-NEXT: vinsertps $17, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1907 ; CHECK-NEXT: # xmm0 = zero,mem[0],xmm0[2,3]
1909 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1910 %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
1913 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
1915 define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
1916 ; CHECK-LABEL: stack_fold_maxpd:
1918 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1921 ; CHECK-NEXT: #NO_APP
1922 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1924 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1925 %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
1928 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
1930 define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
1931 ; CHECK-LABEL: stack_fold_maxpd_commutable:
1933 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1936 ; CHECK-NEXT: #NO_APP
1937 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1939 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1940 %2 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
1944 define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
1945 ; CHECK-LABEL: stack_fold_maxpd_ymm:
1947 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1950 ; CHECK-NEXT: #NO_APP
1951 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1953 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1954 %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
1957 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
1959 define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
1960 ; CHECK-LABEL: stack_fold_maxpd_ymm_commutable:
1962 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1965 ; CHECK-NEXT: #NO_APP
1966 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1968 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1969 %2 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
1973 define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
1974 ; CHECK-LABEL: stack_fold_maxps:
1976 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1979 ; CHECK-NEXT: #NO_APP
1980 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1982 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1983 %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
1986 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
1988 define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
1989 ; CHECK-LABEL: stack_fold_maxps_commutable:
1991 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1994 ; CHECK-NEXT: #NO_APP
1995 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1997 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1998 %2 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
2002 define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
2003 ; CHECK-LABEL: stack_fold_maxps_ymm:
2005 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2008 ; CHECK-NEXT: #NO_APP
2009 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2011 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2012 %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
2015 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
2017 define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
2018 ; CHECK-LABEL: stack_fold_maxps_ymm_commutable:
2020 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2023 ; CHECK-NEXT: #NO_APP
2024 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2026 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2027 %2 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
2031 define double @stack_fold_maxsd(double %a0, double %a1) #0 {
2032 ; CHECK-LABEL: stack_fold_maxsd:
2034 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2037 ; CHECK-NEXT: #NO_APP
2038 ; CHECK-NEXT: vmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2040 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2041 %2 = fcmp ogt double %a0, %a1
2042 %3 = select i1 %2, double %a0, double %a1
2046 define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
2047 ; CHECK-LABEL: stack_fold_maxsd_commutable:
2049 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2052 ; CHECK-NEXT: #NO_APP
2053 ; CHECK-NEXT: vmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2055 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2056 %2 = fcmp ogt double %a0, %a1
2057 %3 = select i1 %2, double %a0, double %a1
2061 define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
2062 ; CHECK-LABEL: stack_fold_maxsd_int:
2064 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2067 ; CHECK-NEXT: #NO_APP
2068 ; CHECK-NEXT: vmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2070 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2071 %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
2074 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
2076 define float @stack_fold_maxss(float %a0, float %a1) #0 {
2077 ; CHECK-LABEL: stack_fold_maxss:
2079 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2082 ; CHECK-NEXT: #NO_APP
2083 ; CHECK-NEXT: vmaxss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2085 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2086 %2 = fcmp ogt float %a0, %a1
2087 %3 = select i1 %2, float %a0, float %a1
2091 define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
2092 ; CHECK-LABEL: stack_fold_maxss_commutable:
2094 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2097 ; CHECK-NEXT: #NO_APP
2098 ; CHECK-NEXT: vmaxss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2100 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2101 %2 = fcmp ogt float %a0, %a1
2102 %3 = select i1 %2, float %a0, float %a1
2106 define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
2107 ; CHECK-LABEL: stack_fold_maxss_int:
2109 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2112 ; CHECK-NEXT: #NO_APP
2113 ; CHECK-NEXT: vmaxss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2115 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2116 %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
2119 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
2121 define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
2122 ; CHECK-LABEL: stack_fold_minpd:
2124 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2127 ; CHECK-NEXT: #NO_APP
2128 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2130 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2131 %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
2134 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
2136 define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
2137 ; CHECK-LABEL: stack_fold_minpd_commutable:
2139 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2142 ; CHECK-NEXT: #NO_APP
2143 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2145 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2146 %2 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
2150 define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
2151 ; CHECK-LABEL: stack_fold_minpd_ymm:
2153 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2156 ; CHECK-NEXT: #NO_APP
2157 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2159 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2160 %2 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
2163 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
2165 define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
2166 ; CHECK-LABEL: stack_fold_minpd_ymm_commutable:
2168 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2171 ; CHECK-NEXT: #NO_APP
2172 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2174 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2175 %2 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
2179 define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
2180 ; CHECK-LABEL: stack_fold_minps:
2182 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2185 ; CHECK-NEXT: #NO_APP
2186 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2188 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2189 %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
2192 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
2194 define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
2195 ; CHECK-LABEL: stack_fold_minps_commutable:
2197 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2200 ; CHECK-NEXT: #NO_APP
2201 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2204 %2 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
2208 define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
2209 ; CHECK-LABEL: stack_fold_minps_ymm:
2211 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2214 ; CHECK-NEXT: #NO_APP
2215 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2217 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2218 %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
2221 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
2223 define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
2224 ; CHECK-LABEL: stack_fold_minps_ymm_commutable:
2226 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2229 ; CHECK-NEXT: #NO_APP
2230 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2232 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2233 %2 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
2237 define double @stack_fold_minsd(double %a0, double %a1) #0 {
2238 ; CHECK-LABEL: stack_fold_minsd:
2240 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2243 ; CHECK-NEXT: #NO_APP
2244 ; CHECK-NEXT: vminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2246 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2247 %2 = fcmp olt double %a0, %a1
2248 %3 = select i1 %2, double %a0, double %a1
2252 define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
2253 ; CHECK-LABEL: stack_fold_minsd_commutable:
2255 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2258 ; CHECK-NEXT: #NO_APP
2259 ; CHECK-NEXT: vminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2261 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2262 %2 = fcmp olt double %a0, %a1
2263 %3 = select i1 %2, double %a0, double %a1
2267 define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
2268 ; CHECK-LABEL: stack_fold_minsd_int:
2270 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2273 ; CHECK-NEXT: #NO_APP
2274 ; CHECK-NEXT: vminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2276 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2277 %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
2280 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
2282 define float @stack_fold_minss(float %a0, float %a1) #0 {
2283 ; CHECK-LABEL: stack_fold_minss:
2285 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2288 ; CHECK-NEXT: #NO_APP
2289 ; CHECK-NEXT: vminss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2291 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2292 %2 = fcmp olt float %a0, %a1
2293 %3 = select i1 %2, float %a0, float %a1
2297 define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
2298 ; CHECK-LABEL: stack_fold_minss_commutable:
2300 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2303 ; CHECK-NEXT: #NO_APP
2304 ; CHECK-NEXT: vminss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2306 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2307 %2 = fcmp olt float %a0, %a1
2308 %3 = select i1 %2, float %a0, float %a1
2312 define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
2313 ; CHECK-LABEL: stack_fold_minss_int:
2315 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2318 ; CHECK-NEXT: #NO_APP
2319 ; CHECK-NEXT: vminss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2321 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2322 %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
2325 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
2327 define <2 x double> @stack_fold_movddup(<2 x double> %a0) {
2328 ; CHECK-LABEL: stack_fold_movddup:
2330 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2333 ; CHECK-NEXT: #NO_APP
2334 ; CHECK-NEXT: vmovddup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2335 ; CHECK-NEXT: # xmm0 = mem[0,0]
2337 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2338 %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 0, i32 0>
2342 define <4 x double> @stack_fold_movddup_ymm(<4 x double> %a0) {
2343 ; CHECK-LABEL: stack_fold_movddup_ymm:
2345 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2348 ; CHECK-NEXT: #NO_APP
2349 ; CHECK-NEXT: vmovddup {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2350 ; CHECK-NEXT: # ymm0 = mem[0,0,2,2]
2352 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2353 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2357 ; TODO stack_fold_movhpd (load / store)
2358 ; TODO stack_fold_movhps (load / store)
2360 ; TODO stack_fold_movlpd (load / store)
2361 ; TODO stack_fold_movlps (load / store)
2363 define <4 x float> @stack_fold_movshdup(<4 x float> %a0) {
2364 ; CHECK-LABEL: stack_fold_movshdup:
2366 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2369 ; CHECK-NEXT: #NO_APP
2370 ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2371 ; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
2373 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2374 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2378 define <8 x float> @stack_fold_movshdup_ymm(<8 x float> %a0) {
2379 ; CHECK-LABEL: stack_fold_movshdup_ymm:
2381 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2384 ; CHECK-NEXT: #NO_APP
2385 ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2386 ; CHECK-NEXT: # ymm0 = mem[1,1,3,3,5,5,7,7]
2388 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2389 %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2393 define <4 x float> @stack_fold_movsldup(<4 x float> %a0) {
2394 ; CHECK-LABEL: stack_fold_movsldup:
2396 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2399 ; CHECK-NEXT: #NO_APP
2400 ; CHECK-NEXT: vmovsldup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2401 ; CHECK-NEXT: # xmm0 = mem[0,0,2,2]
2403 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2404 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2408 define <8 x float> @stack_fold_movsldup_ymm(<8 x float> %a0) {
2409 ; CHECK-LABEL: stack_fold_movsldup_ymm:
2411 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2414 ; CHECK-NEXT: #NO_APP
2415 ; CHECK-NEXT: vmovsldup {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2416 ; CHECK-NEXT: # ymm0 = mem[0,0,2,2,4,4,6,6]
2418 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2419 %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2423 define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) {
2424 ; CHECK-LABEL: stack_fold_mulpd:
2426 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2429 ; CHECK-NEXT: #NO_APP
2430 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2432 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2433 %2 = fmul <2 x double> %a0, %a1
2437 define <4 x double> @stack_fold_mulpd_ymm(<4 x double> %a0, <4 x double> %a1) {
2438 ; CHECK-LABEL: stack_fold_mulpd_ymm:
2440 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2443 ; CHECK-NEXT: #NO_APP
2444 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2446 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2447 %2 = fmul <4 x double> %a0, %a1
2451 define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) {
2452 ; CHECK-LABEL: stack_fold_mulps:
2454 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2457 ; CHECK-NEXT: #NO_APP
2458 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2460 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2461 %2 = fmul <4 x float> %a0, %a1
2465 define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) {
2466 ; CHECK-LABEL: stack_fold_mulps_ymm:
2468 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2471 ; CHECK-NEXT: #NO_APP
2472 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2474 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2475 %2 = fmul <8 x float> %a0, %a1
2479 define double @stack_fold_mulsd(double %a0, double %a1) {
2480 ; CHECK-LABEL: stack_fold_mulsd:
2482 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2485 ; CHECK-NEXT: #NO_APP
2486 ; CHECK-NEXT: vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2488 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2489 %2 = fmul double %a0, %a1
2493 define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
2494 ; CHECK-LABEL: stack_fold_mulsd_int:
2496 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2499 ; CHECK-NEXT: #NO_APP
2500 ; CHECK-NEXT: vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2502 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2503 %2 = extractelement <2 x double> %a0, i32 0
2504 %3 = extractelement <2 x double> %a1, i32 0
2505 %4 = fmul double %2, %3
2506 %5 = insertelement <2 x double> %a0, double %4, i32 0
2510 define float @stack_fold_mulss(float %a0, float %a1) {
2511 ; CHECK-LABEL: stack_fold_mulss:
2513 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2516 ; CHECK-NEXT: #NO_APP
2517 ; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2519 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2520 %2 = fmul float %a0, %a1
2524 define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
2525 ; CHECK-LABEL: stack_fold_mulss_int:
2527 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2530 ; CHECK-NEXT: #NO_APP
2531 ; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2533 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2534 %2 = extractelement <4 x float> %a0, i32 0
2535 %3 = extractelement <4 x float> %a1, i32 0
2536 %4 = fmul float %2, %3
2537 %5 = insertelement <4 x float> %a0, float %4, i32 0
2541 define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
2542 ; CHECK-LABEL: stack_fold_orpd:
2544 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2547 ; CHECK-NEXT: #NO_APP
2548 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2549 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
2550 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
2552 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2553 %2 = bitcast <2 x double> %a0 to <2 x i64>
2554 %3 = bitcast <2 x double> %a1 to <2 x i64>
2555 %4 = or <2 x i64> %2, %3
2556 %5 = bitcast <2 x i64> %4 to <2 x double>
2557 ; fadd forces execution domain
2558 %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
2562 define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) {
2563 ; CHECK-LABEL: stack_fold_orpd_ymm:
2565 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2568 ; CHECK-NEXT: #NO_APP
2569 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2570 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
2571 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
2573 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2574 %2 = bitcast <4 x double> %a0 to <4 x i64>
2575 %3 = bitcast <4 x double> %a1 to <4 x i64>
2576 %4 = or <4 x i64> %2, %3
2577 %5 = bitcast <4 x i64> %4 to <4 x double>
2578 ; fadd forces execution domain
2579 %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
2583 define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) {
2584 ; CHECK-LABEL: stack_fold_orps:
2586 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2589 ; CHECK-NEXT: #NO_APP
2590 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2591 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
2592 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
2594 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2595 %2 = bitcast <4 x float> %a0 to <2 x i64>
2596 %3 = bitcast <4 x float> %a1 to <2 x i64>
2597 %4 = or <2 x i64> %2, %3
2598 %5 = bitcast <2 x i64> %4 to <4 x float>
2599 ; fadd forces execution domain
2600 %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
2604 define <8 x float> @stack_fold_orps_ymm(<8 x float> %a0, <8 x float> %a1) {
2605 ; CHECK-LABEL: stack_fold_orps_ymm:
2607 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2610 ; CHECK-NEXT: #NO_APP
2611 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2612 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
2613 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
2615 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2616 %2 = bitcast <8 x float> %a0 to <4 x i64>
2617 %3 = bitcast <8 x float> %a1 to <4 x i64>
2618 %4 = or <4 x i64> %2, %3
2619 %5 = bitcast <4 x i64> %4 to <8 x float>
2620 ; fadd forces execution domain
2621 %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
2625 define <8 x float> @stack_fold_perm2f128(<8 x float> %a0, <8 x float> %a1) {
2626 ; CHECK-LABEL: stack_fold_perm2f128:
2628 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2631 ; CHECK-NEXT: #NO_APP
2632 ; CHECK-NEXT: vperm2f128 $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2633 ; CHECK-NEXT: # ymm0 = ymm0[2,3],mem[0,1]
2635 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2636 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
2640 define <2 x double> @stack_fold_permilpd(<2 x double> %a0) {
2641 ; CHECK-LABEL: stack_fold_permilpd:
2643 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2646 ; CHECK-NEXT: #NO_APP
2647 ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2648 ; CHECK-NEXT: # xmm0 = mem[1,0]
2650 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2651 %2 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0>
2655 define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) {
2656 ; CHECK-LABEL: stack_fold_permilpd_ymm:
2658 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2661 ; CHECK-NEXT: #NO_APP
2662 ; CHECK-NEXT: vpermilpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2663 ; CHECK-NEXT: # ymm0 = mem[1,0,3,2]
2665 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2666 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
2670 define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) {
2671 ; CHECK-LABEL: stack_fold_permilpdvar:
2673 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2676 ; CHECK-NEXT: #NO_APP
2677 ; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2679 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2680 %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
2683 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
2685 define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) {
2686 ; CHECK-LABEL: stack_fold_permilpdvar_ymm:
2688 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2691 ; CHECK-NEXT: #NO_APP
2692 ; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2694 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2695 %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
2698 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
2700 define <4 x float> @stack_fold_permilps(<4 x float> %a0) {
2701 ; CHECK-LABEL: stack_fold_permilps:
2703 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2706 ; CHECK-NEXT: #NO_APP
2707 ; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2708 ; CHECK-NEXT: # xmm0 = mem[3,2,1,0]
2710 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2711 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2715 define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) {
2716 ; CHECK-LABEL: stack_fold_permilps_ymm:
2718 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2721 ; CHECK-NEXT: #NO_APP
2722 ; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2723 ; CHECK-NEXT: # ymm0 = mem[3,2,1,0,7,6,5,4]
2725 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2726 %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
2730 define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) {
2731 ; CHECK-LABEL: stack_fold_permilpsvar:
2733 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2736 ; CHECK-NEXT: #NO_APP
2737 ; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2739 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2740 %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
2743 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
2745 define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) {
2746 ; CHECK-LABEL: stack_fold_permilpsvar_ymm:
2748 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2751 ; CHECK-NEXT: #NO_APP
2752 ; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2754 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2755 %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
2758 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
2760 ; TODO stack_fold_rcpps
2762 define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
2763 ; CHECK-LABEL: stack_fold_rcpps_int:
2765 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2768 ; CHECK-NEXT: #NO_APP
2769 ; CHECK-NEXT: vrcpps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2771 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2772 %2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
2775 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
2777 ; TODO stack_fold_rcpps_ymm
2779 define <8 x float> @stack_fold_rcpps_ymm_int(<8 x float> %a0) {
2780 ; CHECK-LABEL: stack_fold_rcpps_ymm_int:
2782 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2785 ; CHECK-NEXT: #NO_APP
2786 ; CHECK-NEXT: vrcpps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2788 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2789 %2 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
2792 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
2794 ; TODO stack_fold_rcpss
2795 ; TODO stack_fold_rcpss_int
2797 define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {
2798 ; CHECK-LABEL: stack_fold_roundpd:
2800 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2803 ; CHECK-NEXT: #NO_APP
2804 ; CHECK-NEXT: vroundpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2806 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2807 %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
2810 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
2812 define <4 x double> @stack_fold_roundpd_ymm(<4 x double> %a0) {
2813 ; CHECK-LABEL: stack_fold_roundpd_ymm:
2815 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2818 ; CHECK-NEXT: #NO_APP
2819 ; CHECK-NEXT: vroundpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2821 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2822 %2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)
2825 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
2827 define <4 x float> @stack_fold_roundps(<4 x float> %a0) {
2828 ; CHECK-LABEL: stack_fold_roundps:
2830 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2833 ; CHECK-NEXT: #NO_APP
2834 ; CHECK-NEXT: vroundps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2836 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2837 %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
2840 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
2842 define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) {
2843 ; CHECK-LABEL: stack_fold_roundps_ymm:
2845 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2848 ; CHECK-NEXT: #NO_APP
2849 ; CHECK-NEXT: vroundps $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2851 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2852 %2 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)
2855 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
2857 define double @stack_fold_roundsd(double %a0) optsize {
2858 ; CHECK-LABEL: stack_fold_roundsd:
2860 ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2863 ; CHECK-NEXT: #NO_APP
2864 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2865 ; CHECK-NEXT: vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2867 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2868 %2 = call double @llvm.floor.f64(double %a0)
2872 define double @stack_fold_roundsd_minsize(double %a0) minsize {
2873 ; CHECK-LABEL: stack_fold_roundsd_minsize:
2875 ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2878 ; CHECK-NEXT: #NO_APP
2879 ; CHECK-NEXT: vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2881 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2882 %2 = call double @llvm.floor.f64(double %a0)
2885 declare double @llvm.floor.f64(double) nounwind readnone
2887 define <2 x double> @stack_fold_roundsd_int(<2 x double> %a0, <2 x double> %a1) optsize {
2888 ; CHECK-LABEL: stack_fold_roundsd_int:
2890 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2891 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2894 ; CHECK-NEXT: #NO_APP
2895 ; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2896 ; CHECK-NEXT: vroundsd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2898 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2899 %2 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7)
2902 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
2904 define float @stack_fold_roundss(float %a0) optsize {
2905 ; CHECK-LABEL: stack_fold_roundss:
2907 ; CHECK-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2910 ; CHECK-NEXT: #NO_APP
2911 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2912 ; CHECK-NEXT: vroundss $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2914 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2915 %2 = call float @llvm.floor.f32(float %a0)
2918 declare float @llvm.floor.f32(float) nounwind readnone
2920 define <4 x float> @stack_fold_roundss_int(<4 x float> %a0, <4 x float> %a1) optsize {
2921 ; CHECK-LABEL: stack_fold_roundss_int:
2923 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2924 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2927 ; CHECK-NEXT: #NO_APP
2928 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2929 ; CHECK-NEXT: vroundss $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2931 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2932 %2 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7)
2935 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
2937 ; TODO stack_fold_rsqrtps
2939 define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
2940 ; CHECK-LABEL: stack_fold_rsqrtps_int:
2942 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2945 ; CHECK-NEXT: #NO_APP
2946 ; CHECK-NEXT: vrsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2948 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2949 %2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
2952 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
2954 ; TODO stack_fold_rsqrtps_ymm
2956 define <8 x float> @stack_fold_rsqrtps_ymm_int(<8 x float> %a0) {
2957 ; CHECK-LABEL: stack_fold_rsqrtps_ymm_int:
2959 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2962 ; CHECK-NEXT: #NO_APP
2963 ; CHECK-NEXT: vrsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
2965 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2966 %2 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
2969 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
2971 ; TODO stack_fold_rsqrtss
2972 ; TODO stack_fold_rsqrtss_int
2974 define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
2975 ; CHECK-LABEL: stack_fold_shufpd:
2977 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2980 ; CHECK-NEXT: #NO_APP
2981 ; CHECK-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2982 ; CHECK-NEXT: # xmm0 = xmm0[1],mem[0]
2983 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
2984 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
2986 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2987 %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
2988 ; fadd forces execution domain
2989 %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
2993 define <4 x double> @stack_fold_shufpd_ymm(<4 x double> %a0, <4 x double> %a1) {
2994 ; CHECK-LABEL: stack_fold_shufpd_ymm:
2996 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2999 ; CHECK-NEXT: #NO_APP
3000 ; CHECK-NEXT: vshufpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3001 ; CHECK-NEXT: # ymm0 = ymm0[1],mem[0],ymm0[3],mem[2]
3002 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
3003 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
3005 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3006 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
3007 ; fadd forces execution domain
3008 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
3012 define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
3013 ; CHECK-LABEL: stack_fold_shufps:
3015 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3018 ; CHECK-NEXT: #NO_APP
3019 ; CHECK-NEXT: vshufps $200, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3020 ; CHECK-NEXT: # xmm0 = xmm0[0,2],mem[0,3]
3022 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3023 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
3027 define <8 x float> @stack_fold_shufps_ymm(<8 x float> %a0, <8 x float> %a1) {
3028 ; CHECK-LABEL: stack_fold_shufps_ymm:
3030 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3033 ; CHECK-NEXT: #NO_APP
3034 ; CHECK-NEXT: vshufps $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3035 ; CHECK-NEXT: # ymm0 = ymm0[0,1],mem[1,2],ymm0[4,5],mem[5,6]
3037 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3038 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 13, i32 14>
3042 define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) {
3043 ; CHECK-LABEL: stack_fold_sqrtpd:
3045 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3048 ; CHECK-NEXT: #NO_APP
3049 ; CHECK-NEXT: vsqrtpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3051 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3052 %2 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0)
3055 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
3057 define <4 x double> @stack_fold_sqrtpd_ymm(<4 x double> %a0) {
3058 ; CHECK-LABEL: stack_fold_sqrtpd_ymm:
3060 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3063 ; CHECK-NEXT: #NO_APP
3064 ; CHECK-NEXT: vsqrtpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3066 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3067 %2 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0)
3070 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
3072 define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) {
3073 ; CHECK-LABEL: stack_fold_sqrtps:
3075 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3078 ; CHECK-NEXT: #NO_APP
3079 ; CHECK-NEXT: vsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3081 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3082 %2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
3085 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
3087 define <8 x float> @stack_fold_sqrtps_ymm(<8 x float> %a0) {
3088 ; CHECK-LABEL: stack_fold_sqrtps_ymm:
3090 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3093 ; CHECK-NEXT: #NO_APP
3094 ; CHECK-NEXT: vsqrtps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3096 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3097 %2 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0)
3100 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
3102 define double @stack_fold_sqrtsd(double %a0) optsize {
3103 ; CHECK-LABEL: stack_fold_sqrtsd:
3105 ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3108 ; CHECK-NEXT: #NO_APP
3109 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
3110 ; CHECK-NEXT: vsqrtsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
3112 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3113 %2 = call double @llvm.sqrt.f64(double %a0)
3116 declare double @llvm.sqrt.f64(double) nounwind readnone
3118 ; TODO stack_fold_sqrtsd_int
3120 define float @stack_fold_sqrtss(float %a0) optsize {
3121 ; CHECK-LABEL: stack_fold_sqrtss:
3123 ; CHECK-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3126 ; CHECK-NEXT: #NO_APP
3127 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
3128 ; CHECK-NEXT: vsqrtss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
3130 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3131 %2 = call float @llvm.sqrt.f32(float %a0)
3134 declare float @llvm.sqrt.f32(float) nounwind readnone
3136 ; TODO stack_fold_sqrtss_int
3138 define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
3139 ; CHECK-LABEL: stack_fold_subpd:
3141 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3144 ; CHECK-NEXT: #NO_APP
3145 ; CHECK-NEXT: vsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3147 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3148 %2 = fsub <2 x double> %a0, %a1
3152 define <4 x double> @stack_fold_subpd_ymm(<4 x double> %a0, <4 x double> %a1) {
3153 ; CHECK-LABEL: stack_fold_subpd_ymm:
3155 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3158 ; CHECK-NEXT: #NO_APP
3159 ; CHECK-NEXT: vsubpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3161 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3162 %2 = fsub <4 x double> %a0, %a1
3166 define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) {
3167 ; CHECK-LABEL: stack_fold_subps:
3169 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3172 ; CHECK-NEXT: #NO_APP
3173 ; CHECK-NEXT: vsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3175 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3176 %2 = fsub <4 x float> %a0, %a1
3180 define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) {
3181 ; CHECK-LABEL: stack_fold_subps_ymm:
3183 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3186 ; CHECK-NEXT: #NO_APP
3187 ; CHECK-NEXT: vsubps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3189 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3190 %2 = fsub <8 x float> %a0, %a1
3194 define double @stack_fold_subsd(double %a0, double %a1) {
3195 ; CHECK-LABEL: stack_fold_subsd:
3197 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3200 ; CHECK-NEXT: #NO_APP
3201 ; CHECK-NEXT: vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
3203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3204 %2 = fsub double %a0, %a1
3208 define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
3209 ; CHECK-LABEL: stack_fold_subsd_int:
3211 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3214 ; CHECK-NEXT: #NO_APP
3215 ; CHECK-NEXT: vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3217 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3218 %2 = extractelement <2 x double> %a0, i32 0
3219 %3 = extractelement <2 x double> %a1, i32 0
3220 %4 = fsub double %2, %3
3221 %5 = insertelement <2 x double> %a0, double %4, i32 0
3225 define float @stack_fold_subss(float %a0, float %a1) {
3226 ; CHECK-LABEL: stack_fold_subss:
3228 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3231 ; CHECK-NEXT: #NO_APP
3232 ; CHECK-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
3234 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3235 %2 = fsub float %a0, %a1
3239 define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
3240 ; CHECK-LABEL: stack_fold_subss_int:
3242 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3245 ; CHECK-NEXT: #NO_APP
3246 ; CHECK-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3248 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3249 %2 = extractelement <4 x float> %a0, i32 0
3250 %3 = extractelement <4 x float> %a1, i32 0
3251 %4 = fsub float %2, %3
3252 %5 = insertelement <4 x float> %a0, float %4, i32 0
3256 define i32 @stack_fold_testpd(<2 x double> %a0, <2 x double> %a1) {
3257 ; CHECK-LABEL: stack_fold_testpd:
3259 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3262 ; CHECK-NEXT: #NO_APP
3263 ; CHECK-NEXT: xorl %eax, %eax
3264 ; CHECK-NEXT: vtestpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3265 ; CHECK-NEXT: setb %al
3267 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3268 %2 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
3271 declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
3273 define i32 @stack_fold_testpd_ymm(<4 x double> %a0, <4 x double> %a1) {
3274 ; CHECK-LABEL: stack_fold_testpd_ymm:
3276 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3279 ; CHECK-NEXT: #NO_APP
3280 ; CHECK-NEXT: xorl %eax, %eax
3281 ; CHECK-NEXT: vtestpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3282 ; CHECK-NEXT: setb %al
3283 ; CHECK-NEXT: vzeroupper
3285 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3286 %2 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
3289 declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
3291 define i32 @stack_fold_testps(<4 x float> %a0, <4 x float> %a1) {
3292 ; CHECK-LABEL: stack_fold_testps:
3294 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3297 ; CHECK-NEXT: #NO_APP
3298 ; CHECK-NEXT: xorl %eax, %eax
3299 ; CHECK-NEXT: vtestps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3300 ; CHECK-NEXT: setb %al
3302 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3303 %2 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
3306 declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
3308 define i32 @stack_fold_testps_ymm(<8 x float> %a0, <8 x float> %a1) {
3309 ; CHECK-LABEL: stack_fold_testps_ymm:
3311 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3314 ; CHECK-NEXT: #NO_APP
3315 ; CHECK-NEXT: xorl %eax, %eax
3316 ; CHECK-NEXT: vtestps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
3317 ; CHECK-NEXT: setb %al
3318 ; CHECK-NEXT: vzeroupper
3320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3321 %2 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
3324 declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
3326 define i32 @stack_fold_ucomisd(double %a0, double %a1) {
3327 ; CHECK-LABEL: stack_fold_ucomisd:
3329 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
3332 ; CHECK-NEXT: #NO_APP
3333 ; CHECK-NEXT: xorl %eax, %eax
3334 ; CHECK-NEXT: vucomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
3335 ; CHECK-NEXT: sete %al
3336 ; CHECK-NEXT: leal -1(%rax,%rax), %eax
3338 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3339 %2 = fcmp ueq double %a0, %a1
3340 %3 = select i1 %2, i32 1, i32 -1
3344 define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) {
3345 ; CHECK-LABEL: stack_fold_ucomisd_int:
3347 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3350 ; CHECK-NEXT: #NO_APP
3351 ; CHECK-NEXT: vucomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3352 ; CHECK-NEXT: setnp %al
3353 ; CHECK-NEXT: sete %cl
3354 ; CHECK-NEXT: andb %al, %cl
3355 ; CHECK-NEXT: movzbl %cl, %eax
3357 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3358 %2 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
3361 declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
3363 define i32 @stack_fold_ucomiss(float %a0, float %a1) {
3364 ; CHECK-LABEL: stack_fold_ucomiss:
3366 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3369 ; CHECK-NEXT: #NO_APP
3370 ; CHECK-NEXT: xorl %eax, %eax
3371 ; CHECK-NEXT: vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
3372 ; CHECK-NEXT: sete %al
3373 ; CHECK-NEXT: leal -1(%rax,%rax), %eax
3375 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3376 %2 = fcmp ueq float %a0, %a1
3377 %3 = select i1 %2, i32 1, i32 -1
3381 define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) {
3382 ; CHECK-LABEL: stack_fold_ucomiss_int:
3384 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3387 ; CHECK-NEXT: #NO_APP
3388 ; CHECK-NEXT: vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3389 ; CHECK-NEXT: setnp %al
3390 ; CHECK-NEXT: sete %cl
3391 ; CHECK-NEXT: andb %al, %cl
3392 ; CHECK-NEXT: movzbl %cl, %eax
3394 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3395 %2 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
3398 declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
3400 define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) {
3401 ; CHECK-LABEL: stack_fold_unpckhpd:
3403 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3406 ; CHECK-NEXT: #NO_APP
3407 ; CHECK-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3408 ; CHECK-NEXT: # xmm0 = xmm0[1],mem[1]
3409 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
3410 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
3412 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3413 %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
3414 ; fadd forces execution domain
3415 %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
3419 define <4 x double> @stack_fold_unpckhpd_ymm(<4 x double> %a0, <4 x double> %a1) {
3420 ; CHECK-LABEL: stack_fold_unpckhpd_ymm:
3422 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3425 ; CHECK-NEXT: #NO_APP
3426 ; CHECK-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3427 ; CHECK-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
3428 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
3429 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
3431 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3432 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
3433 ; fadd forces execution domain
3434 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
3438 define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) {
3439 ; CHECK-LABEL: stack_fold_unpckhps:
3441 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3444 ; CHECK-NEXT: #NO_APP
3445 ; CHECK-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3446 ; CHECK-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
3447 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
3448 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
3450 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3451 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3452 ; fadd forces execution domain
3453 %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
3457 define <8 x float> @stack_fold_unpckhps_ymm(<8 x float> %a0, <8 x float> %a1) {
3458 ; CHECK-LABEL: stack_fold_unpckhps_ymm:
3460 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3463 ; CHECK-NEXT: #NO_APP
3464 ; CHECK-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3465 ; CHECK-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
3466 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
3467 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
3469 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3470 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
3471 ; fadd forces execution domain
3472 %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
3476 define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) {
3477 ; CHECK-LABEL: stack_fold_unpcklpd:
3479 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3482 ; CHECK-NEXT: #NO_APP
3483 ; CHECK-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3484 ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
3485 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
3486 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
3488 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3489 %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
3490 ; fadd forces execution domain
3491 %3 = fadd <2 x double> %2, <double 0x0, double 0x0>
3495 define <4 x double> @stack_fold_unpcklpd_ymm(<4 x double> %a0, <4 x double> %a1) {
3496 ; CHECK-LABEL: stack_fold_unpcklpd_ymm:
3498 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3501 ; CHECK-NEXT: #NO_APP
3502 ; CHECK-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3503 ; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
3504 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
3505 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
3507 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3508 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
3509 ; fadd forces execution domain
3510 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
3514 define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) {
3515 ; CHECK-LABEL: stack_fold_unpcklps:
3517 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3520 ; CHECK-NEXT: #NO_APP
3521 ; CHECK-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3522 ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3523 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
3524 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
3526 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3527 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3528 ; fadd forces execution domain
3529 %3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
3533 define <8 x float> @stack_fold_unpcklps_ymm(<8 x float> %a0, <8 x float> %a1) {
3534 ; CHECK-LABEL: stack_fold_unpcklps_ymm:
3536 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3539 ; CHECK-NEXT: #NO_APP
3540 ; CHECK-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3541 ; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
3542 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
3543 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
3545 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3546 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
3547 ; fadd forces execution domain
3548 %3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
3552 define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
3553 ; CHECK-LABEL: stack_fold_xorpd:
3555 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3558 ; CHECK-NEXT: #NO_APP
3559 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3560 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
3561 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
3563 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3564 %2 = bitcast <2 x double> %a0 to <2 x i64>
3565 %3 = bitcast <2 x double> %a1 to <2 x i64>
3566 %4 = xor <2 x i64> %2, %3
3567 %5 = bitcast <2 x i64> %4 to <2 x double>
3568 ; fadd forces execution domain
3569 %6 = fadd <2 x double> %5, <double 0x0, double 0x0>
3573 define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) {
3574 ; CHECK-LABEL: stack_fold_xorpd_ymm:
3576 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3579 ; CHECK-NEXT: #NO_APP
3580 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3581 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
3582 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
3584 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3585 %2 = bitcast <4 x double> %a0 to <4 x i64>
3586 %3 = bitcast <4 x double> %a1 to <4 x i64>
3587 %4 = xor <4 x i64> %2, %3
3588 %5 = bitcast <4 x i64> %4 to <4 x double>
3589 ; fadd forces execution domain
3590 %6 = fadd <4 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0>
3594 define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
3595 ; CHECK-LABEL: stack_fold_xorps:
3597 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3600 ; CHECK-NEXT: #NO_APP
3601 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3602 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
3603 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
3605 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3606 %2 = bitcast <4 x float> %a0 to <2 x i64>
3607 %3 = bitcast <4 x float> %a1 to <2 x i64>
3608 %4 = xor <2 x i64> %2, %3
3609 %5 = bitcast <2 x i64> %4 to <4 x float>
3610 ; fadd forces execution domain
3611 %6 = fadd <4 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0>
3615 define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
3616 ; CHECK-LABEL: stack_fold_xorps_ymm:
3618 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3621 ; CHECK-NEXT: #NO_APP
3622 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3623 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
3624 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
3626 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3627 %2 = bitcast <8 x float> %a0 to <4 x i64>
3628 %3 = bitcast <8 x float> %a1 to <4 x i64>
3629 %4 = xor <4 x i64> %2, %3
3630 %5 = bitcast <4 x i64> %4 to <8 x float>
3631 ; fadd forces execution domain
3632 %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
3636 attributes #0 = { "unsafe-fp-math"="false" }
3637 attributes #1 = { "unsafe-fp-math"="true" }