1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) {
13 ; CHECK-LABEL: stack_fold_broadcastsd_ymm:
15 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
19 ; CHECK-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
20 ; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
21 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
23 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
24 %2 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
25 ; fadd forces execution domain
26 %3 = fadd <4 x double> %2, <double 0x1, double 0x0, double 0x0, double 0x0>
30 define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) {
31 ; CHECK-LABEL: stack_fold_broadcastss:
33 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
37 ; CHECK-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
38 ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
39 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
41 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
42 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
43 ; fadd forces execution domain
44 %3 = fadd <4 x float> %2, <float 1.0, float 0x0, float 0x0, float 0x0>
48 define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) {
49 ; CHECK-LABEL: stack_fold_broadcastss_ymm:
51 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
55 ; CHECK-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
56 ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
57 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
59 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
60 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
61 ; fadd forces execution domain
62 %3 = fadd <8 x float> %2, <float 1.0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
66 define <4 x i32> @stack_fold_extracti128(<8 x i16> %a0, <8 x i32> %a1) {
67 ; CHECK-LABEL: stack_fold_extracti128:
69 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
70 ; CHECK-NEXT: vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
74 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
75 ; CHECK-NEXT: vzeroupper
77 ; zext forces execution domain
78 %t1 = zext <8 x i16> %a0 to <8 x i32>
79 %t2 = shufflevector <8 x i32> %t1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
80 %t3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
84 define <8 x i32> @stack_fold_inserti128(<4 x i32> %a0, <4 x i32> %a1) {
85 ; CHECK-LABEL: stack_fold_inserti128:
87 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
88 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
92 ; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
93 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
95 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
96 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
97 ; add forces execution domain
98 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
102 define <16 x i16> @stack_fold_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
103 ; CHECK-LABEL: stack_fold_mpsadbw:
105 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
108 ; CHECK-NEXT: #NO_APP
109 ; CHECK-NEXT: vmpsadbw $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
111 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
112 %2 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7)
115 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
117 define <32 x i8> @stack_fold_pabsb(<32 x i8> %a0) {
118 ; CHECK-LABEL: stack_fold_pabsb:
120 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
123 ; CHECK-NEXT: #NO_APP
124 ; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
126 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
127 %2 = icmp sgt <32 x i8> %a0, zeroinitializer
128 %3 = sub <32 x i8> zeroinitializer, %a0
129 %4 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %3
133 define <8 x i32> @stack_fold_pabsd(<8 x i32> %a0) {
134 ; CHECK-LABEL: stack_fold_pabsd:
136 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
139 ; CHECK-NEXT: #NO_APP
140 ; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
142 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
143 %2 = icmp sgt <8 x i32> %a0, zeroinitializer
144 %3 = sub <8 x i32> zeroinitializer, %a0
145 %4 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %3
149 define <16 x i16> @stack_fold_pabsw(<16 x i16> %a0) {
150 ; CHECK-LABEL: stack_fold_pabsw:
152 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
155 ; CHECK-NEXT: #NO_APP
156 ; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
158 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
159 %2 = icmp sgt <16 x i16> %a0, zeroinitializer
160 %3 = sub <16 x i16> zeroinitializer, %a0
161 %4 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %3
165 define <16 x i16> @stack_fold_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
166 ; CHECK-LABEL: stack_fold_packssdw:
168 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
171 ; CHECK-NEXT: #NO_APP
172 ; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
174 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
175 %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
178 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
180 define <32 x i8> @stack_fold_packsswb(<16 x i16> %a0, <16 x i16> %a1) {
181 ; CHECK-LABEL: stack_fold_packsswb:
183 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
186 ; CHECK-NEXT: #NO_APP
187 ; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
189 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
190 %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
193 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
195 define <16 x i16> @stack_fold_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
196 ; CHECK-LABEL: stack_fold_packusdw:
198 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
201 ; CHECK-NEXT: #NO_APP
202 ; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
204 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
205 %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
208 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
210 define <32 x i8> @stack_fold_packuswb(<16 x i16> %a0, <16 x i16> %a1) {
211 ; CHECK-LABEL: stack_fold_packuswb:
213 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
216 ; CHECK-NEXT: #NO_APP
217 ; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
219 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
220 %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
223 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
225 define <32 x i8> @stack_fold_paddb(<32 x i8> %a0, <32 x i8> %a1) {
226 ; CHECK-LABEL: stack_fold_paddb:
228 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
231 ; CHECK-NEXT: #NO_APP
232 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
234 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
235 %2 = add <32 x i8> %a0, %a1
239 define <8 x i32> @stack_fold_paddd(<8 x i32> %a0, <8 x i32> %a1) {
240 ; CHECK-LABEL: stack_fold_paddd:
242 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
245 ; CHECK-NEXT: #NO_APP
246 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
248 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
249 %2 = add <8 x i32> %a0, %a1
253 define <4 x i64> @stack_fold_paddq(<4 x i64> %a0, <4 x i64> %a1) {
254 ; CHECK-LABEL: stack_fold_paddq:
256 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
259 ; CHECK-NEXT: #NO_APP
260 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
262 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
263 %2 = add <4 x i64> %a0, %a1
267 define <32 x i8> @stack_fold_paddsb(<32 x i8> %a0, <32 x i8> %a1) {
268 ; CHECK-LABEL: stack_fold_paddsb:
270 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
273 ; CHECK-NEXT: #NO_APP
274 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
276 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
277 %2 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
280 declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
282 define <16 x i16> @stack_fold_paddsw(<16 x i16> %a0, <16 x i16> %a1) {
283 ; CHECK-LABEL: stack_fold_paddsw:
285 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
288 ; CHECK-NEXT: #NO_APP
289 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
291 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
292 %2 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
295 declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
297 define <32 x i8> @stack_fold_paddusb(<32 x i8> %a0, <32 x i8> %a1) {
298 ; CHECK-LABEL: stack_fold_paddusb:
300 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
303 ; CHECK-NEXT: #NO_APP
304 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
306 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
307 %2 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
310 declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
312 define <16 x i16> @stack_fold_paddusw(<16 x i16> %a0, <16 x i16> %a1) {
313 ; CHECK-LABEL: stack_fold_paddusw:
315 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
318 ; CHECK-NEXT: #NO_APP
319 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
321 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
322 %2 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
325 declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
327 define <16 x i16> @stack_fold_paddw(<16 x i16> %a0, <16 x i16> %a1) {
328 ; CHECK-LABEL: stack_fold_paddw:
330 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
333 ; CHECK-NEXT: #NO_APP
334 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
336 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
337 %2 = add <16 x i16> %a0, %a1
341 define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) {
342 ; CHECK-LABEL: stack_fold_palignr:
344 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
347 ; CHECK-NEXT: #NO_APP
348 ; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
349 ; CHECK-NEXT: # ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
351 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
352 %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
356 define <32 x i8> @stack_fold_pand(<32 x i8> %a0, <32 x i8> %a1) {
357 ; CHECK-LABEL: stack_fold_pand:
359 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
362 ; CHECK-NEXT: #NO_APP
363 ; CHECK-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
364 ; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
366 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
367 %2 = and <32 x i8> %a0, %a1
368 ; add forces execution domain
369 %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
373 define <32 x i8> @stack_fold_pandn(<32 x i8> %a0, <32 x i8> %a1) {
374 ; CHECK-LABEL: stack_fold_pandn:
376 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
379 ; CHECK-NEXT: #NO_APP
380 ; CHECK-NEXT: vpandn {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
381 ; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
383 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
384 %2 = xor <32 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
385 %3 = and <32 x i8> %2, %a1
386 ; add forces execution domain
387 %4 = add <32 x i8> %3, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
391 define <32 x i8> @stack_fold_pavgb(<32 x i8> %a0, <32 x i8> %a1) {
392 ; CHECK-LABEL: stack_fold_pavgb:
394 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
397 ; CHECK-NEXT: #NO_APP
398 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
400 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
401 %2 = zext <32 x i8> %a0 to <32 x i16>
402 %3 = zext <32 x i8> %a1 to <32 x i16>
403 %4 = add <32 x i16> %2, %3
404 %5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
405 %6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
406 %7 = trunc <32 x i16> %6 to <32 x i8>
410 define <16 x i16> @stack_fold_pavgw(<16 x i16> %a0, <16 x i16> %a1) {
411 ; CHECK-LABEL: stack_fold_pavgw:
413 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
416 ; CHECK-NEXT: #NO_APP
417 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
419 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
420 %2 = zext <16 x i16> %a0 to <16 x i32>
421 %3 = zext <16 x i16> %a1 to <16 x i32>
422 %4 = add <16 x i32> %2, %3
423 %5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
424 %6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
425 %7 = trunc <16 x i32> %6 to <16 x i16>
429 define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) {
430 ; CHECK-LABEL: stack_fold_pblendd:
432 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
435 ; CHECK-NEXT: #NO_APP
436 ; CHECK-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
437 ; CHECK-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
438 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
439 ; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0
441 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
442 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
443 ; add forces execution domain
444 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
448 define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
449 ; CHECK-LABEL: stack_fold_pblendd_ymm:
451 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
454 ; CHECK-NEXT: #NO_APP
455 ; CHECK-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
456 ; CHECK-NEXT: # ymm0 = mem[0,1,2],ymm0[3,4,5,6,7]
457 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
459 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
460 %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
461 ; add forces execution domain
462 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
466 define <32 x i8> @stack_fold_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %c) {
467 ; CHECK-LABEL: stack_fold_pblendvb:
469 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
472 ; CHECK-NEXT: #NO_APP
473 ; CHECK-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
475 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
476 %2 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a1, <32 x i8> %c, <32 x i8> %a0)
479 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
481 define <16 x i16> @stack_fold_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
482 ; CHECK-LABEL: stack_fold_pblendw:
484 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
487 ; CHECK-NEXT: #NO_APP
488 ; CHECK-NEXT: vpblendw $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
489 ; CHECK-NEXT: # ymm0 = mem[0,1,2],ymm0[3,4,5,6,7],mem[8,9,10],ymm0[11,12,13,14,15]
491 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
492 %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
496 define <16 x i8> @stack_fold_pbroadcastb(<16 x i8> %a0) {
497 ; CHECK-LABEL: stack_fold_pbroadcastb:
499 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
502 ; CHECK-NEXT: #NO_APP
503 ; CHECK-NEXT: vpbroadcastb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
505 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
506 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer
510 define <32 x i8> @stack_fold_pbroadcastb_ymm(<16 x i8> %a0) {
511 ; CHECK-LABEL: stack_fold_pbroadcastb_ymm:
513 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
516 ; CHECK-NEXT: #NO_APP
517 ; CHECK-NEXT: vpbroadcastb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
519 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
520 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> zeroinitializer
524 define <4 x i32> @stack_fold_pbroadcastd(<4 x i32> %a0) {
525 ; CHECK-LABEL: stack_fold_pbroadcastd:
527 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
530 ; CHECK-NEXT: #NO_APP
531 ; CHECK-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
532 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
534 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
535 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> zeroinitializer
536 ; add forces execution domain
537 %3 = add <4 x i32> %2, <i32 2, i32 1, i32 1, i32 1>
541 define <8 x i32> @stack_fold_pbroadcastd_ymm(<4 x i32> %a0) {
542 ; CHECK-LABEL: stack_fold_pbroadcastd_ymm:
544 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
547 ; CHECK-NEXT: #NO_APP
548 ; CHECK-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
549 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
551 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
552 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> zeroinitializer
553 ; add forces execution domain
554 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
558 define <2 x i64> @stack_fold_pbroadcastq(<2 x i64> %a0) {
559 ; CHECK-LABEL: stack_fold_pbroadcastq:
561 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
564 ; CHECK-NEXT: #NO_APP
565 ; CHECK-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
566 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
568 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
569 %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
570 ; add forces execution domain
571 %3 = add <2 x i64> %2, <i64 2, i64 1>
575 define <4 x i64> @stack_fold_pbroadcastq_ymm(<2 x i64> %a0) {
576 ; CHECK-LABEL: stack_fold_pbroadcastq_ymm:
578 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
581 ; CHECK-NEXT: #NO_APP
582 ; CHECK-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
583 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
585 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
586 %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
587 ; add forces execution domain
588 %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1>
592 define <8 x i16> @stack_fold_pbroadcastw(<8 x i16> %a0) {
593 ; CHECK-LABEL: stack_fold_pbroadcastw:
595 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
598 ; CHECK-NEXT: #NO_APP
599 ; CHECK-NEXT: vpbroadcastw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
601 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
602 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
606 define <16 x i16> @stack_fold_pbroadcastw_ymm(<8 x i16> %a0) {
607 ; CHECK-LABEL: stack_fold_pbroadcastw_ymm:
609 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
612 ; CHECK-NEXT: #NO_APP
613 ; CHECK-NEXT: vpbroadcastw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
615 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
616 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> zeroinitializer
620 define <32 x i8> @stack_fold_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1) {
621 ; CHECK-LABEL: stack_fold_pcmpeqb:
623 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
626 ; CHECK-NEXT: #NO_APP
627 ; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
629 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
630 %2 = icmp eq <32 x i8> %a0, %a1
631 %3 = sext <32 x i1> %2 to <32 x i8>
635 define <8 x i32> @stack_fold_pcmpeqd(<8 x i32> %a0, <8 x i32> %a1) {
636 ; CHECK-LABEL: stack_fold_pcmpeqd:
638 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
641 ; CHECK-NEXT: #NO_APP
642 ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
644 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
645 %2 = icmp eq <8 x i32> %a0, %a1
646 %3 = sext <8 x i1> %2 to <8 x i32>
650 define <4 x i64> @stack_fold_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1) {
651 ; CHECK-LABEL: stack_fold_pcmpeqq:
653 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
656 ; CHECK-NEXT: #NO_APP
657 ; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
659 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
660 %2 = icmp eq <4 x i64> %a0, %a1
661 %3 = sext <4 x i1> %2 to <4 x i64>
665 define <16 x i16> @stack_fold_pcmpeqw(<16 x i16> %a0, <16 x i16> %a1) {
666 ; CHECK-LABEL: stack_fold_pcmpeqw:
668 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
671 ; CHECK-NEXT: #NO_APP
672 ; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
674 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
675 %2 = icmp eq <16 x i16> %a0, %a1
676 %3 = sext <16 x i1> %2 to <16 x i16>
680 define <32 x i8> @stack_fold_pcmpgtb(<32 x i8> %a0, <32 x i8> %a1) {
681 ; CHECK-LABEL: stack_fold_pcmpgtb:
683 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
686 ; CHECK-NEXT: #NO_APP
687 ; CHECK-NEXT: vpcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
689 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
690 %2 = icmp sgt <32 x i8> %a0, %a1
691 %3 = sext <32 x i1> %2 to <32 x i8>
695 define <8 x i32> @stack_fold_pcmpgtd(<8 x i32> %a0, <8 x i32> %a1) {
696 ; CHECK-LABEL: stack_fold_pcmpgtd:
698 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
701 ; CHECK-NEXT: #NO_APP
702 ; CHECK-NEXT: vpcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
704 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
705 %2 = icmp sgt <8 x i32> %a0, %a1
706 %3 = sext <8 x i1> %2 to <8 x i32>
710 define <4 x i64> @stack_fold_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1) {
711 ; CHECK-LABEL: stack_fold_pcmpgtq:
713 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
716 ; CHECK-NEXT: #NO_APP
717 ; CHECK-NEXT: vpcmpgtq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
719 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
720 %2 = icmp sgt <4 x i64> %a0, %a1
721 %3 = sext <4 x i1> %2 to <4 x i64>
725 define <16 x i16> @stack_fold_pcmpgtw(<16 x i16> %a0, <16 x i16> %a1) {
726 ; CHECK-LABEL: stack_fold_pcmpgtw:
728 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
731 ; CHECK-NEXT: #NO_APP
732 ; CHECK-NEXT: vpcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
734 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
735 %2 = icmp sgt <16 x i16> %a0, %a1
736 %3 = sext <16 x i1> %2 to <16 x i16>
740 define <8 x i32> @stack_fold_perm2i128(<8 x i32> %a0, <8 x i32> %a1) {
741 ; CHECK-LABEL: stack_fold_perm2i128:
743 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
746 ; CHECK-NEXT: #NO_APP
747 ; CHECK-NEXT: vperm2i128 $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
748 ; CHECK-NEXT: # ymm0 = ymm0[2,3],mem[0,1]
749 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
751 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
752 %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
753 ; add forces execution domain
754 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
758 define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
759 ; CHECK-LABEL: stack_fold_permd:
761 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
764 ; CHECK-NEXT: #NO_APP
765 ; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
766 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
768 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
769 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
770 ; add forces execution domain
771 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
774 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
776 define <4 x double> @stack_fold_permpd(<4 x double> %a0) {
777 ; CHECK-LABEL: stack_fold_permpd:
779 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
782 ; CHECK-NEXT: #NO_APP
783 ; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
784 ; CHECK-NEXT: # ymm0 = mem[3,2,2,3]
785 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
786 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
788 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
789 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
790 ; fadd forces execution domain
791 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
795 define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) {
796 ; CHECK-LABEL: stack_fold_permps:
798 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
801 ; CHECK-NEXT: #NO_APP
802 ; CHECK-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
804 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
805 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0)
808 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
810 define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
811 ; CHECK-LABEL: stack_fold_permq:
813 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
816 ; CHECK-NEXT: #NO_APP
817 ; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
818 ; CHECK-NEXT: # ymm0 = mem[3,2,2,3]
819 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
821 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
822 %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
823 ; add forces execution domain
824 %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1>
828 define <8 x i32> @stack_fold_phaddd(<8 x i32> %a0, <8 x i32> %a1) {
829 ; CHECK-LABEL: stack_fold_phaddd:
831 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
834 ; CHECK-NEXT: #NO_APP
835 ; CHECK-NEXT: vphaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
837 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
838 %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1)
841 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
843 define <16 x i16> @stack_fold_phaddsw(<16 x i16> %a0, <16 x i16> %a1) {
844 ; CHECK-LABEL: stack_fold_phaddsw:
846 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
849 ; CHECK-NEXT: #NO_APP
850 ; CHECK-NEXT: vphaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
852 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
853 %2 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1)
856 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
858 define <16 x i16> @stack_fold_phaddw(<16 x i16> %a0, <16 x i16> %a1) {
859 ; CHECK-LABEL: stack_fold_phaddw:
861 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
864 ; CHECK-NEXT: #NO_APP
865 ; CHECK-NEXT: vphaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
867 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
868 %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
871 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
873 define <8 x i32> @stack_fold_phsubd(<8 x i32> %a0, <8 x i32> %a1) {
874 ; CHECK-LABEL: stack_fold_phsubd:
876 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
879 ; CHECK-NEXT: #NO_APP
880 ; CHECK-NEXT: vphsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
882 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
883 %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1)
886 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
888 define <16 x i16> @stack_fold_phsubsw(<16 x i16> %a0, <16 x i16> %a1) {
889 ; CHECK-LABEL: stack_fold_phsubsw:
891 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
894 ; CHECK-NEXT: #NO_APP
895 ; CHECK-NEXT: vphsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
897 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
898 %2 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1)
901 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
903 define <16 x i16> @stack_fold_phsubw(<16 x i16> %a0, <16 x i16> %a1) {
904 ; CHECK-LABEL: stack_fold_phsubw:
906 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
909 ; CHECK-NEXT: #NO_APP
910 ; CHECK-NEXT: vphsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
912 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
913 %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1)
916 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
918 define <16 x i16> @stack_fold_pmaddubsw(<32 x i8> %a0, <32 x i8> %a1) {
919 ; CHECK-LABEL: stack_fold_pmaddubsw:
921 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
924 ; CHECK-NEXT: #NO_APP
925 ; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
927 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
928 %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
931 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
933 define <8 x i32> @stack_fold_pmaddwd(<16 x i16> %a0, <16 x i16> %a1) {
934 ; CHECK-LABEL: stack_fold_pmaddwd:
936 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
939 ; CHECK-NEXT: #NO_APP
940 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
942 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
943 %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
946 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
948 define <32 x i8> @stack_fold_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
949 ; CHECK-LABEL: stack_fold_pmaxsb:
951 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
954 ; CHECK-NEXT: #NO_APP
955 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
958 %2 = icmp sgt <32 x i8> %a0, %a1
959 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
963 define <8 x i32> @stack_fold_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) {
964 ; CHECK-LABEL: stack_fold_pmaxsd:
966 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
969 ; CHECK-NEXT: #NO_APP
970 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
972 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
973 %2 = icmp sgt <8 x i32> %a0, %a1
974 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
978 define <16 x i16> @stack_fold_pmaxsw(<16 x i16> %a0, <16 x i16> %a1) {
979 ; CHECK-LABEL: stack_fold_pmaxsw:
981 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
984 ; CHECK-NEXT: #NO_APP
985 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
987 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
988 %2 = icmp sgt <16 x i16> %a0, %a1
989 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
993 define <32 x i8> @stack_fold_pmaxub(<32 x i8> %a0, <32 x i8> %a1) {
994 ; CHECK-LABEL: stack_fold_pmaxub:
996 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
999 ; CHECK-NEXT: #NO_APP
1000 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1002 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1003 %2 = icmp ugt <32 x i8> %a0, %a1
1004 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
1008 define <8 x i32> @stack_fold_pmaxud(<8 x i32> %a0, <8 x i32> %a1) {
1009 ; CHECK-LABEL: stack_fold_pmaxud:
1011 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1014 ; CHECK-NEXT: #NO_APP
1015 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1017 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1018 %2 = icmp ugt <8 x i32> %a0, %a1
1019 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
1023 define <16 x i16> @stack_fold_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) {
1024 ; CHECK-LABEL: stack_fold_pmaxuw:
1026 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1029 ; CHECK-NEXT: #NO_APP
1030 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1032 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1033 %2 = icmp ugt <16 x i16> %a0, %a1
1034 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
1038 define <32 x i8> @stack_fold_pminsb(<32 x i8> %a0, <32 x i8> %a1) {
1039 ; CHECK-LABEL: stack_fold_pminsb:
1041 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1044 ; CHECK-NEXT: #NO_APP
1045 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1047 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1048 %2 = icmp slt <32 x i8> %a0, %a1
1049 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
1053 define <8 x i32> @stack_fold_pminsd(<8 x i32> %a0, <8 x i32> %a1) {
1054 ; CHECK-LABEL: stack_fold_pminsd:
1056 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1059 ; CHECK-NEXT: #NO_APP
1060 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1062 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1063 %2 = icmp slt <8 x i32> %a0, %a1
1064 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
1068 define <16 x i16> @stack_fold_pminsw(<16 x i16> %a0, <16 x i16> %a1) {
1069 ; CHECK-LABEL: stack_fold_pminsw:
1071 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1074 ; CHECK-NEXT: #NO_APP
1075 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1077 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1078 %2 = icmp slt <16 x i16> %a0, %a1
1079 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
1083 define <32 x i8> @stack_fold_pminub(<32 x i8> %a0, <32 x i8> %a1) {
1084 ; CHECK-LABEL: stack_fold_pminub:
1086 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1089 ; CHECK-NEXT: #NO_APP
1090 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1092 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1093 %2 = icmp ult <32 x i8> %a0, %a1
1094 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
1098 define <8 x i32> @stack_fold_pminud(<8 x i32> %a0, <8 x i32> %a1) {
1099 ; CHECK-LABEL: stack_fold_pminud:
1101 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1104 ; CHECK-NEXT: #NO_APP
1105 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1107 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1108 %2 = icmp ult <8 x i32> %a0, %a1
1109 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
1113 define <16 x i16> @stack_fold_pminuw(<16 x i16> %a0, <16 x i16> %a1) {
1114 ; CHECK-LABEL: stack_fold_pminuw:
1116 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1119 ; CHECK-NEXT: #NO_APP
1120 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1122 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1123 %2 = icmp ult <16 x i16> %a0, %a1
1124 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
1128 define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
1129 ; CHECK-LABEL: stack_fold_pmovsxbd:
1131 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1134 ; CHECK-NEXT: #NO_APP
1135 ; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1137 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1138 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1139 %3 = sext <8 x i8> %2 to <8 x i32>
1143 define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
1144 ; CHECK-LABEL: stack_fold_pmovsxbq:
1146 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1149 ; CHECK-NEXT: #NO_APP
1150 ; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1152 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1153 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1154 %3 = sext <4 x i8> %2 to <4 x i64>
1158 define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
1159 ; CHECK-LABEL: stack_fold_pmovsxbw:
1161 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1164 ; CHECK-NEXT: #NO_APP
1165 ; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1167 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1168 %2 = sext <16 x i8> %a0 to <16 x i16>
1172 define <4 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
1173 ; CHECK-LABEL: stack_fold_pmovsxdq:
1175 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1178 ; CHECK-NEXT: #NO_APP
1179 ; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1181 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1182 %2 = sext <4 x i32> %a0 to <4 x i64>
1186 define <8 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
1187 ; CHECK-LABEL: stack_fold_pmovsxwd:
1189 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1192 ; CHECK-NEXT: #NO_APP
1193 ; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1196 %2 = sext <8 x i16> %a0 to <8 x i32>
1200 define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
1201 ; CHECK-LABEL: stack_fold_pmovsxwq:
1203 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1206 ; CHECK-NEXT: #NO_APP
1207 ; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1209 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1210 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1211 %3 = sext <4 x i16> %2 to <4 x i64>
1215 define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
1216 ; CHECK-LABEL: stack_fold_pmovzxbd:
1218 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1221 ; CHECK-NEXT: #NO_APP
1222 ; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1223 ; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1225 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1226 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1227 %3 = zext <8 x i8> %2 to <8 x i32>
1231 define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
1232 ; CHECK-LABEL: stack_fold_pmovzxbq:
1234 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1237 ; CHECK-NEXT: #NO_APP
1238 ; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1239 ; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
1241 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1242 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1243 %3 = zext <4 x i8> %2 to <4 x i64>
1247 define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
1248 ; CHECK-LABEL: stack_fold_pmovzxbw:
1250 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1253 ; CHECK-NEXT: #NO_APP
1254 ; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1255 ; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1257 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1258 %2 = zext <16 x i8> %a0 to <16 x i16>
1262 define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
1263 ; CHECK-LABEL: stack_fold_pmovzxdq:
1265 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1268 ; CHECK-NEXT: #NO_APP
1269 ; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1270 ; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1272 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1273 %2 = zext <4 x i32> %a0 to <4 x i64>
1277 define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
1278 ; CHECK-LABEL: stack_fold_pmovzxwd:
1280 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1283 ; CHECK-NEXT: #NO_APP
1284 ; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1285 ; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1287 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1288 %2 = zext <8 x i16> %a0 to <8 x i32>
1292 define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
1293 ; CHECK-LABEL: stack_fold_pmovzxwq:
1295 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1298 ; CHECK-NEXT: #NO_APP
1299 ; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
1300 ; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1302 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1303 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1304 %3 = zext <4 x i16> %2 to <4 x i64>
1308 define <4 x i64> @stack_fold_pmuldq(<8 x i32> %a0, <8 x i32> %a1) {
1309 ; CHECK-LABEL: stack_fold_pmuldq:
1311 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1314 ; CHECK-NEXT: #NO_APP
1315 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1317 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1318 %2 = bitcast <8 x i32> %a0 to <4 x i64>
1319 %3 = bitcast <8 x i32> %a1 to <4 x i64>
1320 %4 = shl <4 x i64> %2, <i64 32, i64 32, i64 32, i64 32>
1321 %5 = ashr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
1322 %6 = shl <4 x i64> %3, <i64 32, i64 32, i64 32, i64 32>
1323 %7 = ashr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
1324 %8 = mul <4 x i64> %5, %7
1328 define <16 x i16> @stack_fold_pmulhrsw(<16 x i16> %a0, <16 x i16> %a1) {
1329 ; CHECK-LABEL: stack_fold_pmulhrsw:
1331 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1334 ; CHECK-NEXT: #NO_APP
1335 ; CHECK-NEXT: vpmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1337 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1338 %2 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1)
1341 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1343 define <16 x i16> @stack_fold_pmulhuw(<16 x i16> %a0, <16 x i16> %a1) {
1344 ; CHECK-LABEL: stack_fold_pmulhuw:
1346 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1349 ; CHECK-NEXT: #NO_APP
1350 ; CHECK-NEXT: vpmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1352 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1353 %2 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1)
1356 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1358 define <16 x i16> @stack_fold_pmulhw(<16 x i16> %a0, <16 x i16> %a1) {
1359 ; CHECK-LABEL: stack_fold_pmulhw:
1361 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1364 ; CHECK-NEXT: #NO_APP
1365 ; CHECK-NEXT: vpmulhw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1367 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1368 %2 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1)
1371 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1373 define <8 x i32> @stack_fold_pmulld(<8 x i32> %a0, <8 x i32> %a1) {
1374 ; CHECK-LABEL: stack_fold_pmulld:
1376 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1379 ; CHECK-NEXT: #NO_APP
1380 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1382 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1383 %2 = mul <8 x i32> %a0, %a1
1387 define <16 x i16> @stack_fold_pmullw(<16 x i16> %a0, <16 x i16> %a1) {
1388 ; CHECK-LABEL: stack_fold_pmullw:
1390 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1393 ; CHECK-NEXT: #NO_APP
1394 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1396 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1397 %2 = mul <16 x i16> %a0, %a1
1401 define <4 x i64> @stack_fold_pmuludq(<8 x i32> %a0, <8 x i32> %a1) {
1402 ; CHECK-LABEL: stack_fold_pmuludq:
1404 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1407 ; CHECK-NEXT: #NO_APP
1408 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1410 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1411 %2 = bitcast <8 x i32> %a0 to <4 x i64>
1412 %3 = bitcast <8 x i32> %a1 to <4 x i64>
1413 %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1414 %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1415 %6 = mul <4 x i64> %4, %5
1419 define <32 x i8> @stack_fold_por(<32 x i8> %a0, <32 x i8> %a1) {
1420 ; CHECK-LABEL: stack_fold_por:
1422 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1425 ; CHECK-NEXT: #NO_APP
1426 ; CHECK-NEXT: vpor {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1427 ; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1429 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1430 %2 = or <32 x i8> %a0, %a1
1431 ; add forces execution domain
1432 %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1436 define <4 x i64> @stack_fold_psadbw(<32 x i8> %a0, <32 x i8> %a1) {
1437 ; CHECK-LABEL: stack_fold_psadbw:
1439 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1442 ; CHECK-NEXT: #NO_APP
1443 ; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1445 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1446 %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1)
1449 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
1451 define <32 x i8> @stack_fold_pshufb(<32 x i8> %a0, <32 x i8> %a1) {
1452 ; CHECK-LABEL: stack_fold_pshufb:
1454 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1457 ; CHECK-NEXT: #NO_APP
1458 ; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1460 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1461 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
1464 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
1466 define <8 x i32> @stack_fold_pshufd(<8 x i32> %a0) {
1467 ; CHECK-LABEL: stack_fold_pshufd:
1469 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1472 ; CHECK-NEXT: #NO_APP
1473 ; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1474 ; CHECK-NEXT: # ymm0 = mem[3,2,1,0,7,6,5,4]
1475 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1477 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1478 %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1479 ; add forces execution domain
1480 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1484 define <16 x i16> @stack_fold_vpshufhw(<16 x i16> %a0) {
1485 ; CHECK-LABEL: stack_fold_vpshufhw:
1487 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1490 ; CHECK-NEXT: #NO_APP
1491 ; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1492 ; CHECK-NEXT: # ymm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
1494 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1495 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
1499 define <16 x i16> @stack_fold_vpshuflw(<16 x i16> %a0) {
1500 ; CHECK-LABEL: stack_fold_vpshuflw:
1502 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1505 ; CHECK-NEXT: #NO_APP
1506 ; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1507 ; CHECK-NEXT: # ymm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
1509 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1510 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
1514 define <32 x i8> @stack_fold_psignb(<32 x i8> %a0, <32 x i8> %a1) {
1515 ; CHECK-LABEL: stack_fold_psignb:
1517 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1520 ; CHECK-NEXT: #NO_APP
1521 ; CHECK-NEXT: vpsignb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1523 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1524 %2 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1)
1527 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
1529 define <8 x i32> @stack_fold_psignd(<8 x i32> %a0, <8 x i32> %a1) {
1530 ; CHECK-LABEL: stack_fold_psignd:
1532 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1535 ; CHECK-NEXT: #NO_APP
1536 ; CHECK-NEXT: vpsignd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1538 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1539 %2 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1)
1542 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
1544 define <16 x i16> @stack_fold_psignw(<16 x i16> %a0, <16 x i16> %a1) {
1545 ; CHECK-LABEL: stack_fold_psignw:
1547 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1550 ; CHECK-NEXT: #NO_APP
1551 ; CHECK-NEXT: vpsignw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1553 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1554 %2 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1)
1557 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
1559 define <8 x i32> @stack_fold_pslld(<8 x i32> %a0, <4 x i32> %a1) {
1560 ; CHECK-LABEL: stack_fold_pslld:
1562 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1565 ; CHECK-NEXT: #NO_APP
1566 ; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1568 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1569 %2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1)
1572 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
1574 define <4 x i64> @stack_fold_psllq(<4 x i64> %a0, <2 x i64> %a1) {
1575 ; CHECK-LABEL: stack_fold_psllq:
1577 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1580 ; CHECK-NEXT: #NO_APP
1581 ; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1583 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1584 %2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
1587 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
1589 define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) {
1590 ; CHECK-LABEL: stack_fold_psllvd:
1592 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1595 ; CHECK-NEXT: #NO_APP
1596 ; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1598 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1599 %2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1)
1602 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
1604 define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
1605 ; CHECK-LABEL: stack_fold_psllvd_ymm:
1607 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1610 ; CHECK-NEXT: #NO_APP
1611 ; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1613 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1614 %2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1)
1617 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
1619 define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) {
1620 ; CHECK-LABEL: stack_fold_psllvq:
1622 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1625 ; CHECK-NEXT: #NO_APP
1626 ; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1628 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1629 %2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
1632 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
1634 define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
1635 ; CHECK-LABEL: stack_fold_psllvq_ymm:
1637 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1640 ; CHECK-NEXT: #NO_APP
1641 ; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1643 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1644 %2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
1647 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
1649 define <16 x i16> @stack_fold_psllw(<16 x i16> %a0, <8 x i16> %a1) {
1650 ; CHECK-LABEL: stack_fold_psllw:
1652 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1655 ; CHECK-NEXT: #NO_APP
1656 ; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1658 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1659 %2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1)
1662 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
1664 define <8 x i32> @stack_fold_psrad(<8 x i32> %a0, <4 x i32> %a1) {
1665 ; CHECK-LABEL: stack_fold_psrad:
1667 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1670 ; CHECK-NEXT: #NO_APP
1671 ; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1673 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1674 %2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1)
1677 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
1679 define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) {
1680 ; CHECK-LABEL: stack_fold_psravd:
1682 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1685 ; CHECK-NEXT: #NO_APP
1686 ; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1688 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1689 %2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1)
1692 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
1694 define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
1695 ; CHECK-LABEL: stack_fold_psravd_ymm:
1697 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1700 ; CHECK-NEXT: #NO_APP
1701 ; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1703 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1704 %2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1)
1707 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
1709 define <16 x i16> @stack_fold_psraw(<16 x i16> %a0, <8 x i16> %a1) {
1710 ; CHECK-LABEL: stack_fold_psraw:
1712 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1715 ; CHECK-NEXT: #NO_APP
1716 ; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1718 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1719 %2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1)
1722 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
1724 define <8 x i32> @stack_fold_psrld(<8 x i32> %a0, <4 x i32> %a1) {
1725 ; CHECK-LABEL: stack_fold_psrld:
1727 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1730 ; CHECK-NEXT: #NO_APP
1731 ; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1733 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1734 %2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1)
1737 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
1739 define <4 x i64> @stack_fold_psrlq(<4 x i64> %a0, <2 x i64> %a1) {
1740 ; CHECK-LABEL: stack_fold_psrlq:
1742 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1745 ; CHECK-NEXT: #NO_APP
1746 ; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1748 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1749 %2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
1752 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
1754 define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) {
1755 ; CHECK-LABEL: stack_fold_psrlvd:
1757 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1760 ; CHECK-NEXT: #NO_APP
1761 ; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1763 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1764 %2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1)
1767 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
1769 define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
1770 ; CHECK-LABEL: stack_fold_psrlvd_ymm:
1772 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1775 ; CHECK-NEXT: #NO_APP
1776 ; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1778 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1779 %2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1)
1782 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
1784 define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) {
1785 ; CHECK-LABEL: stack_fold_psrlvq:
1787 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1790 ; CHECK-NEXT: #NO_APP
1791 ; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1793 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1794 %2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
1797 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
1799 define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
1800 ; CHECK-LABEL: stack_fold_psrlvq_ymm:
1802 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1805 ; CHECK-NEXT: #NO_APP
1806 ; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1808 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1809 %2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
1812 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
1814 define <16 x i16> @stack_fold_psrlw(<16 x i16> %a0, <8 x i16> %a1) {
1815 ; CHECK-LABEL: stack_fold_psrlw:
1817 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1820 ; CHECK-NEXT: #NO_APP
1821 ; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
1823 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1824 %2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1)
1827 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
1829 define <32 x i8> @stack_fold_psubb(<32 x i8> %a0, <32 x i8> %a1) {
1830 ; CHECK-LABEL: stack_fold_psubb:
1832 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1835 ; CHECK-NEXT: #NO_APP
1836 ; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1838 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1839 %2 = sub <32 x i8> %a0, %a1
1843 define <8 x i32> @stack_fold_psubd(<8 x i32> %a0, <8 x i32> %a1) {
1844 ; CHECK-LABEL: stack_fold_psubd:
1846 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1849 ; CHECK-NEXT: #NO_APP
1850 ; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1852 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1853 %2 = sub <8 x i32> %a0, %a1
1857 define <4 x i64> @stack_fold_psubq(<4 x i64> %a0, <4 x i64> %a1) {
1858 ; CHECK-LABEL: stack_fold_psubq:
1860 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1863 ; CHECK-NEXT: #NO_APP
1864 ; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1866 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1867 %2 = sub <4 x i64> %a0, %a1
1871 define <32 x i8> @stack_fold_psubsb(<32 x i8> %a0, <32 x i8> %a1) {
1872 ; CHECK-LABEL: stack_fold_psubsb:
1874 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1877 ; CHECK-NEXT: #NO_APP
1878 ; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1880 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1881 %2 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
1884 declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
1886 define <16 x i16> @stack_fold_psubsw(<16 x i16> %a0, <16 x i16> %a1) {
1887 ; CHECK-LABEL: stack_fold_psubsw:
1889 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1892 ; CHECK-NEXT: #NO_APP
1893 ; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1895 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1896 %2 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
1899 declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
1901 define <32 x i8> @stack_fold_psubusb(<32 x i8> %a0, <32 x i8> %a1) {
1902 ; CHECK-LABEL: stack_fold_psubusb:
1904 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1907 ; CHECK-NEXT: #NO_APP
1908 ; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1910 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1911 %2 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
1914 declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
1916 define <16 x i16> @stack_fold_psubusw(<16 x i16> %a0, <16 x i16> %a1) {
1917 ; CHECK-LABEL: stack_fold_psubusw:
1919 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1922 ; CHECK-NEXT: #NO_APP
1923 ; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1925 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1926 %2 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
1929 declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
1931 define <16 x i16> @stack_fold_psubw(<16 x i16> %a0, <16 x i16> %a1) {
1932 ; CHECK-LABEL: stack_fold_psubw:
1934 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1937 ; CHECK-NEXT: #NO_APP
1938 ; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1940 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1941 %2 = sub <16 x i16> %a0, %a1
1945 define <32 x i8> @stack_fold_punpckhbw(<32 x i8> %a0, <32 x i8> %a1) {
1946 ; CHECK-LABEL: stack_fold_punpckhbw:
1948 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1951 ; CHECK-NEXT: #NO_APP
1952 ; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1953 ; CHECK-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
1955 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1956 %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
1960 define <8 x i32> @stack_fold_punpckhdq(<8 x i32> %a0, <8 x i32> %a1) {
1961 ; CHECK-LABEL: stack_fold_punpckhdq:
1963 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1966 ; CHECK-NEXT: #NO_APP
1967 ; CHECK-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1968 ; CHECK-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
1969 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1971 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1972 %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
1973 ; add forces execution domain
1974 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1978 define <4 x i64> @stack_fold_punpckhqdq(<4 x i64> %a0, <4 x i64> %a1) {
1979 ; CHECK-LABEL: stack_fold_punpckhqdq:
1981 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1984 ; CHECK-NEXT: #NO_APP
1985 ; CHECK-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1986 ; CHECK-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
1987 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1989 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1990 %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
1991 ; add forces execution domain
1992 %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1>
1996 define <16 x i16> @stack_fold_punpckhwd(<16 x i16> %a0, <16 x i16> %a1) {
1997 ; CHECK-LABEL: stack_fold_punpckhwd:
1999 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2002 ; CHECK-NEXT: #NO_APP
2003 ; CHECK-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2004 ; CHECK-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15]
2006 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2007 %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2011 define <32 x i8> @stack_fold_punpcklbw(<32 x i8> %a0, <32 x i8> %a1) {
2012 ; CHECK-LABEL: stack_fold_punpcklbw:
2014 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2017 ; CHECK-NEXT: #NO_APP
2018 ; CHECK-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2019 ; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23]
2021 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2022 %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2026 define <8 x i32> @stack_fold_punpckldq(<8 x i32> %a0, <8 x i32> %a1) {
2027 ; CHECK-LABEL: stack_fold_punpckldq:
2029 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2032 ; CHECK-NEXT: #NO_APP
2033 ; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2034 ; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
2035 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2037 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2038 %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2039 ; add forces execution domain
2040 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2044 define <4 x i64> @stack_fold_punpcklqdq(<4 x i64> %a0, <4 x i64> %a1) {
2045 ; CHECK-LABEL: stack_fold_punpcklqdq:
2047 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2050 ; CHECK-NEXT: #NO_APP
2051 ; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2052 ; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
2053 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2055 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2056 %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2057 ; add forces execution domain
2058 %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1>
2062 define <16 x i16> @stack_fold_punpcklwd(<16 x i16> %a0, <16 x i16> %a1) {
2063 ; CHECK-LABEL: stack_fold_punpcklwd:
2065 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2068 ; CHECK-NEXT: #NO_APP
2069 ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2070 ; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
2072 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2073 %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2077 define <32 x i8> @stack_fold_pxor(<32 x i8> %a0, <32 x i8> %a1) {
2078 ; CHECK-LABEL: stack_fold_pxor:
2080 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2083 ; CHECK-NEXT: #NO_APP
2084 ; CHECK-NEXT: vpxor {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2085 ; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2087 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2088 %2 = xor <32 x i8> %a0, %a1
2089 ; add forces execution domain
2090 %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>