1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+aes,+pclmul < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <2 x i64> @stack_fold_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
13 ; CHECK-LABEL: stack_fold_aesdec:
15 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
19 ; CHECK-NEXT: vaesdec {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
22 %2 = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1)
25 declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
27 define <2 x i64> @stack_fold_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
28 ; CHECK-LABEL: stack_fold_aesdeclast:
30 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
34 ; CHECK-NEXT: vaesdeclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
36 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
37 %2 = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1)
40 declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone
42 define <2 x i64> @stack_fold_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
43 ; CHECK-LABEL: stack_fold_aesenc:
45 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
49 ; CHECK-NEXT: vaesenc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
51 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
52 %2 = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1)
55 declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
57 define <2 x i64> @stack_fold_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
58 ; CHECK-LABEL: stack_fold_aesenclast:
60 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
64 ; CHECK-NEXT: vaesenclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
66 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
67 %2 = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1)
70 declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone
72 define <2 x i64> @stack_fold_aesimc(<2 x i64> %a0) {
73 ; CHECK-LABEL: stack_fold_aesimc:
75 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
79 ; CHECK-NEXT: vaesimc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
81 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
82 %2 = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0)
85 declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
87 define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) {
88 ; CHECK-LABEL: stack_fold_aeskeygenassist:
90 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
94 ; CHECK-NEXT: vaeskeygenassist $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
96 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
97 %2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
100 declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
102 define <4 x i32> @stack_fold_movd_load(i32 %a0) {
103 ; CHECK-LABEL: stack_fold_movd_load:
105 ; CHECK-NEXT: pushq %rbp
106 ; CHECK-NEXT: .cfi_def_cfa_offset 16
107 ; CHECK-NEXT: pushq %r15
108 ; CHECK-NEXT: .cfi_def_cfa_offset 24
109 ; CHECK-NEXT: pushq %r14
110 ; CHECK-NEXT: .cfi_def_cfa_offset 32
111 ; CHECK-NEXT: pushq %r13
112 ; CHECK-NEXT: .cfi_def_cfa_offset 40
113 ; CHECK-NEXT: pushq %r12
114 ; CHECK-NEXT: .cfi_def_cfa_offset 48
115 ; CHECK-NEXT: pushq %rbx
116 ; CHECK-NEXT: .cfi_def_cfa_offset 56
117 ; CHECK-NEXT: .cfi_offset %rbx, -56
118 ; CHECK-NEXT: .cfi_offset %r12, -48
119 ; CHECK-NEXT: .cfi_offset %r13, -40
120 ; CHECK-NEXT: .cfi_offset %r14, -32
121 ; CHECK-NEXT: .cfi_offset %r15, -24
122 ; CHECK-NEXT: .cfi_offset %rbp, -16
123 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
126 ; CHECK-NEXT: #NO_APP
127 ; CHECK-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
128 ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
129 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
130 ; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0
131 ; CHECK-NEXT: popq %rbx
132 ; CHECK-NEXT: .cfi_def_cfa_offset 48
133 ; CHECK-NEXT: popq %r12
134 ; CHECK-NEXT: .cfi_def_cfa_offset 40
135 ; CHECK-NEXT: popq %r13
136 ; CHECK-NEXT: .cfi_def_cfa_offset 32
137 ; CHECK-NEXT: popq %r14
138 ; CHECK-NEXT: .cfi_def_cfa_offset 24
139 ; CHECK-NEXT: popq %r15
140 ; CHECK-NEXT: .cfi_def_cfa_offset 16
141 ; CHECK-NEXT: popq %rbp
142 ; CHECK-NEXT: .cfi_def_cfa_offset 8
144 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
145 %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0
146 ; add forces execution domain
147 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
151 define i32 @stack_fold_movd_store(<4 x i32> %a0, <4 x i32> %a1) {
152 ; CHECK-LABEL: stack_fold_movd_store:
154 ; CHECK-NEXT: pushq %rbp
155 ; CHECK-NEXT: .cfi_def_cfa_offset 16
156 ; CHECK-NEXT: pushq %r15
157 ; CHECK-NEXT: .cfi_def_cfa_offset 24
158 ; CHECK-NEXT: pushq %r14
159 ; CHECK-NEXT: .cfi_def_cfa_offset 32
160 ; CHECK-NEXT: pushq %r13
161 ; CHECK-NEXT: .cfi_def_cfa_offset 40
162 ; CHECK-NEXT: pushq %r12
163 ; CHECK-NEXT: .cfi_def_cfa_offset 48
164 ; CHECK-NEXT: pushq %rbx
165 ; CHECK-NEXT: .cfi_def_cfa_offset 56
166 ; CHECK-NEXT: .cfi_offset %rbx, -56
167 ; CHECK-NEXT: .cfi_offset %r12, -48
168 ; CHECK-NEXT: .cfi_offset %r13, -40
169 ; CHECK-NEXT: .cfi_offset %r14, -32
170 ; CHECK-NEXT: .cfi_offset %r15, -24
171 ; CHECK-NEXT: .cfi_offset %rbp, -16
172 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
173 ; CHECK-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
176 ; CHECK-NEXT: #NO_APP
177 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
178 ; CHECK-NEXT: popq %rbx
179 ; CHECK-NEXT: .cfi_def_cfa_offset 48
180 ; CHECK-NEXT: popq %r12
181 ; CHECK-NEXT: .cfi_def_cfa_offset 40
182 ; CHECK-NEXT: popq %r13
183 ; CHECK-NEXT: .cfi_def_cfa_offset 32
184 ; CHECK-NEXT: popq %r14
185 ; CHECK-NEXT: .cfi_def_cfa_offset 24
186 ; CHECK-NEXT: popq %r15
187 ; CHECK-NEXT: .cfi_def_cfa_offset 16
188 ; CHECK-NEXT: popq %rbp
189 ; CHECK-NEXT: .cfi_def_cfa_offset 8
191 ; add forces execution domain
192 %1 = add <4 x i32> %a0, %a1
193 %2 = extractelement <4 x i32> %1, i32 0
194 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
198 define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
199 ; CHECK-LABEL: stack_fold_movq_load:
201 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
204 ; CHECK-NEXT: #NO_APP
205 ; CHECK-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
206 ; CHECK-NEXT: # xmm0 = mem[0],zero
207 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
208 ; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0
210 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
211 %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
212 ; add forces execution domain
213 %3 = add <2 x i64> %2, <i64 1, i64 1>
217 define i64 @stack_fold_movq_store(<2 x i64> %a0, <2 x i64> %a1) {
218 ; CHECK-LABEL: stack_fold_movq_store:
220 ; CHECK-NEXT: pushq %rbp
221 ; CHECK-NEXT: .cfi_def_cfa_offset 16
222 ; CHECK-NEXT: pushq %r15
223 ; CHECK-NEXT: .cfi_def_cfa_offset 24
224 ; CHECK-NEXT: pushq %r14
225 ; CHECK-NEXT: .cfi_def_cfa_offset 32
226 ; CHECK-NEXT: pushq %r13
227 ; CHECK-NEXT: .cfi_def_cfa_offset 40
228 ; CHECK-NEXT: pushq %r12
229 ; CHECK-NEXT: .cfi_def_cfa_offset 48
230 ; CHECK-NEXT: pushq %rbx
231 ; CHECK-NEXT: .cfi_def_cfa_offset 56
232 ; CHECK-NEXT: .cfi_offset %rbx, -56
233 ; CHECK-NEXT: .cfi_offset %r12, -48
234 ; CHECK-NEXT: .cfi_offset %r13, -40
235 ; CHECK-NEXT: .cfi_offset %r14, -32
236 ; CHECK-NEXT: .cfi_offset %r15, -24
237 ; CHECK-NEXT: .cfi_offset %rbp, -16
238 ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
239 ; CHECK-NEXT: vmovq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
242 ; CHECK-NEXT: #NO_APP
243 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
244 ; CHECK-NEXT: popq %rbx
245 ; CHECK-NEXT: .cfi_def_cfa_offset 48
246 ; CHECK-NEXT: popq %r12
247 ; CHECK-NEXT: .cfi_def_cfa_offset 40
248 ; CHECK-NEXT: popq %r13
249 ; CHECK-NEXT: .cfi_def_cfa_offset 32
250 ; CHECK-NEXT: popq %r14
251 ; CHECK-NEXT: .cfi_def_cfa_offset 24
252 ; CHECK-NEXT: popq %r15
253 ; CHECK-NEXT: .cfi_def_cfa_offset 16
254 ; CHECK-NEXT: popq %rbp
255 ; CHECK-NEXT: .cfi_def_cfa_offset 8
257 ; add forces execution domain
258 %1 = add <2 x i64> %a0, %a1
259 %2 = extractelement <2 x i64> %1, i32 0
260 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
264 define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
265 ; CHECK-LABEL: stack_fold_mpsadbw:
267 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
270 ; CHECK-NEXT: #NO_APP
271 ; CHECK-NEXT: vmpsadbw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
273 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
274 %2 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
277 declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
279 define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
280 ; CHECK-LABEL: stack_fold_pabsb:
282 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
285 ; CHECK-NEXT: #NO_APP
286 ; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
288 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
289 %2 = icmp sgt <16 x i8> %a0, zeroinitializer
290 %3 = sub <16 x i8> zeroinitializer, %a0
291 %4 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %3
295 define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) {
296 ; CHECK-LABEL: stack_fold_pabsd:
298 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
301 ; CHECK-NEXT: #NO_APP
302 ; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
304 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
305 %2 = icmp sgt <4 x i32> %a0, zeroinitializer
306 %3 = sub <4 x i32> zeroinitializer, %a0
307 %4 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %3
311 define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) {
312 ; CHECK-LABEL: stack_fold_pabsw:
314 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
317 ; CHECK-NEXT: #NO_APP
318 ; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
321 %2 = icmp sgt <8 x i16> %a0, zeroinitializer
322 %3 = sub <8 x i16> zeroinitializer, %a0
323 %4 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %3
327 define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
328 ; CHECK-LABEL: stack_fold_packssdw:
330 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
333 ; CHECK-NEXT: #NO_APP
334 ; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
336 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
337 %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
340 declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
342 define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
343 ; CHECK-LABEL: stack_fold_packsswb:
345 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
348 ; CHECK-NEXT: #NO_APP
349 ; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
351 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
352 %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
355 declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
357 define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
358 ; CHECK-LABEL: stack_fold_packusdw:
360 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
363 ; CHECK-NEXT: #NO_APP
364 ; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
366 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
367 %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
370 declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
372 define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
373 ; CHECK-LABEL: stack_fold_packuswb:
375 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
378 ; CHECK-NEXT: #NO_APP
379 ; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
381 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
382 %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
385 declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
387 define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
388 ; CHECK-LABEL: stack_fold_paddb:
390 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
393 ; CHECK-NEXT: #NO_APP
394 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
396 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
397 %2 = add <16 x i8> %a0, %a1
401 define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) {
402 ; CHECK-LABEL: stack_fold_paddd:
404 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
407 ; CHECK-NEXT: #NO_APP
408 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
410 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
411 %2 = add <4 x i32> %a0, %a1
415 define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) {
416 ; CHECK-LABEL: stack_fold_paddq:
418 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
421 ; CHECK-NEXT: #NO_APP
422 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
424 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
425 %2 = add <2 x i64> %a0, %a1
429 define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) {
430 ; CHECK-LABEL: stack_fold_paddsb:
432 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
435 ; CHECK-NEXT: #NO_APP
436 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
438 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
439 %2 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
442 declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
444 define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) {
445 ; CHECK-LABEL: stack_fold_paddsw:
447 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
450 ; CHECK-NEXT: #NO_APP
451 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
453 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
454 %2 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
457 declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
459 define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) {
460 ; CHECK-LABEL: stack_fold_paddusb:
462 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
465 ; CHECK-NEXT: #NO_APP
466 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
468 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
469 %2 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
472 declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
474 define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) {
475 ; CHECK-LABEL: stack_fold_paddusw:
477 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
480 ; CHECK-NEXT: #NO_APP
481 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
483 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
484 %2 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
487 declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
489 define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) {
490 ; CHECK-LABEL: stack_fold_paddw:
492 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
495 ; CHECK-NEXT: #NO_APP
496 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
498 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
499 %2 = add <8 x i16> %a0, %a1
503 define <16 x i8> @stack_fold_palignr(<16 x i8> %a0, <16 x i8> %a1) {
504 ; CHECK-LABEL: stack_fold_palignr:
506 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
509 ; CHECK-NEXT: #NO_APP
510 ; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
511 ; CHECK-NEXT: # xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
513 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
514 %2 = shufflevector <16 x i8> %a1, <16 x i8> %a0, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
518 define <16 x i8> @stack_fold_pand(<16 x i8> %a0, <16 x i8> %a1) {
519 ; CHECK-LABEL: stack_fold_pand:
521 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
524 ; CHECK-NEXT: #NO_APP
525 ; CHECK-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
526 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
527 ; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0
529 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
530 %2 = and <16 x i8> %a0, %a1
531 ; add forces execution domain
532 %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
536 define <16 x i8> @stack_fold_pandn(<16 x i8> %a0, <16 x i8> %a1) {
537 ; CHECK-LABEL: stack_fold_pandn:
539 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
542 ; CHECK-NEXT: #NO_APP
543 ; CHECK-NEXT: vpandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
544 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
545 ; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0
547 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
548 %2 = xor <16 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
549 %3 = and <16 x i8> %2, %a1
550 ; add forces execution domain
551 %4 = add <16 x i8> %3, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
555 define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
556 ; CHECK-LABEL: stack_fold_pavgb:
558 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
561 ; CHECK-NEXT: #NO_APP
562 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
564 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
565 %2 = zext <16 x i8> %a0 to <16 x i16>
566 %3 = zext <16 x i8> %a1 to <16 x i16>
567 %4 = add <16 x i16> %2, %3
568 %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
569 %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
570 %7 = trunc <16 x i16> %6 to <16 x i8>
574 define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
575 ; CHECK-LABEL: stack_fold_pavgw:
577 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
580 ; CHECK-NEXT: #NO_APP
581 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
583 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
584 %2 = zext <8 x i16> %a0 to <8 x i32>
585 %3 = zext <8 x i16> %a1 to <8 x i32>
586 %4 = add <8 x i32> %2, %3
587 %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
588 %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
589 %7 = trunc <8 x i32> %6 to <8 x i16>
593 define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
594 ; CHECK-LABEL: stack_fold_pblendvb:
596 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
599 ; CHECK-NEXT: #NO_APP
600 ; CHECK-NEXT: vpblendvb %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
602 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
603 %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)
606 declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
608 define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
609 ; CHECK-LABEL: stack_fold_pblendw:
611 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
614 ; CHECK-NEXT: #NO_APP
615 ; CHECK-NEXT: vpblendw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
616 ; CHECK-NEXT: # xmm0 = mem[0,1,2],xmm0[3,4,5,6,7]
618 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
619 %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
623 define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
624 ; CHECK-LABEL: stack_fold_pclmulqdq:
626 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
629 ; CHECK-NEXT: #NO_APP
630 ; CHECK-NEXT: vpclmulqdq $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
632 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
633 %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
636 declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
638 define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
639 ; CHECK-LABEL: stack_fold_pcmpeqb:
641 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
644 ; CHECK-NEXT: #NO_APP
645 ; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
647 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
648 %2 = icmp eq <16 x i8> %a0, %a1
649 %3 = sext <16 x i1> %2 to <16 x i8>
653 define <4 x i32> @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) {
654 ; CHECK-LABEL: stack_fold_pcmpeqd:
656 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
659 ; CHECK-NEXT: #NO_APP
660 ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
662 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
663 %2 = icmp eq <4 x i32> %a0, %a1
664 %3 = sext <4 x i1> %2 to <4 x i32>
668 define <2 x i64> @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) {
669 ; CHECK-LABEL: stack_fold_pcmpeqq:
671 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
674 ; CHECK-NEXT: #NO_APP
675 ; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
677 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
678 %2 = icmp eq <2 x i64> %a0, %a1
679 %3 = sext <2 x i1> %2 to <2 x i64>
683 define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
684 ; CHECK-LABEL: stack_fold_pcmpeqw:
686 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
689 ; CHECK-NEXT: #NO_APP
690 ; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
692 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
693 %2 = icmp eq <8 x i16> %a0, %a1
694 %3 = sext <8 x i1> %2 to <8 x i16>
698 define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) {
699 ; CHECK-LABEL: stack_fold_pcmpestri:
701 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
704 ; CHECK-NEXT: #NO_APP
705 ; CHECK-NEXT: movl $7, %eax
706 ; CHECK-NEXT: movl $7, %edx
707 ; CHECK-NEXT: vpcmpestri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
708 ; CHECK-NEXT: movl %ecx, %eax
710 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
711 %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
714 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
716 define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) {
717 ; CHECK-LABEL: stack_fold_pcmpestrm:
719 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
722 ; CHECK-NEXT: #NO_APP
723 ; CHECK-NEXT: movl $7, %eax
724 ; CHECK-NEXT: movl $7, %edx
725 ; CHECK-NEXT: vpcmpestrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
727 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
728 %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
731 declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
733 define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {
734 ; CHECK-LABEL: stack_fold_pcmpgtb:
736 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
739 ; CHECK-NEXT: #NO_APP
740 ; CHECK-NEXT: vpcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
742 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
743 %2 = icmp sgt <16 x i8> %a0, %a1
744 %3 = sext <16 x i1> %2 to <16 x i8>
748 define <4 x i32> @stack_fold_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1) {
749 ; CHECK-LABEL: stack_fold_pcmpgtd:
751 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
754 ; CHECK-NEXT: #NO_APP
755 ; CHECK-NEXT: vpcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
758 %2 = icmp sgt <4 x i32> %a0, %a1
759 %3 = sext <4 x i1> %2 to <4 x i32>
763 define <2 x i64> @stack_fold_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1) {
764 ; CHECK-LABEL: stack_fold_pcmpgtq:
766 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
769 ; CHECK-NEXT: #NO_APP
770 ; CHECK-NEXT: vpcmpgtq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
772 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
773 %2 = icmp sgt <2 x i64> %a0, %a1
774 %3 = sext <2 x i1> %2 to <2 x i64>
778 define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) {
779 ; CHECK-LABEL: stack_fold_pcmpgtw:
781 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
784 ; CHECK-NEXT: #NO_APP
785 ; CHECK-NEXT: vpcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
787 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
788 %2 = icmp sgt <8 x i16> %a0, %a1
789 %3 = sext <8 x i1> %2 to <8 x i16>
793 define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) {
794 ; CHECK-LABEL: stack_fold_pcmpistri:
796 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
799 ; CHECK-NEXT: #NO_APP
800 ; CHECK-NEXT: vpcmpistri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
801 ; CHECK-NEXT: movl %ecx, %eax
803 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
804 %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
807 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
809 define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) {
810 ; CHECK-LABEL: stack_fold_pcmpistrm:
812 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
815 ; CHECK-NEXT: #NO_APP
816 ; CHECK-NEXT: vpcmpistrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
818 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
819 %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
822 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
824 ; TODO stack_fold_pextrb
826 define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
827 ; CHECK-LABEL: stack_fold_pextrd:
829 ; CHECK-NEXT: pushq %rbp
830 ; CHECK-NEXT: .cfi_def_cfa_offset 16
831 ; CHECK-NEXT: pushq %r15
832 ; CHECK-NEXT: .cfi_def_cfa_offset 24
833 ; CHECK-NEXT: pushq %r14
834 ; CHECK-NEXT: .cfi_def_cfa_offset 32
835 ; CHECK-NEXT: pushq %r13
836 ; CHECK-NEXT: .cfi_def_cfa_offset 40
837 ; CHECK-NEXT: pushq %r12
838 ; CHECK-NEXT: .cfi_def_cfa_offset 48
839 ; CHECK-NEXT: pushq %rbx
840 ; CHECK-NEXT: .cfi_def_cfa_offset 56
841 ; CHECK-NEXT: .cfi_offset %rbx, -56
842 ; CHECK-NEXT: .cfi_offset %r12, -48
843 ; CHECK-NEXT: .cfi_offset %r13, -40
844 ; CHECK-NEXT: .cfi_offset %r14, -32
845 ; CHECK-NEXT: .cfi_offset %r15, -24
846 ; CHECK-NEXT: .cfi_offset %rbp, -16
847 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
848 ; CHECK-NEXT: vpextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
851 ; CHECK-NEXT: #NO_APP
852 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
853 ; CHECK-NEXT: popq %rbx
854 ; CHECK-NEXT: .cfi_def_cfa_offset 48
855 ; CHECK-NEXT: popq %r12
856 ; CHECK-NEXT: .cfi_def_cfa_offset 40
857 ; CHECK-NEXT: popq %r13
858 ; CHECK-NEXT: .cfi_def_cfa_offset 32
859 ; CHECK-NEXT: popq %r14
860 ; CHECK-NEXT: .cfi_def_cfa_offset 24
861 ; CHECK-NEXT: popq %r15
862 ; CHECK-NEXT: .cfi_def_cfa_offset 16
863 ; CHECK-NEXT: popq %rbp
864 ; CHECK-NEXT: .cfi_def_cfa_offset 8
866 ; add forces execution domain
867 %1 = add <4 x i32> %a0, %a1
868 %2 = extractelement <4 x i32> %1, i32 1
869 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
873 define i64 @stack_fold_pextrq(<2 x i64> %a0) {
874 ; CHECK-LABEL: stack_fold_pextrq:
876 ; CHECK-NEXT: pushq %rbp
877 ; CHECK-NEXT: .cfi_def_cfa_offset 16
878 ; CHECK-NEXT: pushq %r15
879 ; CHECK-NEXT: .cfi_def_cfa_offset 24
880 ; CHECK-NEXT: pushq %r14
881 ; CHECK-NEXT: .cfi_def_cfa_offset 32
882 ; CHECK-NEXT: pushq %r13
883 ; CHECK-NEXT: .cfi_def_cfa_offset 40
884 ; CHECK-NEXT: pushq %r12
885 ; CHECK-NEXT: .cfi_def_cfa_offset 48
886 ; CHECK-NEXT: pushq %rbx
887 ; CHECK-NEXT: .cfi_def_cfa_offset 56
888 ; CHECK-NEXT: .cfi_offset %rbx, -56
889 ; CHECK-NEXT: .cfi_offset %r12, -48
890 ; CHECK-NEXT: .cfi_offset %r13, -40
891 ; CHECK-NEXT: .cfi_offset %r14, -32
892 ; CHECK-NEXT: .cfi_offset %r15, -24
893 ; CHECK-NEXT: .cfi_offset %rbp, -16
894 ; CHECK-NEXT: vpextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
897 ; CHECK-NEXT: #NO_APP
898 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
899 ; CHECK-NEXT: popq %rbx
900 ; CHECK-NEXT: .cfi_def_cfa_offset 48
901 ; CHECK-NEXT: popq %r12
902 ; CHECK-NEXT: .cfi_def_cfa_offset 40
903 ; CHECK-NEXT: popq %r13
904 ; CHECK-NEXT: .cfi_def_cfa_offset 32
905 ; CHECK-NEXT: popq %r14
906 ; CHECK-NEXT: .cfi_def_cfa_offset 24
907 ; CHECK-NEXT: popq %r15
908 ; CHECK-NEXT: .cfi_def_cfa_offset 16
909 ; CHECK-NEXT: popq %rbp
910 ; CHECK-NEXT: .cfi_def_cfa_offset 8
912 %1 = extractelement <2 x i64> %a0, i32 1
913 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
917 ; TODO stack_fold_pextrw
919 define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
920 ; CHECK-LABEL: stack_fold_phaddd:
922 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
925 ; CHECK-NEXT: #NO_APP
926 ; CHECK-NEXT: vphaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
928 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
929 %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
932 declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
934 define <8 x i16> @stack_fold_phaddsw(<8 x i16> %a0, <8 x i16> %a1) {
935 ; CHECK-LABEL: stack_fold_phaddsw:
937 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
940 ; CHECK-NEXT: #NO_APP
941 ; CHECK-NEXT: vphaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
943 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
944 %2 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
947 declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
949 define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) {
950 ; CHECK-LABEL: stack_fold_phaddw:
952 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
955 ; CHECK-NEXT: #NO_APP
956 ; CHECK-NEXT: vphaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
958 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
959 %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
962 declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
964 define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) {
965 ; CHECK-LABEL: stack_fold_phminposuw:
967 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
970 ; CHECK-NEXT: #NO_APP
971 ; CHECK-NEXT: vphminposuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
973 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
974 %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0)
977 declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
979 define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {
980 ; CHECK-LABEL: stack_fold_phsubd:
982 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
985 ; CHECK-NEXT: #NO_APP
986 ; CHECK-NEXT: vphsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
988 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
989 %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
992 declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
994 define <8 x i16> @stack_fold_phsubsw(<8 x i16> %a0, <8 x i16> %a1) {
995 ; CHECK-LABEL: stack_fold_phsubsw:
997 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1000 ; CHECK-NEXT: #NO_APP
1001 ; CHECK-NEXT: vphsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1003 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1004 %2 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
1007 declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
1009 define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) {
1010 ; CHECK-LABEL: stack_fold_phsubw:
1012 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1015 ; CHECK-NEXT: #NO_APP
1016 ; CHECK-NEXT: vphsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1018 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1019 %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
1022 declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
1024 define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
1025 ; CHECK-LABEL: stack_fold_pinsrb:
1027 ; CHECK-NEXT: pushq %rbp
1028 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1029 ; CHECK-NEXT: pushq %r15
1030 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1031 ; CHECK-NEXT: pushq %r14
1032 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1033 ; CHECK-NEXT: pushq %r13
1034 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1035 ; CHECK-NEXT: pushq %r12
1036 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1037 ; CHECK-NEXT: pushq %rbx
1038 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1039 ; CHECK-NEXT: .cfi_offset %rbx, -56
1040 ; CHECK-NEXT: .cfi_offset %r12, -48
1041 ; CHECK-NEXT: .cfi_offset %r13, -40
1042 ; CHECK-NEXT: .cfi_offset %r14, -32
1043 ; CHECK-NEXT: .cfi_offset %r15, -24
1044 ; CHECK-NEXT: .cfi_offset %rbp, -16
1045 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1048 ; CHECK-NEXT: #NO_APP
1049 ; CHECK-NEXT: vpinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1050 ; CHECK-NEXT: popq %rbx
1051 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1052 ; CHECK-NEXT: popq %r12
1053 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1054 ; CHECK-NEXT: popq %r13
1055 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1056 ; CHECK-NEXT: popq %r14
1057 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1058 ; CHECK-NEXT: popq %r15
1059 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1060 ; CHECK-NEXT: popq %rbp
1061 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1063 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1064 %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
1068 define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
1069 ; CHECK-LABEL: stack_fold_pinsrd:
1071 ; CHECK-NEXT: pushq %rbp
1072 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1073 ; CHECK-NEXT: pushq %r15
1074 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1075 ; CHECK-NEXT: pushq %r14
1076 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1077 ; CHECK-NEXT: pushq %r13
1078 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1079 ; CHECK-NEXT: pushq %r12
1080 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1081 ; CHECK-NEXT: pushq %rbx
1082 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1083 ; CHECK-NEXT: .cfi_offset %rbx, -56
1084 ; CHECK-NEXT: .cfi_offset %r12, -48
1085 ; CHECK-NEXT: .cfi_offset %r13, -40
1086 ; CHECK-NEXT: .cfi_offset %r14, -32
1087 ; CHECK-NEXT: .cfi_offset %r15, -24
1088 ; CHECK-NEXT: .cfi_offset %rbp, -16
1089 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1092 ; CHECK-NEXT: #NO_APP
1093 ; CHECK-NEXT: vpinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1094 ; CHECK-NEXT: popq %rbx
1095 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1096 ; CHECK-NEXT: popq %r12
1097 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1098 ; CHECK-NEXT: popq %r13
1099 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1100 ; CHECK-NEXT: popq %r14
1101 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1102 ; CHECK-NEXT: popq %r15
1103 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1104 ; CHECK-NEXT: popq %rbp
1105 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1107 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1108 %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
1112 define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
1113 ; CHECK-LABEL: stack_fold_pinsrq:
1115 ; CHECK-NEXT: pushq %rbp
1116 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1117 ; CHECK-NEXT: pushq %r15
1118 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1119 ; CHECK-NEXT: pushq %r14
1120 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1121 ; CHECK-NEXT: pushq %r13
1122 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1123 ; CHECK-NEXT: pushq %r12
1124 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1125 ; CHECK-NEXT: pushq %rbx
1126 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1127 ; CHECK-NEXT: .cfi_offset %rbx, -56
1128 ; CHECK-NEXT: .cfi_offset %r12, -48
1129 ; CHECK-NEXT: .cfi_offset %r13, -40
1130 ; CHECK-NEXT: .cfi_offset %r14, -32
1131 ; CHECK-NEXT: .cfi_offset %r15, -24
1132 ; CHECK-NEXT: .cfi_offset %rbp, -16
1133 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1136 ; CHECK-NEXT: #NO_APP
1137 ; CHECK-NEXT: vpinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
1138 ; CHECK-NEXT: popq %rbx
1139 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1140 ; CHECK-NEXT: popq %r12
1141 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1142 ; CHECK-NEXT: popq %r13
1143 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1144 ; CHECK-NEXT: popq %r14
1145 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1146 ; CHECK-NEXT: popq %r15
1147 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1148 ; CHECK-NEXT: popq %rbp
1149 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1151 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1152 %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
1156 define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
1157 ; CHECK-LABEL: stack_fold_pinsrw:
1159 ; CHECK-NEXT: pushq %rbp
1160 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1161 ; CHECK-NEXT: pushq %r15
1162 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1163 ; CHECK-NEXT: pushq %r14
1164 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1165 ; CHECK-NEXT: pushq %r13
1166 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1167 ; CHECK-NEXT: pushq %r12
1168 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1169 ; CHECK-NEXT: pushq %rbx
1170 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1171 ; CHECK-NEXT: .cfi_offset %rbx, -56
1172 ; CHECK-NEXT: .cfi_offset %r12, -48
1173 ; CHECK-NEXT: .cfi_offset %r13, -40
1174 ; CHECK-NEXT: .cfi_offset %r14, -32
1175 ; CHECK-NEXT: .cfi_offset %r15, -24
1176 ; CHECK-NEXT: .cfi_offset %rbp, -16
1177 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1180 ; CHECK-NEXT: #NO_APP
1181 ; CHECK-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1182 ; CHECK-NEXT: popq %rbx
1183 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1184 ; CHECK-NEXT: popq %r12
1185 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1186 ; CHECK-NEXT: popq %r13
1187 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1188 ; CHECK-NEXT: popq %r14
1189 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1190 ; CHECK-NEXT: popq %r15
1191 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1192 ; CHECK-NEXT: popq %rbp
1193 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1196 %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
1200 define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
1201 ; CHECK-LABEL: stack_fold_pmaddubsw:
1203 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1206 ; CHECK-NEXT: #NO_APP
1207 ; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1209 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1210 %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
1213 declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
1215 define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
1216 ; CHECK-LABEL: stack_fold_pmaddwd:
1218 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1221 ; CHECK-NEXT: #NO_APP
1222 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1224 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1225 %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
1228 declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
1230 define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
1231 ; CHECK-LABEL: stack_fold_pmaxsb:
1233 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1236 ; CHECK-NEXT: #NO_APP
1237 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1239 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1240 %2 = icmp sgt <16 x i8> %a0, %a1
1241 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1245 define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
1246 ; CHECK-LABEL: stack_fold_pmaxsd:
1248 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1251 ; CHECK-NEXT: #NO_APP
1252 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1254 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1255 %2 = icmp sgt <4 x i32> %a0, %a1
1256 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1260 define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
1261 ; CHECK-LABEL: stack_fold_pmaxsw:
1263 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1266 ; CHECK-NEXT: #NO_APP
1267 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1269 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1270 %2 = icmp sgt <8 x i16> %a0, %a1
1271 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1275 define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
1276 ; CHECK-LABEL: stack_fold_pmaxub:
1278 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1281 ; CHECK-NEXT: #NO_APP
1282 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1284 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1285 %2 = icmp ugt <16 x i8> %a0, %a1
1286 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1290 define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
1291 ; CHECK-LABEL: stack_fold_pmaxud:
1293 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1296 ; CHECK-NEXT: #NO_APP
1297 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1299 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1300 %2 = icmp ugt <4 x i32> %a0, %a1
1301 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1305 define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
1306 ; CHECK-LABEL: stack_fold_pmaxuw:
1308 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1311 ; CHECK-NEXT: #NO_APP
1312 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1314 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1315 %2 = icmp ugt <8 x i16> %a0, %a1
1316 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1320 define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
1321 ; CHECK-LABEL: stack_fold_pminsb:
1323 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1326 ; CHECK-NEXT: #NO_APP
1327 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1329 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1330 %2 = icmp slt <16 x i8> %a0, %a1
1331 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1335 define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
1336 ; CHECK-LABEL: stack_fold_pminsd:
1338 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1341 ; CHECK-NEXT: #NO_APP
1342 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1344 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1345 %2 = icmp slt <4 x i32> %a0, %a1
1346 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1350 define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
1351 ; CHECK-LABEL: stack_fold_pminsw:
1353 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1356 ; CHECK-NEXT: #NO_APP
1357 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1359 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1360 %2 = icmp slt <8 x i16> %a0, %a1
1361 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1365 define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
1366 ; CHECK-LABEL: stack_fold_pminub:
1368 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1371 ; CHECK-NEXT: #NO_APP
1372 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1374 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1375 %2 = icmp ult <16 x i8> %a0, %a1
1376 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1380 define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
1381 ; CHECK-LABEL: stack_fold_pminud:
1383 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1386 ; CHECK-NEXT: #NO_APP
1387 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1389 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1390 %2 = icmp ult <4 x i32> %a0, %a1
1391 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1395 define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
1396 ; CHECK-LABEL: stack_fold_pminuw:
1398 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1401 ; CHECK-NEXT: #NO_APP
1402 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1404 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1405 %2 = icmp ult <8 x i16> %a0, %a1
1406 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1410 define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
1411 ; CHECK-LABEL: stack_fold_pmuldq:
1413 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1416 ; CHECK-NEXT: #NO_APP
1417 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1419 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1420 %2 = bitcast <4 x i32> %a0 to <2 x i64>
1421 %3 = bitcast <4 x i32> %a1 to <2 x i64>
1422 %4 = shl <2 x i64> %2, <i64 32, i64 32>
1423 %5 = ashr <2 x i64> %4, <i64 32, i64 32>
1424 %6 = shl <2 x i64> %3, <i64 32, i64 32>
1425 %7 = ashr <2 x i64> %6, <i64 32, i64 32>
1426 %8 = mul <2 x i64> %5, %7
1430 define <8 x i16> @stack_fold_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1) {
1431 ; CHECK-LABEL: stack_fold_pmulhrsw:
1433 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1436 ; CHECK-NEXT: #NO_APP
1437 ; CHECK-NEXT: vpmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1439 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1440 %2 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
1443 declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
1445 define <8 x i16> @stack_fold_pmulhuw(<8 x i16> %a0, <8 x i16> %a1) {
1446 ; CHECK-LABEL: stack_fold_pmulhuw:
1448 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1451 ; CHECK-NEXT: #NO_APP
1452 ; CHECK-NEXT: vpmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1454 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1455 %2 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1)
1458 declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
1460 define <8 x i16> @stack_fold_pmulhw(<8 x i16> %a0, <8 x i16> %a1) {
1461 ; CHECK-LABEL: stack_fold_pmulhw:
1463 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1466 ; CHECK-NEXT: #NO_APP
1467 ; CHECK-NEXT: vpmulhw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1469 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1470 %2 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1)
1473 declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
1475 define <4 x i32> @stack_fold_pmulld(<4 x i32> %a0, <4 x i32> %a1) {
1476 ; CHECK-LABEL: stack_fold_pmulld:
1478 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1481 ; CHECK-NEXT: #NO_APP
1482 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1484 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1485 %2 = mul <4 x i32> %a0, %a1
1489 define <8 x i16> @stack_fold_pmullw(<8 x i16> %a0, <8 x i16> %a1) {
1490 ; CHECK-LABEL: stack_fold_pmullw:
1492 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1495 ; CHECK-NEXT: #NO_APP
1496 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1498 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1499 %2 = mul <8 x i16> %a0, %a1
1503 define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
1504 ; CHECK-LABEL: stack_fold_pmuludq:
1506 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1509 ; CHECK-NEXT: #NO_APP
1510 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1512 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1513 %2 = bitcast <4 x i32> %a0 to <2 x i64>
1514 %3 = bitcast <4 x i32> %a1 to <2 x i64>
1515 %4 = and <2 x i64> %2, <i64 4294967295, i64 4294967295>
1516 %5 = and <2 x i64> %3, <i64 4294967295, i64 4294967295>
1517 %6 = mul <2 x i64> %4, %5
1521 define <16 x i8> @stack_fold_por(<16 x i8> %a0, <16 x i8> %a1) {
1522 ; CHECK-LABEL: stack_fold_por:
1524 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1527 ; CHECK-NEXT: #NO_APP
1528 ; CHECK-NEXT: vpor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1529 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1530 ; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0
1532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1533 %2 = or <16 x i8> %a0, %a1
1534 ; add forces execution domain
1535 %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1539 define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) {
1540 ; CHECK-LABEL: stack_fold_psadbw:
1542 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1545 ; CHECK-NEXT: #NO_APP
1546 ; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1548 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1549 %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
1552 declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
1554 define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
1555 ; CHECK-LABEL: stack_fold_pshufb:
1557 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1560 ; CHECK-NEXT: #NO_APP
1561 ; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1563 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1564 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
1567 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
1569 define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
1570 ; CHECK-LABEL: stack_fold_pshufd:
1572 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1575 ; CHECK-NEXT: #NO_APP
1576 ; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1577 ; CHECK-NEXT: # xmm0 = mem[3,2,1,0]
1578 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1579 ; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0
1581 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1582 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1583 ; add forces execution domain
1584 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
1588 define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
1589 ; CHECK-LABEL: stack_fold_pshufhw:
1591 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1594 ; CHECK-NEXT: #NO_APP
1595 ; CHECK-NEXT: vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1596 ; CHECK-NEXT: # xmm0 = mem[0,1,2,3,7,6,4,4]
1598 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1599 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
1603 define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
1604 ; CHECK-LABEL: stack_fold_pshuflw:
1606 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1609 ; CHECK-NEXT: #NO_APP
1610 ; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1611 ; CHECK-NEXT: # xmm0 = mem[3,2,1,0,4,5,6,7]
1613 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1614 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
1618 define <16 x i8> @stack_fold_psignb(<16 x i8> %a0, <16 x i8> %a1) {
1619 ; CHECK-LABEL: stack_fold_psignb:
1621 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1624 ; CHECK-NEXT: #NO_APP
1625 ; CHECK-NEXT: vpsignb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1627 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1628 %2 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
1631 declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
1633 define <4 x i32> @stack_fold_psignd(<4 x i32> %a0, <4 x i32> %a1) {
1634 ; CHECK-LABEL: stack_fold_psignd:
1636 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1639 ; CHECK-NEXT: #NO_APP
1640 ; CHECK-NEXT: vpsignd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1642 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1643 %2 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
1646 declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
1648 define <8 x i16> @stack_fold_psignw(<8 x i16> %a0, <8 x i16> %a1) {
1649 ; CHECK-LABEL: stack_fold_psignw:
1651 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1654 ; CHECK-NEXT: #NO_APP
1655 ; CHECK-NEXT: vpsignw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1657 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1658 %2 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
1661 declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
1663 define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) {
1664 ; CHECK-LABEL: stack_fold_pslld:
1666 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1669 ; CHECK-NEXT: #NO_APP
1670 ; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1672 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1673 %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
1676 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
1678 define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) {
1679 ; CHECK-LABEL: stack_fold_psllq:
1681 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1684 ; CHECK-NEXT: #NO_APP
1685 ; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1687 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1688 %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
1691 declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
1693 define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) {
1694 ; CHECK-LABEL: stack_fold_psllw:
1696 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1699 ; CHECK-NEXT: #NO_APP
1700 ; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1702 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1703 %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
1706 declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
1708 define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) {
1709 ; CHECK-LABEL: stack_fold_psrad:
1711 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1714 ; CHECK-NEXT: #NO_APP
1715 ; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1717 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1718 %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
1721 declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
1723 define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) {
1724 ; CHECK-LABEL: stack_fold_psraw:
1726 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1729 ; CHECK-NEXT: #NO_APP
1730 ; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1732 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1733 %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
1736 declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
1738 define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) {
1739 ; CHECK-LABEL: stack_fold_psrld:
1741 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1744 ; CHECK-NEXT: #NO_APP
1745 ; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1747 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1748 %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
1751 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
1753 define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) {
1754 ; CHECK-LABEL: stack_fold_psrlq:
1756 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1759 ; CHECK-NEXT: #NO_APP
1760 ; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1762 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1763 %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
1766 declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
1768 define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) {
1769 ; CHECK-LABEL: stack_fold_psrlw:
1771 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1774 ; CHECK-NEXT: #NO_APP
1775 ; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1777 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1778 %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
1781 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
1783 define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
1784 ; CHECK-LABEL: stack_fold_psubb:
1786 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1789 ; CHECK-NEXT: #NO_APP
1790 ; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1792 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1793 %2 = sub <16 x i8> %a0, %a1
1797 define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
1798 ; CHECK-LABEL: stack_fold_psubd:
1800 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1803 ; CHECK-NEXT: #NO_APP
1804 ; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1806 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1807 %2 = sub <4 x i32> %a0, %a1
1811 define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
1812 ; CHECK-LABEL: stack_fold_psubq:
1814 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1817 ; CHECK-NEXT: #NO_APP
1818 ; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1821 %2 = sub <2 x i64> %a0, %a1
1825 define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
1826 ; CHECK-LABEL: stack_fold_psubsb:
1828 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1831 ; CHECK-NEXT: #NO_APP
1832 ; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1834 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1835 %2 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
1838 declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
1840 define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
1841 ; CHECK-LABEL: stack_fold_psubsw:
1843 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1846 ; CHECK-NEXT: #NO_APP
1847 ; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1849 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1850 %2 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
1853 declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
1855 define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
1856 ; CHECK-LABEL: stack_fold_psubusb:
1858 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1861 ; CHECK-NEXT: #NO_APP
1862 ; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1864 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1865 %2 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
1868 declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
1870 define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
1871 ; CHECK-LABEL: stack_fold_psubusw:
1873 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1876 ; CHECK-NEXT: #NO_APP
1877 ; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1879 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1880 %2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
1883 declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
1885 define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
1886 ; CHECK-LABEL: stack_fold_psubw:
1888 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1891 ; CHECK-NEXT: #NO_APP
1892 ; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1894 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1895 %2 = sub <8 x i16> %a0, %a1
1899 define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) {
1900 ; CHECK-LABEL: stack_fold_ptest:
1902 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1905 ; CHECK-NEXT: #NO_APP
1906 ; CHECK-NEXT: xorl %eax, %eax
1907 ; CHECK-NEXT: vptest {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1908 ; CHECK-NEXT: setb %al
1910 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1911 %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
1914 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
1916 define i32 @stack_fold_ptest_ymm(<4 x i64> %a0, <4 x i64> %a1) {
1917 ; CHECK-LABEL: stack_fold_ptest_ymm:
1919 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1922 ; CHECK-NEXT: #NO_APP
1923 ; CHECK-NEXT: xorl %eax, %eax
1924 ; CHECK-NEXT: vptest {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
1925 ; CHECK-NEXT: setb %al
1926 ; CHECK-NEXT: vzeroupper
1928 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1929 %2 = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
1932 declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
1934 define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
1935 ; CHECK-LABEL: stack_fold_punpckhbw:
1937 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1940 ; CHECK-NEXT: #NO_APP
1941 ; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1942 ; CHECK-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
1944 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1945 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
1949 define <4 x i32> @stack_fold_punpckhdq(<4 x i32> %a0, <4 x i32> %a1) {
1950 ; CHECK-LABEL: stack_fold_punpckhdq:
1952 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1955 ; CHECK-NEXT: #NO_APP
1956 ; CHECK-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1957 ; CHECK-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
1958 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1959 ; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0
1961 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1962 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
1963 ; add forces execution domain
1964 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
1968 define <2 x i64> @stack_fold_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1) {
1969 ; CHECK-LABEL: stack_fold_punpckhqdq:
1971 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1974 ; CHECK-NEXT: #NO_APP
1975 ; CHECK-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1976 ; CHECK-NEXT: # xmm0 = xmm0[1],mem[1]
1977 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1978 ; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0
1980 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1981 %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
1982 ; add forces execution domain
1983 %3 = add <2 x i64> %2, <i64 1, i64 1>
1987 define <8 x i16> @stack_fold_punpckhwd(<8 x i16> %a0, <8 x i16> %a1) {
1988 ; CHECK-LABEL: stack_fold_punpckhwd:
1990 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1993 ; CHECK-NEXT: #NO_APP
1994 ; CHECK-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1995 ; CHECK-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
1997 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1998 %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
2002 define <16 x i8> @stack_fold_punpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
2003 ; CHECK-LABEL: stack_fold_punpcklbw:
2005 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2008 ; CHECK-NEXT: #NO_APP
2009 ; CHECK-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2010 ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
2012 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2013 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
2017 define <4 x i32> @stack_fold_punpckldq(<4 x i32> %a0, <4 x i32> %a1) {
2018 ; CHECK-LABEL: stack_fold_punpckldq:
2020 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2023 ; CHECK-NEXT: #NO_APP
2024 ; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2025 ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2026 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2027 ; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0
2029 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2030 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2031 ; add forces execution domain
2032 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
2036 define <2 x i64> @stack_fold_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1) {
2037 ; CHECK-LABEL: stack_fold_punpcklqdq:
2039 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2042 ; CHECK-NEXT: #NO_APP
2043 ; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2044 ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
2045 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2046 ; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0
2048 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2049 %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
2050 ; add forces execution domain
2051 %3 = add <2 x i64> %2, <i64 1, i64 1>
2055 define <8 x i16> @stack_fold_punpcklwd(<8 x i16> %a0, <8 x i16> %a1) {
2056 ; CHECK-LABEL: stack_fold_punpcklwd:
2058 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2061 ; CHECK-NEXT: #NO_APP
2062 ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2063 ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2065 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2066 %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
2070 define <16 x i8> @stack_fold_pxor(<16 x i8> %a0, <16 x i8> %a1) {
2071 ; CHECK-LABEL: stack_fold_pxor:
2073 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2076 ; CHECK-NEXT: #NO_APP
2077 ; CHECK-NEXT: vpxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2078 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2079 ; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2081 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2082 %2 = xor <16 x i8> %a0, %a1
2083 ; add forces execution domain
2084 %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>