1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+aes,+crc32,+pclmul < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <2 x i64> @stack_fold_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
13 ; CHECK-LABEL: stack_fold_aesdec:
15 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
19 ; CHECK-NEXT: aesdec {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
22 %2 = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1)
25 declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
27 define <2 x i64> @stack_fold_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
28 ; CHECK-LABEL: stack_fold_aesdeclast:
30 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
34 ; CHECK-NEXT: aesdeclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
36 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
37 %2 = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1)
40 declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone
42 define <2 x i64> @stack_fold_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
43 ; CHECK-LABEL: stack_fold_aesenc:
45 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
49 ; CHECK-NEXT: aesenc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
51 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
52 %2 = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1)
55 declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
57 define <2 x i64> @stack_fold_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
58 ; CHECK-LABEL: stack_fold_aesenclast:
60 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
64 ; CHECK-NEXT: aesenclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
66 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
67 %2 = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1)
70 declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone
72 define <2 x i64> @stack_fold_aesimc(<2 x i64> %a0) {
73 ; CHECK-LABEL: stack_fold_aesimc:
75 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
79 ; CHECK-NEXT: aesimc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
81 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
82 %2 = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0)
85 declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
87 define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) {
88 ; CHECK-LABEL: stack_fold_aeskeygenassist:
90 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
94 ; CHECK-NEXT: aeskeygenassist $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
96 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
97 %2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
100 declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
102 define i32 @stack_fold_crc32_32_8(i32 %a0, i8 %a1) {
103 ; CHECK-LABEL: stack_fold_crc32_32_8:
105 ; CHECK-NEXT: pushq %rbp
106 ; CHECK-NEXT: .cfi_def_cfa_offset 16
107 ; CHECK-NEXT: pushq %r15
108 ; CHECK-NEXT: .cfi_def_cfa_offset 24
109 ; CHECK-NEXT: pushq %r14
110 ; CHECK-NEXT: .cfi_def_cfa_offset 32
111 ; CHECK-NEXT: pushq %r13
112 ; CHECK-NEXT: .cfi_def_cfa_offset 40
113 ; CHECK-NEXT: pushq %r12
114 ; CHECK-NEXT: .cfi_def_cfa_offset 48
115 ; CHECK-NEXT: pushq %rbx
116 ; CHECK-NEXT: .cfi_def_cfa_offset 56
117 ; CHECK-NEXT: .cfi_offset %rbx, -56
118 ; CHECK-NEXT: .cfi_offset %r12, -48
119 ; CHECK-NEXT: .cfi_offset %r13, -40
120 ; CHECK-NEXT: .cfi_offset %r14, -32
121 ; CHECK-NEXT: .cfi_offset %r15, -24
122 ; CHECK-NEXT: .cfi_offset %rbp, -16
123 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
124 ; CHECK-NEXT: movl %edi, %eax
127 ; CHECK-NEXT: #NO_APP
128 ; CHECK-NEXT: crc32b {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
129 ; CHECK-NEXT: popq %rbx
130 ; CHECK-NEXT: .cfi_def_cfa_offset 48
131 ; CHECK-NEXT: popq %r12
132 ; CHECK-NEXT: .cfi_def_cfa_offset 40
133 ; CHECK-NEXT: popq %r13
134 ; CHECK-NEXT: .cfi_def_cfa_offset 32
135 ; CHECK-NEXT: popq %r14
136 ; CHECK-NEXT: .cfi_def_cfa_offset 24
137 ; CHECK-NEXT: popq %r15
138 ; CHECK-NEXT: .cfi_def_cfa_offset 16
139 ; CHECK-NEXT: popq %rbp
140 ; CHECK-NEXT: .cfi_def_cfa_offset 8
142 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
143 %2 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
146 declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
148 define i32 @stack_fold_crc32_32_16(i32 %a0, i16 %a1) {
149 ; CHECK-LABEL: stack_fold_crc32_32_16:
151 ; CHECK-NEXT: pushq %rbp
152 ; CHECK-NEXT: .cfi_def_cfa_offset 16
153 ; CHECK-NEXT: pushq %r15
154 ; CHECK-NEXT: .cfi_def_cfa_offset 24
155 ; CHECK-NEXT: pushq %r14
156 ; CHECK-NEXT: .cfi_def_cfa_offset 32
157 ; CHECK-NEXT: pushq %r13
158 ; CHECK-NEXT: .cfi_def_cfa_offset 40
159 ; CHECK-NEXT: pushq %r12
160 ; CHECK-NEXT: .cfi_def_cfa_offset 48
161 ; CHECK-NEXT: pushq %rbx
162 ; CHECK-NEXT: .cfi_def_cfa_offset 56
163 ; CHECK-NEXT: .cfi_offset %rbx, -56
164 ; CHECK-NEXT: .cfi_offset %r12, -48
165 ; CHECK-NEXT: .cfi_offset %r13, -40
166 ; CHECK-NEXT: .cfi_offset %r14, -32
167 ; CHECK-NEXT: .cfi_offset %r15, -24
168 ; CHECK-NEXT: .cfi_offset %rbp, -16
169 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
170 ; CHECK-NEXT: movl %edi, %eax
173 ; CHECK-NEXT: #NO_APP
174 ; CHECK-NEXT: crc32w {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
175 ; CHECK-NEXT: popq %rbx
176 ; CHECK-NEXT: .cfi_def_cfa_offset 48
177 ; CHECK-NEXT: popq %r12
178 ; CHECK-NEXT: .cfi_def_cfa_offset 40
179 ; CHECK-NEXT: popq %r13
180 ; CHECK-NEXT: .cfi_def_cfa_offset 32
181 ; CHECK-NEXT: popq %r14
182 ; CHECK-NEXT: .cfi_def_cfa_offset 24
183 ; CHECK-NEXT: popq %r15
184 ; CHECK-NEXT: .cfi_def_cfa_offset 16
185 ; CHECK-NEXT: popq %rbp
186 ; CHECK-NEXT: .cfi_def_cfa_offset 8
188 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
189 %2 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
192 declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
194 define i32 @stack_fold_crc32_32_32(i32 %a0, i32 %a1) {
195 ; CHECK-LABEL: stack_fold_crc32_32_32:
197 ; CHECK-NEXT: pushq %rbp
198 ; CHECK-NEXT: .cfi_def_cfa_offset 16
199 ; CHECK-NEXT: pushq %r15
200 ; CHECK-NEXT: .cfi_def_cfa_offset 24
201 ; CHECK-NEXT: pushq %r14
202 ; CHECK-NEXT: .cfi_def_cfa_offset 32
203 ; CHECK-NEXT: pushq %r13
204 ; CHECK-NEXT: .cfi_def_cfa_offset 40
205 ; CHECK-NEXT: pushq %r12
206 ; CHECK-NEXT: .cfi_def_cfa_offset 48
207 ; CHECK-NEXT: pushq %rbx
208 ; CHECK-NEXT: .cfi_def_cfa_offset 56
209 ; CHECK-NEXT: .cfi_offset %rbx, -56
210 ; CHECK-NEXT: .cfi_offset %r12, -48
211 ; CHECK-NEXT: .cfi_offset %r13, -40
212 ; CHECK-NEXT: .cfi_offset %r14, -32
213 ; CHECK-NEXT: .cfi_offset %r15, -24
214 ; CHECK-NEXT: .cfi_offset %rbp, -16
215 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
216 ; CHECK-NEXT: movl %edi, %eax
219 ; CHECK-NEXT: #NO_APP
220 ; CHECK-NEXT: crc32l {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
221 ; CHECK-NEXT: popq %rbx
222 ; CHECK-NEXT: .cfi_def_cfa_offset 48
223 ; CHECK-NEXT: popq %r12
224 ; CHECK-NEXT: .cfi_def_cfa_offset 40
225 ; CHECK-NEXT: popq %r13
226 ; CHECK-NEXT: .cfi_def_cfa_offset 32
227 ; CHECK-NEXT: popq %r14
228 ; CHECK-NEXT: .cfi_def_cfa_offset 24
229 ; CHECK-NEXT: popq %r15
230 ; CHECK-NEXT: .cfi_def_cfa_offset 16
231 ; CHECK-NEXT: popq %rbp
232 ; CHECK-NEXT: .cfi_def_cfa_offset 8
234 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
235 %2 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
238 declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
240 define i64 @stack_fold_crc32_64_64(i64 %a0, i64 %a1) {
241 ; CHECK-LABEL: stack_fold_crc32_64_64:
243 ; CHECK-NEXT: pushq %rbp
244 ; CHECK-NEXT: .cfi_def_cfa_offset 16
245 ; CHECK-NEXT: pushq %r15
246 ; CHECK-NEXT: .cfi_def_cfa_offset 24
247 ; CHECK-NEXT: pushq %r14
248 ; CHECK-NEXT: .cfi_def_cfa_offset 32
249 ; CHECK-NEXT: pushq %r13
250 ; CHECK-NEXT: .cfi_def_cfa_offset 40
251 ; CHECK-NEXT: pushq %r12
252 ; CHECK-NEXT: .cfi_def_cfa_offset 48
253 ; CHECK-NEXT: pushq %rbx
254 ; CHECK-NEXT: .cfi_def_cfa_offset 56
255 ; CHECK-NEXT: .cfi_offset %rbx, -56
256 ; CHECK-NEXT: .cfi_offset %r12, -48
257 ; CHECK-NEXT: .cfi_offset %r13, -40
258 ; CHECK-NEXT: .cfi_offset %r14, -32
259 ; CHECK-NEXT: .cfi_offset %r15, -24
260 ; CHECK-NEXT: .cfi_offset %rbp, -16
261 ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
262 ; CHECK-NEXT: movq %rdi, %rax
265 ; CHECK-NEXT: #NO_APP
266 ; CHECK-NEXT: crc32q {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
267 ; CHECK-NEXT: popq %rbx
268 ; CHECK-NEXT: .cfi_def_cfa_offset 48
269 ; CHECK-NEXT: popq %r12
270 ; CHECK-NEXT: .cfi_def_cfa_offset 40
271 ; CHECK-NEXT: popq %r13
272 ; CHECK-NEXT: .cfi_def_cfa_offset 32
273 ; CHECK-NEXT: popq %r14
274 ; CHECK-NEXT: .cfi_def_cfa_offset 24
275 ; CHECK-NEXT: popq %r15
276 ; CHECK-NEXT: .cfi_def_cfa_offset 16
277 ; CHECK-NEXT: popq %rbp
278 ; CHECK-NEXT: .cfi_def_cfa_offset 8
280 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
281 %2 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
284 declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
286 define <4 x i32> @stack_fold_movd_load(i32 %a0) {
287 ; CHECK-LABEL: stack_fold_movd_load:
289 ; CHECK-NEXT: pushq %rbp
290 ; CHECK-NEXT: .cfi_def_cfa_offset 16
291 ; CHECK-NEXT: pushq %r15
292 ; CHECK-NEXT: .cfi_def_cfa_offset 24
293 ; CHECK-NEXT: pushq %r14
294 ; CHECK-NEXT: .cfi_def_cfa_offset 32
295 ; CHECK-NEXT: pushq %r13
296 ; CHECK-NEXT: .cfi_def_cfa_offset 40
297 ; CHECK-NEXT: pushq %r12
298 ; CHECK-NEXT: .cfi_def_cfa_offset 48
299 ; CHECK-NEXT: pushq %rbx
300 ; CHECK-NEXT: .cfi_def_cfa_offset 56
301 ; CHECK-NEXT: .cfi_offset %rbx, -56
302 ; CHECK-NEXT: .cfi_offset %r12, -48
303 ; CHECK-NEXT: .cfi_offset %r13, -40
304 ; CHECK-NEXT: .cfi_offset %r14, -32
305 ; CHECK-NEXT: .cfi_offset %r15, -24
306 ; CHECK-NEXT: .cfi_offset %rbp, -16
307 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
310 ; CHECK-NEXT: #NO_APP
311 ; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
312 ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
313 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
314 ; CHECK-NEXT: psubd %xmm1, %xmm0
315 ; CHECK-NEXT: popq %rbx
316 ; CHECK-NEXT: .cfi_def_cfa_offset 48
317 ; CHECK-NEXT: popq %r12
318 ; CHECK-NEXT: .cfi_def_cfa_offset 40
319 ; CHECK-NEXT: popq %r13
320 ; CHECK-NEXT: .cfi_def_cfa_offset 32
321 ; CHECK-NEXT: popq %r14
322 ; CHECK-NEXT: .cfi_def_cfa_offset 24
323 ; CHECK-NEXT: popq %r15
324 ; CHECK-NEXT: .cfi_def_cfa_offset 16
325 ; CHECK-NEXT: popq %rbp
326 ; CHECK-NEXT: .cfi_def_cfa_offset 8
328 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
329 %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0
330 ; add forces execution domain
331 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
335 define i32 @stack_fold_movd_store(<4 x i32> %a0, <4 x i32> %a1) {
336 ; CHECK-LABEL: stack_fold_movd_store:
338 ; CHECK-NEXT: pushq %rbp
339 ; CHECK-NEXT: .cfi_def_cfa_offset 16
340 ; CHECK-NEXT: pushq %r15
341 ; CHECK-NEXT: .cfi_def_cfa_offset 24
342 ; CHECK-NEXT: pushq %r14
343 ; CHECK-NEXT: .cfi_def_cfa_offset 32
344 ; CHECK-NEXT: pushq %r13
345 ; CHECK-NEXT: .cfi_def_cfa_offset 40
346 ; CHECK-NEXT: pushq %r12
347 ; CHECK-NEXT: .cfi_def_cfa_offset 48
348 ; CHECK-NEXT: pushq %rbx
349 ; CHECK-NEXT: .cfi_def_cfa_offset 56
350 ; CHECK-NEXT: .cfi_offset %rbx, -56
351 ; CHECK-NEXT: .cfi_offset %r12, -48
352 ; CHECK-NEXT: .cfi_offset %r13, -40
353 ; CHECK-NEXT: .cfi_offset %r14, -32
354 ; CHECK-NEXT: .cfi_offset %r15, -24
355 ; CHECK-NEXT: .cfi_offset %rbp, -16
356 ; CHECK-NEXT: paddd %xmm1, %xmm0
357 ; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
360 ; CHECK-NEXT: #NO_APP
361 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
362 ; CHECK-NEXT: popq %rbx
363 ; CHECK-NEXT: .cfi_def_cfa_offset 48
364 ; CHECK-NEXT: popq %r12
365 ; CHECK-NEXT: .cfi_def_cfa_offset 40
366 ; CHECK-NEXT: popq %r13
367 ; CHECK-NEXT: .cfi_def_cfa_offset 32
368 ; CHECK-NEXT: popq %r14
369 ; CHECK-NEXT: .cfi_def_cfa_offset 24
370 ; CHECK-NEXT: popq %r15
371 ; CHECK-NEXT: .cfi_def_cfa_offset 16
372 ; CHECK-NEXT: popq %rbp
373 ; CHECK-NEXT: .cfi_def_cfa_offset 8
375 ; add forces execution domain
376 %1 = add <4 x i32> %a0, %a1
377 %2 = extractelement <4 x i32> %1, i32 0
378 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
382 define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
383 ; CHECK-LABEL: stack_fold_movq_load:
385 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
388 ; CHECK-NEXT: #NO_APP
389 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
390 ; CHECK-NEXT: # xmm0 = mem[0],zero
391 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
392 ; CHECK-NEXT: psubq %xmm1, %xmm0
394 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
395 %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
396 ; add forces execution domain
397 %3 = add <2 x i64> %2, <i64 1, i64 1>
401 define i64 @stack_fold_movq_store(<2 x i64> %a0, <2 x i64> %a1) {
402 ; CHECK-LABEL: stack_fold_movq_store:
404 ; CHECK-NEXT: pushq %rbp
405 ; CHECK-NEXT: .cfi_def_cfa_offset 16
406 ; CHECK-NEXT: pushq %r15
407 ; CHECK-NEXT: .cfi_def_cfa_offset 24
408 ; CHECK-NEXT: pushq %r14
409 ; CHECK-NEXT: .cfi_def_cfa_offset 32
410 ; CHECK-NEXT: pushq %r13
411 ; CHECK-NEXT: .cfi_def_cfa_offset 40
412 ; CHECK-NEXT: pushq %r12
413 ; CHECK-NEXT: .cfi_def_cfa_offset 48
414 ; CHECK-NEXT: pushq %rbx
415 ; CHECK-NEXT: .cfi_def_cfa_offset 56
416 ; CHECK-NEXT: .cfi_offset %rbx, -56
417 ; CHECK-NEXT: .cfi_offset %r12, -48
418 ; CHECK-NEXT: .cfi_offset %r13, -40
419 ; CHECK-NEXT: .cfi_offset %r14, -32
420 ; CHECK-NEXT: .cfi_offset %r15, -24
421 ; CHECK-NEXT: .cfi_offset %rbp, -16
422 ; CHECK-NEXT: paddq %xmm1, %xmm0
423 ; CHECK-NEXT: movq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
426 ; CHECK-NEXT: #NO_APP
427 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
428 ; CHECK-NEXT: popq %rbx
429 ; CHECK-NEXT: .cfi_def_cfa_offset 48
430 ; CHECK-NEXT: popq %r12
431 ; CHECK-NEXT: .cfi_def_cfa_offset 40
432 ; CHECK-NEXT: popq %r13
433 ; CHECK-NEXT: .cfi_def_cfa_offset 32
434 ; CHECK-NEXT: popq %r14
435 ; CHECK-NEXT: .cfi_def_cfa_offset 24
436 ; CHECK-NEXT: popq %r15
437 ; CHECK-NEXT: .cfi_def_cfa_offset 16
438 ; CHECK-NEXT: popq %rbp
439 ; CHECK-NEXT: .cfi_def_cfa_offset 8
441 ; add forces execution domain
442 %1 = add <2 x i64> %a0, %a1
443 %2 = extractelement <2 x i64> %1, i32 0
444 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
448 define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
449 ; CHECK-LABEL: stack_fold_mpsadbw:
451 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
454 ; CHECK-NEXT: #NO_APP
455 ; CHECK-NEXT: mpsadbw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
457 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
458 %2 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
461 declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
463 define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
464 ; CHECK-LABEL: stack_fold_pabsb:
466 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
469 ; CHECK-NEXT: #NO_APP
470 ; CHECK-NEXT: pabsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
472 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
473 %2 = icmp sgt <16 x i8> %a0, zeroinitializer
474 %3 = sub <16 x i8> zeroinitializer, %a0
475 %4 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %3
479 define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) {
480 ; CHECK-LABEL: stack_fold_pabsd:
482 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
485 ; CHECK-NEXT: #NO_APP
486 ; CHECK-NEXT: pabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
488 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
489 %2 = icmp sgt <4 x i32> %a0, zeroinitializer
490 %3 = sub <4 x i32> zeroinitializer, %a0
491 %4 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %3
495 define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) {
496 ; CHECK-LABEL: stack_fold_pabsw:
498 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
501 ; CHECK-NEXT: #NO_APP
502 ; CHECK-NEXT: pabsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
504 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
505 %2 = icmp sgt <8 x i16> %a0, zeroinitializer
506 %3 = sub <8 x i16> zeroinitializer, %a0
507 %4 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %3
511 define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
512 ; CHECK-LABEL: stack_fold_packssdw:
514 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
517 ; CHECK-NEXT: #NO_APP
518 ; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
520 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
521 %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
524 declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
526 define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
527 ; CHECK-LABEL: stack_fold_packsswb:
529 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
532 ; CHECK-NEXT: #NO_APP
533 ; CHECK-NEXT: packsswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
535 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
536 %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
539 declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
541 define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
542 ; CHECK-LABEL: stack_fold_packusdw:
544 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
547 ; CHECK-NEXT: #NO_APP
548 ; CHECK-NEXT: packusdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
550 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
551 %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
554 declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
556 define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
557 ; CHECK-LABEL: stack_fold_packuswb:
559 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
562 ; CHECK-NEXT: #NO_APP
563 ; CHECK-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
565 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
566 %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
569 declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
571 define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
572 ; CHECK-LABEL: stack_fold_paddb:
574 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
577 ; CHECK-NEXT: #NO_APP
578 ; CHECK-NEXT: paddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
580 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
581 %2 = add <16 x i8> %a0, %a1
585 define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) {
586 ; CHECK-LABEL: stack_fold_paddd:
588 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
591 ; CHECK-NEXT: #NO_APP
592 ; CHECK-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
594 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
595 %2 = add <4 x i32> %a0, %a1
599 define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) {
600 ; CHECK-LABEL: stack_fold_paddq:
602 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
605 ; CHECK-NEXT: #NO_APP
606 ; CHECK-NEXT: paddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
608 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
609 %2 = add <2 x i64> %a0, %a1
613 define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) {
614 ; CHECK-LABEL: stack_fold_paddsb:
616 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
619 ; CHECK-NEXT: #NO_APP
620 ; CHECK-NEXT: paddsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
622 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
623 %2 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
626 declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
628 define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) {
629 ; CHECK-LABEL: stack_fold_paddsw:
631 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
634 ; CHECK-NEXT: #NO_APP
635 ; CHECK-NEXT: paddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
637 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
638 %2 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
641 declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
643 define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) {
644 ; CHECK-LABEL: stack_fold_paddusb:
646 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
649 ; CHECK-NEXT: #NO_APP
650 ; CHECK-NEXT: paddusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
652 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
653 %2 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
656 declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
658 define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) {
659 ; CHECK-LABEL: stack_fold_paddusw:
661 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
664 ; CHECK-NEXT: #NO_APP
665 ; CHECK-NEXT: paddusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
667 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
668 %2 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
671 declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
673 define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) {
674 ; CHECK-LABEL: stack_fold_paddw:
676 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
679 ; CHECK-NEXT: #NO_APP
680 ; CHECK-NEXT: paddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
682 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
683 %2 = add <8 x i16> %a0, %a1
687 define <16 x i8> @stack_fold_palignr(<16 x i8> %a0, <16 x i8> %a1) {
688 ; CHECK-LABEL: stack_fold_palignr:
690 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
693 ; CHECK-NEXT: #NO_APP
694 ; CHECK-NEXT: palignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
695 ; CHECK-NEXT: # xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
697 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
698 %2 = shufflevector <16 x i8> %a1, <16 x i8> %a0, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
702 define <16 x i8> @stack_fold_pand(<16 x i8> %a0, <16 x i8> %a1) {
703 ; CHECK-LABEL: stack_fold_pand:
705 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
708 ; CHECK-NEXT: #NO_APP
709 ; CHECK-NEXT: pand {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
710 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
711 ; CHECK-NEXT: psubb %xmm1, %xmm0
713 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
714 %2 = and <16 x i8> %a0, %a1
715 ; add forces execution domain
716 %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
720 define <16 x i8> @stack_fold_pandn(<16 x i8> %a0, <16 x i8> %a1) {
721 ; CHECK-LABEL: stack_fold_pandn:
723 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
726 ; CHECK-NEXT: #NO_APP
727 ; CHECK-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
728 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
729 ; CHECK-NEXT: psubb %xmm1, %xmm0
731 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
732 %2 = xor <16 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
733 %3 = and <16 x i8> %2, %a1
734 ; add forces execution domain
735 %4 = add <16 x i8> %3, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
739 define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
740 ; CHECK-LABEL: stack_fold_pavgb:
742 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
745 ; CHECK-NEXT: #NO_APP
746 ; CHECK-NEXT: pavgb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
748 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
749 %2 = zext <16 x i8> %a0 to <16 x i16>
750 %3 = zext <16 x i8> %a1 to <16 x i16>
751 %4 = add <16 x i16> %2, %3
752 %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
753 %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
754 %7 = trunc <16 x i16> %6 to <16 x i8>
758 define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
759 ; CHECK-LABEL: stack_fold_pavgw:
761 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
764 ; CHECK-NEXT: #NO_APP
765 ; CHECK-NEXT: pavgw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
767 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
768 %2 = zext <8 x i16> %a0 to <8 x i32>
769 %3 = zext <8 x i16> %a1 to <8 x i32>
770 %4 = add <8 x i32> %2, %3
771 %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
772 %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
773 %7 = trunc <8 x i32> %6 to <8 x i16>
777 define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
778 ; CHECK-LABEL: stack_fold_pblendvb:
780 ; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
781 ; CHECK-NEXT: movdqa %xmm1, %xmm2
784 ; CHECK-NEXT: #NO_APP
785 ; CHECK-NEXT: pblendvb %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
786 ; CHECK-NEXT: movdqa %xmm2, %xmm0
788 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
789 %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)
792 declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
794 define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
795 ; CHECK-LABEL: stack_fold_pblendw:
797 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
800 ; CHECK-NEXT: #NO_APP
801 ; CHECK-NEXT: pblendw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
802 ; CHECK-NEXT: # xmm0 = mem[0,1,2],xmm0[3,4,5,6,7]
804 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
805 %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
809 define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
810 ; CHECK-LABEL: stack_fold_pclmulqdq:
812 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
815 ; CHECK-NEXT: #NO_APP
816 ; CHECK-NEXT: pclmulqdq $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
818 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
819 %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
822 declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
824 define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
825 ; CHECK-LABEL: stack_fold_pcmpeqb:
827 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
830 ; CHECK-NEXT: #NO_APP
831 ; CHECK-NEXT: pcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
833 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
834 %2 = icmp eq <16 x i8> %a0, %a1
835 %3 = sext <16 x i1> %2 to <16 x i8>
839 define <4 x i32> @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) {
840 ; CHECK-LABEL: stack_fold_pcmpeqd:
842 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
845 ; CHECK-NEXT: #NO_APP
846 ; CHECK-NEXT: pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
848 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
849 %2 = icmp eq <4 x i32> %a0, %a1
850 %3 = sext <4 x i1> %2 to <4 x i32>
854 define <2 x i64> @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) {
855 ; CHECK-LABEL: stack_fold_pcmpeqq:
857 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
860 ; CHECK-NEXT: #NO_APP
861 ; CHECK-NEXT: pcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
863 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
864 %2 = icmp eq <2 x i64> %a0, %a1
865 %3 = sext <2 x i1> %2 to <2 x i64>
869 define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
870 ; CHECK-LABEL: stack_fold_pcmpeqw:
872 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
875 ; CHECK-NEXT: #NO_APP
876 ; CHECK-NEXT: pcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
878 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
879 %2 = icmp eq <8 x i16> %a0, %a1
880 %3 = sext <8 x i1> %2 to <8 x i16>
884 define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) {
885 ; CHECK-LABEL: stack_fold_pcmpestri:
887 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
890 ; CHECK-NEXT: #NO_APP
891 ; CHECK-NEXT: movl $7, %eax
892 ; CHECK-NEXT: movl $7, %edx
893 ; CHECK-NEXT: pcmpestri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
894 ; CHECK-NEXT: movl %ecx, %eax
896 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
897 %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
900 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
902 define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) {
903 ; CHECK-LABEL: stack_fold_pcmpestrm:
905 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
908 ; CHECK-NEXT: #NO_APP
909 ; CHECK-NEXT: movl $7, %eax
910 ; CHECK-NEXT: movl $7, %edx
911 ; CHECK-NEXT: pcmpestrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
913 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
914 %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
917 declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
919 define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {
920 ; CHECK-LABEL: stack_fold_pcmpgtb:
922 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
925 ; CHECK-NEXT: #NO_APP
926 ; CHECK-NEXT: pcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
928 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
929 %2 = icmp sgt <16 x i8> %a0, %a1
930 %3 = sext <16 x i1> %2 to <16 x i8>
934 define <4 x i32> @stack_fold_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1) {
935 ; CHECK-LABEL: stack_fold_pcmpgtd:
937 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
940 ; CHECK-NEXT: #NO_APP
941 ; CHECK-NEXT: pcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
943 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
944 %2 = icmp sgt <4 x i32> %a0, %a1
945 %3 = sext <4 x i1> %2 to <4 x i32>
949 define <2 x i64> @stack_fold_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1) {
950 ; CHECK-LABEL: stack_fold_pcmpgtq:
952 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
955 ; CHECK-NEXT: #NO_APP
956 ; CHECK-NEXT: pcmpgtq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
958 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
959 %2 = icmp sgt <2 x i64> %a0, %a1
960 %3 = sext <2 x i1> %2 to <2 x i64>
964 define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) {
965 ; CHECK-LABEL: stack_fold_pcmpgtw:
967 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
970 ; CHECK-NEXT: #NO_APP
971 ; CHECK-NEXT: pcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
973 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
974 %2 = icmp sgt <8 x i16> %a0, %a1
975 %3 = sext <8 x i1> %2 to <8 x i16>
979 define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) {
980 ; CHECK-LABEL: stack_fold_pcmpistri:
982 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
985 ; CHECK-NEXT: #NO_APP
986 ; CHECK-NEXT: pcmpistri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
987 ; CHECK-NEXT: movl %ecx, %eax
989 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
990 %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
993 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
995 define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) {
996 ; CHECK-LABEL: stack_fold_pcmpistrm:
998 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1001 ; CHECK-NEXT: #NO_APP
1002 ; CHECK-NEXT: pcmpistrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1004 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1005 %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
1008 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
1010 ; TODO stack_fold_pextrb
1012 ; We can't naively fold pextrw as it only writes to a 16-bit memory location
1013 ; even though it can store to a 32-bit register.
1014 define i16 @stack_fold_pextrw(<8 x i16> %a0) {
1015 ; CHECK-LABEL: stack_fold_pextrw:
1016 ; CHECK: # %bb.0: # %entry
1017 ; CHECK-NEXT: pushq %rbp
1018 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1019 ; CHECK-NEXT: pushq %r15
1020 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1021 ; CHECK-NEXT: pushq %r14
1022 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1023 ; CHECK-NEXT: pushq %r13
1024 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1025 ; CHECK-NEXT: pushq %r12
1026 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1027 ; CHECK-NEXT: pushq %rbx
1028 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1029 ; CHECK-NEXT: .cfi_offset %rbx, -56
1030 ; CHECK-NEXT: .cfi_offset %r12, -48
1031 ; CHECK-NEXT: .cfi_offset %r13, -40
1032 ; CHECK-NEXT: .cfi_offset %r14, -32
1033 ; CHECK-NEXT: .cfi_offset %r15, -24
1034 ; CHECK-NEXT: .cfi_offset %rbp, -16
1035 ; CHECK-NEXT: pextrw $1, %xmm0, %eax
1036 ; CHECK-NEXT: addl $2, %eax
1037 ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1040 ; CHECK-NEXT: #NO_APP
1041 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1042 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1043 ; CHECK-NEXT: popq %rbx
1044 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1045 ; CHECK-NEXT: popq %r12
1046 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1047 ; CHECK-NEXT: popq %r13
1048 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1049 ; CHECK-NEXT: popq %r14
1050 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1051 ; CHECK-NEXT: popq %r15
1052 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1053 ; CHECK-NEXT: popq %rbp
1054 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1057 ; add forces execution domain
1058 %add = add <8 x i16> %a0, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
1059 %extract = extractelement <8 x i16> %add, i32 1
1060 %asm = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1064 define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
1065 ; CHECK-LABEL: stack_fold_pextrd:
1067 ; CHECK-NEXT: pushq %rbp
1068 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1069 ; CHECK-NEXT: pushq %r15
1070 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1071 ; CHECK-NEXT: pushq %r14
1072 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1073 ; CHECK-NEXT: pushq %r13
1074 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1075 ; CHECK-NEXT: pushq %r12
1076 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1077 ; CHECK-NEXT: pushq %rbx
1078 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1079 ; CHECK-NEXT: .cfi_offset %rbx, -56
1080 ; CHECK-NEXT: .cfi_offset %r12, -48
1081 ; CHECK-NEXT: .cfi_offset %r13, -40
1082 ; CHECK-NEXT: .cfi_offset %r14, -32
1083 ; CHECK-NEXT: .cfi_offset %r15, -24
1084 ; CHECK-NEXT: .cfi_offset %rbp, -16
1085 ; CHECK-NEXT: paddd %xmm1, %xmm0
1086 ; CHECK-NEXT: pextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1089 ; CHECK-NEXT: #NO_APP
1090 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1091 ; CHECK-NEXT: popq %rbx
1092 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1093 ; CHECK-NEXT: popq %r12
1094 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1095 ; CHECK-NEXT: popq %r13
1096 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1097 ; CHECK-NEXT: popq %r14
1098 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1099 ; CHECK-NEXT: popq %r15
1100 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1101 ; CHECK-NEXT: popq %rbp
1102 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1104 ; add forces execution domain
1105 %1 = add <4 x i32> %a0, %a1
1106 %2 = extractelement <4 x i32> %1, i32 1
1107 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1111 define i64 @stack_fold_pextrq(<2 x i64> %a0) {
1112 ; CHECK-LABEL: stack_fold_pextrq:
1114 ; CHECK-NEXT: pushq %rbp
1115 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1116 ; CHECK-NEXT: pushq %r15
1117 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1118 ; CHECK-NEXT: pushq %r14
1119 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1120 ; CHECK-NEXT: pushq %r13
1121 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1122 ; CHECK-NEXT: pushq %r12
1123 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1124 ; CHECK-NEXT: pushq %rbx
1125 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1126 ; CHECK-NEXT: .cfi_offset %rbx, -56
1127 ; CHECK-NEXT: .cfi_offset %r12, -48
1128 ; CHECK-NEXT: .cfi_offset %r13, -40
1129 ; CHECK-NEXT: .cfi_offset %r14, -32
1130 ; CHECK-NEXT: .cfi_offset %r15, -24
1131 ; CHECK-NEXT: .cfi_offset %rbp, -16
1132 ; CHECK-NEXT: pextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
1135 ; CHECK-NEXT: #NO_APP
1136 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1137 ; CHECK-NEXT: popq %rbx
1138 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1139 ; CHECK-NEXT: popq %r12
1140 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1141 ; CHECK-NEXT: popq %r13
1142 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1143 ; CHECK-NEXT: popq %r14
1144 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1145 ; CHECK-NEXT: popq %r15
1146 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1147 ; CHECK-NEXT: popq %rbp
1148 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1150 %1 = extractelement <2 x i64> %a0, i32 1
1151 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1155 define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
1156 ; CHECK-LABEL: stack_fold_phaddd:
1158 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1161 ; CHECK-NEXT: #NO_APP
1162 ; CHECK-NEXT: phaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1164 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1165 %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
1168 declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
1170 define <8 x i16> @stack_fold_phaddsw(<8 x i16> %a0, <8 x i16> %a1) {
1171 ; CHECK-LABEL: stack_fold_phaddsw:
1173 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1176 ; CHECK-NEXT: #NO_APP
1177 ; CHECK-NEXT: phaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1179 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1180 %2 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
1183 declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
1185 define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) {
1186 ; CHECK-LABEL: stack_fold_phaddw:
1188 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1191 ; CHECK-NEXT: #NO_APP
1192 ; CHECK-NEXT: phaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1194 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1195 %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
1198 declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
1200 define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) {
1201 ; CHECK-LABEL: stack_fold_phminposuw:
1203 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1206 ; CHECK-NEXT: #NO_APP
1207 ; CHECK-NEXT: phminposuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1209 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1210 %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0)
1213 declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
1215 define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {
1216 ; CHECK-LABEL: stack_fold_phsubd:
1218 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1221 ; CHECK-NEXT: #NO_APP
1222 ; CHECK-NEXT: phsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1224 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1225 %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
1228 declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
1230 define <8 x i16> @stack_fold_phsubsw(<8 x i16> %a0, <8 x i16> %a1) {
1231 ; CHECK-LABEL: stack_fold_phsubsw:
1233 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1236 ; CHECK-NEXT: #NO_APP
1237 ; CHECK-NEXT: phsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1239 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1240 %2 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
1243 declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
1245 define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) {
1246 ; CHECK-LABEL: stack_fold_phsubw:
1248 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1251 ; CHECK-NEXT: #NO_APP
1252 ; CHECK-NEXT: phsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1254 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1255 %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
1258 declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
1260 define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
1261 ; CHECK-LABEL: stack_fold_pinsrb:
1263 ; CHECK-NEXT: pushq %rbp
1264 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1265 ; CHECK-NEXT: pushq %r15
1266 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1267 ; CHECK-NEXT: pushq %r14
1268 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1269 ; CHECK-NEXT: pushq %r13
1270 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1271 ; CHECK-NEXT: pushq %r12
1272 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1273 ; CHECK-NEXT: pushq %rbx
1274 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1275 ; CHECK-NEXT: .cfi_offset %rbx, -56
1276 ; CHECK-NEXT: .cfi_offset %r12, -48
1277 ; CHECK-NEXT: .cfi_offset %r13, -40
1278 ; CHECK-NEXT: .cfi_offset %r14, -32
1279 ; CHECK-NEXT: .cfi_offset %r15, -24
1280 ; CHECK-NEXT: .cfi_offset %rbp, -16
1281 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1284 ; CHECK-NEXT: #NO_APP
1285 ; CHECK-NEXT: pinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1286 ; CHECK-NEXT: popq %rbx
1287 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1288 ; CHECK-NEXT: popq %r12
1289 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1290 ; CHECK-NEXT: popq %r13
1291 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1292 ; CHECK-NEXT: popq %r14
1293 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1294 ; CHECK-NEXT: popq %r15
1295 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1296 ; CHECK-NEXT: popq %rbp
1297 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1299 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1300 %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
1304 define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
1305 ; CHECK-LABEL: stack_fold_pinsrd:
1307 ; CHECK-NEXT: pushq %rbp
1308 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1309 ; CHECK-NEXT: pushq %r15
1310 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1311 ; CHECK-NEXT: pushq %r14
1312 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1313 ; CHECK-NEXT: pushq %r13
1314 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1315 ; CHECK-NEXT: pushq %r12
1316 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1317 ; CHECK-NEXT: pushq %rbx
1318 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1319 ; CHECK-NEXT: .cfi_offset %rbx, -56
1320 ; CHECK-NEXT: .cfi_offset %r12, -48
1321 ; CHECK-NEXT: .cfi_offset %r13, -40
1322 ; CHECK-NEXT: .cfi_offset %r14, -32
1323 ; CHECK-NEXT: .cfi_offset %r15, -24
1324 ; CHECK-NEXT: .cfi_offset %rbp, -16
1325 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1328 ; CHECK-NEXT: #NO_APP
1329 ; CHECK-NEXT: pinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1330 ; CHECK-NEXT: popq %rbx
1331 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1332 ; CHECK-NEXT: popq %r12
1333 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1334 ; CHECK-NEXT: popq %r13
1335 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1336 ; CHECK-NEXT: popq %r14
1337 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1338 ; CHECK-NEXT: popq %r15
1339 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1340 ; CHECK-NEXT: popq %rbp
1341 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1343 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1344 %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
1348 define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
1349 ; CHECK-LABEL: stack_fold_pinsrq:
1351 ; CHECK-NEXT: pushq %rbp
1352 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1353 ; CHECK-NEXT: pushq %r15
1354 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1355 ; CHECK-NEXT: pushq %r14
1356 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1357 ; CHECK-NEXT: pushq %r13
1358 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1359 ; CHECK-NEXT: pushq %r12
1360 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1361 ; CHECK-NEXT: pushq %rbx
1362 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1363 ; CHECK-NEXT: .cfi_offset %rbx, -56
1364 ; CHECK-NEXT: .cfi_offset %r12, -48
1365 ; CHECK-NEXT: .cfi_offset %r13, -40
1366 ; CHECK-NEXT: .cfi_offset %r14, -32
1367 ; CHECK-NEXT: .cfi_offset %r15, -24
1368 ; CHECK-NEXT: .cfi_offset %rbp, -16
1369 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1372 ; CHECK-NEXT: #NO_APP
1373 ; CHECK-NEXT: pinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
1374 ; CHECK-NEXT: popq %rbx
1375 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1376 ; CHECK-NEXT: popq %r12
1377 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1378 ; CHECK-NEXT: popq %r13
1379 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1380 ; CHECK-NEXT: popq %r14
1381 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1382 ; CHECK-NEXT: popq %r15
1383 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1384 ; CHECK-NEXT: popq %rbp
1385 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1387 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1388 %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
1392 define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
1393 ; CHECK-LABEL: stack_fold_pinsrw:
1395 ; CHECK-NEXT: pushq %rbp
1396 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1397 ; CHECK-NEXT: pushq %r15
1398 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1399 ; CHECK-NEXT: pushq %r14
1400 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1401 ; CHECK-NEXT: pushq %r13
1402 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1403 ; CHECK-NEXT: pushq %r12
1404 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1405 ; CHECK-NEXT: pushq %rbx
1406 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1407 ; CHECK-NEXT: .cfi_offset %rbx, -56
1408 ; CHECK-NEXT: .cfi_offset %r12, -48
1409 ; CHECK-NEXT: .cfi_offset %r13, -40
1410 ; CHECK-NEXT: .cfi_offset %r14, -32
1411 ; CHECK-NEXT: .cfi_offset %r15, -24
1412 ; CHECK-NEXT: .cfi_offset %rbp, -16
1413 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1416 ; CHECK-NEXT: #NO_APP
1417 ; CHECK-NEXT: pinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1418 ; CHECK-NEXT: popq %rbx
1419 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1420 ; CHECK-NEXT: popq %r12
1421 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1422 ; CHECK-NEXT: popq %r13
1423 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1424 ; CHECK-NEXT: popq %r14
1425 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1426 ; CHECK-NEXT: popq %r15
1427 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1428 ; CHECK-NEXT: popq %rbp
1429 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1431 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1432 %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
1436 define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
1437 ; CHECK-LABEL: stack_fold_pmaddubsw:
1439 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1442 ; CHECK-NEXT: #NO_APP
1443 ; CHECK-NEXT: pmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1445 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1446 %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
1449 declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
1451 define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
1452 ; CHECK-LABEL: stack_fold_pmaddwd:
1454 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1457 ; CHECK-NEXT: #NO_APP
1458 ; CHECK-NEXT: pmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1460 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1461 %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
1464 declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
1466 define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
1467 ; CHECK-LABEL: stack_fold_pmaxsb:
1469 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1472 ; CHECK-NEXT: #NO_APP
1473 ; CHECK-NEXT: pmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1475 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1476 %2 = icmp sgt <16 x i8> %a0, %a1
1477 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1481 define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
1482 ; CHECK-LABEL: stack_fold_pmaxsd:
1484 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1487 ; CHECK-NEXT: #NO_APP
1488 ; CHECK-NEXT: pmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1490 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1491 %2 = icmp sgt <4 x i32> %a0, %a1
1492 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1496 define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
1497 ; CHECK-LABEL: stack_fold_pmaxsw:
1499 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1502 ; CHECK-NEXT: #NO_APP
1503 ; CHECK-NEXT: pmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1505 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1506 %2 = icmp sgt <8 x i16> %a0, %a1
1507 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1511 define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
1512 ; CHECK-LABEL: stack_fold_pmaxub:
1514 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1517 ; CHECK-NEXT: #NO_APP
1518 ; CHECK-NEXT: pmaxub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1520 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1521 %2 = icmp ugt <16 x i8> %a0, %a1
1522 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1526 define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
1527 ; CHECK-LABEL: stack_fold_pmaxud:
1529 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1532 ; CHECK-NEXT: #NO_APP
1533 ; CHECK-NEXT: pmaxud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1535 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1536 %2 = icmp ugt <4 x i32> %a0, %a1
1537 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1541 define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
1542 ; CHECK-LABEL: stack_fold_pmaxuw:
1544 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1547 ; CHECK-NEXT: #NO_APP
1548 ; CHECK-NEXT: pmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1550 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1551 %2 = icmp ugt <8 x i16> %a0, %a1
1552 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1556 define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
1557 ; CHECK-LABEL: stack_fold_pminsb:
1559 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1562 ; CHECK-NEXT: #NO_APP
1563 ; CHECK-NEXT: pminsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1565 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1566 %2 = icmp slt <16 x i8> %a0, %a1
1567 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1571 define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
1572 ; CHECK-LABEL: stack_fold_pminsd:
1574 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1577 ; CHECK-NEXT: #NO_APP
1578 ; CHECK-NEXT: pminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1580 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1581 %2 = icmp slt <4 x i32> %a0, %a1
1582 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1586 define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
1587 ; CHECK-LABEL: stack_fold_pminsw:
1589 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1592 ; CHECK-NEXT: #NO_APP
1593 ; CHECK-NEXT: pminsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1595 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1596 %2 = icmp slt <8 x i16> %a0, %a1
1597 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1601 define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
1602 ; CHECK-LABEL: stack_fold_pminub:
1604 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1607 ; CHECK-NEXT: #NO_APP
1608 ; CHECK-NEXT: pminub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1610 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1611 %2 = icmp ult <16 x i8> %a0, %a1
1612 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
1616 define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
1617 ; CHECK-LABEL: stack_fold_pminud:
1619 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1622 ; CHECK-NEXT: #NO_APP
1623 ; CHECK-NEXT: pminud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1625 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1626 %2 = icmp ult <4 x i32> %a0, %a1
1627 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
1631 define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
1632 ; CHECK-LABEL: stack_fold_pminuw:
1634 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1637 ; CHECK-NEXT: #NO_APP
1638 ; CHECK-NEXT: pminuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1640 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1641 %2 = icmp ult <8 x i16> %a0, %a1
1642 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
1646 define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
1647 ; CHECK-LABEL: stack_fold_pmovsxbd:
1649 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1652 ; CHECK-NEXT: #NO_APP
1653 ; CHECK-NEXT: pmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1655 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1656 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1657 %3 = sext <4 x i8> %2 to <4 x i32>
1661 define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
1662 ; CHECK-LABEL: stack_fold_pmovsxbq:
1664 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1667 ; CHECK-NEXT: #NO_APP
1668 ; CHECK-NEXT: pmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1670 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1671 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
1672 %3 = sext <2 x i8> %2 to <2 x i64>
1676 define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
1677 ; CHECK-LABEL: stack_fold_pmovsxbw:
1679 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1682 ; CHECK-NEXT: #NO_APP
1683 ; CHECK-NEXT: pmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1685 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1686 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1687 %3 = sext <8 x i8> %2 to <8 x i16>
1691 define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
1692 ; CHECK-LABEL: stack_fold_pmovsxdq:
1694 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1697 ; CHECK-NEXT: #NO_APP
1698 ; CHECK-NEXT: pmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1700 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1701 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1702 %3 = sext <2 x i32> %2 to <2 x i64>
1706 define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
1707 ; CHECK-LABEL: stack_fold_pmovsxwd:
1709 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1712 ; CHECK-NEXT: #NO_APP
1713 ; CHECK-NEXT: pmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1715 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1716 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1717 %3 = sext <4 x i16> %2 to <4 x i32>
1721 define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
1722 ; CHECK-LABEL: stack_fold_pmovsxwq:
1724 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1727 ; CHECK-NEXT: #NO_APP
1728 ; CHECK-NEXT: pmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1730 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1731 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
1732 %3 = sext <2 x i16> %2 to <2 x i64>
1736 define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
1737 ; CHECK-LABEL: stack_fold_pmovzxbd:
1739 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1742 ; CHECK-NEXT: #NO_APP
1743 ; CHECK-NEXT: pmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1744 ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1746 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1747 %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 1, i32 19, i32 20, i32 21, i32 2, i32 22, i32 23, i32 24, i32 3, i32 25, i32 26, i32 27>
1748 %3 = bitcast <16 x i8> %2 to <4 x i32>
1752 define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
1753 ; CHECK-LABEL: stack_fold_pmovzxbq:
1755 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1758 ; CHECK-NEXT: #NO_APP
1759 ; CHECK-NEXT: pmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1760 ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1762 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1763 %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28>
1764 %3 = bitcast <16 x i8> %2 to <2 x i64>
1768 define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
1769 ; CHECK-LABEL: stack_fold_pmovzxbw:
1771 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1774 ; CHECK-NEXT: #NO_APP
1775 ; CHECK-NEXT: pmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1776 ; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1778 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1779 %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
1780 %3 = bitcast <16 x i8> %2 to <8 x i16>
1784 define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
1785 ; CHECK-LABEL: stack_fold_pmovzxdq:
1787 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1790 ; CHECK-NEXT: #NO_APP
1791 ; CHECK-NEXT: pmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1792 ; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero
1794 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1795 %2 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
1796 %3 = bitcast <4 x i32> %2 to <2 x i64>
1800 define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
1801 ; CHECK-LABEL: stack_fold_pmovzxwd:
1803 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1806 ; CHECK-NEXT: #NO_APP
1807 ; CHECK-NEXT: pmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1808 ; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1810 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1811 %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
1812 %3 = bitcast <8 x i16> %2 to <4 x i32>
1816 define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
1817 ; CHECK-LABEL: stack_fold_pmovzxwq:
1819 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1822 ; CHECK-NEXT: #NO_APP
1823 ; CHECK-NEXT: pmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1824 ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
1826 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1827 %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 1, i32 11, i32 12, i32 13>
1828 %3 = bitcast <8 x i16> %2 to <2 x i64>
1832 define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
1833 ; CHECK-LABEL: stack_fold_pmuldq:
1835 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1838 ; CHECK-NEXT: #NO_APP
1839 ; CHECK-NEXT: pmuldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1841 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1842 %2 = bitcast <4 x i32> %a0 to <2 x i64>
1843 %3 = bitcast <4 x i32> %a1 to <2 x i64>
1844 %4 = shl <2 x i64> %2, <i64 32, i64 32>
1845 %5 = ashr <2 x i64> %4, <i64 32, i64 32>
1846 %6 = shl <2 x i64> %3, <i64 32, i64 32>
1847 %7 = ashr <2 x i64> %6, <i64 32, i64 32>
1848 %8 = mul <2 x i64> %5, %7
1852 define <8 x i16> @stack_fold_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1) {
1853 ; CHECK-LABEL: stack_fold_pmulhrsw:
1855 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1858 ; CHECK-NEXT: #NO_APP
1859 ; CHECK-NEXT: pmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1861 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1862 %2 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
1865 declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
1867 define <8 x i16> @stack_fold_pmulhuw(<8 x i16> %a0, <8 x i16> %a1) {
1868 ; CHECK-LABEL: stack_fold_pmulhuw:
1870 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1873 ; CHECK-NEXT: #NO_APP
1874 ; CHECK-NEXT: pmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1876 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1877 %2 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1)
1880 declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
1882 define <8 x i16> @stack_fold_pmulhw(<8 x i16> %a0, <8 x i16> %a1) {
1883 ; CHECK-LABEL: stack_fold_pmulhw:
1885 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1888 ; CHECK-NEXT: #NO_APP
1889 ; CHECK-NEXT: pmulhw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1891 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1892 %2 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1)
1895 declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
1897 define <4 x i32> @stack_fold_pmulld(<4 x i32> %a0, <4 x i32> %a1) {
1898 ; CHECK-LABEL: stack_fold_pmulld:
1900 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1903 ; CHECK-NEXT: #NO_APP
1904 ; CHECK-NEXT: pmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1906 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1907 %2 = mul <4 x i32> %a0, %a1
1911 define <8 x i16> @stack_fold_pmullw(<8 x i16> %a0, <8 x i16> %a1) {
1912 ; CHECK-LABEL: stack_fold_pmullw:
1914 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1917 ; CHECK-NEXT: #NO_APP
1918 ; CHECK-NEXT: pmullw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1920 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1921 %2 = mul <8 x i16> %a0, %a1
1925 define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
1926 ; CHECK-LABEL: stack_fold_pmuludq:
1928 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1931 ; CHECK-NEXT: #NO_APP
1932 ; CHECK-NEXT: pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1934 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1935 %2 = bitcast <4 x i32> %a0 to <2 x i64>
1936 %3 = bitcast <4 x i32> %a1 to <2 x i64>
1937 %4 = and <2 x i64> %2, <i64 4294967295, i64 4294967295>
1938 %5 = and <2 x i64> %3, <i64 4294967295, i64 4294967295>
1939 %6 = mul <2 x i64> %4, %5
1943 define <16 x i8> @stack_fold_por(<16 x i8> %a0, <16 x i8> %a1) {
1944 ; CHECK-LABEL: stack_fold_por:
1946 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1949 ; CHECK-NEXT: #NO_APP
1950 ; CHECK-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1951 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
1952 ; CHECK-NEXT: psubb %xmm1, %xmm0
1954 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1955 %2 = or <16 x i8> %a0, %a1
1956 ; add forces execution domain
1957 %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1961 define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) {
1962 ; CHECK-LABEL: stack_fold_psadbw:
1964 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1967 ; CHECK-NEXT: #NO_APP
1968 ; CHECK-NEXT: psadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1970 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1971 %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
1974 declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
1976 define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
1977 ; CHECK-LABEL: stack_fold_pshufb:
1979 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1982 ; CHECK-NEXT: #NO_APP
1983 ; CHECK-NEXT: pshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1985 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1986 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
1989 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
1991 define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
1992 ; CHECK-LABEL: stack_fold_pshufd:
1994 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1997 ; CHECK-NEXT: #NO_APP
1998 ; CHECK-NEXT: pshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1999 ; CHECK-NEXT: # xmm0 = mem[3,2,1,0]
2001 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2002 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2006 define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
2007 ; CHECK-LABEL: stack_fold_pshufhw:
2009 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2012 ; CHECK-NEXT: #NO_APP
2013 ; CHECK-NEXT: pshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2014 ; CHECK-NEXT: # xmm0 = mem[0,1,2,3,7,6,4,4]
2016 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2017 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
2021 define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
2022 ; CHECK-LABEL: stack_fold_pshuflw:
2024 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2027 ; CHECK-NEXT: #NO_APP
2028 ; CHECK-NEXT: pshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2029 ; CHECK-NEXT: # xmm0 = mem[3,2,1,0,4,5,6,7]
2031 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2032 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
2036 define <16 x i8> @stack_fold_psignb(<16 x i8> %a0, <16 x i8> %a1) {
2037 ; CHECK-LABEL: stack_fold_psignb:
2039 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2042 ; CHECK-NEXT: #NO_APP
2043 ; CHECK-NEXT: psignb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2045 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2046 %2 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
2049 declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
2051 define <4 x i32> @stack_fold_psignd(<4 x i32> %a0, <4 x i32> %a1) {
2052 ; CHECK-LABEL: stack_fold_psignd:
2054 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2057 ; CHECK-NEXT: #NO_APP
2058 ; CHECK-NEXT: psignd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2060 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2061 %2 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
2064 declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
2066 define <8 x i16> @stack_fold_psignw(<8 x i16> %a0, <8 x i16> %a1) {
2067 ; CHECK-LABEL: stack_fold_psignw:
2069 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2072 ; CHECK-NEXT: #NO_APP
2073 ; CHECK-NEXT: psignw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2075 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2076 %2 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
2079 declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
2081 define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) {
2082 ; CHECK-LABEL: stack_fold_pslld:
2084 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2087 ; CHECK-NEXT: #NO_APP
2088 ; CHECK-NEXT: pslld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2090 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2091 %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
2094 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
2096 define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) {
2097 ; CHECK-LABEL: stack_fold_psllq:
2099 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2102 ; CHECK-NEXT: #NO_APP
2103 ; CHECK-NEXT: psllq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2105 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2106 %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
2109 declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
2111 define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) {
2112 ; CHECK-LABEL: stack_fold_psllw:
2114 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2117 ; CHECK-NEXT: #NO_APP
2118 ; CHECK-NEXT: psllw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2120 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2121 %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
2124 declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
2126 define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) {
2127 ; CHECK-LABEL: stack_fold_psrad:
2129 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2132 ; CHECK-NEXT: #NO_APP
2133 ; CHECK-NEXT: psrad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2135 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2136 %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
2139 declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
2141 define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) {
2142 ; CHECK-LABEL: stack_fold_psraw:
2144 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2147 ; CHECK-NEXT: #NO_APP
2148 ; CHECK-NEXT: psraw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2150 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2151 %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
2154 declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
2156 define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) {
2157 ; CHECK-LABEL: stack_fold_psrld:
2159 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2162 ; CHECK-NEXT: #NO_APP
2163 ; CHECK-NEXT: psrld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2165 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2166 %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
2169 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
2171 define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) {
2172 ; CHECK-LABEL: stack_fold_psrlq:
2174 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2177 ; CHECK-NEXT: #NO_APP
2178 ; CHECK-NEXT: psrlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2180 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2181 %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
2184 declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
2186 define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) {
2187 ; CHECK-LABEL: stack_fold_psrlw:
2189 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2192 ; CHECK-NEXT: #NO_APP
2193 ; CHECK-NEXT: psrlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2196 %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
2199 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
2201 define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
2202 ; CHECK-LABEL: stack_fold_psubb:
2204 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2207 ; CHECK-NEXT: #NO_APP
2208 ; CHECK-NEXT: psubb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2210 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2211 %2 = sub <16 x i8> %a0, %a1
2215 define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
2216 ; CHECK-LABEL: stack_fold_psubd:
2218 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2221 ; CHECK-NEXT: #NO_APP
2222 ; CHECK-NEXT: psubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2224 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2225 %2 = sub <4 x i32> %a0, %a1
2229 define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
2230 ; CHECK-LABEL: stack_fold_psubq:
2232 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2235 ; CHECK-NEXT: #NO_APP
2236 ; CHECK-NEXT: psubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2238 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2239 %2 = sub <2 x i64> %a0, %a1
2243 define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
2244 ; CHECK-LABEL: stack_fold_psubsb:
2246 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2249 ; CHECK-NEXT: #NO_APP
2250 ; CHECK-NEXT: psubsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2252 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2253 %2 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
2256 declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
2258 define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
2259 ; CHECK-LABEL: stack_fold_psubsw:
2261 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2264 ; CHECK-NEXT: #NO_APP
2265 ; CHECK-NEXT: psubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2267 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2268 %2 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
2271 declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
2273 define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
2274 ; CHECK-LABEL: stack_fold_psubusb:
2276 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2279 ; CHECK-NEXT: #NO_APP
2280 ; CHECK-NEXT: psubusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2282 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2283 %2 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
2286 declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
2288 define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
2289 ; CHECK-LABEL: stack_fold_psubusw:
2291 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2294 ; CHECK-NEXT: #NO_APP
2295 ; CHECK-NEXT: psubusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2297 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2298 %2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
2301 declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
2303 define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
2304 ; CHECK-LABEL: stack_fold_psubw:
2306 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2309 ; CHECK-NEXT: #NO_APP
2310 ; CHECK-NEXT: psubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2312 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2313 %2 = sub <8 x i16> %a0, %a1
2317 define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) {
2318 ; CHECK-LABEL: stack_fold_ptest:
2320 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2323 ; CHECK-NEXT: #NO_APP
2324 ; CHECK-NEXT: xorl %eax, %eax
2325 ; CHECK-NEXT: ptest {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2326 ; CHECK-NEXT: setb %al
2328 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2329 %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
2332 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
2334 define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
2335 ; CHECK-LABEL: stack_fold_punpckhbw:
2337 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2340 ; CHECK-NEXT: #NO_APP
2341 ; CHECK-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2342 ; CHECK-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
2344 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2345 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2349 define <4 x i32> @stack_fold_punpckhdq(<4 x i32> %a0, <4 x i32> %a1) {
2350 ; CHECK-LABEL: stack_fold_punpckhdq:
2352 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2355 ; CHECK-NEXT: #NO_APP
2356 ; CHECK-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2357 ; CHECK-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
2358 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
2359 ; CHECK-NEXT: psubd %xmm1, %xmm0
2361 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2362 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
2363 ; add forces execution domain
2364 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
2368 define <2 x i64> @stack_fold_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1) {
2369 ; CHECK-LABEL: stack_fold_punpckhqdq:
2371 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2374 ; CHECK-NEXT: #NO_APP
2375 ; CHECK-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2376 ; CHECK-NEXT: # xmm0 = xmm0[1],mem[1]
2377 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
2378 ; CHECK-NEXT: psubq %xmm1, %xmm0
2380 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2381 %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
2382 ; add forces execution domain
2383 %3 = add <2 x i64> %2, <i64 1, i64 1>
2387 define <8 x i16> @stack_fold_punpckhwd(<8 x i16> %a0, <8 x i16> %a1) {
2388 ; CHECK-LABEL: stack_fold_punpckhwd:
2390 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2393 ; CHECK-NEXT: #NO_APP
2394 ; CHECK-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2395 ; CHECK-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
2397 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2398 %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
2402 define <16 x i8> @stack_fold_punpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
2403 ; CHECK-LABEL: stack_fold_punpcklbw:
2405 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2408 ; CHECK-NEXT: #NO_APP
2409 ; CHECK-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2410 ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
2412 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2413 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
2417 define <4 x i32> @stack_fold_punpckldq(<4 x i32> %a0, <4 x i32> %a1) {
2418 ; CHECK-LABEL: stack_fold_punpckldq:
2420 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2423 ; CHECK-NEXT: #NO_APP
2424 ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2425 ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
2426 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
2427 ; CHECK-NEXT: psubd %xmm1, %xmm0
2429 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2430 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2431 ; add forces execution domain
2432 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
2436 define <2 x i64> @stack_fold_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1) {
2437 ; CHECK-LABEL: stack_fold_punpcklqdq:
2439 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2442 ; CHECK-NEXT: #NO_APP
2443 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2444 ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
2445 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
2446 ; CHECK-NEXT: psubq %xmm1, %xmm0
2448 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2449 %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
2450 ; add forces execution domain
2451 %3 = add <2 x i64> %2, <i64 1, i64 1>
2455 define <8 x i16> @stack_fold_punpcklwd(<8 x i16> %a0, <8 x i16> %a1) {
2456 ; CHECK-LABEL: stack_fold_punpcklwd:
2458 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2461 ; CHECK-NEXT: #NO_APP
2462 ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2463 ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2465 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2466 %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
2470 define <16 x i8> @stack_fold_pxor(<16 x i8> %a0, <16 x i8> %a1) {
2471 ; CHECK-LABEL: stack_fold_pxor:
2473 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2476 ; CHECK-NEXT: #NO_APP
2477 ; CHECK-NEXT: pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2478 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
2479 ; CHECK-NEXT: psubb %xmm1, %xmm0
2481 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2482 %2 = xor <16 x i8> %a0, %a1
2483 ; add forces execution domain
2484 %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>