1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq,+avx512vnni < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <16 x i32> @stack_fold_valignd(<16 x i32> %a, <16 x i32> %b) {
13 ; CHECK-LABEL: stack_fold_valignd:
15 ; CHECK-NEXT: pushq %rax
16 ; CHECK-NEXT: .cfi_def_cfa_offset 16
17 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23 ; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
24 ; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
25 ; CHECK-NEXT: popq %rax
26 ; CHECK-NEXT: .cfi_def_cfa_offset 8
28 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
29 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
33 define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, ptr %passthru, i16 %mask) {
34 ; CHECK-LABEL: stack_fold_valignd_mask:
36 ; CHECK-NEXT: pushq %rax
37 ; CHECK-NEXT: .cfi_def_cfa_offset 16
38 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
39 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
43 ; CHECK-NEXT: kmovd %esi, %k1
44 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
45 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
46 ; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
47 ; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
48 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
49 ; CHECK-NEXT: popq %rax
50 ; CHECK-NEXT: .cfi_def_cfa_offset 8
52 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
53 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
54 %3 = bitcast i16 %mask to <16 x i1>
55 %4 = load <16 x i32>, ptr %passthru
56 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
60 define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
61 ; CHECK-LABEL: stack_fold_valignd_maskz:
63 ; CHECK-NEXT: pushq %rax
64 ; CHECK-NEXT: .cfi_def_cfa_offset 16
65 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
66 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
70 ; CHECK-NEXT: kmovd %edi, %k1
71 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
72 ; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
73 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
74 ; CHECK-NEXT: popq %rax
75 ; CHECK-NEXT: .cfi_def_cfa_offset 8
77 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
78 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
79 %3 = bitcast i16 %mask to <16 x i1>
80 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
84 define <8 x i64> @stack_fold_valignq(<8 x i64> %a, <8 x i64> %b) {
85 ; CHECK-LABEL: stack_fold_valignq:
87 ; CHECK-NEXT: pushq %rax
88 ; CHECK-NEXT: .cfi_def_cfa_offset 16
89 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
90 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
94 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
95 ; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
96 ; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7],zmm0[0]
97 ; CHECK-NEXT: popq %rax
98 ; CHECK-NEXT: .cfi_def_cfa_offset 8
100 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
101 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
105 define <8 x i64> @stack_fold_valignq_mask(<8 x i64> %a, <8 x i64> %b, ptr %passthru, i8 %mask) {
106 ; CHECK-LABEL: stack_fold_valignq_mask:
108 ; CHECK-NEXT: pushq %rax
109 ; CHECK-NEXT: .cfi_def_cfa_offset 16
110 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
111 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
114 ; CHECK-NEXT: #NO_APP
115 ; CHECK-NEXT: kmovd %esi, %k1
116 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
117 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
118 ; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
119 ; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7],zmm0[0]
120 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
121 ; CHECK-NEXT: popq %rax
122 ; CHECK-NEXT: .cfi_def_cfa_offset 8
124 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
125 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
126 %3 = bitcast i8 %mask to <8 x i1>
127 %4 = load <8 x i64>, ptr %passthru
128 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
132 define <8 x i64> @stack_fold_valignq_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
133 ; CHECK-LABEL: stack_fold_valignq_maskz:
135 ; CHECK-NEXT: pushq %rax
136 ; CHECK-NEXT: .cfi_def_cfa_offset 16
137 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
138 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
141 ; CHECK-NEXT: #NO_APP
142 ; CHECK-NEXT: kmovd %edi, %k1
143 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
144 ; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
145 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7],zmm0[0]
146 ; CHECK-NEXT: popq %rax
147 ; CHECK-NEXT: .cfi_def_cfa_offset 8
149 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
150 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
151 %3 = bitcast i8 %mask to <8 x i1>
152 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
156 define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) {
157 ; CHECK-LABEL: stack_fold_pavgb:
159 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
162 ; CHECK-NEXT: #NO_APP
163 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
165 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
166 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
169 declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>)
171 define <64 x i8> @stack_fold_pavgb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
172 ; CHECK-LABEL: stack_fold_pavgb_commuted:
174 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
177 ; CHECK-NEXT: #NO_APP
178 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
180 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
181 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
185 define <64 x i8> @stack_fold_pavgb_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
186 ; CHECK-LABEL: stack_fold_pavgb_mask:
188 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
189 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
192 ; CHECK-NEXT: #NO_APP
193 ; CHECK-NEXT: kmovq %rsi, %k1
194 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
195 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
197 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
198 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
199 %3 = bitcast i64 %mask to <64 x i1>
200 ; load needed to keep the operation from being scheduled about the asm block
201 %4 = load <64 x i8>, ptr %a2
202 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
206 define <64 x i8> @stack_fold_pavgb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
207 ; CHECK-LABEL: stack_fold_pavgb_mask_commuted:
209 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
210 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
213 ; CHECK-NEXT: #NO_APP
214 ; CHECK-NEXT: kmovq %rsi, %k1
215 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
216 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
218 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
219 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
220 %3 = bitcast i64 %mask to <64 x i1>
221 ; load needed to keep the operation from being scheduled about the asm block
222 %4 = load <64 x i8>, ptr %a2
223 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
227 define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
228 ; CHECK-LABEL: stack_fold_pavgb_maskz:
230 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
233 ; CHECK-NEXT: #NO_APP
234 ; CHECK-NEXT: kmovq %rdi, %k1
235 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
237 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
238 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
239 %3 = bitcast i64 %mask to <64 x i1>
240 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
244 define <64 x i8> @stack_fold_pavgb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
245 ; CHECK-LABEL: stack_fold_pavgb_maskz_commuted:
247 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
250 ; CHECK-NEXT: #NO_APP
251 ; CHECK-NEXT: kmovq %rdi, %k1
252 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
254 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
255 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
256 %3 = bitcast i64 %mask to <64 x i1>
257 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
261 define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) {
262 ; CHECK-LABEL: stack_fold_pavgw:
264 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
267 ; CHECK-NEXT: #NO_APP
268 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
270 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
271 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
274 declare <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16>, <32 x i16>)
276 define <32 x i16> @stack_fold_pavgw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
277 ; CHECK-LABEL: stack_fold_pavgw_commuted:
279 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
282 ; CHECK-NEXT: #NO_APP
283 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
285 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
286 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
290 define <32 x i16> @stack_fold_pavgw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
291 ; CHECK-LABEL: stack_fold_pavgw_mask:
293 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
294 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
297 ; CHECK-NEXT: #NO_APP
298 ; CHECK-NEXT: kmovd %esi, %k1
299 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
300 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
302 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
303 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
304 %3 = bitcast i32 %mask to <32 x i1>
305 ; load needed to keep the operation from being scheduled about the asm block
306 %4 = load <32 x i16>, ptr %a2
307 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
311 define <32 x i16> @stack_fold_pavgw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
312 ; CHECK-LABEL: stack_fold_pavgw_mask_commuted:
314 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
315 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
318 ; CHECK-NEXT: #NO_APP
319 ; CHECK-NEXT: kmovd %esi, %k1
320 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
321 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
323 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
324 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
325 %3 = bitcast i32 %mask to <32 x i1>
326 ; load needed to keep the operation from being scheduled about the asm block
327 %4 = load <32 x i16>, ptr %a2
328 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
332 define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
333 ; CHECK-LABEL: stack_fold_pavgw_maskz:
335 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
338 ; CHECK-NEXT: #NO_APP
339 ; CHECK-NEXT: kmovd %edi, %k1
340 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
342 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
343 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
344 %3 = bitcast i32 %mask to <32 x i1>
345 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
349 define <32 x i16> @stack_fold_pavgw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
350 ; CHECK-LABEL: stack_fold_pavgw_maskz_commuted:
352 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
355 ; CHECK-NEXT: #NO_APP
356 ; CHECK-NEXT: kmovd %edi, %k1
357 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
359 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
360 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
361 %3 = bitcast i32 %mask to <32 x i1>
362 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
366 define <4 x i32> @stack_fold_extracti32x4(<16 x i16> %a0, <16 x i32> %a1) {
367 ; CHECK-LABEL: stack_fold_extracti32x4:
369 ; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
370 ; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
373 ; CHECK-NEXT: #NO_APP
374 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
375 ; CHECK-NEXT: vzeroupper
377 ; zext forces execution domain
378 %1 = zext <16 x i16> %a0 to <16 x i32>
379 %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
380 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
384 define <2 x i64> @stack_fold_extracti64x2(<8 x i32> %a0, <8 x i64> %a1) {
385 ; CHECK-LABEL: stack_fold_extracti64x2:
387 ; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
388 ; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
391 ; CHECK-NEXT: #NO_APP
392 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
393 ; CHECK-NEXT: vzeroupper
395 ; zext forces execution domain
396 %1 = zext <8 x i32> %a0 to <8 x i64>
397 %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7>
398 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
402 define <8 x i32> @stack_fold_extracti32x8(<16 x i16> %a0, <16 x i32> %a1) {
403 ; CHECK-LABEL: stack_fold_extracti32x8:
405 ; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
406 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
409 ; CHECK-NEXT: #NO_APP
410 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
412 ; zext forces execution domain
413 %1 = zext <16 x i16> %a0 to <16 x i32>
414 %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
415 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
419 define <4 x i64> @stack_fold_extracti64x4(<8 x i32> %a0, <8 x i64> %a1) {
420 ; CHECK-LABEL: stack_fold_extracti64x4:
422 ; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
423 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
426 ; CHECK-NEXT: #NO_APP
427 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
429 ; zext forces execution domain
430 %1 = zext <8 x i32> %a0 to <8 x i64>
431 %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
432 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
436 define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) {
437 ; CHECK-LABEL: stack_fold_inserti32x8:
439 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
440 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
443 ; CHECK-NEXT: #NO_APP
444 ; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
445 ; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1
446 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
448 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
449 %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
450 ; add forces execution domain
451 %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
455 define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) {
456 ; CHECK-LABEL: stack_fold_inserti64x4:
458 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
459 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
462 ; CHECK-NEXT: #NO_APP
463 ; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
464 ; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1
465 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
467 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
468 %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
469 ; add forces execution domain
470 %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
474 define <64 x i8> @stack_fold_pabsb(<64 x i8> %a0) {
475 ; CHECK-LABEL: stack_fold_pabsb:
477 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
480 ; CHECK-NEXT: #NO_APP
481 ; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
483 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
484 %2 = icmp sgt <64 x i8> %a0, zeroinitializer
485 %3 = sub <64 x i8> zeroinitializer, %a0
486 %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3
490 define <64 x i8> @stack_fold_pabsb_mask(<64 x i8> %passthru, <64 x i8> %a0, i64 %mask) {
491 ; CHECK-LABEL: stack_fold_pabsb_mask:
493 ; CHECK-NEXT: pushq %rax
494 ; CHECK-NEXT: .cfi_def_cfa_offset 16
495 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
496 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
499 ; CHECK-NEXT: #NO_APP
500 ; CHECK-NEXT: kmovq %rdi, %k1
501 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
502 ; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
503 ; CHECK-NEXT: popq %rax
504 ; CHECK-NEXT: .cfi_def_cfa_offset 8
506 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
507 %2 = icmp sgt <64 x i8> %a0, zeroinitializer
508 %3 = sub <64 x i8> zeroinitializer, %a0
509 %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3
510 %5 = bitcast i64 %mask to <64 x i1>
511 %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> %passthru
515 define <64 x i8> @stack_fold_pabsb_maskz(<64 x i8> %a0, i64 %mask) {
516 ; CHECK-LABEL: stack_fold_pabsb_maskz:
518 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
521 ; CHECK-NEXT: #NO_APP
522 ; CHECK-NEXT: kmovq %rdi, %k1
523 ; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
525 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
526 %2 = icmp sgt <64 x i8> %a0, zeroinitializer
527 %3 = sub <64 x i8> zeroinitializer, %a0
528 %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3
529 %5 = bitcast i64 %mask to <64 x i1>
530 %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer
534 define <16 x i32> @stack_fold_pabsd(<16 x i32> %a0) {
535 ; CHECK-LABEL: stack_fold_pabsd:
537 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
540 ; CHECK-NEXT: #NO_APP
541 ; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
543 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
544 %2 = icmp sgt <16 x i32> %a0, zeroinitializer
545 %3 = sub <16 x i32> zeroinitializer, %a0
546 %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3
550 define <16 x i32> @stack_fold_pabsd_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) {
551 ; CHECK-LABEL: stack_fold_pabsd_mask:
553 ; CHECK-NEXT: pushq %rax
554 ; CHECK-NEXT: .cfi_def_cfa_offset 16
555 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
556 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
559 ; CHECK-NEXT: #NO_APP
560 ; CHECK-NEXT: kmovd %edi, %k1
561 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
562 ; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
563 ; CHECK-NEXT: popq %rax
564 ; CHECK-NEXT: .cfi_def_cfa_offset 8
566 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
567 %2 = icmp sgt <16 x i32> %a0, zeroinitializer
568 %3 = sub <16 x i32> zeroinitializer, %a0
569 %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3
570 %5 = bitcast i16 %mask to <16 x i1>
571 %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> %passthru
575 define <16 x i32> @stack_fold_pabsd_maskz(<16 x i32> %a0, i16 %mask) {
576 ; CHECK-LABEL: stack_fold_pabsd_maskz:
578 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
581 ; CHECK-NEXT: #NO_APP
582 ; CHECK-NEXT: kmovd %edi, %k1
583 ; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
585 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
586 %2 = icmp sgt <16 x i32> %a0, zeroinitializer
587 %3 = sub <16 x i32> zeroinitializer, %a0
588 %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3
589 %5 = bitcast i16 %mask to <16 x i1>
590 %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
594 define <8 x i64> @stack_fold_pabsq(<8 x i64> %a0) {
595 ; CHECK-LABEL: stack_fold_pabsq:
597 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
600 ; CHECK-NEXT: #NO_APP
601 ; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
603 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
604 %2 = icmp sgt <8 x i64> %a0, zeroinitializer
605 %3 = sub <8 x i64> zeroinitializer, %a0
606 %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3
610 define <8 x i64> @stack_fold_pabsq_mask(<8 x i64> %passthru, <8 x i64> %a0, i8 %mask) {
611 ; CHECK-LABEL: stack_fold_pabsq_mask:
613 ; CHECK-NEXT: pushq %rax
614 ; CHECK-NEXT: .cfi_def_cfa_offset 16
615 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
616 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
619 ; CHECK-NEXT: #NO_APP
620 ; CHECK-NEXT: kmovd %edi, %k1
621 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
622 ; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
623 ; CHECK-NEXT: popq %rax
624 ; CHECK-NEXT: .cfi_def_cfa_offset 8
626 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
627 %2 = icmp sgt <8 x i64> %a0, zeroinitializer
628 %3 = sub <8 x i64> zeroinitializer, %a0
629 %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3
630 %5 = bitcast i8 %mask to <8 x i1>
631 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %passthru
635 define <8 x i64> @stack_fold_pabsq_maskz(<8 x i64> %a0, i8 %mask) {
636 ; CHECK-LABEL: stack_fold_pabsq_maskz:
638 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
641 ; CHECK-NEXT: #NO_APP
642 ; CHECK-NEXT: kmovd %edi, %k1
643 ; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
645 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
646 %2 = icmp sgt <8 x i64> %a0, zeroinitializer
647 %3 = sub <8 x i64> zeroinitializer, %a0
648 %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3
649 %5 = bitcast i8 %mask to <8 x i1>
650 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
654 define <32 x i16> @stack_fold_pabsw(<32 x i16> %a0) {
655 ; CHECK-LABEL: stack_fold_pabsw:
657 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
660 ; CHECK-NEXT: #NO_APP
661 ; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
663 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
664 %2 = icmp sgt <32 x i16> %a0, zeroinitializer
665 %3 = sub <32 x i16> zeroinitializer, %a0
666 %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3
670 define <32 x i16> @stack_fold_pabsw_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
671 ; CHECK-LABEL: stack_fold_pabsw_mask:
673 ; CHECK-NEXT: pushq %rax
674 ; CHECK-NEXT: .cfi_def_cfa_offset 16
675 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
676 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
679 ; CHECK-NEXT: #NO_APP
680 ; CHECK-NEXT: kmovd %edi, %k1
681 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
682 ; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
683 ; CHECK-NEXT: popq %rax
684 ; CHECK-NEXT: .cfi_def_cfa_offset 8
686 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
687 %2 = icmp sgt <32 x i16> %a0, zeroinitializer
688 %3 = sub <32 x i16> zeroinitializer, %a0
689 %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3
690 %5 = bitcast i32 %mask to <32 x i1>
691 %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> %passthru
695 define <32 x i16> @stack_fold_pabsw_maskz(<32 x i16> %a0, i32 %mask) {
696 ; CHECK-LABEL: stack_fold_pabsw_maskz:
698 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
701 ; CHECK-NEXT: #NO_APP
702 ; CHECK-NEXT: kmovd %edi, %k1
703 ; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
705 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
706 %2 = icmp sgt <32 x i16> %a0, zeroinitializer
707 %3 = sub <32 x i16> zeroinitializer, %a0
708 %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3
709 %5 = bitcast i32 %mask to <32 x i1>
710 %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> zeroinitializer
714 define <32 x i16> @stack_fold_packssdw(<16 x i32> %a0, <16 x i32> %a1) {
715 ; CHECK-LABEL: stack_fold_packssdw:
717 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
720 ; CHECK-NEXT: #NO_APP
721 ; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
723 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
724 %2 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a0, <16 x i32> %a1)
727 declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone
729 define <64 x i8> @stack_fold_packsswb(<32 x i16> %a0, <32 x i16> %a1) {
730 ; CHECK-LABEL: stack_fold_packsswb:
732 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
735 ; CHECK-NEXT: #NO_APP
736 ; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
738 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
739 %2 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a0, <32 x i16> %a1)
742 declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone
744 define <32 x i16> @stack_fold_packusdw(<16 x i32> %a0, <16 x i32> %a1) {
745 ; CHECK-LABEL: stack_fold_packusdw:
747 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
750 ; CHECK-NEXT: #NO_APP
751 ; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
753 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
754 %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
757 declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone
759 define <32 x i16> @stack_fold_packusdw_mask(ptr %passthru, <16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
760 ; CHECK-LABEL: stack_fold_packusdw_mask:
762 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
765 ; CHECK-NEXT: #NO_APP
766 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
767 ; CHECK-NEXT: kmovd %esi, %k1
768 ; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
769 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
771 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
772 %2 = load <32 x i16>, ptr %passthru
773 %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
774 %4 = bitcast i32 %mask to <32 x i1>
775 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %2
779 define <32 x i16> @stack_fold_packusdw_maskz(<16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
780 ; CHECK-LABEL: stack_fold_packusdw_maskz:
782 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
785 ; CHECK-NEXT: #NO_APP
786 ; CHECK-NEXT: kmovd %edi, %k1
787 ; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
789 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
790 %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
791 %3 = bitcast i32 %mask to <32 x i1>
792 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
796 define <64 x i8> @stack_fold_packuswb(<32 x i16> %a0, <32 x i16> %a1) {
797 ; CHECK-LABEL: stack_fold_packuswb:
799 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
802 ; CHECK-NEXT: #NO_APP
803 ; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
805 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
806 %2 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a0, <32 x i16> %a1)
809 declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone
811 define <64 x i8> @stack_fold_paddb(<64 x i8> %a0, <64 x i8> %a1) {
812 ; CHECK-LABEL: stack_fold_paddb:
814 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
817 ; CHECK-NEXT: #NO_APP
818 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
821 %2 = add <64 x i8> %a0, %a1
825 define <64 x i8> @stack_fold_paddb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
826 ; CHECK-LABEL: stack_fold_paddb_commuted:
828 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
831 ; CHECK-NEXT: #NO_APP
832 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
834 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
835 %2 = add <64 x i8> %a1, %a0
839 define <64 x i8> @stack_fold_paddb_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
840 ; CHECK-LABEL: stack_fold_paddb_mask:
842 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
843 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
846 ; CHECK-NEXT: #NO_APP
847 ; CHECK-NEXT: kmovq %rsi, %k1
848 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
849 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
851 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
852 %2 = add <64 x i8> %a0, %a1
853 %3 = bitcast i64 %mask to <64 x i1>
854 ; load needed to keep the operation from being scheduled about the asm block
855 %4 = load <64 x i8>, ptr %a2
856 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
860 define <64 x i8> @stack_fold_paddb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
861 ; CHECK-LABEL: stack_fold_paddb_mask_commuted:
863 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
864 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
867 ; CHECK-NEXT: #NO_APP
868 ; CHECK-NEXT: kmovq %rsi, %k1
869 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
870 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
872 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
873 %2 = add <64 x i8> %a1, %a0
874 %3 = bitcast i64 %mask to <64 x i1>
875 ; load needed to keep the operation from being scheduled about the asm block
876 %4 = load <64 x i8>, ptr %a2
877 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
881 define <64 x i8> @stack_fold_paddb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
882 ; CHECK-LABEL: stack_fold_paddb_maskz:
884 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
887 ; CHECK-NEXT: #NO_APP
888 ; CHECK-NEXT: kmovq %rdi, %k1
889 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
891 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
892 %2 = add <64 x i8> %a0, %a1
893 %3 = bitcast i64 %mask to <64 x i1>
894 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
898 define <64 x i8> @stack_fold_paddb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
899 ; CHECK-LABEL: stack_fold_paddb_maskz_commuted:
901 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
904 ; CHECK-NEXT: #NO_APP
905 ; CHECK-NEXT: kmovq %rdi, %k1
906 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
908 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
909 %2 = add <64 x i8> %a1, %a0
910 %3 = bitcast i64 %mask to <64 x i1>
911 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
915 define <16 x i32> @stack_fold_paddd(<16 x i32> %a0, <16 x i32> %a1) {
916 ; CHECK-LABEL: stack_fold_paddd:
918 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
921 ; CHECK-NEXT: #NO_APP
922 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
924 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
925 %2 = add <16 x i32> %a0, %a1
929 define <16 x i32> @stack_fold_paddd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
930 ; CHECK-LABEL: stack_fold_paddd_commuted:
932 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
935 ; CHECK-NEXT: #NO_APP
936 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
938 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
939 %2 = add <16 x i32> %a1, %a0
943 define <16 x i32> @stack_fold_paddd_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
944 ; CHECK-LABEL: stack_fold_paddd_mask:
946 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
947 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
950 ; CHECK-NEXT: #NO_APP
951 ; CHECK-NEXT: kmovd %esi, %k1
952 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
953 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
955 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
956 %2 = add <16 x i32> %a0, %a1
957 %3 = bitcast i16 %mask to <16 x i1>
958 ; load needed to keep the operation from being scheduled about the asm block
959 %4 = load <16 x i32>, ptr %a2
960 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
964 define <16 x i32> @stack_fold_paddd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
965 ; CHECK-LABEL: stack_fold_paddd_mask_commuted:
967 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
968 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
971 ; CHECK-NEXT: #NO_APP
972 ; CHECK-NEXT: kmovd %esi, %k1
973 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
974 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
976 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
977 %2 = add <16 x i32> %a1, %a0
978 %3 = bitcast i16 %mask to <16 x i1>
979 ; load needed to keep the operation from being scheduled about the asm block
980 %4 = load <16 x i32>, ptr %a2
981 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
985 define <16 x i32> @stack_fold_paddd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
986 ; CHECK-LABEL: stack_fold_paddd_maskz:
988 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
991 ; CHECK-NEXT: #NO_APP
992 ; CHECK-NEXT: kmovd %edi, %k1
993 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
995 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
996 %2 = add <16 x i32> %a0, %a1
997 %3 = bitcast i16 %mask to <16 x i1>
998 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1002 define <16 x i32> @stack_fold_paddd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1003 ; CHECK-LABEL: stack_fold_paddd_maskz_commuted:
1005 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1008 ; CHECK-NEXT: #NO_APP
1009 ; CHECK-NEXT: kmovd %edi, %k1
1010 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1012 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1013 %2 = add <16 x i32> %a1, %a0
1014 %3 = bitcast i16 %mask to <16 x i1>
1015 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1019 define <8 x i64> @stack_fold_paddq(<8 x i64> %a0, <8 x i64> %a1) {
1020 ; CHECK-LABEL: stack_fold_paddq:
1022 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1025 ; CHECK-NEXT: #NO_APP
1026 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1028 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1029 %2 = add <8 x i64> %a0, %a1
1033 define <8 x i64> @stack_fold_paddq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
1034 ; CHECK-LABEL: stack_fold_paddq_commuted:
1036 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1039 ; CHECK-NEXT: #NO_APP
1040 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1042 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1043 %2 = add <8 x i64> %a1, %a0
1047 define <8 x i64> @stack_fold_paddq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
1048 ; CHECK-LABEL: stack_fold_paddq_mask:
1050 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1051 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1054 ; CHECK-NEXT: #NO_APP
1055 ; CHECK-NEXT: kmovd %esi, %k1
1056 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1057 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1059 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1060 %2 = add <8 x i64> %a0, %a1
1061 %3 = bitcast i8 %mask to <8 x i1>
1062 ; load needed to keep the operation from being scheduled about the asm block
1063 %4 = load <8 x i64>, ptr %a2
1064 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1068 define <8 x i64> @stack_fold_paddq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
1069 ; CHECK-LABEL: stack_fold_paddq_mask_commuted:
1071 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1072 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1075 ; CHECK-NEXT: #NO_APP
1076 ; CHECK-NEXT: kmovd %esi, %k1
1077 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1078 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1080 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1081 %2 = add <8 x i64> %a1, %a0
1082 %3 = bitcast i8 %mask to <8 x i1>
1083 ; load needed to keep the operation from being scheduled about the asm block
1084 %4 = load <8 x i64>, ptr %a2
1085 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1089 define <8 x i64> @stack_fold_paddq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1090 ; CHECK-LABEL: stack_fold_paddq_maskz:
1092 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1095 ; CHECK-NEXT: #NO_APP
1096 ; CHECK-NEXT: kmovd %edi, %k1
1097 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1099 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1100 %2 = add <8 x i64> %a0, %a1
1101 %3 = bitcast i8 %mask to <8 x i1>
1102 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1106 define <8 x i64> @stack_fold_paddq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1107 ; CHECK-LABEL: stack_fold_paddq_maskz_commuted:
1109 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1112 ; CHECK-NEXT: #NO_APP
1113 ; CHECK-NEXT: kmovd %edi, %k1
1114 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1116 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1117 %2 = add <8 x i64> %a1, %a0
1118 %3 = bitcast i8 %mask to <8 x i1>
1119 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1123 define <64 x i8> @stack_fold_paddsb(<64 x i8> %a0, <64 x i8> %a1) {
1124 ; CHECK-LABEL: stack_fold_paddsb:
1126 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1129 ; CHECK-NEXT: #NO_APP
1130 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1132 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1133 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1137 define <64 x i8> @stack_fold_paddsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
1138 ; CHECK-LABEL: stack_fold_paddsb_commuted:
1140 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1143 ; CHECK-NEXT: #NO_APP
1144 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1146 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1147 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1151 define <64 x i8> @stack_fold_paddsb_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
1152 ; CHECK-LABEL: stack_fold_paddsb_mask:
1154 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1155 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1158 ; CHECK-NEXT: #NO_APP
1159 ; CHECK-NEXT: kmovq %rsi, %k1
1160 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1161 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1164 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1165 %3 = bitcast i64 %mask to <64 x i1>
1166 ; load needed to keep the operation from being scheduled about the asm block
1167 %4 = load <64 x i8>, ptr %a2
1168 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1172 define <64 x i8> @stack_fold_paddsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
1173 ; CHECK-LABEL: stack_fold_paddsb_mask_commuted:
1175 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1176 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1179 ; CHECK-NEXT: #NO_APP
1180 ; CHECK-NEXT: kmovq %rsi, %k1
1181 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1182 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1184 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1185 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1186 %3 = bitcast i64 %mask to <64 x i1>
1187 ; load needed to keep the operation from being scheduled about the asm block
1188 %4 = load <64 x i8>, ptr %a2
1189 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1193 define <64 x i8> @stack_fold_paddsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1194 ; CHECK-LABEL: stack_fold_paddsb_maskz:
1196 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1199 ; CHECK-NEXT: #NO_APP
1200 ; CHECK-NEXT: kmovq %rdi, %k1
1201 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1204 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1205 %3 = bitcast i64 %mask to <64 x i1>
1206 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1210 define <64 x i8> @stack_fold_paddsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1211 ; CHECK-LABEL: stack_fold_paddsb_maskz_commuted:
1213 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1216 ; CHECK-NEXT: #NO_APP
1217 ; CHECK-NEXT: kmovq %rdi, %k1
1218 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1220 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1221 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1222 %3 = bitcast i64 %mask to <64 x i1>
1223 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1227 define <32 x i16> @stack_fold_paddsw(<32 x i16> %a0, <32 x i16> %a1) {
1228 ; CHECK-LABEL: stack_fold_paddsw:
1230 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1233 ; CHECK-NEXT: #NO_APP
1234 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1236 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1237 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1241 define <32 x i16> @stack_fold_paddsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
1242 ; CHECK-LABEL: stack_fold_paddsw_commuted:
1244 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1247 ; CHECK-NEXT: #NO_APP
1248 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1250 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1251 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1255 define <32 x i16> @stack_fold_paddsw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
1256 ; CHECK-LABEL: stack_fold_paddsw_mask:
1258 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1259 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1262 ; CHECK-NEXT: #NO_APP
1263 ; CHECK-NEXT: kmovd %esi, %k1
1264 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1265 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1267 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1268 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1269 %3 = bitcast i32 %mask to <32 x i1>
1270 ; load needed to keep the operation from being scheduled about the asm block
1271 %4 = load <32 x i16>, ptr %a2
1272 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1276 define <32 x i16> @stack_fold_paddsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
1277 ; CHECK-LABEL: stack_fold_paddsw_mask_commuted:
1279 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1280 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1283 ; CHECK-NEXT: #NO_APP
1284 ; CHECK-NEXT: kmovd %esi, %k1
1285 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1286 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1288 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1289 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1290 %3 = bitcast i32 %mask to <32 x i1>
1291 ; load needed to keep the operation from being scheduled about the asm block
1292 %4 = load <32 x i16>, ptr %a2
1293 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1297 define <32 x i16> @stack_fold_paddsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1298 ; CHECK-LABEL: stack_fold_paddsw_maskz:
1300 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1303 ; CHECK-NEXT: #NO_APP
1304 ; CHECK-NEXT: kmovd %edi, %k1
1305 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1307 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1308 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1309 %3 = bitcast i32 %mask to <32 x i1>
1310 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1314 define <32 x i16> @stack_fold_paddsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1315 ; CHECK-LABEL: stack_fold_paddsw_maskz_commuted:
1317 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1320 ; CHECK-NEXT: #NO_APP
1321 ; CHECK-NEXT: kmovd %edi, %k1
1322 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1324 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1325 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1326 %3 = bitcast i32 %mask to <32 x i1>
1327 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1331 define <64 x i8> @stack_fold_paddusb(<64 x i8> %a0, <64 x i8> %a1) {
1332 ; CHECK-LABEL: stack_fold_paddusb:
1334 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1337 ; CHECK-NEXT: #NO_APP
1338 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1340 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1341 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1345 define <64 x i8> @stack_fold_paddusb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
1346 ; CHECK-LABEL: stack_fold_paddusb_commuted:
1348 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1351 ; CHECK-NEXT: #NO_APP
1352 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1354 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1355 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1359 define <64 x i8> @stack_fold_paddusb_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
1360 ; CHECK-LABEL: stack_fold_paddusb_mask:
1362 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1363 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1366 ; CHECK-NEXT: #NO_APP
1367 ; CHECK-NEXT: kmovq %rsi, %k1
1368 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1369 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1371 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1372 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1373 %3 = bitcast i64 %mask to <64 x i1>
1374 ; load needed to keep the operation from being scheduled about the asm block
1375 %4 = load <64 x i8>, ptr %a2
1376 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1380 define <64 x i8> @stack_fold_paddusb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, ptr %a2, i64 %mask) {
1381 ; CHECK-LABEL: stack_fold_paddusb_mask_commuted:
1383 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1384 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1387 ; CHECK-NEXT: #NO_APP
1388 ; CHECK-NEXT: kmovq %rsi, %k1
1389 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1390 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1392 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1393 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1394 %3 = bitcast i64 %mask to <64 x i1>
1395 ; load needed to keep the operation from being scheduled about the asm block
1396 %4 = load <64 x i8>, ptr %a2
1397 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1401 define <64 x i8> @stack_fold_paddusb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1402 ; CHECK-LABEL: stack_fold_paddusb_maskz:
1404 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1407 ; CHECK-NEXT: #NO_APP
1408 ; CHECK-NEXT: kmovq %rdi, %k1
1409 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1411 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1412 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1413 %3 = bitcast i64 %mask to <64 x i1>
1414 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1418 define <64 x i8> @stack_fold_paddusb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1419 ; CHECK-LABEL: stack_fold_paddusb_maskz_commuted:
1421 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1424 ; CHECK-NEXT: #NO_APP
1425 ; CHECK-NEXT: kmovq %rdi, %k1
1426 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1428 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1429 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1430 %3 = bitcast i64 %mask to <64 x i1>
1431 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1435 define <32 x i16> @stack_fold_paddusw(<32 x i16> %a0, <32 x i16> %a1) {
1436 ; CHECK-LABEL: stack_fold_paddusw:
1438 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1441 ; CHECK-NEXT: #NO_APP
1442 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1444 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1445 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1449 define <32 x i16> @stack_fold_paddusw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
1450 ; CHECK-LABEL: stack_fold_paddusw_commuted:
1452 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1455 ; CHECK-NEXT: #NO_APP
1456 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1458 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1459 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1463 define <32 x i16> @stack_fold_paddusw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
1464 ; CHECK-LABEL: stack_fold_paddusw_mask:
1466 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1467 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1470 ; CHECK-NEXT: #NO_APP
1471 ; CHECK-NEXT: kmovd %esi, %k1
1472 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1473 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1475 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1476 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1477 %3 = bitcast i32 %mask to <32 x i1>
1478 ; load needed to keep the operation from being scheduled about the asm block
1479 %4 = load <32 x i16>, ptr %a2
1480 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1484 define <32 x i16> @stack_fold_paddusw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
1485 ; CHECK-LABEL: stack_fold_paddusw_mask_commuted:
1487 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1488 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1491 ; CHECK-NEXT: #NO_APP
1492 ; CHECK-NEXT: kmovd %esi, %k1
1493 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1494 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1496 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1497 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1498 %3 = bitcast i32 %mask to <32 x i1>
1499 ; load needed to keep the operation from being scheduled about the asm block
1500 %4 = load <32 x i16>, ptr %a2
1501 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1505 define <32 x i16> @stack_fold_paddusw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1506 ; CHECK-LABEL: stack_fold_paddusw_maskz:
1508 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1511 ; CHECK-NEXT: #NO_APP
1512 ; CHECK-NEXT: kmovd %edi, %k1
1513 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1515 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1516 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1517 %3 = bitcast i32 %mask to <32 x i1>
1518 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1522 define <32 x i16> @stack_fold_paddusw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1523 ; CHECK-LABEL: stack_fold_paddusw_maskz_commuted:
1525 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1528 ; CHECK-NEXT: #NO_APP
1529 ; CHECK-NEXT: kmovd %edi, %k1
1530 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1533 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1534 %3 = bitcast i32 %mask to <32 x i1>
1535 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1539 define <32 x i16> @stack_fold_paddw(<32 x i16> %a0, <32 x i16> %a1) {
1540 ; CHECK-LABEL: stack_fold_paddw:
1542 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1545 ; CHECK-NEXT: #NO_APP
1546 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1548 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1549 %2 = add <32 x i16> %a0, %a1
1553 define <32 x i16> @stack_fold_paddw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
1554 ; CHECK-LABEL: stack_fold_paddw_commuted:
1556 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1559 ; CHECK-NEXT: #NO_APP
1560 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1562 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1563 %2 = add <32 x i16> %a1, %a0
1567 define <32 x i16> @stack_fold_paddw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
1568 ; CHECK-LABEL: stack_fold_paddw_mask:
1570 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1571 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1574 ; CHECK-NEXT: #NO_APP
1575 ; CHECK-NEXT: kmovd %esi, %k1
1576 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1577 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1579 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1580 %2 = add <32 x i16> %a0, %a1
1581 %3 = bitcast i32 %mask to <32 x i1>
1582 ; load needed to keep the operation from being scheduled about the asm block
1583 %4 = load <32 x i16>, ptr %a2
1584 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1588 define <32 x i16> @stack_fold_paddw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
1589 ; CHECK-LABEL: stack_fold_paddw_mask_commuted:
1591 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1592 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1595 ; CHECK-NEXT: #NO_APP
1596 ; CHECK-NEXT: kmovd %esi, %k1
1597 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1598 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1600 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1601 %2 = add <32 x i16> %a1, %a0
1602 %3 = bitcast i32 %mask to <32 x i1>
1603 ; load needed to keep the operation from being scheduled about the asm block
1604 %4 = load <32 x i16>, ptr %a2
1605 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1609 define <32 x i16> @stack_fold_paddw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1610 ; CHECK-LABEL: stack_fold_paddw_maskz:
1612 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1615 ; CHECK-NEXT: #NO_APP
1616 ; CHECK-NEXT: kmovd %edi, %k1
1617 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1619 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1620 %2 = add <32 x i16> %a0, %a1
1621 %3 = bitcast i32 %mask to <32 x i1>
1622 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1626 define <32 x i16> @stack_fold_paddw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1627 ; CHECK-LABEL: stack_fold_paddw_maskz_commuted:
1629 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1632 ; CHECK-NEXT: #NO_APP
1633 ; CHECK-NEXT: kmovd %edi, %k1
1634 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1636 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1637 %2 = add <32 x i16> %a1, %a0
1638 %3 = bitcast i32 %mask to <32 x i1>
1639 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1643 define <64 x i8> @stack_fold_palignr(<64 x i8> %a0, <64 x i8> %a1) {
1644 ; CHECK-LABEL: stack_fold_palignr:
1646 ; CHECK-NEXT: pushq %rax
1647 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1648 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1649 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1652 ; CHECK-NEXT: #NO_APP
1653 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1654 ; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1655 ; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
1656 ; CHECK-NEXT: popq %rax
1657 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1659 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1660 %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
1664 define <64 x i8> @stack_fold_palignr_mask(<64 x i8> %a0, <64 x i8> %a1, ptr %passthru, i64 %mask) {
1665 ; CHECK-LABEL: stack_fold_palignr_mask:
1667 ; CHECK-NEXT: pushq %rax
1668 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1669 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1670 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1673 ; CHECK-NEXT: #NO_APP
1674 ; CHECK-NEXT: kmovq %rsi, %k1
1675 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
1676 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1677 ; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
1678 ; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
1679 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
1680 ; CHECK-NEXT: popq %rax
1681 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1683 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1684 %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
1685 %3 = bitcast i64 %mask to <64 x i1>
1686 %4 = load <64 x i8>, ptr %passthru
1687 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1691 define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1692 ; CHECK-LABEL: stack_fold_palignr_maskz:
1694 ; CHECK-NEXT: pushq %rax
1695 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1696 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1697 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1700 ; CHECK-NEXT: #NO_APP
1701 ; CHECK-NEXT: kmovq %rdi, %k1
1702 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1703 ; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1704 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
1705 ; CHECK-NEXT: popq %rax
1706 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1708 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1709 %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
1710 %3 = bitcast i64 %mask to <64 x i1>
1711 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1715 define <16 x i32> @stack_fold_pandd(<16 x i32> %a0, <16 x i32> %a1) {
1716 ; CHECK-LABEL: stack_fold_pandd:
1718 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1721 ; CHECK-NEXT: #NO_APP
1722 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1724 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1725 %2 = and <16 x i32> %a0, %a1
1729 define <16 x i32> @stack_fold_pandd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
1730 ; CHECK-LABEL: stack_fold_pandd_commuted:
1732 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1735 ; CHECK-NEXT: #NO_APP
1736 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1738 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1739 %2 = and <16 x i32> %a1, %a0
1743 define <16 x i32> @stack_fold_pandd_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
1744 ; CHECK-LABEL: stack_fold_pandd_mask:
1746 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1747 ; CHECK-NEXT: vmovaps %zmm0, %zmm1
1750 ; CHECK-NEXT: #NO_APP
1751 ; CHECK-NEXT: kmovd %esi, %k1
1752 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
1753 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1755 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1756 %2 = and <16 x i32> %a0, %a1
1757 %3 = bitcast i16 %mask to <16 x i1>
1758 ; load needed to keep the operation from being scheduled about the asm block
1759 %4 = load <16 x i32>, ptr %a2
1760 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
1764 define <16 x i32> @stack_fold_pandd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
1765 ; CHECK-LABEL: stack_fold_pandd_mask_commuted:
1767 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1768 ; CHECK-NEXT: vmovaps %zmm0, %zmm1
1771 ; CHECK-NEXT: #NO_APP
1772 ; CHECK-NEXT: kmovd %esi, %k1
1773 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
1774 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1776 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1777 %2 = and <16 x i32> %a1, %a0
1778 %3 = bitcast i16 %mask to <16 x i1>
1779 ; load needed to keep the operation from being scheduled about the asm block
1780 %4 = load <16 x i32>, ptr %a2
1781 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
1785 define <16 x i32> @stack_fold_pandd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1786 ; CHECK-LABEL: stack_fold_pandd_maskz:
1788 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1791 ; CHECK-NEXT: #NO_APP
1792 ; CHECK-NEXT: kmovd %edi, %k1
1793 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1795 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1796 %2 = and <16 x i32> %a0, %a1
1797 %3 = bitcast i16 %mask to <16 x i1>
1798 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1802 define <16 x i32> @stack_fold_pandd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1803 ; CHECK-LABEL: stack_fold_pandd_maskz_commuted:
1805 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1808 ; CHECK-NEXT: #NO_APP
1809 ; CHECK-NEXT: kmovd %edi, %k1
1810 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1812 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1813 %2 = and <16 x i32> %a1, %a0
1814 %3 = bitcast i16 %mask to <16 x i1>
1815 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1819 define <8 x i64> @stack_fold_pandq(<8 x i64> %a0, <8 x i64> %a1) {
1820 ; CHECK-LABEL: stack_fold_pandq:
1822 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1825 ; CHECK-NEXT: #NO_APP
1826 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1828 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1829 %2 = and <8 x i64> %a0, %a1
1833 define <8 x i64> @stack_fold_pandq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
1834 ; CHECK-LABEL: stack_fold_pandq_commuted:
1836 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1839 ; CHECK-NEXT: #NO_APP
1840 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1842 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1843 %2 = and <8 x i64> %a1, %a0
1847 define <8 x i64> @stack_fold_pandq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
1848 ; CHECK-LABEL: stack_fold_pandq_mask:
1850 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1851 ; CHECK-NEXT: vmovapd %zmm0, %zmm1
1854 ; CHECK-NEXT: #NO_APP
1855 ; CHECK-NEXT: kmovd %esi, %k1
1856 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
1857 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1859 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1860 %2 = and <8 x i64> %a0, %a1
1861 %3 = bitcast i8 %mask to <8 x i1>
1862 ; load needed to keep the operation from being scheduled about the asm block
1863 %4 = load <8 x i64>, ptr %a2
1864 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1868 define <8 x i64> @stack_fold_pandq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
1869 ; CHECK-LABEL: stack_fold_pandq_mask_commuted:
1871 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1872 ; CHECK-NEXT: vmovapd %zmm0, %zmm1
1875 ; CHECK-NEXT: #NO_APP
1876 ; CHECK-NEXT: kmovd %esi, %k1
1877 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
1878 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1880 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1881 %2 = and <8 x i64> %a1, %a0
1882 %3 = bitcast i8 %mask to <8 x i1>
1883 ; load needed to keep the operation from being scheduled about the asm block
1884 %4 = load <8 x i64>, ptr %a2
1885 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1889 define <8 x i64> @stack_fold_pandq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1890 ; CHECK-LABEL: stack_fold_pandq_maskz:
1892 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1895 ; CHECK-NEXT: #NO_APP
1896 ; CHECK-NEXT: kmovd %edi, %k1
1897 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1899 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1900 %2 = and <8 x i64> %a0, %a1
1901 %3 = bitcast i8 %mask to <8 x i1>
1902 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1906 define <8 x i64> @stack_fold_pandq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1907 ; CHECK-LABEL: stack_fold_pandq_maskz_commuted:
1909 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1912 ; CHECK-NEXT: #NO_APP
1913 ; CHECK-NEXT: kmovd %edi, %k1
1914 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1916 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1917 %2 = and <8 x i64> %a1, %a0
1918 %3 = bitcast i8 %mask to <8 x i1>
1919 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1923 define <16 x i32> @stack_fold_vpconflictd(<16 x i32> %a0) {
1924 ; CHECK-LABEL: stack_fold_vpconflictd:
1926 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1929 ; CHECK-NEXT: #NO_APP
1930 ; CHECK-NEXT: vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1932 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1933 %2 = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a0)
1936 declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
1938 define <8 x i64> @stack_fold_vpconflictq(<8 x i64> %a0) {
1939 ; CHECK-LABEL: stack_fold_vpconflictq:
1941 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1944 ; CHECK-NEXT: #NO_APP
1945 ; CHECK-NEXT: vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1947 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1948 %2 = call <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %a0)
1951 declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readnone
1953 define i64 @stack_fold_pcmpeqb(<64 x i8> %a0, <64 x i8> %a1) {
1954 ; CHECK-LABEL: stack_fold_pcmpeqb:
1956 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1959 ; CHECK-NEXT: #NO_APP
1960 ; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
1961 ; CHECK-NEXT: kmovq %k0, %rax
1962 ; CHECK-NEXT: vzeroupper
1964 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1965 %2 = icmp eq <64 x i8> %a0, %a1
1966 %3 = bitcast <64 x i1> %2 to i64
1970 define i16 @stack_fold_pcmpeqd(<16 x i32> %a0, <16 x i32> %a1) {
1971 ; CHECK-LABEL: stack_fold_pcmpeqd:
1973 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1976 ; CHECK-NEXT: #NO_APP
1977 ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
1978 ; CHECK-NEXT: kmovd %k0, %eax
1979 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1980 ; CHECK-NEXT: vzeroupper
1982 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1983 %2 = icmp eq <16 x i32> %a0, %a1
1984 %3 = bitcast <16 x i1> %2 to i16
1988 define i8 @stack_fold_pcmpeqq(<8 x i64> %a0, <8 x i64> %a1) {
1989 ; CHECK-LABEL: stack_fold_pcmpeqq:
1991 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1994 ; CHECK-NEXT: #NO_APP
1995 ; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
1996 ; CHECK-NEXT: kmovd %k0, %eax
1997 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
1998 ; CHECK-NEXT: vzeroupper
2000 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2001 %2 = icmp eq <8 x i64> %a0, %a1
2002 %3 = bitcast <8 x i1> %2 to i8
2006 define i32 @stack_fold_pcmpeqw(<32 x i16> %a0, <32 x i16> %a1) {
2007 ; CHECK-LABEL: stack_fold_pcmpeqw:
2009 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2012 ; CHECK-NEXT: #NO_APP
2013 ; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
2014 ; CHECK-NEXT: kmovd %k0, %eax
2015 ; CHECK-NEXT: vzeroupper
2017 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2018 %2 = icmp eq <32 x i16> %a0, %a1
2019 %3 = bitcast <32 x i1> %2 to i32
2023 define <16 x i32> @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
2024 ; CHECK-LABEL: stack_fold_pcmpeqd_mask:
2026 ; CHECK-NEXT: subq $136, %rsp
2027 ; CHECK-NEXT: .cfi_def_cfa_offset 144
2028 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2029 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
2030 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2031 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2034 ; CHECK-NEXT: #NO_APP
2035 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2036 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
2037 ; CHECK-NEXT: kmovd %esi, %k1
2038 ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
2039 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2040 ; CHECK-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
2041 ; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
2042 ; CHECK-NEXT: addq $136, %rsp
2043 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2045 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2046 ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
2047 %2 = load <16 x i32>, ptr %a2
2048 %3 = add <16 x i32> %a1, %2
2049 %4 = bitcast i16 %mask to <16 x i1>
2050 %5 = icmp eq <16 x i32> %3, %a0
2051 %6 = and <16 x i1> %4, %5
2052 %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
2056 define <16 x i32> @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
2057 ; CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted:
2059 ; CHECK-NEXT: subq $136, %rsp
2060 ; CHECK-NEXT: .cfi_def_cfa_offset 144
2061 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2062 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
2063 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2064 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2067 ; CHECK-NEXT: #NO_APP
2068 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2069 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
2070 ; CHECK-NEXT: kmovd %esi, %k1
2071 ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
2072 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2073 ; CHECK-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
2074 ; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
2075 ; CHECK-NEXT: addq $136, %rsp
2076 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2078 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2079 ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
2080 %2 = load <16 x i32>, ptr %a2
2081 %3 = add <16 x i32> %a1, %2
2082 %4 = bitcast i16 %mask to <16 x i1>
2083 %5 = icmp eq <16 x i32> %a0, %3
2084 %6 = and <16 x i1> %4, %5
2085 %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
2089 define <16 x i32> @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
2090 ; CHECK-LABEL: stack_fold_pcmpled_mask:
2092 ; CHECK-NEXT: subq $136, %rsp
2093 ; CHECK-NEXT: .cfi_def_cfa_offset 144
2094 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2095 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
2096 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2097 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2100 ; CHECK-NEXT: #NO_APP
2101 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2102 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
2103 ; CHECK-NEXT: kmovd %esi, %k1
2104 ; CHECK-NEXT: vpcmpled {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
2105 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2106 ; CHECK-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
2107 ; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
2108 ; CHECK-NEXT: addq $136, %rsp
2109 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2111 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2112 ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
2113 %2 = load <16 x i32>, ptr %a2
2114 %3 = add <16 x i32> %a1, %2
2115 %4 = bitcast i16 %mask to <16 x i1>
2116 %5 = icmp sge <16 x i32> %a0, %3
2117 %6 = and <16 x i1> %4, %5
2118 %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
2122 define i16 @stack_fold_pcmpleud(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
2123 ; CHECK-LABEL: stack_fold_pcmpleud:
2125 ; CHECK-NEXT: pushq %rax
2126 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2127 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2128 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2131 ; CHECK-NEXT: #NO_APP
2132 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2133 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
2134 ; CHECK-NEXT: vpcmpleud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
2135 ; CHECK-NEXT: kmovd %k0, %eax
2136 ; CHECK-NEXT: andl %esi, %eax
2137 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
2138 ; CHECK-NEXT: popq %rcx
2139 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2140 ; CHECK-NEXT: vzeroupper
2142 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2143 %2 = load <16 x i32>, ptr %a2
2144 %3 = add <16 x i32> %a1, %2
2145 %4 = bitcast i16 %mask to <16 x i1>
2146 %5 = icmp uge <16 x i32> %a0, %3
2147 %6 = and <16 x i1> %5, %4
2148 %7 = bitcast <16 x i1> %6 to i16
2152 define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) {
2153 ; CHECK-LABEL: stack_fold_permbvar:
2155 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2158 ; CHECK-NEXT: #NO_APP
2159 ; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2161 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2162 %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
2165 declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) nounwind readonly
2167 define <64 x i8> @stack_fold_permbvar_mask(ptr %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
2168 ; CHECK-LABEL: stack_fold_permbvar_mask:
2170 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2173 ; CHECK-NEXT: #NO_APP
2174 ; CHECK-NEXT: kmovq %rsi, %k1
2175 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2176 ; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2177 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2179 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2180 %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
2181 %3 = bitcast i64 %mask to <64 x i1>
2182 ; load needed to keep the operation from being scheduled above the asm block
2183 %4 = load <64 x i8>, ptr %passthru
2184 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
2188 define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
2189 ; CHECK-LABEL: stack_fold_permbvar_maskz:
2191 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2194 ; CHECK-NEXT: #NO_APP
2195 ; CHECK-NEXT: kmovq %rdi, %k1
2196 ; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2198 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2199 %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
2200 %3 = bitcast i64 %mask to <64 x i1>
2201 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
2205 define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) {
2206 ; CHECK-LABEL: stack_fold_permd:
2208 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2211 ; CHECK-NEXT: #NO_APP
2212 ; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2213 ; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1
2214 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
2216 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2217 %2 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0)
2218 ; add forces execution domain
2219 %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2222 declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) nounwind readonly
2224 define <64 x i8> @stack_fold_vpermi2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
2225 ; CHECK-LABEL: stack_fold_vpermi2b:
2227 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2230 ; CHECK-NEXT: #NO_APP
2231 ; CHECK-NEXT: vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2233 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2234 %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2)
2238 define <16 x i32> @stack_fold_vpermi2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
2239 ; CHECK-LABEL: stack_fold_vpermi2d:
2241 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2244 ; CHECK-NEXT: #NO_APP
2245 ; CHECK-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2247 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2248 %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
2252 define <8 x i64> @stack_fold_vpermi2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
2253 ; CHECK-LABEL: stack_fold_vpermi2q:
2255 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2258 ; CHECK-NEXT: #NO_APP
2259 ; CHECK-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2261 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2262 %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
2266 define <32 x i16> @stack_fold_vpermi2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
2267 ; CHECK-LABEL: stack_fold_vpermi2w:
2269 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2272 ; CHECK-NEXT: #NO_APP
2273 ; CHECK-NEXT: vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2275 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2276 %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2)
2280 define <8 x i64> @stack_fold_permq(<8 x i64> %a0) {
2281 ; CHECK-LABEL: stack_fold_permq:
2283 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2286 ; CHECK-NEXT: #NO_APP
2287 ; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
2288 ; CHECK-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7]
2289 ; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1
2290 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
2292 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2293 %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
2294 ; add forces execution domain
2295 %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2299 define <8 x i64> @stack_fold_permq_mask(ptr %passthru, <8 x i64> %a0, i8 %mask) {
2300 ; CHECK-LABEL: stack_fold_permq_mask:
2302 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2305 ; CHECK-NEXT: #NO_APP
2306 ; CHECK-NEXT: kmovd %esi, %k1
2307 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
2308 ; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
2309 ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7]
2310 ; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1
2311 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
2313 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2314 %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
2315 %3 = bitcast i8 %mask to <8 x i1>
2316 ; load needed to keep the operation from being scheduled above the asm block
2317 %4 = load <8 x i64>, ptr %passthru
2318 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
2319 ; add forces execution domain
2320 %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2324 define <8 x i64> @stack_fold_permq_maskz(ptr %passthru, <8 x i64> %a0, i8 %mask) {
2325 ; CHECK-LABEL: stack_fold_permq_maskz:
2327 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2330 ; CHECK-NEXT: #NO_APP
2331 ; CHECK-NEXT: kmovd %esi, %k1
2332 ; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
2333 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,2,3,7,6,6,7]
2335 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2336 %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
2337 %3 = bitcast i8 %mask to <8 x i1>
2338 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
2342 define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) {
2343 ; CHECK-LABEL: stack_fold_permqvar:
2345 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2348 ; CHECK-NEXT: #NO_APP
2349 ; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2350 ; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1
2351 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
2353 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2354 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0)
2355 ; add forces execution domain
2356 %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2359 declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) nounwind readonly
2361 define <8 x i64> @stack_fold_permqvar_mask(ptr %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
2362 ; CHECK-LABEL: stack_fold_permqvar_mask:
2364 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2367 ; CHECK-NEXT: #NO_APP
2368 ; CHECK-NEXT: kmovd %esi, %k1
2369 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
2370 ; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
2371 ; CHECK-NEXT: vpternlogd {{.*#+}} zmm0 = -1
2372 ; CHECK-NEXT: vpsubq %zmm0, %zmm1, %zmm0
2374 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2375 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0)
2376 %3 = bitcast i8 %mask to <8 x i1>
2377 ; load needed to keep the operation from being scheduled above the asm block
2378 %4 = load <8 x i64>, ptr %passthru
2379 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
2380 ; add forces execution domain
2381 %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2385 define <64 x i8> @stack_fold_vpermt2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
2386 ; CHECK-LABEL: stack_fold_vpermt2b:
2388 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2391 ; CHECK-NEXT: #NO_APP
2392 ; CHECK-NEXT: vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2394 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2395 %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2)
2398 declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>)
2400 define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
2401 ; CHECK-LABEL: stack_fold_vpermt2d:
2403 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2406 ; CHECK-NEXT: #NO_APP
2407 ; CHECK-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2409 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2410 %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
2413 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2415 define <8 x i64> @stack_fold_vpermt2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
2416 ; CHECK-LABEL: stack_fold_vpermt2q:
2418 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2421 ; CHECK-NEXT: #NO_APP
2422 ; CHECK-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2424 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2425 %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
2428 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
2430 define <32 x i16> @stack_fold_vpermt2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
2431 ; CHECK-LABEL: stack_fold_vpermt2w:
2433 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2436 ; CHECK-NEXT: #NO_APP
2437 ; CHECK-NEXT: vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2439 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2440 %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2)
2443 declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>)
2445 define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) {
2446 ; CHECK-LABEL: stack_fold_permwvar:
2448 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2451 ; CHECK-NEXT: #NO_APP
2452 ; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2454 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2455 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
2458 declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) nounwind readonly
2460 define <32 x i16> @stack_fold_permwvar_mask(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
2461 ; CHECK-LABEL: stack_fold_permwvar_mask:
2463 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2466 ; CHECK-NEXT: #NO_APP
2467 ; CHECK-NEXT: kmovd %esi, %k1
2468 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2469 ; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2470 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2472 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2473 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
2474 %3 = bitcast i32 %mask to <32 x i1>
2475 ; load needed to keep the operation from being scheduled above the asm block
2476 %4 = load <32 x i16>, ptr %passthru
2477 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
2481 define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
2482 ; CHECK-LABEL: stack_fold_permwvar_maskz:
2484 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2487 ; CHECK-NEXT: #NO_APP
2488 ; CHECK-NEXT: kmovd %edi, %k1
2489 ; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2491 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2492 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
2493 %3 = bitcast i32 %mask to <32 x i1>
2494 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
2498 define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
2499 ; CHECK-LABEL: stack_fold_pextrd:
2501 ; CHECK-NEXT: pushq %rbp
2502 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2503 ; CHECK-NEXT: pushq %r15
2504 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2505 ; CHECK-NEXT: pushq %r14
2506 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2507 ; CHECK-NEXT: pushq %r13
2508 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2509 ; CHECK-NEXT: pushq %r12
2510 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2511 ; CHECK-NEXT: pushq %rbx
2512 ; CHECK-NEXT: .cfi_def_cfa_offset 56
2513 ; CHECK-NEXT: .cfi_offset %rbx, -56
2514 ; CHECK-NEXT: .cfi_offset %r12, -48
2515 ; CHECK-NEXT: .cfi_offset %r13, -40
2516 ; CHECK-NEXT: .cfi_offset %r14, -32
2517 ; CHECK-NEXT: .cfi_offset %r15, -24
2518 ; CHECK-NEXT: .cfi_offset %rbp, -16
2519 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2520 ; CHECK-NEXT: vpextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
2523 ; CHECK-NEXT: #NO_APP
2524 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2525 ; CHECK-NEXT: popq %rbx
2526 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2527 ; CHECK-NEXT: popq %r12
2528 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2529 ; CHECK-NEXT: popq %r13
2530 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2531 ; CHECK-NEXT: popq %r14
2532 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2533 ; CHECK-NEXT: popq %r15
2534 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2535 ; CHECK-NEXT: popq %rbp
2536 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2538 ; add forces execution domain
2539 %1 = add <4 x i32> %a0, %a1
2540 %2 = extractelement <4 x i32> %1, i32 1
2541 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2545 define i64 @stack_fold_pextrq(<2 x i64> %a0) {
2546 ; CHECK-LABEL: stack_fold_pextrq:
2548 ; CHECK-NEXT: pushq %rbp
2549 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2550 ; CHECK-NEXT: pushq %r15
2551 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2552 ; CHECK-NEXT: pushq %r14
2553 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2554 ; CHECK-NEXT: pushq %r13
2555 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2556 ; CHECK-NEXT: pushq %r12
2557 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2558 ; CHECK-NEXT: pushq %rbx
2559 ; CHECK-NEXT: .cfi_def_cfa_offset 56
2560 ; CHECK-NEXT: .cfi_offset %rbx, -56
2561 ; CHECK-NEXT: .cfi_offset %r12, -48
2562 ; CHECK-NEXT: .cfi_offset %r13, -40
2563 ; CHECK-NEXT: .cfi_offset %r14, -32
2564 ; CHECK-NEXT: .cfi_offset %r15, -24
2565 ; CHECK-NEXT: .cfi_offset %rbp, -16
2566 ; CHECK-NEXT: vpextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2569 ; CHECK-NEXT: #NO_APP
2570 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2571 ; CHECK-NEXT: popq %rbx
2572 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2573 ; CHECK-NEXT: popq %r12
2574 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2575 ; CHECK-NEXT: popq %r13
2576 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2577 ; CHECK-NEXT: popq %r14
2578 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2579 ; CHECK-NEXT: popq %r15
2580 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2581 ; CHECK-NEXT: popq %rbp
2582 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2584 %1 = extractelement <2 x i64> %a0, i32 1
2585 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2589 define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
2590 ; CHECK-LABEL: stack_fold_pinsrb:
2592 ; CHECK-NEXT: pushq %rbp
2593 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2594 ; CHECK-NEXT: pushq %r15
2595 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2596 ; CHECK-NEXT: pushq %r14
2597 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2598 ; CHECK-NEXT: pushq %r13
2599 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2600 ; CHECK-NEXT: pushq %r12
2601 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2602 ; CHECK-NEXT: pushq %rbx
2603 ; CHECK-NEXT: .cfi_def_cfa_offset 56
2604 ; CHECK-NEXT: .cfi_offset %rbx, -56
2605 ; CHECK-NEXT: .cfi_offset %r12, -48
2606 ; CHECK-NEXT: .cfi_offset %r13, -40
2607 ; CHECK-NEXT: .cfi_offset %r14, -32
2608 ; CHECK-NEXT: .cfi_offset %r15, -24
2609 ; CHECK-NEXT: .cfi_offset %rbp, -16
2610 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2613 ; CHECK-NEXT: #NO_APP
2614 ; CHECK-NEXT: vpinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2615 ; CHECK-NEXT: popq %rbx
2616 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2617 ; CHECK-NEXT: popq %r12
2618 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2619 ; CHECK-NEXT: popq %r13
2620 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2621 ; CHECK-NEXT: popq %r14
2622 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2623 ; CHECK-NEXT: popq %r15
2624 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2625 ; CHECK-NEXT: popq %rbp
2626 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2628 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2629 %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
2633 define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
2634 ; CHECK-LABEL: stack_fold_pinsrd:
2636 ; CHECK-NEXT: pushq %rbp
2637 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2638 ; CHECK-NEXT: pushq %r15
2639 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2640 ; CHECK-NEXT: pushq %r14
2641 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2642 ; CHECK-NEXT: pushq %r13
2643 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2644 ; CHECK-NEXT: pushq %r12
2645 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2646 ; CHECK-NEXT: pushq %rbx
2647 ; CHECK-NEXT: .cfi_def_cfa_offset 56
2648 ; CHECK-NEXT: .cfi_offset %rbx, -56
2649 ; CHECK-NEXT: .cfi_offset %r12, -48
2650 ; CHECK-NEXT: .cfi_offset %r13, -40
2651 ; CHECK-NEXT: .cfi_offset %r14, -32
2652 ; CHECK-NEXT: .cfi_offset %r15, -24
2653 ; CHECK-NEXT: .cfi_offset %rbp, -16
2654 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2657 ; CHECK-NEXT: #NO_APP
2658 ; CHECK-NEXT: vpinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2659 ; CHECK-NEXT: popq %rbx
2660 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2661 ; CHECK-NEXT: popq %r12
2662 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2663 ; CHECK-NEXT: popq %r13
2664 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2665 ; CHECK-NEXT: popq %r14
2666 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2667 ; CHECK-NEXT: popq %r15
2668 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2669 ; CHECK-NEXT: popq %rbp
2670 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2672 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2673 %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
2677 define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
2678 ; CHECK-LABEL: stack_fold_pinsrq:
2680 ; CHECK-NEXT: pushq %rbp
2681 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2682 ; CHECK-NEXT: pushq %r15
2683 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2684 ; CHECK-NEXT: pushq %r14
2685 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2686 ; CHECK-NEXT: pushq %r13
2687 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2688 ; CHECK-NEXT: pushq %r12
2689 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2690 ; CHECK-NEXT: pushq %rbx
2691 ; CHECK-NEXT: .cfi_def_cfa_offset 56
2692 ; CHECK-NEXT: .cfi_offset %rbx, -56
2693 ; CHECK-NEXT: .cfi_offset %r12, -48
2694 ; CHECK-NEXT: .cfi_offset %r13, -40
2695 ; CHECK-NEXT: .cfi_offset %r14, -32
2696 ; CHECK-NEXT: .cfi_offset %r15, -24
2697 ; CHECK-NEXT: .cfi_offset %rbp, -16
2698 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2701 ; CHECK-NEXT: #NO_APP
2702 ; CHECK-NEXT: vpinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2703 ; CHECK-NEXT: popq %rbx
2704 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2705 ; CHECK-NEXT: popq %r12
2706 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2707 ; CHECK-NEXT: popq %r13
2708 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2709 ; CHECK-NEXT: popq %r14
2710 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2711 ; CHECK-NEXT: popq %r15
2712 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2713 ; CHECK-NEXT: popq %rbp
2714 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2716 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2717 %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
2721 define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
2722 ; CHECK-LABEL: stack_fold_pinsrw:
2724 ; CHECK-NEXT: pushq %rbp
2725 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2726 ; CHECK-NEXT: pushq %r15
2727 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2728 ; CHECK-NEXT: pushq %r14
2729 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2730 ; CHECK-NEXT: pushq %r13
2731 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2732 ; CHECK-NEXT: pushq %r12
2733 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2734 ; CHECK-NEXT: pushq %rbx
2735 ; CHECK-NEXT: .cfi_def_cfa_offset 56
2736 ; CHECK-NEXT: .cfi_offset %rbx, -56
2737 ; CHECK-NEXT: .cfi_offset %r12, -48
2738 ; CHECK-NEXT: .cfi_offset %r13, -40
2739 ; CHECK-NEXT: .cfi_offset %r14, -32
2740 ; CHECK-NEXT: .cfi_offset %r15, -24
2741 ; CHECK-NEXT: .cfi_offset %rbp, -16
2742 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2745 ; CHECK-NEXT: #NO_APP
2746 ; CHECK-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2747 ; CHECK-NEXT: popq %rbx
2748 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2749 ; CHECK-NEXT: popq %r12
2750 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2751 ; CHECK-NEXT: popq %r13
2752 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2753 ; CHECK-NEXT: popq %r14
2754 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2755 ; CHECK-NEXT: popq %r15
2756 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2757 ; CHECK-NEXT: popq %rbp
2758 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2760 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2761 %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
2765 define <16 x i32> @stack_fold_vplzcntd(<16 x i32> %a0) {
2766 ; CHECK-LABEL: stack_fold_vplzcntd:
2768 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2771 ; CHECK-NEXT: #NO_APP
2772 ; CHECK-NEXT: vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
2774 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2775 %2 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a0, i1 false)
2779 define <8 x i64> @stack_fold_vplzcntq(<8 x i64> %a0) {
2780 ; CHECK-LABEL: stack_fold_vplzcntq:
2782 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2785 ; CHECK-NEXT: #NO_APP
2786 ; CHECK-NEXT: vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
2788 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2789 %2 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a0, i1 false)
2793 define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
2794 ; CHECK-LABEL: stack_fold_pmaddubsw_zmm:
2796 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2799 ; CHECK-NEXT: #NO_APP
2800 ; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2802 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2803 %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
2806 declare <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>) nounwind readnone
2808 define <32 x i16> @stack_fold_pmaddubsw_zmm_mask(ptr %passthru, <64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
2809 ; CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask:
2811 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2814 ; CHECK-NEXT: #NO_APP
2815 ; CHECK-NEXT: kmovd %esi, %k1
2816 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2817 ; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2818 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2821 %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
2822 %3 = bitcast i32 %mask to <32 x i1>
2823 ; load needed to keep the operation from being scheduled about the asm block
2824 %4 = load <32 x i16>, ptr %passthru
2825 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
2829 define <32 x i16> @stack_fold_pmaddubsw_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
2830 ; CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz:
2832 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2835 ; CHECK-NEXT: #NO_APP
2836 ; CHECK-NEXT: kmovd %edi, %k1
2837 ; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2839 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2840 %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
2841 %3 = bitcast i32 %mask to <32 x i1>
2842 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
2846 define <16 x i32> @stack_fold_pmaddwd_zmm(<32 x i16> %a0, <32 x i16> %a1) {
2847 ; CHECK-LABEL: stack_fold_pmaddwd_zmm:
2849 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2852 ; CHECK-NEXT: #NO_APP
2853 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2855 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2856 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
2859 declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) nounwind readnone
2861 define <16 x i32> @stack_fold_pmaddwd_zmm_commuted(<32 x i16> %a0, <32 x i16> %a1) {
2862 ; CHECK-LABEL: stack_fold_pmaddwd_zmm_commuted:
2864 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2867 ; CHECK-NEXT: #NO_APP
2868 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2870 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2871 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
2875 define <16 x i32> @stack_fold_pmaddwd_zmm_mask(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2876 ; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask:
2878 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2881 ; CHECK-NEXT: #NO_APP
2882 ; CHECK-NEXT: kmovd %esi, %k1
2883 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2884 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2885 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2887 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2888 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
2889 %3 = bitcast i16 %mask to <16 x i1>
2890 ; load needed to keep the operation from being scheduled about the asm block
2891 %4 = load <16 x i32>, ptr %passthru
2892 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
2896 define <16 x i32> @stack_fold_pmaddwd_zmm_mask_commuted(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2897 ; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask_commuted:
2899 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2902 ; CHECK-NEXT: #NO_APP
2903 ; CHECK-NEXT: kmovd %esi, %k1
2904 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2905 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2906 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2908 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2909 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
2910 %3 = bitcast i16 %mask to <16 x i1>
2911 ; load needed to keep the operation from being scheduled about the asm block
2912 %4 = load <16 x i32>, ptr %passthru
2913 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
2917 define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2918 ; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz:
2920 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2923 ; CHECK-NEXT: #NO_APP
2924 ; CHECK-NEXT: kmovd %esi, %k1
2925 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2927 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2928 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
2929 %3 = bitcast i16 %mask to <16 x i1>
2930 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
2934 define <16 x i32> @stack_fold_pmaddwd_zmm_maskz_commuted(ptr %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2935 ; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz_commuted:
2937 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2940 ; CHECK-NEXT: #NO_APP
2941 ; CHECK-NEXT: kmovd %esi, %k1
2942 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2944 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2945 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
2946 %3 = bitcast i16 %mask to <16 x i1>
2947 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
2951 define <64 x i8> @stack_fold_pmaxsb(<64 x i8> %a0, <64 x i8> %a1) {
2952 ; CHECK-LABEL: stack_fold_pmaxsb:
2954 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2957 ; CHECK-NEXT: #NO_APP
2958 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2960 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2961 %2 = icmp sgt <64 x i8> %a0, %a1
2962 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
2966 define <64 x i8> @stack_fold_pmaxsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
2967 ; CHECK-LABEL: stack_fold_pmaxsb_commuted:
2969 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2972 ; CHECK-NEXT: #NO_APP
2973 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2975 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2976 %2 = icmp sgt <64 x i8> %a1, %a0
2977 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
2981 define <64 x i8> @stack_fold_pmaxsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
2982 ; CHECK-LABEL: stack_fold_pmaxsb_mask:
2984 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2987 ; CHECK-NEXT: #NO_APP
2988 ; CHECK-NEXT: kmovq %rdi, %k1
2989 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
2990 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2991 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2993 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2994 %2 = icmp sgt <64 x i8> %a0, %a1
2995 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
2996 %4 = bitcast i64 %mask to <64 x i1>
2997 ; load needed to keep the operation from being scheduled about the asm block
2998 %5 = load <64 x i8>, ptr %passthru
2999 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3003 define <64 x i8> @stack_fold_pmaxsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
3004 ; CHECK-LABEL: stack_fold_pmaxsb_mask_commuted:
3006 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3009 ; CHECK-NEXT: #NO_APP
3010 ; CHECK-NEXT: kmovq %rdi, %k1
3011 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3012 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3013 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3015 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3016 %2 = icmp sgt <64 x i8> %a1, %a0
3017 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3018 %4 = bitcast i64 %mask to <64 x i1>
3019 ; load needed to keep the operation from being scheduled about the asm block
3020 %5 = load <64 x i8>, ptr %passthru
3021 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3025 define <64 x i8> @stack_fold_pmaxsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3026 ; CHECK-LABEL: stack_fold_pmaxsb_maskz:
3028 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3031 ; CHECK-NEXT: #NO_APP
3032 ; CHECK-NEXT: kmovq %rdi, %k1
3033 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3035 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3036 %2 = icmp sgt <64 x i8> %a0, %a1
3037 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3038 %4 = bitcast i64 %mask to <64 x i1>
3039 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3043 define <64 x i8> @stack_fold_pmaxsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3044 ; CHECK-LABEL: stack_fold_pmaxsb_maskz_commuted:
3046 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3049 ; CHECK-NEXT: #NO_APP
3050 ; CHECK-NEXT: kmovq %rdi, %k1
3051 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3053 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3054 %2 = icmp sgt <64 x i8> %a1, %a0
3055 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3056 %4 = bitcast i64 %mask to <64 x i1>
3057 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3061 define <16 x i32> @stack_fold_pmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
3062 ; CHECK-LABEL: stack_fold_pmaxsd:
3064 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3067 ; CHECK-NEXT: #NO_APP
3068 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3070 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3071 %2 = icmp sgt <16 x i32> %a0, %a1
3072 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3076 define <16 x i32> @stack_fold_pmaxsd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
3077 ; CHECK-LABEL: stack_fold_pmaxsd_commuted:
3079 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3082 ; CHECK-NEXT: #NO_APP
3083 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3085 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3086 %2 = icmp sgt <16 x i32> %a1, %a0
3087 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3091 define <16 x i32> @stack_fold_pmaxsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
3092 ; CHECK-LABEL: stack_fold_pmaxsd_mask:
3094 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3097 ; CHECK-NEXT: #NO_APP
3098 ; CHECK-NEXT: kmovd %edi, %k1
3099 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3100 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3101 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3103 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3104 %2 = icmp sgt <16 x i32> %a0, %a1
3105 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3106 %4 = bitcast i16 %mask to <16 x i1>
3107 ; load needed to keep the operation from being scheduled about the asm block
3108 %5 = load <16 x i32>, ptr %passthru
3109 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3113 define <16 x i32> @stack_fold_pmaxsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
3114 ; CHECK-LABEL: stack_fold_pmaxsd_mask_commuted:
3116 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3119 ; CHECK-NEXT: #NO_APP
3120 ; CHECK-NEXT: kmovd %edi, %k1
3121 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3122 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3123 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3125 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3126 %2 = icmp sgt <16 x i32> %a1, %a0
3127 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3128 %4 = bitcast i16 %mask to <16 x i1>
3129 ; load needed to keep the operation from being scheduled about the asm block
3130 %5 = load <16 x i32>, ptr %passthru
3131 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3135 define <16 x i32> @stack_fold_pmaxsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3136 ; CHECK-LABEL: stack_fold_pmaxsd_maskz:
3138 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3141 ; CHECK-NEXT: #NO_APP
3142 ; CHECK-NEXT: kmovd %edi, %k1
3143 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3145 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3146 %2 = icmp sgt <16 x i32> %a0, %a1
3147 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3148 %4 = bitcast i16 %mask to <16 x i1>
3149 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3153 define <16 x i32> @stack_fold_pmaxsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3154 ; CHECK-LABEL: stack_fold_pmaxsd_maskz_commuted:
3156 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3159 ; CHECK-NEXT: #NO_APP
3160 ; CHECK-NEXT: kmovd %edi, %k1
3161 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3164 %2 = icmp sgt <16 x i32> %a1, %a0
3165 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3166 %4 = bitcast i16 %mask to <16 x i1>
3167 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3171 define <8 x i64> @stack_fold_pmaxsq(<8 x i64> %a0, <8 x i64> %a1) {
3172 ; CHECK-LABEL: stack_fold_pmaxsq:
3174 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3177 ; CHECK-NEXT: #NO_APP
3178 ; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3180 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3181 %2 = icmp sgt <8 x i64> %a0, %a1
3182 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3186 define <8 x i64> @stack_fold_pmaxsq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
3187 ; CHECK-LABEL: stack_fold_pmaxsq_commuted:
3189 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3192 ; CHECK-NEXT: #NO_APP
3193 ; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3196 %2 = icmp sgt <8 x i64> %a1, %a0
3197 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3201 define <8 x i64> @stack_fold_pmaxsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
3202 ; CHECK-LABEL: stack_fold_pmaxsq_mask:
3204 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3207 ; CHECK-NEXT: #NO_APP
3208 ; CHECK-NEXT: kmovd %edi, %k1
3209 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3210 ; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3211 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3213 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3214 %2 = icmp sgt <8 x i64> %a0, %a1
3215 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3216 %4 = bitcast i8 %mask to <8 x i1>
3217 ; load needed to keep the operation from being scheduled about the asm block
3218 %5 = load <8 x i64>, ptr %passthru
3219 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3223 define <8 x i64> @stack_fold_pmaxsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
3224 ; CHECK-LABEL: stack_fold_pmaxsq_mask_commuted:
3226 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3229 ; CHECK-NEXT: #NO_APP
3230 ; CHECK-NEXT: kmovd %edi, %k1
3231 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3232 ; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3233 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3235 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3236 %2 = icmp sgt <8 x i64> %a1, %a0
3237 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3238 %4 = bitcast i8 %mask to <8 x i1>
3239 ; load needed to keep the operation from being scheduled about the asm block
3240 %5 = load <8 x i64>, ptr %passthru
3241 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3245 define <8 x i64> @stack_fold_pmaxsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3246 ; CHECK-LABEL: stack_fold_pmaxsq_maskz:
3248 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3251 ; CHECK-NEXT: #NO_APP
3252 ; CHECK-NEXT: kmovd %edi, %k1
3253 ; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3255 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3256 %2 = icmp sgt <8 x i64> %a0, %a1
3257 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3258 %4 = bitcast i8 %mask to <8 x i1>
3259 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3263 define <8 x i64> @stack_fold_pmaxsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3264 ; CHECK-LABEL: stack_fold_pmaxsq_maskz_commuted:
3266 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3269 ; CHECK-NEXT: #NO_APP
3270 ; CHECK-NEXT: kmovd %edi, %k1
3271 ; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3273 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3274 %2 = icmp sgt <8 x i64> %a1, %a0
3275 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3276 %4 = bitcast i8 %mask to <8 x i1>
3277 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3281 define <32 x i16> @stack_fold_pmaxsw(<32 x i16> %a0, <32 x i16> %a1) {
3282 ; CHECK-LABEL: stack_fold_pmaxsw:
3284 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3287 ; CHECK-NEXT: #NO_APP
3288 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3290 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3291 %2 = icmp sgt <32 x i16> %a0, %a1
3292 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3296 define <32 x i16> @stack_fold_pmaxsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
3297 ; CHECK-LABEL: stack_fold_pmaxsw_commuted:
3299 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3302 ; CHECK-NEXT: #NO_APP
3303 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3305 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3306 %2 = icmp sgt <32 x i16> %a1, %a0
3307 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3311 define <32 x i16> @stack_fold_pmaxsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
3312 ; CHECK-LABEL: stack_fold_pmaxsw_mask:
3314 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3317 ; CHECK-NEXT: #NO_APP
3318 ; CHECK-NEXT: kmovd %edi, %k1
3319 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3320 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3321 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3323 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3324 %2 = icmp sgt <32 x i16> %a0, %a1
3325 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3326 %4 = bitcast i32 %mask to <32 x i1>
3327 ; load needed to keep the operation from being scheduled about the asm block
3328 %5 = load <32 x i16>, ptr %passthru
3329 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3333 define <32 x i16> @stack_fold_pmaxsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
3334 ; CHECK-LABEL: stack_fold_pmaxsw_mask_commuted:
3336 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3339 ; CHECK-NEXT: #NO_APP
3340 ; CHECK-NEXT: kmovd %edi, %k1
3341 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3342 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3343 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3345 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3346 %2 = icmp sgt <32 x i16> %a1, %a0
3347 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3348 %4 = bitcast i32 %mask to <32 x i1>
3349 ; load needed to keep the operation from being scheduled about the asm block
3350 %5 = load <32 x i16>, ptr %passthru
3351 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3355 define <32 x i16> @stack_fold_pmaxsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3356 ; CHECK-LABEL: stack_fold_pmaxsw_maskz:
3358 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3361 ; CHECK-NEXT: #NO_APP
3362 ; CHECK-NEXT: kmovd %edi, %k1
3363 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3365 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3366 %2 = icmp sgt <32 x i16> %a0, %a1
3367 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3368 %4 = bitcast i32 %mask to <32 x i1>
3369 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3373 define <32 x i16> @stack_fold_pmaxsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3374 ; CHECK-LABEL: stack_fold_pmaxsw_maskz_commuted:
3376 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3379 ; CHECK-NEXT: #NO_APP
3380 ; CHECK-NEXT: kmovd %edi, %k1
3381 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3383 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3384 %2 = icmp sgt <32 x i16> %a1, %a0
3385 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3386 %4 = bitcast i32 %mask to <32 x i1>
3387 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3391 define <64 x i8> @stack_fold_pmaxub(<64 x i8> %a0, <64 x i8> %a1) {
3392 ; CHECK-LABEL: stack_fold_pmaxub:
3394 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3397 ; CHECK-NEXT: #NO_APP
3398 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3400 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3401 %2 = icmp ugt <64 x i8> %a0, %a1
3402 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3406 define <64 x i8> @stack_fold_pmaxub_commuted(<64 x i8> %a0, <64 x i8> %a1) {
3407 ; CHECK-LABEL: stack_fold_pmaxub_commuted:
3409 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3412 ; CHECK-NEXT: #NO_APP
3413 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3415 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3416 %2 = icmp ugt <64 x i8> %a1, %a0
3417 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3421 define <64 x i8> @stack_fold_pmaxub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
3422 ; CHECK-LABEL: stack_fold_pmaxub_mask:
3424 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3427 ; CHECK-NEXT: #NO_APP
3428 ; CHECK-NEXT: kmovq %rdi, %k1
3429 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3430 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3431 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3433 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3434 %2 = icmp ugt <64 x i8> %a0, %a1
3435 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3436 %4 = bitcast i64 %mask to <64 x i1>
3437 ; load needed to keep the operation from being scheduled about the asm block
3438 %5 = load <64 x i8>, ptr %passthru
3439 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3443 define <64 x i8> @stack_fold_pmaxub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
3444 ; CHECK-LABEL: stack_fold_pmaxub_mask_commuted:
3446 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3449 ; CHECK-NEXT: #NO_APP
3450 ; CHECK-NEXT: kmovq %rdi, %k1
3451 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3452 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3453 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3455 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3456 %2 = icmp ugt <64 x i8> %a1, %a0
3457 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3458 %4 = bitcast i64 %mask to <64 x i1>
3459 ; load needed to keep the operation from being scheduled about the asm block
3460 %5 = load <64 x i8>, ptr %passthru
3461 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3465 define <64 x i8> @stack_fold_pmaxub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3466 ; CHECK-LABEL: stack_fold_pmaxub_maskz:
3468 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3471 ; CHECK-NEXT: #NO_APP
3472 ; CHECK-NEXT: kmovq %rdi, %k1
3473 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3475 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3476 %2 = icmp ugt <64 x i8> %a0, %a1
3477 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3478 %4 = bitcast i64 %mask to <64 x i1>
3479 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3483 define <64 x i8> @stack_fold_pmaxub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3484 ; CHECK-LABEL: stack_fold_pmaxub_maskz_commuted:
3486 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3489 ; CHECK-NEXT: #NO_APP
3490 ; CHECK-NEXT: kmovq %rdi, %k1
3491 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3493 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3494 %2 = icmp ugt <64 x i8> %a1, %a0
3495 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3496 %4 = bitcast i64 %mask to <64 x i1>
3497 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3501 define <16 x i32> @stack_fold_pmaxud(<16 x i32> %a0, <16 x i32> %a1) {
3502 ; CHECK-LABEL: stack_fold_pmaxud:
3504 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3507 ; CHECK-NEXT: #NO_APP
3508 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3510 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3511 %2 = icmp ugt <16 x i32> %a0, %a1
3512 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3516 define <16 x i32> @stack_fold_pmaxud_commuted(<16 x i32> %a0, <16 x i32> %a1) {
3517 ; CHECK-LABEL: stack_fold_pmaxud_commuted:
3519 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3522 ; CHECK-NEXT: #NO_APP
3523 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3525 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3526 %2 = icmp ugt <16 x i32> %a1, %a0
3527 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3531 define <16 x i32> @stack_fold_pmaxud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
3532 ; CHECK-LABEL: stack_fold_pmaxud_mask:
3534 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3537 ; CHECK-NEXT: #NO_APP
3538 ; CHECK-NEXT: kmovd %edi, %k1
3539 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3540 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3541 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3543 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3544 %2 = icmp ugt <16 x i32> %a0, %a1
3545 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3546 %4 = bitcast i16 %mask to <16 x i1>
3547 ; load needed to keep the operation from being scheduled about the asm block
3548 %5 = load <16 x i32>, ptr %passthru
3549 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3553 define <16 x i32> @stack_fold_pmaxud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
3554 ; CHECK-LABEL: stack_fold_pmaxud_mask_commuted:
3556 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3559 ; CHECK-NEXT: #NO_APP
3560 ; CHECK-NEXT: kmovd %edi, %k1
3561 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3562 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3563 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3565 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3566 %2 = icmp ugt <16 x i32> %a1, %a0
3567 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3568 %4 = bitcast i16 %mask to <16 x i1>
3569 ; load needed to keep the operation from being scheduled about the asm block
3570 %5 = load <16 x i32>, ptr %passthru
3571 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3575 define <16 x i32> @stack_fold_pmaxud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3576 ; CHECK-LABEL: stack_fold_pmaxud_maskz:
3578 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3581 ; CHECK-NEXT: #NO_APP
3582 ; CHECK-NEXT: kmovd %edi, %k1
3583 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3585 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3586 %2 = icmp ugt <16 x i32> %a0, %a1
3587 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3588 %4 = bitcast i16 %mask to <16 x i1>
3589 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3593 define <16 x i32> @stack_fold_pmaxud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3594 ; CHECK-LABEL: stack_fold_pmaxud_maskz_commuted:
3596 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3599 ; CHECK-NEXT: #NO_APP
3600 ; CHECK-NEXT: kmovd %edi, %k1
3601 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3603 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3604 %2 = icmp ugt <16 x i32> %a1, %a0
3605 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3606 %4 = bitcast i16 %mask to <16 x i1>
3607 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3611 define <8 x i64> @stack_fold_pmaxuq(<8 x i64> %a0, <8 x i64> %a1) {
3612 ; CHECK-LABEL: stack_fold_pmaxuq:
3614 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3617 ; CHECK-NEXT: #NO_APP
3618 ; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3620 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3621 %2 = icmp ugt <8 x i64> %a0, %a1
3622 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3626 define <8 x i64> @stack_fold_pmaxuq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
3627 ; CHECK-LABEL: stack_fold_pmaxuq_commuted:
3629 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3632 ; CHECK-NEXT: #NO_APP
3633 ; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3635 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3636 %2 = icmp ugt <8 x i64> %a1, %a0
3637 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3641 define <8 x i64> @stack_fold_pmaxuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
3642 ; CHECK-LABEL: stack_fold_pmaxuq_mask:
3644 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3647 ; CHECK-NEXT: #NO_APP
3648 ; CHECK-NEXT: kmovd %edi, %k1
3649 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3650 ; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3651 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3653 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3654 %2 = icmp ugt <8 x i64> %a0, %a1
3655 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3656 %4 = bitcast i8 %mask to <8 x i1>
3657 ; load needed to keep the operation from being scheduled about the asm block
3658 %5 = load <8 x i64>, ptr %passthru
3659 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3663 define <8 x i64> @stack_fold_pmaxuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
3664 ; CHECK-LABEL: stack_fold_pmaxuq_mask_commuted:
3666 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3669 ; CHECK-NEXT: #NO_APP
3670 ; CHECK-NEXT: kmovd %edi, %k1
3671 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3672 ; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3673 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3675 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3676 %2 = icmp ugt <8 x i64> %a1, %a0
3677 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3678 %4 = bitcast i8 %mask to <8 x i1>
3679 ; load needed to keep the operation from being scheduled about the asm block
3680 %5 = load <8 x i64>, ptr %passthru
3681 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3685 define <8 x i64> @stack_fold_pmaxuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3686 ; CHECK-LABEL: stack_fold_pmaxuq_maskz:
3688 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3691 ; CHECK-NEXT: #NO_APP
3692 ; CHECK-NEXT: kmovd %edi, %k1
3693 ; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3695 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3696 %2 = icmp ugt <8 x i64> %a0, %a1
3697 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3698 %4 = bitcast i8 %mask to <8 x i1>
3699 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3703 define <8 x i64> @stack_fold_pmaxuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3704 ; CHECK-LABEL: stack_fold_pmaxuq_maskz_commuted:
3706 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3709 ; CHECK-NEXT: #NO_APP
3710 ; CHECK-NEXT: kmovd %edi, %k1
3711 ; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3713 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3714 %2 = icmp ugt <8 x i64> %a1, %a0
3715 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3716 %4 = bitcast i8 %mask to <8 x i1>
3717 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3721 define <32 x i16> @stack_fold_pmaxuw(<32 x i16> %a0, <32 x i16> %a1) {
3722 ; CHECK-LABEL: stack_fold_pmaxuw:
3724 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3727 ; CHECK-NEXT: #NO_APP
3728 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3730 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3731 %2 = icmp ugt <32 x i16> %a0, %a1
3732 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3736 define <32 x i16> @stack_fold_pmaxuw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
3737 ; CHECK-LABEL: stack_fold_pmaxuw_commuted:
3739 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3742 ; CHECK-NEXT: #NO_APP
3743 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3745 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3746 %2 = icmp ugt <32 x i16> %a1, %a0
3747 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3751 define <32 x i16> @stack_fold_pmaxuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
3752 ; CHECK-LABEL: stack_fold_pmaxuw_mask:
3754 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3757 ; CHECK-NEXT: #NO_APP
3758 ; CHECK-NEXT: kmovd %edi, %k1
3759 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3760 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3761 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3763 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3764 %2 = icmp ugt <32 x i16> %a0, %a1
3765 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3766 %4 = bitcast i32 %mask to <32 x i1>
3767 ; load needed to keep the operation from being scheduled about the asm block
3768 %5 = load <32 x i16>, ptr %passthru
3769 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3773 define <32 x i16> @stack_fold_pmaxuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
3774 ; CHECK-LABEL: stack_fold_pmaxuw_mask_commuted:
3776 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3779 ; CHECK-NEXT: #NO_APP
3780 ; CHECK-NEXT: kmovd %edi, %k1
3781 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3782 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3783 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3785 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3786 %2 = icmp ugt <32 x i16> %a1, %a0
3787 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3788 %4 = bitcast i32 %mask to <32 x i1>
3789 ; load needed to keep the operation from being scheduled about the asm block
3790 %5 = load <32 x i16>, ptr %passthru
3791 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3795 define <32 x i16> @stack_fold_pmaxuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3796 ; CHECK-LABEL: stack_fold_pmaxuw_maskz:
3798 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3801 ; CHECK-NEXT: #NO_APP
3802 ; CHECK-NEXT: kmovd %edi, %k1
3803 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3805 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3806 %2 = icmp ugt <32 x i16> %a0, %a1
3807 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3808 %4 = bitcast i32 %mask to <32 x i1>
3809 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3813 define <32 x i16> @stack_fold_pmaxuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3814 ; CHECK-LABEL: stack_fold_pmaxuw_maskz_commuted:
3816 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3819 ; CHECK-NEXT: #NO_APP
3820 ; CHECK-NEXT: kmovd %edi, %k1
3821 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3823 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3824 %2 = icmp ugt <32 x i16> %a1, %a0
3825 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3826 %4 = bitcast i32 %mask to <32 x i1>
3827 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3831 define <64 x i8> @stack_fold_pminsb(<64 x i8> %a0, <64 x i8> %a1) {
3832 ; CHECK-LABEL: stack_fold_pminsb:
3834 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3837 ; CHECK-NEXT: #NO_APP
3838 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3840 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3841 %2 = icmp slt <64 x i8> %a0, %a1
3842 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3846 define <64 x i8> @stack_fold_pminsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
3847 ; CHECK-LABEL: stack_fold_pminsb_commuted:
3849 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3852 ; CHECK-NEXT: #NO_APP
3853 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3855 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3856 %2 = icmp slt <64 x i8> %a1, %a0
3857 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3861 define <64 x i8> @stack_fold_pminsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
3862 ; CHECK-LABEL: stack_fold_pminsb_mask:
3864 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3867 ; CHECK-NEXT: #NO_APP
3868 ; CHECK-NEXT: kmovq %rdi, %k1
3869 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3870 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3871 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3873 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3874 %2 = icmp slt <64 x i8> %a0, %a1
3875 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3876 %4 = bitcast i64 %mask to <64 x i1>
3877 ; load needed to keep the operation from being scheduled about the asm block
3878 %5 = load <64 x i8>, ptr %passthru
3879 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3883 define <64 x i8> @stack_fold_pminsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
3884 ; CHECK-LABEL: stack_fold_pminsb_mask_commuted:
3886 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3889 ; CHECK-NEXT: #NO_APP
3890 ; CHECK-NEXT: kmovq %rdi, %k1
3891 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3892 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3893 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3895 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3896 %2 = icmp slt <64 x i8> %a1, %a0
3897 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3898 %4 = bitcast i64 %mask to <64 x i1>
3899 ; load needed to keep the operation from being scheduled about the asm block
3900 %5 = load <64 x i8>, ptr %passthru
3901 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3905 define <64 x i8> @stack_fold_pminsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3906 ; CHECK-LABEL: stack_fold_pminsb_maskz:
3908 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3911 ; CHECK-NEXT: #NO_APP
3912 ; CHECK-NEXT: kmovq %rdi, %k1
3913 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3915 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3916 %2 = icmp slt <64 x i8> %a0, %a1
3917 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3918 %4 = bitcast i64 %mask to <64 x i1>
3919 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3923 define <64 x i8> @stack_fold_pminsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3924 ; CHECK-LABEL: stack_fold_pminsb_maskz_commuted:
3926 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3929 ; CHECK-NEXT: #NO_APP
3930 ; CHECK-NEXT: kmovq %rdi, %k1
3931 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3933 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3934 %2 = icmp slt <64 x i8> %a1, %a0
3935 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3936 %4 = bitcast i64 %mask to <64 x i1>
3937 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3941 define <16 x i32> @stack_fold_pminsd(<16 x i32> %a0, <16 x i32> %a1) {
3942 ; CHECK-LABEL: stack_fold_pminsd:
3944 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3947 ; CHECK-NEXT: #NO_APP
3948 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3950 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3951 %2 = icmp slt <16 x i32> %a0, %a1
3952 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3956 define <16 x i32> @stack_fold_pminsd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
3957 ; CHECK-LABEL: stack_fold_pminsd_commuted:
3959 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3962 ; CHECK-NEXT: #NO_APP
3963 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3965 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3966 %2 = icmp slt <16 x i32> %a1, %a0
3967 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3971 define <16 x i32> @stack_fold_pminsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
3972 ; CHECK-LABEL: stack_fold_pminsd_mask:
3974 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3977 ; CHECK-NEXT: #NO_APP
3978 ; CHECK-NEXT: kmovd %edi, %k1
3979 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3980 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3981 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3983 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3984 %2 = icmp slt <16 x i32> %a0, %a1
3985 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3986 %4 = bitcast i16 %mask to <16 x i1>
3987 ; load needed to keep the operation from being scheduled about the asm block
3988 %5 = load <16 x i32>, ptr %passthru
3989 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3993 define <16 x i32> @stack_fold_pminsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
3994 ; CHECK-LABEL: stack_fold_pminsd_mask_commuted:
3996 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3999 ; CHECK-NEXT: #NO_APP
4000 ; CHECK-NEXT: kmovd %edi, %k1
4001 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4002 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4003 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4005 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4006 %2 = icmp slt <16 x i32> %a1, %a0
4007 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4008 %4 = bitcast i16 %mask to <16 x i1>
4009 ; load needed to keep the operation from being scheduled about the asm block
4010 %5 = load <16 x i32>, ptr %passthru
4011 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
4015 define <16 x i32> @stack_fold_pminsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4016 ; CHECK-LABEL: stack_fold_pminsd_maskz:
4018 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4021 ; CHECK-NEXT: #NO_APP
4022 ; CHECK-NEXT: kmovd %edi, %k1
4023 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4025 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4026 %2 = icmp slt <16 x i32> %a0, %a1
4027 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4028 %4 = bitcast i16 %mask to <16 x i1>
4029 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4033 define <16 x i32> @stack_fold_pminsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4034 ; CHECK-LABEL: stack_fold_pminsd_maskz_commuted:
4036 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4039 ; CHECK-NEXT: #NO_APP
4040 ; CHECK-NEXT: kmovd %edi, %k1
4041 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4043 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4044 %2 = icmp slt <16 x i32> %a1, %a0
4045 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4046 %4 = bitcast i16 %mask to <16 x i1>
4047 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4051 define <8 x i64> @stack_fold_pminsq(<8 x i64> %a0, <8 x i64> %a1) {
4052 ; CHECK-LABEL: stack_fold_pminsq:
4054 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4057 ; CHECK-NEXT: #NO_APP
4058 ; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4060 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4061 %2 = icmp slt <8 x i64> %a0, %a1
4062 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4066 define <8 x i64> @stack_fold_pminsq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
4067 ; CHECK-LABEL: stack_fold_pminsq_commuted:
4069 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4072 ; CHECK-NEXT: #NO_APP
4073 ; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4075 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4076 %2 = icmp slt <8 x i64> %a1, %a0
4077 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4081 define <8 x i64> @stack_fold_pminsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
4082 ; CHECK-LABEL: stack_fold_pminsq_mask:
4084 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4087 ; CHECK-NEXT: #NO_APP
4088 ; CHECK-NEXT: kmovd %edi, %k1
4089 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4090 ; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4091 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4093 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4094 %2 = icmp slt <8 x i64> %a0, %a1
4095 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4096 %4 = bitcast i8 %mask to <8 x i1>
4097 ; load needed to keep the operation from being scheduled about the asm block
4098 %5 = load <8 x i64>, ptr %passthru
4099 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4103 define <8 x i64> @stack_fold_pminsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
4104 ; CHECK-LABEL: stack_fold_pminsq_mask_commuted:
4106 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4109 ; CHECK-NEXT: #NO_APP
4110 ; CHECK-NEXT: kmovd %edi, %k1
4111 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4112 ; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4113 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4115 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4116 %2 = icmp slt <8 x i64> %a1, %a0
4117 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4118 %4 = bitcast i8 %mask to <8 x i1>
4119 ; load needed to keep the operation from being scheduled about the asm block
4120 %5 = load <8 x i64>, ptr %passthru
4121 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4125 define <8 x i64> @stack_fold_pminsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4126 ; CHECK-LABEL: stack_fold_pminsq_maskz:
4128 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4131 ; CHECK-NEXT: #NO_APP
4132 ; CHECK-NEXT: kmovd %edi, %k1
4133 ; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4135 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4136 %2 = icmp slt <8 x i64> %a0, %a1
4137 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4138 %4 = bitcast i8 %mask to <8 x i1>
4139 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4143 define <8 x i64> @stack_fold_pminsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4144 ; CHECK-LABEL: stack_fold_pminsq_maskz_commuted:
4146 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4149 ; CHECK-NEXT: #NO_APP
4150 ; CHECK-NEXT: kmovd %edi, %k1
4151 ; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4153 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4154 %2 = icmp slt <8 x i64> %a1, %a0
4155 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4156 %4 = bitcast i8 %mask to <8 x i1>
4157 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4161 define <32 x i16> @stack_fold_pminsw(<32 x i16> %a0, <32 x i16> %a1) {
4162 ; CHECK-LABEL: stack_fold_pminsw:
4164 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4167 ; CHECK-NEXT: #NO_APP
4168 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4170 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4171 %2 = icmp slt <32 x i16> %a0, %a1
4172 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4176 define <32 x i16> @stack_fold_pminsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
4177 ; CHECK-LABEL: stack_fold_pminsw_commuted:
4179 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4182 ; CHECK-NEXT: #NO_APP
4183 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4185 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4186 %2 = icmp slt <32 x i16> %a1, %a0
4187 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4191 define <32 x i16> @stack_fold_pminsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
4192 ; CHECK-LABEL: stack_fold_pminsw_mask:
4194 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4197 ; CHECK-NEXT: #NO_APP
4198 ; CHECK-NEXT: kmovd %edi, %k1
4199 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4200 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4201 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4204 %2 = icmp slt <32 x i16> %a0, %a1
4205 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4206 %4 = bitcast i32 %mask to <32 x i1>
4207 ; load needed to keep the operation from being scheduled about the asm block
4208 %5 = load <32 x i16>, ptr %passthru
4209 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4213 define <32 x i16> @stack_fold_pminsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
4214 ; CHECK-LABEL: stack_fold_pminsw_mask_commuted:
4216 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4219 ; CHECK-NEXT: #NO_APP
4220 ; CHECK-NEXT: kmovd %edi, %k1
4221 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4222 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4223 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4225 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4226 %2 = icmp slt <32 x i16> %a1, %a0
4227 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4228 %4 = bitcast i32 %mask to <32 x i1>
4229 ; load needed to keep the operation from being scheduled about the asm block
4230 %5 = load <32 x i16>, ptr %passthru
4231 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4235 define <32 x i16> @stack_fold_pminsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4236 ; CHECK-LABEL: stack_fold_pminsw_maskz:
4238 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4241 ; CHECK-NEXT: #NO_APP
4242 ; CHECK-NEXT: kmovd %edi, %k1
4243 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4245 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4246 %2 = icmp slt <32 x i16> %a0, %a1
4247 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4248 %4 = bitcast i32 %mask to <32 x i1>
4249 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4253 define <32 x i16> @stack_fold_pminsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4254 ; CHECK-LABEL: stack_fold_pminsw_maskz_commuted:
4256 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4259 ; CHECK-NEXT: #NO_APP
4260 ; CHECK-NEXT: kmovd %edi, %k1
4261 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4263 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4264 %2 = icmp slt <32 x i16> %a1, %a0
4265 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4266 %4 = bitcast i32 %mask to <32 x i1>
4267 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4271 define <64 x i8> @stack_fold_pminub(<64 x i8> %a0, <64 x i8> %a1) {
4272 ; CHECK-LABEL: stack_fold_pminub:
4274 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4277 ; CHECK-NEXT: #NO_APP
4278 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4280 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4281 %2 = icmp ult <64 x i8> %a0, %a1
4282 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
4286 define <64 x i8> @stack_fold_pminub_commuted(<64 x i8> %a0, <64 x i8> %a1) {
4287 ; CHECK-LABEL: stack_fold_pminub_commuted:
4289 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4292 ; CHECK-NEXT: #NO_APP
4293 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4295 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4296 %2 = icmp ult <64 x i8> %a1, %a0
4297 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
4301 define <64 x i8> @stack_fold_pminub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
4302 ; CHECK-LABEL: stack_fold_pminub_mask:
4304 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4307 ; CHECK-NEXT: #NO_APP
4308 ; CHECK-NEXT: kmovq %rdi, %k1
4309 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4310 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4311 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4313 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4314 %2 = icmp ult <64 x i8> %a0, %a1
4315 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
4316 %4 = bitcast i64 %mask to <64 x i1>
4317 ; load needed to keep the operation from being scheduled about the asm block
4318 %5 = load <64 x i8>, ptr %passthru
4319 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
4323 define <64 x i8> @stack_fold_pminub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, ptr %passthru) {
4324 ; CHECK-LABEL: stack_fold_pminub_mask_commuted:
4326 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4329 ; CHECK-NEXT: #NO_APP
4330 ; CHECK-NEXT: kmovq %rdi, %k1
4331 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4332 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4333 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4335 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4336 %2 = icmp ult <64 x i8> %a1, %a0
4337 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
4338 %4 = bitcast i64 %mask to <64 x i1>
4339 ; load needed to keep the operation from being scheduled about the asm block
4340 %5 = load <64 x i8>, ptr %passthru
4341 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
4345 define <64 x i8> @stack_fold_pminub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
4346 ; CHECK-LABEL: stack_fold_pminub_maskz:
4348 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4351 ; CHECK-NEXT: #NO_APP
4352 ; CHECK-NEXT: kmovq %rdi, %k1
4353 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4355 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4356 %2 = icmp ult <64 x i8> %a0, %a1
4357 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
4358 %4 = bitcast i64 %mask to <64 x i1>
4359 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
4363 define <64 x i8> @stack_fold_pminub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
4364 ; CHECK-LABEL: stack_fold_pminub_maskz_commuted:
4366 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4369 ; CHECK-NEXT: #NO_APP
4370 ; CHECK-NEXT: kmovq %rdi, %k1
4371 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4373 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4374 %2 = icmp ult <64 x i8> %a1, %a0
4375 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
4376 %4 = bitcast i64 %mask to <64 x i1>
4377 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
4381 define <16 x i32> @stack_fold_pminud(<16 x i32> %a0, <16 x i32> %a1) {
4382 ; CHECK-LABEL: stack_fold_pminud:
4384 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4387 ; CHECK-NEXT: #NO_APP
4388 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4390 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4391 %2 = icmp ult <16 x i32> %a0, %a1
4392 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4396 define <16 x i32> @stack_fold_pminud_commuted(<16 x i32> %a0, <16 x i32> %a1) {
4397 ; CHECK-LABEL: stack_fold_pminud_commuted:
4399 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4402 ; CHECK-NEXT: #NO_APP
4403 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4405 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4406 %2 = icmp ult <16 x i32> %a1, %a0
4407 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4411 define <16 x i32> @stack_fold_pminud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
4412 ; CHECK-LABEL: stack_fold_pminud_mask:
4414 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4417 ; CHECK-NEXT: #NO_APP
4418 ; CHECK-NEXT: kmovd %edi, %k1
4419 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4420 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4421 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4423 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4424 %2 = icmp ult <16 x i32> %a0, %a1
4425 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4426 %4 = bitcast i16 %mask to <16 x i1>
4427 ; load needed to keep the operation from being scheduled about the asm block
4428 %5 = load <16 x i32>, ptr %passthru
4429 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
4433 define <16 x i32> @stack_fold_pminud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, ptr %passthru) {
4434 ; CHECK-LABEL: stack_fold_pminud_mask_commuted:
4436 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4439 ; CHECK-NEXT: #NO_APP
4440 ; CHECK-NEXT: kmovd %edi, %k1
4441 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4442 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4443 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4445 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4446 %2 = icmp ult <16 x i32> %a1, %a0
4447 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4448 %4 = bitcast i16 %mask to <16 x i1>
4449 ; load needed to keep the operation from being scheduled about the asm block
4450 %5 = load <16 x i32>, ptr %passthru
4451 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
4455 define <16 x i32> @stack_fold_pminud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4456 ; CHECK-LABEL: stack_fold_pminud_maskz:
4458 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4461 ; CHECK-NEXT: #NO_APP
4462 ; CHECK-NEXT: kmovd %edi, %k1
4463 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4465 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4466 %2 = icmp ult <16 x i32> %a0, %a1
4467 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4468 %4 = bitcast i16 %mask to <16 x i1>
4469 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4473 define <16 x i32> @stack_fold_pminud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4474 ; CHECK-LABEL: stack_fold_pminud_maskz_commuted:
4476 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4479 ; CHECK-NEXT: #NO_APP
4480 ; CHECK-NEXT: kmovd %edi, %k1
4481 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4483 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4484 %2 = icmp ult <16 x i32> %a1, %a0
4485 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4486 %4 = bitcast i16 %mask to <16 x i1>
4487 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4491 define <8 x i64> @stack_fold_pminuq(<8 x i64> %a0, <8 x i64> %a1) {
4492 ; CHECK-LABEL: stack_fold_pminuq:
4494 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4497 ; CHECK-NEXT: #NO_APP
4498 ; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4500 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4501 %2 = icmp ult <8 x i64> %a0, %a1
4502 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4506 define <8 x i64> @stack_fold_pminuq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
4507 ; CHECK-LABEL: stack_fold_pminuq_commuted:
4509 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4512 ; CHECK-NEXT: #NO_APP
4513 ; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4515 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4516 %2 = icmp ult <8 x i64> %a1, %a0
4517 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4521 define <8 x i64> @stack_fold_pminuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
4522 ; CHECK-LABEL: stack_fold_pminuq_mask:
4524 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4527 ; CHECK-NEXT: #NO_APP
4528 ; CHECK-NEXT: kmovd %edi, %k1
4529 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4530 ; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4531 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4533 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4534 %2 = icmp ult <8 x i64> %a0, %a1
4535 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4536 %4 = bitcast i8 %mask to <8 x i1>
4537 ; load needed to keep the operation from being scheduled about the asm block
4538 %5 = load <8 x i64>, ptr %passthru
4539 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4543 define <8 x i64> @stack_fold_pminuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, ptr %passthru) {
4544 ; CHECK-LABEL: stack_fold_pminuq_mask_commuted:
4546 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4549 ; CHECK-NEXT: #NO_APP
4550 ; CHECK-NEXT: kmovd %edi, %k1
4551 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4552 ; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4553 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4555 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4556 %2 = icmp ult <8 x i64> %a1, %a0
4557 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4558 %4 = bitcast i8 %mask to <8 x i1>
4559 ; load needed to keep the operation from being scheduled about the asm block
4560 %5 = load <8 x i64>, ptr %passthru
4561 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4565 define <8 x i64> @stack_fold_pminuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4566 ; CHECK-LABEL: stack_fold_pminuq_maskz:
4568 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4571 ; CHECK-NEXT: #NO_APP
4572 ; CHECK-NEXT: kmovd %edi, %k1
4573 ; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4575 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4576 %2 = icmp ult <8 x i64> %a0, %a1
4577 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4578 %4 = bitcast i8 %mask to <8 x i1>
4579 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4583 define <8 x i64> @stack_fold_pminuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4584 ; CHECK-LABEL: stack_fold_pminuq_maskz_commuted:
4586 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4589 ; CHECK-NEXT: #NO_APP
4590 ; CHECK-NEXT: kmovd %edi, %k1
4591 ; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4593 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4594 %2 = icmp ult <8 x i64> %a1, %a0
4595 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4596 %4 = bitcast i8 %mask to <8 x i1>
4597 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4601 define <32 x i16> @stack_fold_pminuw(<32 x i16> %a0, <32 x i16> %a1) {
4602 ; CHECK-LABEL: stack_fold_pminuw:
4604 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4607 ; CHECK-NEXT: #NO_APP
4608 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4610 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4611 %2 = icmp ult <32 x i16> %a0, %a1
4612 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4616 define <32 x i16> @stack_fold_pminuw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
4617 ; CHECK-LABEL: stack_fold_pminuw_commuted:
4619 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4622 ; CHECK-NEXT: #NO_APP
4623 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4625 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4626 %2 = icmp ult <32 x i16> %a1, %a0
4627 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4631 define <32 x i16> @stack_fold_pminuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
4632 ; CHECK-LABEL: stack_fold_pminuw_mask:
4634 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4637 ; CHECK-NEXT: #NO_APP
4638 ; CHECK-NEXT: kmovd %edi, %k1
4639 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4640 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4641 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4643 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4644 %2 = icmp ult <32 x i16> %a0, %a1
4645 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4646 %4 = bitcast i32 %mask to <32 x i1>
4647 ; load needed to keep the operation from being scheduled about the asm block
4648 %5 = load <32 x i16>, ptr %passthru
4649 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4653 define <32 x i16> @stack_fold_pminuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, ptr %passthru) {
4654 ; CHECK-LABEL: stack_fold_pminuw_mask_commuted:
4656 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4659 ; CHECK-NEXT: #NO_APP
4660 ; CHECK-NEXT: kmovd %edi, %k1
4661 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4662 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4663 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4665 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4666 %2 = icmp ult <32 x i16> %a1, %a0
4667 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4668 %4 = bitcast i32 %mask to <32 x i1>
4669 ; load needed to keep the operation from being scheduled about the asm block
4670 %5 = load <32 x i16>, ptr %passthru
4671 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4675 define <32 x i16> @stack_fold_pminuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4676 ; CHECK-LABEL: stack_fold_pminuw_maskz:
4678 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4681 ; CHECK-NEXT: #NO_APP
4682 ; CHECK-NEXT: kmovd %edi, %k1
4683 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4685 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4686 %2 = icmp ult <32 x i16> %a0, %a1
4687 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4688 %4 = bitcast i32 %mask to <32 x i1>
4689 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4693 define <32 x i16> @stack_fold_pminuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4694 ; CHECK-LABEL: stack_fold_pminuw_maskz_commuted:
4696 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4699 ; CHECK-NEXT: #NO_APP
4700 ; CHECK-NEXT: kmovd %edi, %k1
4701 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4703 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4704 %2 = icmp ult <32 x i16> %a1, %a0
4705 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4706 %4 = bitcast i32 %mask to <32 x i1>
4707 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4711 define <16 x i8> @stack_fold_vpmovdb(<16 x i32> %a0) {
4712 ; CHECK-LABEL: stack_fold_vpmovdb:
4714 ; CHECK-NEXT: vpmovdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4717 ; CHECK-NEXT: #NO_APP
4718 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4719 ; CHECK-NEXT: vzeroupper
4721 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
4722 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4725 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
4727 define <16 x i16> @stack_fold_vpmovdw(<16 x i32> %a0) {
4728 ; CHECK-LABEL: stack_fold_vpmovdw:
4730 ; CHECK-NEXT: vpmovdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4733 ; CHECK-NEXT: #NO_APP
4734 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4736 %1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
4737 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4740 declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
4742 define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
4743 ; CHECK-LABEL: stack_fold_movq_load:
4745 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4748 ; CHECK-NEXT: #NO_APP
4749 ; CHECK-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4750 ; CHECK-NEXT: # xmm0 = mem[0],zero
4751 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
4752 ; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0
4754 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4755 %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
4756 ; add forces execution domain
4757 %3 = add <2 x i64> %2, <i64 1, i64 1>
4761 define <8 x i32> @stack_fold_vpmovqd(<8 x i64> %a0) {
4762 ; CHECK-LABEL: stack_fold_vpmovqd:
4764 ; CHECK-NEXT: vpmovqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4767 ; CHECK-NEXT: #NO_APP
4768 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4770 %1 = trunc <8 x i64> %a0 to <8 x i32>
4771 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4774 declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
4776 define <8 x i16> @stack_fold_vpmovqw(<8 x i64> %a0) {
4777 ; CHECK-LABEL: stack_fold_vpmovqw:
4779 ; CHECK-NEXT: vpmovqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4782 ; CHECK-NEXT: #NO_APP
4783 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4784 ; CHECK-NEXT: vzeroupper
4786 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
4787 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4790 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
4792 define <32 x i8> @stack_fold_vpmovwb(<32 x i16> %a0) {
4793 ; CHECK-LABEL: stack_fold_vpmovwb:
4795 ; CHECK-NEXT: vpmovwb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4798 ; CHECK-NEXT: #NO_APP
4799 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4801 %1 = trunc <32 x i16> %a0 to <32 x i8>
4802 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4805 declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
4807 define <16 x i8> @stack_fold_vpmovsdb(<16 x i32> %a0) {
4808 ; CHECK-LABEL: stack_fold_vpmovsdb:
4810 ; CHECK-NEXT: vpmovsdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4813 ; CHECK-NEXT: #NO_APP
4814 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4815 ; CHECK-NEXT: vzeroupper
4817 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
4818 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4821 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
4823 define <16 x i16> @stack_fold_vpmovsdw(<16 x i32> %a0) {
4824 ; CHECK-LABEL: stack_fold_vpmovsdw:
4826 ; CHECK-NEXT: vpmovsdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4829 ; CHECK-NEXT: #NO_APP
4830 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4832 %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
4833 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4836 declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
4838 define <8 x i32> @stack_fold_vpmovsqd(<8 x i64> %a0) {
4839 ; CHECK-LABEL: stack_fold_vpmovsqd:
4841 ; CHECK-NEXT: vpmovsqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4844 ; CHECK-NEXT: #NO_APP
4845 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4847 %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
4848 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4851 declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
4853 define <8 x i16> @stack_fold_vpmovsqw(<8 x i64> %a0) {
4854 ; CHECK-LABEL: stack_fold_vpmovsqw:
4856 ; CHECK-NEXT: vpmovsqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4859 ; CHECK-NEXT: #NO_APP
4860 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4861 ; CHECK-NEXT: vzeroupper
4863 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
4864 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4867 declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
4869 define <32 x i8> @stack_fold_vpmovswb(<32 x i16> %a0) {
4870 ; CHECK-LABEL: stack_fold_vpmovswb:
4872 ; CHECK-NEXT: vpmovswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4875 ; CHECK-NEXT: #NO_APP
4876 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4878 %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
4879 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4882 declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
4884 define <16 x i32> @stack_fold_pmovsxbd_zmm(<16 x i8> %a0) {
4885 ; CHECK-LABEL: stack_fold_pmovsxbd_zmm:
4887 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4890 ; CHECK-NEXT: #NO_APP
4891 ; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
4893 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4894 %2 = sext <16 x i8> %a0 to <16 x i32>
4898 define <8 x i64> @stack_fold_pmovsxbq_zmm(<16 x i8> %a0) {
4899 ; CHECK-LABEL: stack_fold_pmovsxbq_zmm:
4901 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4904 ; CHECK-NEXT: #NO_APP
4905 ; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
4907 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4908 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4909 %3 = sext <8 x i8> %2 to <8 x i64>
4913 define <32 x i16> @stack_fold_pmovsxbw_zmm(<32 x i8> %a0) {
4914 ; CHECK-LABEL: stack_fold_pmovsxbw_zmm:
4916 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4919 ; CHECK-NEXT: #NO_APP
4920 ; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
4922 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4923 %2 = sext <32 x i8> %a0 to <32 x i16>
4927 define <8 x i64> @stack_fold_pmovsxdq_zmm(<8 x i32> %a0) {
4928 ; CHECK-LABEL: stack_fold_pmovsxdq_zmm:
4930 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4933 ; CHECK-NEXT: #NO_APP
4934 ; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
4936 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4937 %2 = sext <8 x i32> %a0 to <8 x i64>
4941 define <16 x i32> @stack_fold_pmovsxwd_zmm(<16 x i16> %a0) {
4942 ; CHECK-LABEL: stack_fold_pmovsxwd_zmm:
4944 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4947 ; CHECK-NEXT: #NO_APP
4948 ; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
4950 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4951 %2 = sext <16 x i16> %a0 to <16 x i32>
4955 define <8 x i64> @stack_fold_pmovsxwq_zmm(<8 x i16> %a0) {
4956 ; CHECK-LABEL: stack_fold_pmovsxwq_zmm:
4958 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4961 ; CHECK-NEXT: #NO_APP
4962 ; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
4964 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4965 %2 = sext <8 x i16> %a0 to <8 x i64>
4969 define <8 x i64> @stack_fold_pmovsxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
4970 ; CHECK-LABEL: stack_fold_pmovsxwq_mask_zmm:
4972 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4975 ; CHECK-NEXT: #NO_APP
4976 ; CHECK-NEXT: kmovd %edi, %k1
4977 ; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload
4979 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4980 %2 = sext <8 x i16> %a0 to <8 x i64>
4981 %3 = bitcast i8 %mask to <8 x i1>
4982 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %passthru
4986 define <8 x i64> @stack_fold_pmovsxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) {
4987 ; CHECK-LABEL: stack_fold_pmovsxwq_maskz_zmm:
4989 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4992 ; CHECK-NEXT: #NO_APP
4993 ; CHECK-NEXT: kmovd %edi, %k1
4994 ; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload
4996 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4997 %2 = sext <8 x i16> %a0 to <8 x i64>
4998 %3 = bitcast i8 %mask to <8 x i1>
4999 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5003 define <16 x i8> @stack_fold_vpmovusdb(<16 x i32> %a0) {
5004 ; CHECK-LABEL: stack_fold_vpmovusdb:
5006 ; CHECK-NEXT: vpmovusdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
5009 ; CHECK-NEXT: #NO_APP
5010 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5011 ; CHECK-NEXT: vzeroupper
5013 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
5014 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5017 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
5019 define <16 x i16> @stack_fold_vpmovusdw(<16 x i32> %a0) {
5020 ; CHECK-LABEL: stack_fold_vpmovusdw:
5022 ; CHECK-NEXT: vpmovusdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
5025 ; CHECK-NEXT: #NO_APP
5026 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5028 %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
5029 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5032 declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
5034 define <8 x i32> @stack_fold_vpmovusqd(<8 x i64> %a0) {
5035 ; CHECK-LABEL: stack_fold_vpmovusqd:
5037 ; CHECK-NEXT: vpmovusqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
5040 ; CHECK-NEXT: #NO_APP
5041 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5043 %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
5044 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5047 declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
5049 define <8 x i16> @stack_fold_vpmovusqw(<8 x i64> %a0) {
5050 ; CHECK-LABEL: stack_fold_vpmovusqw:
5052 ; CHECK-NEXT: vpmovusqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
5055 ; CHECK-NEXT: #NO_APP
5056 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5057 ; CHECK-NEXT: vzeroupper
5059 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
5060 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5063 declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
5065 define <32 x i8> @stack_fold_vpmovuswb(<32 x i16> %a0) {
5066 ; CHECK-LABEL: stack_fold_vpmovuswb:
5068 ; CHECK-NEXT: vpmovuswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
5071 ; CHECK-NEXT: #NO_APP
5072 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5074 %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
5075 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5078 declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)
5080 define <16 x i32> @stack_fold_pmovzxbd_zmm(<16 x i8> %a0) {
5081 ; CHECK-LABEL: stack_fold_pmovzxbd_zmm:
5083 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5086 ; CHECK-NEXT: #NO_APP
5087 ; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
5088 ; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
5090 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5091 %2 = zext <16 x i8> %a0 to <16 x i32>
5095 define <8 x i64> @stack_fold_pmovzxbq_zmm(<16 x i8> %a0) {
5096 ; CHECK-LABEL: stack_fold_pmovzxbq_zmm:
5098 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5101 ; CHECK-NEXT: #NO_APP
5102 ; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
5103 ; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
5105 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5106 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5107 %3 = zext <8 x i8> %2 to <8 x i64>
5111 define <32 x i16> @stack_fold_pmovzxbw_zmm(<32 x i8> %a0) {
5112 ; CHECK-LABEL: stack_fold_pmovzxbw_zmm:
5114 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5117 ; CHECK-NEXT: #NO_APP
5118 ; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
5119 ; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
5121 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5122 %2 = zext <32 x i8> %a0 to <32 x i16>
5126 define <8 x i64> @stack_fold_pmovzxdq_zmm(<8 x i32> %a0) {
5127 ; CHECK-LABEL: stack_fold_pmovzxdq_zmm:
5129 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5132 ; CHECK-NEXT: #NO_APP
5133 ; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
5134 ; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
5136 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5137 %2 = zext <8 x i32> %a0 to <8 x i64>
5141 define <16 x i32> @stack_fold_pmovzxwd_zmm(<16 x i16> %a0) {
5142 ; CHECK-LABEL: stack_fold_pmovzxwd_zmm:
5144 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5147 ; CHECK-NEXT: #NO_APP
5148 ; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
5149 ; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
5151 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5152 %2 = zext <16 x i16> %a0 to <16 x i32>
5156 define <8 x i64> @stack_fold_pmovzxwq_zmm(<8 x i16> %a0) {
5157 ; CHECK-LABEL: stack_fold_pmovzxwq_zmm:
5159 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5162 ; CHECK-NEXT: #NO_APP
5163 ; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
5164 ; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5166 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5167 %2 = zext <8 x i16> %a0 to <8 x i64>
5171 define <8 x i64> @stack_fold_pmovzxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
5172 ; CHECK-LABEL: stack_fold_pmovzxwq_mask_zmm:
5174 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5177 ; CHECK-NEXT: #NO_APP
5178 ; CHECK-NEXT: kmovd %edi, %k1
5179 ; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload
5180 ; CHECK-NEXT: # zmm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5182 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5183 %2 = zext <8 x i16> %a0 to <8 x i64>
5184 %3 = bitcast i8 %mask to <8 x i1>
5185 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %passthru
5189 define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) {
5190 ; CHECK-LABEL: stack_fold_pmovzxwq_maskz_zmm:
5192 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5195 ; CHECK-NEXT: #NO_APP
5196 ; CHECK-NEXT: kmovd %edi, %k1
5197 ; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload
5198 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5200 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5201 %2 = zext <8 x i16> %a0 to <8 x i64>
5202 %3 = bitcast i8 %mask to <8 x i1>
5203 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5207 define <16 x i32> @stack_fold_pmulld(<16 x i32> %a0, <16 x i32> %a1) {
5208 ; CHECK-LABEL: stack_fold_pmulld:
5210 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5213 ; CHECK-NEXT: #NO_APP
5214 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5216 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5217 %2 = mul <16 x i32> %a0, %a1
5221 define <16 x i32> @stack_fold_pmulld_commuted(<16 x i32> %a0, <16 x i32> %a1) {
5222 ; CHECK-LABEL: stack_fold_pmulld_commuted:
5224 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5227 ; CHECK-NEXT: #NO_APP
5228 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5230 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5231 %2 = mul <16 x i32> %a1, %a0
5235 define <16 x i32> @stack_fold_pmulld_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
5236 ; CHECK-LABEL: stack_fold_pmulld_mask:
5238 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5239 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5242 ; CHECK-NEXT: #NO_APP
5243 ; CHECK-NEXT: kmovd %esi, %k1
5244 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5245 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5247 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5248 %2 = mul <16 x i32> %a0, %a1
5249 %3 = bitcast i16 %mask to <16 x i1>
5250 ; load needed to keep the operation from being scheduled about the asm block
5251 %4 = load <16 x i32>, ptr %a2
5252 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5256 define <16 x i32> @stack_fold_pmulld_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
5257 ; CHECK-LABEL: stack_fold_pmulld_mask_commuted:
5259 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5260 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5263 ; CHECK-NEXT: #NO_APP
5264 ; CHECK-NEXT: kmovd %esi, %k1
5265 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5266 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5268 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5269 %2 = mul <16 x i32> %a1, %a0
5270 %3 = bitcast i16 %mask to <16 x i1>
5271 ; load needed to keep the operation from being scheduled about the asm block
5272 %4 = load <16 x i32>, ptr %a2
5273 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5277 define <16 x i32> @stack_fold_pmulld_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5278 ; CHECK-LABEL: stack_fold_pmulld_maskz:
5280 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5283 ; CHECK-NEXT: #NO_APP
5284 ; CHECK-NEXT: kmovd %edi, %k1
5285 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5287 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5288 %2 = mul <16 x i32> %a0, %a1
5289 %3 = bitcast i16 %mask to <16 x i1>
5290 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5294 define <16 x i32> @stack_fold_pmulld_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5295 ; CHECK-LABEL: stack_fold_pmulld_maskz_commuted:
5297 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5300 ; CHECK-NEXT: #NO_APP
5301 ; CHECK-NEXT: kmovd %edi, %k1
5302 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5304 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5305 %2 = mul <16 x i32> %a1, %a0
5306 %3 = bitcast i16 %mask to <16 x i1>
5307 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5311 define <8 x i64> @stack_fold_pmullq(<8 x i64> %a0, <8 x i64> %a1) {
5312 ; CHECK-LABEL: stack_fold_pmullq:
5314 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5317 ; CHECK-NEXT: #NO_APP
5318 ; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5321 %2 = mul <8 x i64> %a0, %a1
5325 define <8 x i64> @stack_fold_pmullq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5326 ; CHECK-LABEL: stack_fold_pmullq_commuted:
5328 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5331 ; CHECK-NEXT: #NO_APP
5332 ; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5334 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5335 %2 = mul <8 x i64> %a1, %a0
5339 define <8 x i64> @stack_fold_pmullq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5340 ; CHECK-LABEL: stack_fold_pmullq_mask:
5342 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5343 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5346 ; CHECK-NEXT: #NO_APP
5347 ; CHECK-NEXT: kmovd %esi, %k1
5348 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5349 ; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5351 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5352 %2 = mul <8 x i64> %a0, %a1
5353 %3 = bitcast i8 %mask to <8 x i1>
5354 ; load needed to keep the operation from being scheduled about the asm block
5355 %4 = load <8 x i64>, ptr %a2
5356 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5360 define <8 x i64> @stack_fold_pmullq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5361 ; CHECK-LABEL: stack_fold_pmullq_mask_commuted:
5363 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5364 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5367 ; CHECK-NEXT: #NO_APP
5368 ; CHECK-NEXT: kmovd %esi, %k1
5369 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5370 ; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5372 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5373 %2 = mul <8 x i64> %a1, %a0
5374 %3 = bitcast i8 %mask to <8 x i1>
5375 ; load needed to keep the operation from being scheduled about the asm block
5376 %4 = load <8 x i64>, ptr %a2
5377 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5381 define <8 x i64> @stack_fold_pmullq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5382 ; CHECK-LABEL: stack_fold_pmullq_maskz:
5384 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5387 ; CHECK-NEXT: #NO_APP
5388 ; CHECK-NEXT: kmovd %edi, %k1
5389 ; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5391 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5392 %2 = mul <8 x i64> %a0, %a1
5393 %3 = bitcast i8 %mask to <8 x i1>
5394 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5398 define <8 x i64> @stack_fold_pmullq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5399 ; CHECK-LABEL: stack_fold_pmullq_maskz_commuted:
5401 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5404 ; CHECK-NEXT: #NO_APP
5405 ; CHECK-NEXT: kmovd %edi, %k1
5406 ; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5408 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5409 %2 = mul <8 x i64> %a1, %a0
5410 %3 = bitcast i8 %mask to <8 x i1>
5411 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5415 define <32 x i16> @stack_fold_pmullw(<32 x i16> %a0, <32 x i16> %a1) {
5416 ; CHECK-LABEL: stack_fold_pmullw:
5418 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5421 ; CHECK-NEXT: #NO_APP
5422 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5424 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5425 %2 = mul <32 x i16> %a0, %a1
5429 define <32 x i16> @stack_fold_pmullw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
5430 ; CHECK-LABEL: stack_fold_pmullw_commuted:
5432 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5435 ; CHECK-NEXT: #NO_APP
5436 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5438 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5439 %2 = mul <32 x i16> %a1, %a0
5443 define <32 x i16> @stack_fold_pmullw_mask(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
5444 ; CHECK-LABEL: stack_fold_pmullw_mask:
5446 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5447 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5450 ; CHECK-NEXT: #NO_APP
5451 ; CHECK-NEXT: kmovd %esi, %k1
5452 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5453 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5455 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5456 %2 = mul <32 x i16> %a0, %a1
5457 %3 = bitcast i32 %mask to <32 x i1>
5458 ; load needed to keep the operation from being scheduled about the asm block
5459 %4 = load <32 x i16>, ptr %a2
5460 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
5464 define <32 x i16> @stack_fold_pmullw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, ptr %a2, i32 %mask) {
5465 ; CHECK-LABEL: stack_fold_pmullw_mask_commuted:
5467 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5468 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5471 ; CHECK-NEXT: #NO_APP
5472 ; CHECK-NEXT: kmovd %esi, %k1
5473 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5474 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5476 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5477 %2 = mul <32 x i16> %a1, %a0
5478 %3 = bitcast i32 %mask to <32 x i1>
5479 ; load needed to keep the operation from being scheduled about the asm block
5480 %4 = load <32 x i16>, ptr %a2
5481 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
5485 define <32 x i16> @stack_fold_pmullw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
5486 ; CHECK-LABEL: stack_fold_pmullw_maskz:
5488 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5491 ; CHECK-NEXT: #NO_APP
5492 ; CHECK-NEXT: kmovd %edi, %k1
5493 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5495 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5496 %2 = mul <32 x i16> %a0, %a1
5497 %3 = bitcast i32 %mask to <32 x i1>
5498 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
5502 define <32 x i16> @stack_fold_pmullw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
5503 ; CHECK-LABEL: stack_fold_pmullw_maskz_commuted:
5505 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5508 ; CHECK-NEXT: #NO_APP
5509 ; CHECK-NEXT: kmovd %edi, %k1
5510 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5512 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5513 %2 = mul <32 x i16> %a1, %a0
5514 %3 = bitcast i32 %mask to <32 x i1>
5515 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
5519 define <8 x i64> @stack_fold_pmuldq(<8 x i64> %a0, <8 x i64> %a1) {
5520 ; CHECK-LABEL: stack_fold_pmuldq:
5522 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5525 ; CHECK-NEXT: #NO_APP
5526 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5528 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5529 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5530 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5531 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5532 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5533 %6 = mul <8 x i64> %3, %5
5537 define <8 x i64> @stack_fold_pmuldq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5538 ; CHECK-LABEL: stack_fold_pmuldq_commuted:
5540 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5543 ; CHECK-NEXT: #NO_APP
5544 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5546 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5547 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5548 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5549 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5550 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5551 %6 = mul <8 x i64> %5, %3
5555 define <8 x i64> @stack_fold_pmuldq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5556 ; CHECK-LABEL: stack_fold_pmuldq_mask:
5558 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5559 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5562 ; CHECK-NEXT: #NO_APP
5563 ; CHECK-NEXT: kmovd %esi, %k1
5564 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5565 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5567 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5568 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5569 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5570 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5571 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5572 %6 = mul <8 x i64> %3, %5
5573 %7 = bitcast i8 %mask to <8 x i1>
5574 ; load needed to keep the operation from being scheduled about the asm block
5575 %8 = load <8 x i64>, ptr %a2
5576 %9 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> %8
5580 define <8 x i64> @stack_fold_pmuldq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5581 ; CHECK-LABEL: stack_fold_pmuldq_mask_commuted:
5583 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5584 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5587 ; CHECK-NEXT: #NO_APP
5588 ; CHECK-NEXT: kmovd %esi, %k1
5589 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5590 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5592 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5593 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5594 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5595 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5596 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5597 %6 = mul <8 x i64> %5, %3
5598 %7 = bitcast i8 %mask to <8 x i1>
5599 ; load needed to keep the operation from being scheduled about the asm block
5600 %8 = load <8 x i64>, ptr %a2
5601 %9 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> %8
5605 define <8 x i64> @stack_fold_pmuldq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5606 ; CHECK-LABEL: stack_fold_pmuldq_maskz:
5608 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5611 ; CHECK-NEXT: #NO_APP
5612 ; CHECK-NEXT: kmovd %edi, %k1
5613 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5615 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5616 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5617 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5618 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5619 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5620 %6 = mul <8 x i64> %3, %5
5621 %7 = bitcast i8 %mask to <8 x i1>
5622 %8 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> zeroinitializer
5626 define <8 x i64> @stack_fold_pmuldq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5627 ; CHECK-LABEL: stack_fold_pmuldq_maskz_commuted:
5629 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5632 ; CHECK-NEXT: #NO_APP
5633 ; CHECK-NEXT: kmovd %edi, %k1
5634 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5636 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5637 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5638 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5639 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5640 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5641 %6 = mul <8 x i64> %5, %3
5642 %7 = bitcast i8 %mask to <8 x i1>
5643 %8 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> zeroinitializer
5650 define <8 x i64> @stack_fold_pmuludq(<8 x i64> %a0, <8 x i64> %a1) {
5651 ; CHECK-LABEL: stack_fold_pmuludq:
5653 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5656 ; CHECK-NEXT: #NO_APP
5657 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5659 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5660 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5661 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5662 %4 = mul <8 x i64> %2, %3
5666 define <8 x i64> @stack_fold_pmuludq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5667 ; CHECK-LABEL: stack_fold_pmuludq_commuted:
5669 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5672 ; CHECK-NEXT: #NO_APP
5673 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5675 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5676 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5677 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5678 %4 = mul <8 x i64> %3, %2
5682 define <8 x i64> @stack_fold_pmuludq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5683 ; CHECK-LABEL: stack_fold_pmuludq_mask:
5685 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5686 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5689 ; CHECK-NEXT: #NO_APP
5690 ; CHECK-NEXT: kmovd %esi, %k1
5691 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5692 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5694 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5695 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5696 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5697 %4 = mul <8 x i64> %2, %3
5698 %5 = bitcast i8 %mask to <8 x i1>
5699 ; load needed to keep the operation from being scheduled about the asm block
5700 %6 = load <8 x i64>, ptr %a2
5701 %7 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %6
5705 define <8 x i64> @stack_fold_pmuludq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5706 ; CHECK-LABEL: stack_fold_pmuludq_mask_commuted:
5708 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5709 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5712 ; CHECK-NEXT: #NO_APP
5713 ; CHECK-NEXT: kmovd %esi, %k1
5714 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5715 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5717 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5718 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5719 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5720 %4 = mul <8 x i64> %3, %2
5721 %5 = bitcast i8 %mask to <8 x i1>
5722 ; load needed to keep the operation from being scheduled about the asm block
5723 %6 = load <8 x i64>, ptr %a2
5724 %7 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %6
5728 define <8 x i64> @stack_fold_pmuludq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5729 ; CHECK-LABEL: stack_fold_pmuludq_maskz:
5731 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5734 ; CHECK-NEXT: #NO_APP
5735 ; CHECK-NEXT: kmovd %edi, %k1
5736 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5738 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5739 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5740 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5741 %4 = mul <8 x i64> %2, %3
5742 %5 = bitcast i8 %mask to <8 x i1>
5743 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
5747 define <8 x i64> @stack_fold_pmuludq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5748 ; CHECK-LABEL: stack_fold_pmuludq_maskz_commuted:
5750 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5753 ; CHECK-NEXT: #NO_APP
5754 ; CHECK-NEXT: kmovd %edi, %k1
5755 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5758 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5759 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5760 %4 = mul <8 x i64> %3, %2
5761 %5 = bitcast i8 %mask to <8 x i1>
5762 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
5766 define <16 x i32> @stack_fold_vpopcntd(<16 x i32> %a0) {
5767 ; CHECK-LABEL: stack_fold_vpopcntd:
5769 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5772 ; CHECK-NEXT: #NO_APP
5773 ; CHECK-NEXT: vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
5775 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5776 %2 = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a0)
5779 declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readonly
5781 define <8 x i64> @stack_fold_vpopcntq(<8 x i64> %a0) {
5782 ; CHECK-LABEL: stack_fold_vpopcntq:
5784 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5787 ; CHECK-NEXT: #NO_APP
5788 ; CHECK-NEXT: vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
5790 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5791 %2 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a0)
5794 declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
5796 define <16 x i32> @stack_fold_pord(<16 x i32> %a0, <16 x i32> %a1) {
5797 ; CHECK-LABEL: stack_fold_pord:
5799 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5802 ; CHECK-NEXT: #NO_APP
5803 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5805 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5806 %2 = or <16 x i32> %a0, %a1
5810 define <16 x i32> @stack_fold_pord_commuted(<16 x i32> %a0, <16 x i32> %a1) {
5811 ; CHECK-LABEL: stack_fold_pord_commuted:
5813 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5816 ; CHECK-NEXT: #NO_APP
5817 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5819 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5820 %2 = or <16 x i32> %a1, %a0
5824 define <16 x i32> @stack_fold_pord_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
5825 ; CHECK-LABEL: stack_fold_pord_mask:
5827 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5828 ; CHECK-NEXT: vmovaps %zmm0, %zmm1
5831 ; CHECK-NEXT: #NO_APP
5832 ; CHECK-NEXT: kmovd %esi, %k1
5833 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
5834 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5836 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5837 %2 = or <16 x i32> %a0, %a1
5838 %3 = bitcast i16 %mask to <16 x i1>
5839 ; load needed to keep the operation from being scheduled about the asm block
5840 %4 = load <16 x i32>, ptr %a2
5841 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5845 define <16 x i32> @stack_fold_pord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
5846 ; CHECK-LABEL: stack_fold_pord_mask_commuted:
5848 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5849 ; CHECK-NEXT: vmovaps %zmm0, %zmm1
5852 ; CHECK-NEXT: #NO_APP
5853 ; CHECK-NEXT: kmovd %esi, %k1
5854 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
5855 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5857 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5858 %2 = or <16 x i32> %a1, %a0
5859 %3 = bitcast i16 %mask to <16 x i1>
5860 ; load needed to keep the operation from being scheduled about the asm block
5861 %4 = load <16 x i32>, ptr %a2
5862 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5866 define <16 x i32> @stack_fold_pord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5867 ; CHECK-LABEL: stack_fold_pord_maskz:
5869 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5872 ; CHECK-NEXT: #NO_APP
5873 ; CHECK-NEXT: kmovd %edi, %k1
5874 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5876 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5877 %2 = or <16 x i32> %a0, %a1
5878 %3 = bitcast i16 %mask to <16 x i1>
5879 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5883 define <16 x i32> @stack_fold_pord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5884 ; CHECK-LABEL: stack_fold_pord_maskz_commuted:
5886 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5889 ; CHECK-NEXT: #NO_APP
5890 ; CHECK-NEXT: kmovd %edi, %k1
5891 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5893 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5894 %2 = or <16 x i32> %a1, %a0
5895 %3 = bitcast i16 %mask to <16 x i1>
5896 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5900 define <8 x i64> @stack_fold_porq(<8 x i64> %a0, <8 x i64> %a1) {
5901 ; CHECK-LABEL: stack_fold_porq:
5903 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5906 ; CHECK-NEXT: #NO_APP
5907 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5909 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5910 %2 = or <8 x i64> %a0, %a1
5914 define <8 x i64> @stack_fold_porq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5915 ; CHECK-LABEL: stack_fold_porq_commuted:
5917 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5920 ; CHECK-NEXT: #NO_APP
5921 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5923 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5924 %2 = or <8 x i64> %a1, %a0
5928 define <8 x i64> @stack_fold_porq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5929 ; CHECK-LABEL: stack_fold_porq_mask:
5931 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5932 ; CHECK-NEXT: vmovapd %zmm0, %zmm1
5935 ; CHECK-NEXT: #NO_APP
5936 ; CHECK-NEXT: kmovd %esi, %k1
5937 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
5938 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5940 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5941 %2 = or <8 x i64> %a0, %a1
5942 %3 = bitcast i8 %mask to <8 x i1>
5943 ; load needed to keep the operation from being scheduled about the asm block
5944 %4 = load <8 x i64>, ptr %a2
5945 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5949 define <8 x i64> @stack_fold_porq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
5950 ; CHECK-LABEL: stack_fold_porq_mask_commuted:
5952 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5953 ; CHECK-NEXT: vmovapd %zmm0, %zmm1
5956 ; CHECK-NEXT: #NO_APP
5957 ; CHECK-NEXT: kmovd %esi, %k1
5958 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
5959 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5961 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5962 %2 = or <8 x i64> %a1, %a0
5963 %3 = bitcast i8 %mask to <8 x i1>
5964 ; load needed to keep the operation from being scheduled about the asm block
5965 %4 = load <8 x i64>, ptr %a2
5966 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5970 define <8 x i64> @stack_fold_porq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5971 ; CHECK-LABEL: stack_fold_porq_maskz:
5973 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5976 ; CHECK-NEXT: #NO_APP
5977 ; CHECK-NEXT: kmovd %edi, %k1
5978 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5980 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5981 %2 = or <8 x i64> %a0, %a1
5982 %3 = bitcast i8 %mask to <8 x i1>
5983 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5987 define <8 x i64> @stack_fold_porq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5988 ; CHECK-LABEL: stack_fold_porq_maskz_commuted:
5990 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5993 ; CHECK-NEXT: #NO_APP
5994 ; CHECK-NEXT: kmovd %edi, %k1
5995 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5997 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5998 %2 = or <8 x i64> %a1, %a0
5999 %3 = bitcast i8 %mask to <8 x i1>
6000 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
6004 define <8 x i64> @stack_fold_psadbw(<64 x i8> %a0, <64 x i8> %a1) {
6005 ; CHECK-LABEL: stack_fold_psadbw:
6007 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6010 ; CHECK-NEXT: #NO_APP
6011 ; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6013 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6014 %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a0, <64 x i8> %a1)
6017 declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) nounwind readnone
6019 define <8 x i64> @stack_fold_psadbw_commute(<64 x i8> %a0, <64 x i8> %a1) {
6020 ; CHECK-LABEL: stack_fold_psadbw_commute:
6022 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6025 ; CHECK-NEXT: #NO_APP
6026 ; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6028 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6029 %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a1, <64 x i8> %a0)
6033 define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) {
6034 ; CHECK-LABEL: stack_fold_pshufb_zmm:
6036 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6039 ; CHECK-NEXT: #NO_APP
6040 ; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6042 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6043 %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
6046 declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
6048 define <64 x i8> @stack_fold_pshufb_zmm_mask(ptr %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
6049 ; CHECK-LABEL: stack_fold_pshufb_zmm_mask:
6051 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6054 ; CHECK-NEXT: #NO_APP
6055 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
6056 ; CHECK-NEXT: kmovq %rsi, %k1
6057 ; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
6058 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
6060 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6061 %2 = load <64 x i8>, ptr %passthru
6062 %3 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
6063 %4 = bitcast i64 %mask to <64 x i1>
6064 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %2
6068 define <64 x i8> @stack_fold_pshufb_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
6069 ; CHECK-LABEL: stack_fold_pshufb_zmm_maskz:
6071 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6074 ; CHECK-NEXT: #NO_APP
6075 ; CHECK-NEXT: kmovq %rdi, %k1
6076 ; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
6078 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6079 %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
6080 %3 = bitcast i64 %mask to <64 x i1>
6081 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
6085 define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) {
6086 ; CHECK-LABEL: stack_fold_pshufd_zmm:
6088 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6091 ; CHECK-NEXT: #NO_APP
6092 ; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6093 ; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
6094 ; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1
6095 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
6097 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6098 %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
6099 %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
6103 define <16 x i32> @stack_fold_pshufd_zmm_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) {
6104 ; CHECK-LABEL: stack_fold_pshufd_zmm_mask:
6106 ; CHECK-NEXT: pushq %rax
6107 ; CHECK-NEXT: .cfi_def_cfa_offset 16
6108 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6109 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6112 ; CHECK-NEXT: #NO_APP
6113 ; CHECK-NEXT: kmovd %edi, %k1
6114 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6115 ; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
6116 ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
6117 ; CHECK-NEXT: popq %rax
6118 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6120 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6121 %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
6122 %3 = bitcast i16 %mask to <16 x i1>
6123 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %passthru
6127 define <16 x i32> @stack_fold_pshufd_zmm_maskz(<16 x i32> %a0, i16 %mask) {
6128 ; CHECK-LABEL: stack_fold_pshufd_zmm_maskz:
6130 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6133 ; CHECK-NEXT: #NO_APP
6134 ; CHECK-NEXT: kmovd %edi, %k1
6135 ; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6136 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
6138 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6139 %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
6140 %3 = bitcast i16 %mask to <16 x i1>
6141 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6145 define <32 x i16> @stack_fold_pshufhw_zmm(<32 x i16> %a0) {
6146 ; CHECK-LABEL: stack_fold_pshufhw_zmm:
6148 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6151 ; CHECK-NEXT: #NO_APP
6152 ; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6153 ; CHECK-NEXT: # zmm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
6155 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6156 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
6160 define <32 x i16> @stack_fold_pshufhw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
6161 ; CHECK-LABEL: stack_fold_pshufhw_zmm_mask:
6163 ; CHECK-NEXT: pushq %rax
6164 ; CHECK-NEXT: .cfi_def_cfa_offset 16
6165 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6166 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6169 ; CHECK-NEXT: #NO_APP
6170 ; CHECK-NEXT: kmovd %edi, %k1
6171 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6172 ; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
6173 ; CHECK-NEXT: # zmm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
6174 ; CHECK-NEXT: popq %rax
6175 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6177 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6178 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
6179 %3 = bitcast i32 %mask to <32 x i1>
6180 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru
6184 define <32 x i16> @stack_fold_pshufhw_zmm_maskz(<32 x i16> %a0, i32 %mask) {
6185 ; CHECK-LABEL: stack_fold_pshufhw_zmm_maskz:
6187 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6190 ; CHECK-NEXT: #NO_APP
6191 ; CHECK-NEXT: kmovd %edi, %k1
6192 ; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6193 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
6195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6196 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
6197 %3 = bitcast i32 %mask to <32 x i1>
6198 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
6202 define <32 x i16> @stack_fold_pshuflw_zmm(<32 x i16> %a0) {
6203 ; CHECK-LABEL: stack_fold_pshuflw_zmm:
6205 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6208 ; CHECK-NEXT: #NO_APP
6209 ; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6210 ; CHECK-NEXT: # zmm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
6212 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6213 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
6217 define <32 x i16> @stack_fold_pshuflw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
6218 ; CHECK-LABEL: stack_fold_pshuflw_zmm_mask:
6220 ; CHECK-NEXT: pushq %rax
6221 ; CHECK-NEXT: .cfi_def_cfa_offset 16
6222 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6223 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6226 ; CHECK-NEXT: #NO_APP
6227 ; CHECK-NEXT: kmovd %edi, %k1
6228 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6229 ; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
6230 ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
6231 ; CHECK-NEXT: popq %rax
6232 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6234 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6235 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
6236 %3 = bitcast i32 %mask to <32 x i1>
6237 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru
6241 define <32 x i16> @stack_fold_pshuflw_zmm_maskz(<32 x i16> %a0, i32 %mask) {
6242 ; CHECK-LABEL: stack_fold_pshuflw_zmm_maskz:
6244 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6247 ; CHECK-NEXT: #NO_APP
6248 ; CHECK-NEXT: kmovd %edi, %k1
6249 ; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6250 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
6252 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6253 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
6254 %3 = bitcast i32 %mask to <32 x i1>
6255 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
6259 define <16 x i32> @stack_fold_pslld(<16 x i32> %a0, <4 x i32> %a1) {
6260 ; CHECK-LABEL: stack_fold_pslld:
6262 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6265 ; CHECK-NEXT: #NO_APP
6266 ; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6268 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6269 %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
6272 declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6274 define <16 x i32> @stack_fold_pslld_mask(ptr %passthru, <16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6275 ; CHECK-LABEL: stack_fold_pslld_mask:
6277 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6280 ; CHECK-NEXT: #NO_APP
6281 ; CHECK-NEXT: kmovd %esi, %k1
6282 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
6283 ; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
6284 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
6286 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6287 %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
6288 %3 = bitcast i16 %mask to <16 x i1>
6289 %4 = load <16 x i32>, ptr %passthru
6290 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
6294 define <16 x i32> @stack_fold_pslld_maskz(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6295 ; CHECK-LABEL: stack_fold_pslld_maskz:
6297 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6300 ; CHECK-NEXT: #NO_APP
6301 ; CHECK-NEXT: kmovd %edi, %k1
6302 ; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 16-byte Folded Reload
6304 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6305 %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
6306 %3 = bitcast i16 %mask to <16 x i1>
6307 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6311 define <16 x i32> @stack_fold_pslldi(<16 x i32> %a0) {
6312 ; CHECK-LABEL: stack_fold_pslldi:
6314 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6317 ; CHECK-NEXT: #NO_APP
6318 ; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6321 %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
6324 declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
6326 define <16 x i32> @stack_fold_pslldi_mask(ptr %passthru, <16 x i32> %a0, i16 %mask) {
6327 ; CHECK-LABEL: stack_fold_pslldi_mask:
6329 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6332 ; CHECK-NEXT: #NO_APP
6333 ; CHECK-NEXT: kmovd %esi, %k1
6334 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
6335 ; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
6336 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
6338 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6339 %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
6340 %3 = bitcast i16 %mask to <16 x i1>
6341 %4 = load <16 x i32>, ptr %passthru
6342 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
6346 define <16 x i32> @stack_fold_pslldi_maskz(<16 x i32> %a0, i16 %mask) {
6347 ; CHECK-LABEL: stack_fold_pslldi_maskz:
6349 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6352 ; CHECK-NEXT: #NO_APP
6353 ; CHECK-NEXT: kmovd %edi, %k1
6354 ; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6356 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6357 %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
6358 %3 = bitcast i16 %mask to <16 x i1>
6359 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6363 define <64 x i8> @stack_fold_pslldq(<64 x i8> %a, <64 x i8> %b) {
6364 ; CHECK-LABEL: stack_fold_pslldq:
6366 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6369 ; CHECK-NEXT: #NO_APP
6370 ; CHECK-NEXT: vpslldq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6371 ; CHECK-NEXT: # zmm0 = zero,mem[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,mem[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,mem[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,mem[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
6373 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6374 %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
6378 define <8 x i64> @stack_fold_psllq(<8 x i64> %a0, <2 x i64> %a1) {
6379 ; CHECK-LABEL: stack_fold_psllq:
6381 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6384 ; CHECK-NEXT: #NO_APP
6385 ; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6387 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6388 %2 = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1)
6391 declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6393 define <8 x i64> @stack_fold_psllqi(<8 x i64> %a0) {
6394 ; CHECK-LABEL: stack_fold_psllqi:
6396 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6399 ; CHECK-NEXT: #NO_APP
6400 ; CHECK-NEXT: vpsllq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6402 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6403 %2 = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 1)
6406 declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
6408 define <16 x i32> @stack_fold_psllvd(<16 x i32> %a0, <16 x i32> %a1) {
6409 ; CHECK-LABEL: stack_fold_psllvd:
6411 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6414 ; CHECK-NEXT: #NO_APP
6415 ; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6417 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6418 %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6421 declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
6423 define <16 x i32> @stack_fold_psllvd_mask(ptr %passthru, <16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
6424 ; CHECK-LABEL: stack_fold_psllvd_mask:
6426 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6429 ; CHECK-NEXT: #NO_APP
6430 ; CHECK-NEXT: kmovd %esi, %k1
6431 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
6432 ; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
6433 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
6435 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6436 %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6437 %3 = bitcast i16 %mask to <16 x i1>
6438 %4 = load <16 x i32>, ptr %passthru
6439 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
6443 define <16 x i32> @stack_fold_psllvd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
6444 ; CHECK-LABEL: stack_fold_psllvd_maskz:
6446 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6449 ; CHECK-NEXT: #NO_APP
6450 ; CHECK-NEXT: kmovd %edi, %k1
6451 ; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
6453 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6454 %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6455 %3 = bitcast i16 %mask to <16 x i1>
6456 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6460 define <8 x i64> @stack_fold_psllvq(<8 x i64> %a0, <8 x i64> %a1) {
6461 ; CHECK-LABEL: stack_fold_psllvq:
6463 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6466 ; CHECK-NEXT: #NO_APP
6467 ; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6469 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6470 %2 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
6473 declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
6475 define <32 x i16> @stack_fold_psllvw(<32 x i16> %a0, <32 x i16> %a1) {
6476 ; CHECK-LABEL: stack_fold_psllvw:
6478 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6481 ; CHECK-NEXT: #NO_APP
6482 ; CHECK-NEXT: vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6484 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6485 %2 = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %a0, <32 x i16> %a1)
6488 declare <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
6490 define <32 x i16> @stack_fold_psllw(<32 x i16> %a0, <8 x i16> %a1) {
6491 ; CHECK-LABEL: stack_fold_psllw:
6493 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6496 ; CHECK-NEXT: #NO_APP
6497 ; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6499 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6500 %2 = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1)
6503 declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind readnone
6505 define <32 x i16> @stack_fold_psllwi(<32 x i16> %a0) {
6506 ; CHECK-LABEL: stack_fold_psllwi:
6508 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6511 ; CHECK-NEXT: #NO_APP
6512 ; CHECK-NEXT: vpsllw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6514 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6515 %2 = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 1)
6518 declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readnone
6520 define <16 x i32> @stack_fold_psrad(<16 x i32> %a0, <4 x i32> %a1) {
6521 ; CHECK-LABEL: stack_fold_psrad:
6523 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6526 ; CHECK-NEXT: #NO_APP
6527 ; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6529 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6530 %2 = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1)
6533 declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6535 define <16 x i32> @stack_fold_psradi(<16 x i32> %a0) {
6536 ; CHECK-LABEL: stack_fold_psradi:
6538 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6541 ; CHECK-NEXT: #NO_APP
6542 ; CHECK-NEXT: vpsrad $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6544 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6545 %2 = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 1)
6548 declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
6550 define <8 x i64> @stack_fold_psraq(<8 x i64> %a0, <2 x i64> %a1) {
6551 ; CHECK-LABEL: stack_fold_psraq:
6553 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6556 ; CHECK-NEXT: #NO_APP
6557 ; CHECK-NEXT: vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6559 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6560 %2 = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1)
6563 declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6565 define <8 x i64> @stack_fold_psraqi(<8 x i64> %a0) {
6566 ; CHECK-LABEL: stack_fold_psraqi:
6568 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6571 ; CHECK-NEXT: #NO_APP
6572 ; CHECK-NEXT: vpsraq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6574 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6575 %2 = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 1)
6578 declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
6580 define <16 x i32> @stack_fold_psravd(<16 x i32> %a0, <16 x i32> %a1) {
6581 ; CHECK-LABEL: stack_fold_psravd:
6583 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6586 ; CHECK-NEXT: #NO_APP
6587 ; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6589 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6590 %2 = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
6593 declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
6595 define <8 x i64> @stack_fold_psravq(<8 x i64> %a0, <8 x i64> %a1) {
6596 ; CHECK-LABEL: stack_fold_psravq:
6598 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6601 ; CHECK-NEXT: #NO_APP
6602 ; CHECK-NEXT: vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6604 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6605 %2 = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
6608 declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
6610 define <32 x i16> @stack_fold_psravw(<32 x i16> %a0, <32 x i16> %a1) {
6611 ; CHECK-LABEL: stack_fold_psravw:
6613 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6616 ; CHECK-NEXT: #NO_APP
6617 ; CHECK-NEXT: vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6619 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6620 %2 = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %a0, <32 x i16> %a1)
6623 declare <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16>, <32 x i16>) nounwind readnone
6625 define <32 x i16> @stack_fold_psraw(<32 x i16> %a0, <8 x i16> %a1) {
6626 ; CHECK-LABEL: stack_fold_psraw:
6628 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6631 ; CHECK-NEXT: #NO_APP
6632 ; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6634 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6635 %2 = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1)
6638 declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) nounwind readnone
6640 define <32 x i16> @stack_fold_psrawi(<32 x i16> %a0) {
6641 ; CHECK-LABEL: stack_fold_psrawi:
6643 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6646 ; CHECK-NEXT: #NO_APP
6647 ; CHECK-NEXT: vpsraw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6649 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6650 %2 = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 1)
6653 declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readnone
6655 define <16 x i32> @stack_fold_psrld(<16 x i32> %a0, <4 x i32> %a1) {
6656 ; CHECK-LABEL: stack_fold_psrld:
6658 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6661 ; CHECK-NEXT: #NO_APP
6662 ; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6664 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6665 %2 = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1)
6668 declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6670 define <16 x i32> @stack_fold_psrldi(<16 x i32> %a0) {
6671 ; CHECK-LABEL: stack_fold_psrldi:
6673 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6676 ; CHECK-NEXT: #NO_APP
6677 ; CHECK-NEXT: vpsrld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6679 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6680 %2 = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 1)
6683 declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
6685 define <64 x i8> @stack_fold_psrldq(<64 x i8> %a, <64 x i8> %b) {
6686 ; CHECK-LABEL: stack_fold_psrldq:
6688 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6691 ; CHECK-NEXT: #NO_APP
6692 ; CHECK-NEXT: vpsrldq $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6693 ; CHECK-NEXT: # zmm0 = mem[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,mem[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,mem[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,mem[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
6695 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6696 %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64>
6700 define <8 x i64> @stack_fold_psrlq(<8 x i64> %a0, <2 x i64> %a1) {
6701 ; CHECK-LABEL: stack_fold_psrlq:
6703 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6706 ; CHECK-NEXT: #NO_APP
6707 ; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6709 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6710 %2 = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1)
6713 declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6715 define <8 x i64> @stack_fold_psrlqi(<8 x i64> %a0) {
6716 ; CHECK-LABEL: stack_fold_psrlqi:
6718 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6721 ; CHECK-NEXT: #NO_APP
6722 ; CHECK-NEXT: vpsrlq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6724 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6725 %2 = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 1)
6728 declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
6730 define <16 x i32> @stack_fold_psrlvd(<16 x i32> %a0, <16 x i32> %a1) {
6731 ; CHECK-LABEL: stack_fold_psrlvd:
6733 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6736 ; CHECK-NEXT: #NO_APP
6737 ; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6739 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6740 %2 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6743 declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
6745 define <8 x i64> @stack_fold_psrlvq(<8 x i64> %a0, <8 x i64> %a1) {
6746 ; CHECK-LABEL: stack_fold_psrlvq:
6748 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6751 ; CHECK-NEXT: #NO_APP
6752 ; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6754 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6755 %2 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
6758 declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
6760 define <32 x i16> @stack_fold_psrlvw(<32 x i16> %a0, <32 x i16> %a1) {
6761 ; CHECK-LABEL: stack_fold_psrlvw:
6763 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6766 ; CHECK-NEXT: #NO_APP
6767 ; CHECK-NEXT: vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6769 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6770 %2 = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %a0, <32 x i16> %a1)
6773 declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
6775 define <32 x i16> @stack_fold_psrlw(<32 x i16> %a0, <8 x i16> %a1) {
6776 ; CHECK-LABEL: stack_fold_psrlw:
6778 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6781 ; CHECK-NEXT: #NO_APP
6782 ; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6784 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6785 %2 = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1)
6788 declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone
6790 define <32 x i16> @stack_fold_psrlwi(<32 x i16> %a0) {
6791 ; CHECK-LABEL: stack_fold_psrlwi:
6793 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6796 ; CHECK-NEXT: #NO_APP
6797 ; CHECK-NEXT: vpsrlw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6799 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6800 %2 = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 1)
6803 declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) nounwind readnone
6805 define <64 x i8> @stack_fold_psubb(<64 x i8> %a0, <64 x i8> %a1) {
6806 ; CHECK-LABEL: stack_fold_psubb:
6808 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6811 ; CHECK-NEXT: #NO_APP
6812 ; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6814 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6815 %2 = sub <64 x i8> %a0, %a1
6819 define <16 x i32> @stack_fold_psubd(<16 x i32> %a0, <16 x i32> %a1) {
6820 ; CHECK-LABEL: stack_fold_psubd:
6822 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6825 ; CHECK-NEXT: #NO_APP
6826 ; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6828 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6829 %2 = sub <16 x i32> %a0, %a1
6833 define <8 x i64> @stack_fold_psubq(<8 x i64> %a0, <8 x i64> %a1) {
6834 ; CHECK-LABEL: stack_fold_psubq:
6836 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6839 ; CHECK-NEXT: #NO_APP
6840 ; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6842 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6843 %2 = sub <8 x i64> %a0, %a1
6847 define <64 x i8> @stack_fold_psubsb(<64 x i8> %a0, <64 x i8> %a1) {
6848 ; CHECK-LABEL: stack_fold_psubsb:
6850 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6853 ; CHECK-NEXT: #NO_APP
6854 ; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6856 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6857 %2 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
6861 define <32 x i16> @stack_fold_psubsw(<32 x i16> %a0, <32 x i16> %a1) {
6862 ; CHECK-LABEL: stack_fold_psubsw:
6864 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6867 ; CHECK-NEXT: #NO_APP
6868 ; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6870 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6871 %2 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
6875 define <64 x i8> @stack_fold_psubusb(<64 x i8> %a0, <64 x i8> %a1) {
6876 ; CHECK-LABEL: stack_fold_psubusb:
6878 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6881 ; CHECK-NEXT: #NO_APP
6882 ; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6884 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6885 %2 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
6889 define <32 x i16> @stack_fold_psubusw(<32 x i16> %a0, <32 x i16> %a1) {
6890 ; CHECK-LABEL: stack_fold_psubusw:
6892 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6895 ; CHECK-NEXT: #NO_APP
6896 ; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6898 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6899 %2 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
6903 define <32 x i16> @stack_fold_psubw(<32 x i16> %a0, <32 x i16> %a1) {
6904 ; CHECK-LABEL: stack_fold_psubw:
6906 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6909 ; CHECK-NEXT: #NO_APP
6910 ; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6912 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6913 %2 = sub <32 x i16> %a0, %a1
6917 define <8 x i64> @stack_fold_shufi64x2(<8 x i64> %a, <8 x i64> %b) {
6918 ; CHECK-LABEL: stack_fold_shufi64x2:
6920 ; CHECK-NEXT: pushq %rax
6921 ; CHECK-NEXT: .cfi_def_cfa_offset 16
6922 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6923 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6926 ; CHECK-NEXT: #NO_APP
6927 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6928 ; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6929 ; CHECK-NEXT: # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
6930 ; CHECK-NEXT: popq %rax
6931 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6933 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6934 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
6938 define <8 x i64> @stack_fold_shufi64x2_mask(<8 x i64> %a, <8 x i64> %b, i8 %mask, ptr %passthru) {
6939 ; CHECK-LABEL: stack_fold_shufi64x2_mask:
6941 ; CHECK-NEXT: pushq %rax
6942 ; CHECK-NEXT: .cfi_def_cfa_offset 16
6943 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6944 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6947 ; CHECK-NEXT: #NO_APP
6948 ; CHECK-NEXT: kmovd %edi, %k1
6949 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm1
6950 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6951 ; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
6952 ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
6953 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
6954 ; CHECK-NEXT: popq %rax
6955 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6958 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
6959 %3 = bitcast i8 %mask to <8 x i1>
6960 ; load needed to keep the operation from being scheduled above the asm block
6961 %4 = load <8 x i64>, ptr %passthru
6962 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
6966 define <8 x i64> @stack_fold_shufi64x2_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask, ptr %passthru) {
6967 ; CHECK-LABEL: stack_fold_shufi64x2_maskz:
6969 ; CHECK-NEXT: pushq %rax
6970 ; CHECK-NEXT: .cfi_def_cfa_offset 16
6971 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6972 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6975 ; CHECK-NEXT: #NO_APP
6976 ; CHECK-NEXT: kmovd %edi, %k1
6977 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6978 ; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
6979 ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
6980 ; CHECK-NEXT: popq %rax
6981 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6983 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6984 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
6985 %3 = bitcast i8 %mask to <8 x i1>
6986 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
6990 define <16 x i32> @stack_fold_shufi32x4_mask(<16 x i32> %a, <16 x i32> %b, i16 %mask, ptr %passthru) {
6991 ; CHECK-LABEL: stack_fold_shufi32x4_mask:
6993 ; CHECK-NEXT: pushq %rax
6994 ; CHECK-NEXT: .cfi_def_cfa_offset 16
6995 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6996 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6999 ; CHECK-NEXT: #NO_APP
7000 ; CHECK-NEXT: kmovd %edi, %k1
7001 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm1
7002 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7003 ; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
7004 ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
7005 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
7006 ; CHECK-NEXT: popq %rax
7007 ; CHECK-NEXT: .cfi_def_cfa_offset 8
7009 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7010 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
7011 %3 = bitcast i16 %mask to <16 x i1>
7012 ; load needed to keep the operation from being scheduled above the asm block
7013 %4 = load <16 x i32>, ptr %passthru
7014 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
7018 define <16 x i32> @stack_fold_shufi32x4_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
7019 ; CHECK-LABEL: stack_fold_shufi32x4_maskz:
7021 ; CHECK-NEXT: pushq %rax
7022 ; CHECK-NEXT: .cfi_def_cfa_offset 16
7023 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7024 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7027 ; CHECK-NEXT: #NO_APP
7028 ; CHECK-NEXT: kmovd %edi, %k1
7029 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7030 ; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7031 ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
7032 ; CHECK-NEXT: popq %rax
7033 ; CHECK-NEXT: .cfi_def_cfa_offset 8
7035 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7036 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
7037 %3 = bitcast i16 %mask to <16 x i1>
7038 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
7042 define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
7043 ; CHECK-LABEL: stack_fold_ternlogd:
7045 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7048 ; CHECK-NEXT: #NO_APP
7049 ; CHECK-NEXT: vpternlogd $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
7050 ; CHECK-NEXT: # zmm0 = ~(zmm1 | (zmm0 ^ mem))
7052 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7053 %2 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
7056 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32)
7058 define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
7059 ; CHECK-LABEL: stack_fold_ternlogq:
7061 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7064 ; CHECK-NEXT: #NO_APP
7065 ; CHECK-NEXT: vpternlogq $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
7066 ; CHECK-NEXT: # zmm0 = ~(zmm1 | (zmm0 ^ mem))
7068 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7069 %2 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
7073 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32)
7075 define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
7076 ; CHECK-LABEL: stack_fold_punpckhbw_zmm:
7078 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7081 ; CHECK-NEXT: #NO_APP
7082 ; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7083 ; CHECK-NEXT: # zmm0 = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
7085 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7086 %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
7090 define <64 x i8> @stack_fold_punpckhbw_mask_zmm(ptr %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
7091 ; CHECK-LABEL: stack_fold_punpckhbw_mask_zmm:
7093 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7096 ; CHECK-NEXT: #NO_APP
7097 ; CHECK-NEXT: kmovq %rsi, %k1
7098 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
7099 ; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
7100 ; CHECK-NEXT: # zmm2 {%k1} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
7101 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
7103 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7104 %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
7105 %3 = bitcast i64 %mask to <64 x i1>
7106 ; load needed to keep the operation from being scheduled about the asm block
7107 %4 = load <64 x i8>, ptr %passthru
7108 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
7112 define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
7113 ; CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm:
7115 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7118 ; CHECK-NEXT: #NO_APP
7119 ; CHECK-NEXT: kmovq %rdi, %k1
7120 ; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7121 ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
7123 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7124 %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
7125 %3 = bitcast i64 %mask to <64 x i1>
7126 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
7130 define <16 x i32> @stack_fold_pxord(<16 x i32> %a0, <16 x i32> %a1) {
7131 ; CHECK-LABEL: stack_fold_pxord:
7133 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7136 ; CHECK-NEXT: #NO_APP
7137 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7139 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7140 %2 = xor <16 x i32> %a0, %a1
7144 define <16 x i32> @stack_fold_pxord_commuted(<16 x i32> %a0, <16 x i32> %a1) {
7145 ; CHECK-LABEL: stack_fold_pxord_commuted:
7147 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7150 ; CHECK-NEXT: #NO_APP
7151 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7153 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7154 %2 = xor <16 x i32> %a1, %a0
7158 define <16 x i32> @stack_fold_pxord_mask(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
7159 ; CHECK-LABEL: stack_fold_pxord_mask:
7161 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7162 ; CHECK-NEXT: vmovaps %zmm0, %zmm1
7165 ; CHECK-NEXT: #NO_APP
7166 ; CHECK-NEXT: kmovd %esi, %k1
7167 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
7168 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7170 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7171 %2 = xor <16 x i32> %a0, %a1
7172 %3 = bitcast i16 %mask to <16 x i1>
7173 ; load needed to keep the operation from being scheduled about the asm block
7174 %4 = load <16 x i32>, ptr %a2
7175 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
7179 define <16 x i32> @stack_fold_pxord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, ptr %a2, i16 %mask) {
7180 ; CHECK-LABEL: stack_fold_pxord_mask_commuted:
7182 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7183 ; CHECK-NEXT: vmovaps %zmm0, %zmm1
7186 ; CHECK-NEXT: #NO_APP
7187 ; CHECK-NEXT: kmovd %esi, %k1
7188 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
7189 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7191 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7192 %2 = xor <16 x i32> %a1, %a0
7193 %3 = bitcast i16 %mask to <16 x i1>
7194 ; load needed to keep the operation from being scheduled about the asm block
7195 %4 = load <16 x i32>, ptr %a2
7196 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
7200 define <16 x i32> @stack_fold_pxord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7201 ; CHECK-LABEL: stack_fold_pxord_maskz:
7203 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7206 ; CHECK-NEXT: #NO_APP
7207 ; CHECK-NEXT: kmovd %edi, %k1
7208 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7210 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7211 %2 = xor <16 x i32> %a0, %a1
7212 %3 = bitcast i16 %mask to <16 x i1>
7213 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
7217 define <16 x i32> @stack_fold_pxord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7218 ; CHECK-LABEL: stack_fold_pxord_maskz_commuted:
7220 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7223 ; CHECK-NEXT: #NO_APP
7224 ; CHECK-NEXT: kmovd %edi, %k1
7225 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7227 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7228 %2 = xor <16 x i32> %a1, %a0
7229 %3 = bitcast i16 %mask to <16 x i1>
7230 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
7234 define <8 x i64> @stack_fold_pxorq(<8 x i64> %a0, <8 x i64> %a1) {
7235 ; CHECK-LABEL: stack_fold_pxorq:
7237 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7240 ; CHECK-NEXT: #NO_APP
7241 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7243 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7244 %2 = xor <8 x i64> %a0, %a1
7248 define <8 x i64> @stack_fold_pxorq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
7249 ; CHECK-LABEL: stack_fold_pxorq_commuted:
7251 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7254 ; CHECK-NEXT: #NO_APP
7255 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7257 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7258 %2 = xor <8 x i64> %a1, %a0
7262 define <8 x i64> @stack_fold_pxorq_mask(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
7263 ; CHECK-LABEL: stack_fold_pxorq_mask:
7265 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7266 ; CHECK-NEXT: vmovapd %zmm0, %zmm1
7269 ; CHECK-NEXT: #NO_APP
7270 ; CHECK-NEXT: kmovd %esi, %k1
7271 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
7272 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7274 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7275 %2 = xor <8 x i64> %a0, %a1
7276 %3 = bitcast i8 %mask to <8 x i1>
7277 ; load needed to keep the operation from being scheduled about the asm block
7278 %4 = load <8 x i64>, ptr %a2
7279 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
7283 define <8 x i64> @stack_fold_pxorq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, ptr %a2, i8 %mask) {
7284 ; CHECK-LABEL: stack_fold_pxorq_mask_commuted:
7286 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7287 ; CHECK-NEXT: vmovapd %zmm0, %zmm1
7290 ; CHECK-NEXT: #NO_APP
7291 ; CHECK-NEXT: kmovd %esi, %k1
7292 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
7293 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7295 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7296 %2 = xor <8 x i64> %a1, %a0
7297 %3 = bitcast i8 %mask to <8 x i1>
7298 ; load needed to keep the operation from being scheduled about the asm block
7299 %4 = load <8 x i64>, ptr %a2
7300 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
7304 define <8 x i64> @stack_fold_pxorq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7305 ; CHECK-LABEL: stack_fold_pxorq_maskz:
7307 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7310 ; CHECK-NEXT: #NO_APP
7311 ; CHECK-NEXT: kmovd %edi, %k1
7312 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7314 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7315 %2 = xor <8 x i64> %a0, %a1
7316 %3 = bitcast i8 %mask to <8 x i1>
7317 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
7321 define <8 x i64> @stack_fold_pxorq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7322 ; CHECK-LABEL: stack_fold_pxorq_maskz_commuted:
7324 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7327 ; CHECK-NEXT: #NO_APP
7328 ; CHECK-NEXT: kmovd %edi, %k1
7329 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7331 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7332 %2 = xor <8 x i64> %a1, %a0
7333 %3 = bitcast i8 %mask to <8 x i1>
7334 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
7338 declare <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
7339 declare <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16>, <32 x i16>)
7340 declare <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
7341 declare <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16>, <32 x i16>)
7342 declare <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32>)
7343 declare <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64>)
7344 declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1)
7345 declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1)
7346 declare <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
7347 declare <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16>, <32 x i16>)
7348 declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
7349 declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>)