1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq,+avx512vnni < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <16 x i32> @stack_fold_valignd(<16 x i32> %a, <16 x i32> %b) {
13 ; CHECK-LABEL: stack_fold_valignd:
15 ; CHECK-NEXT: subq $56, %rsp
16 ; CHECK-NEXT: .cfi_def_cfa_offset 64
17 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23 ; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
24 ; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
25 ; CHECK-NEXT: addq $56, %rsp
26 ; CHECK-NEXT: .cfi_def_cfa_offset 8
28 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
29 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
33 define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, <16 x i32>* %passthru, i16 %mask) {
34 ; CHECK-LABEL: stack_fold_valignd_mask:
36 ; CHECK-NEXT: subq $56, %rsp
37 ; CHECK-NEXT: .cfi_def_cfa_offset 64
38 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
39 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
43 ; CHECK-NEXT: kmovd %esi, %k1
44 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
45 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
46 ; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
47 ; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
48 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
49 ; CHECK-NEXT: addq $56, %rsp
50 ; CHECK-NEXT: .cfi_def_cfa_offset 8
52 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
53 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
54 %3 = bitcast i16 %mask to <16 x i1>
55 %4 = load <16 x i32>, <16 x i32>* %passthru
56 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
60 define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
61 ; CHECK-LABEL: stack_fold_valignd_maskz:
63 ; CHECK-NEXT: subq $56, %rsp
64 ; CHECK-NEXT: .cfi_def_cfa_offset 64
65 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
66 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
70 ; CHECK-NEXT: kmovd %edi, %k1
71 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
72 ; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
73 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
74 ; CHECK-NEXT: addq $56, %rsp
75 ; CHECK-NEXT: .cfi_def_cfa_offset 8
77 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
78 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
79 %3 = bitcast i16 %mask to <16 x i1>
80 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
84 define <8 x i64> @stack_fold_valignq(<8 x i64> %a, <8 x i64> %b) {
85 ; CHECK-LABEL: stack_fold_valignq:
87 ; CHECK-NEXT: subq $56, %rsp
88 ; CHECK-NEXT: .cfi_def_cfa_offset 64
89 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
90 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
94 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
95 ; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
96 ; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7],zmm0[0]
97 ; CHECK-NEXT: addq $56, %rsp
98 ; CHECK-NEXT: .cfi_def_cfa_offset 8
100 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
101 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
105 define <8 x i64> @stack_fold_valignq_mask(<8 x i64> %a, <8 x i64> %b, <8 x i64>* %passthru, i8 %mask) {
106 ; CHECK-LABEL: stack_fold_valignq_mask:
108 ; CHECK-NEXT: subq $56, %rsp
109 ; CHECK-NEXT: .cfi_def_cfa_offset 64
110 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
111 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
114 ; CHECK-NEXT: #NO_APP
115 ; CHECK-NEXT: kmovd %esi, %k1
116 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
117 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
118 ; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
119 ; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7],zmm0[0]
120 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
121 ; CHECK-NEXT: addq $56, %rsp
122 ; CHECK-NEXT: .cfi_def_cfa_offset 8
124 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
125 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
126 %3 = bitcast i8 %mask to <8 x i1>
127 %4 = load <8 x i64>, <8 x i64>* %passthru
128 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
132 define <8 x i64> @stack_fold_valignq_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
133 ; CHECK-LABEL: stack_fold_valignq_maskz:
135 ; CHECK-NEXT: subq $56, %rsp
136 ; CHECK-NEXT: .cfi_def_cfa_offset 64
137 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
138 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
141 ; CHECK-NEXT: #NO_APP
142 ; CHECK-NEXT: kmovd %edi, %k1
143 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
144 ; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
145 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7],zmm0[0]
146 ; CHECK-NEXT: addq $56, %rsp
147 ; CHECK-NEXT: .cfi_def_cfa_offset 8
149 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
150 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
151 %3 = bitcast i8 %mask to <8 x i1>
152 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
156 define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) {
157 ; CHECK-LABEL: stack_fold_pavgb:
159 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
162 ; CHECK-NEXT: #NO_APP
163 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
165 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
166 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
169 declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>)
171 define <64 x i8> @stack_fold_pavgb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
172 ; CHECK-LABEL: stack_fold_pavgb_commuted:
174 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
177 ; CHECK-NEXT: #NO_APP
178 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
180 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
181 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
185 define <64 x i8> @stack_fold_pavgb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
186 ; CHECK-LABEL: stack_fold_pavgb_mask:
188 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
189 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
192 ; CHECK-NEXT: #NO_APP
193 ; CHECK-NEXT: kmovq %rsi, %k1
194 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
195 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
197 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
198 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
199 %3 = bitcast i64 %mask to <64 x i1>
200 ; load needed to keep the operation from being scheduled about the asm block
201 %4 = load <64 x i8>, <64 x i8>* %a2
202 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
206 define <64 x i8> @stack_fold_pavgb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
207 ; CHECK-LABEL: stack_fold_pavgb_mask_commuted:
209 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
210 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
213 ; CHECK-NEXT: #NO_APP
214 ; CHECK-NEXT: kmovq %rsi, %k1
215 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
216 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
218 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
219 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
220 %3 = bitcast i64 %mask to <64 x i1>
221 ; load needed to keep the operation from being scheduled about the asm block
222 %4 = load <64 x i8>, <64 x i8>* %a2
223 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
227 define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
228 ; CHECK-LABEL: stack_fold_pavgb_maskz:
230 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
233 ; CHECK-NEXT: #NO_APP
234 ; CHECK-NEXT: kmovq %rdi, %k1
235 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
237 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
238 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
239 %3 = bitcast i64 %mask to <64 x i1>
240 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
244 define <64 x i8> @stack_fold_pavgb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
245 ; CHECK-LABEL: stack_fold_pavgb_maskz_commuted:
247 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
250 ; CHECK-NEXT: #NO_APP
251 ; CHECK-NEXT: kmovq %rdi, %k1
252 ; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
254 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
255 %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
256 %3 = bitcast i64 %mask to <64 x i1>
257 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
261 define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) {
262 ; CHECK-LABEL: stack_fold_pavgw:
264 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
267 ; CHECK-NEXT: #NO_APP
268 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
270 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
271 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
274 declare <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16>, <32 x i16>)
276 define <32 x i16> @stack_fold_pavgw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
277 ; CHECK-LABEL: stack_fold_pavgw_commuted:
279 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
282 ; CHECK-NEXT: #NO_APP
283 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
285 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
286 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
290 define <32 x i16> @stack_fold_pavgw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
291 ; CHECK-LABEL: stack_fold_pavgw_mask:
293 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
294 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
297 ; CHECK-NEXT: #NO_APP
298 ; CHECK-NEXT: kmovd %esi, %k1
299 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
300 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
302 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
303 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
304 %3 = bitcast i32 %mask to <32 x i1>
305 ; load needed to keep the operation from being scheduled about the asm block
306 %4 = load <32 x i16>, <32 x i16>* %a2
307 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
311 define <32 x i16> @stack_fold_pavgw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
312 ; CHECK-LABEL: stack_fold_pavgw_mask_commuted:
314 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
315 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
318 ; CHECK-NEXT: #NO_APP
319 ; CHECK-NEXT: kmovd %esi, %k1
320 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
321 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
323 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
324 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
325 %3 = bitcast i32 %mask to <32 x i1>
326 ; load needed to keep the operation from being scheduled about the asm block
327 %4 = load <32 x i16>, <32 x i16>* %a2
328 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
332 define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
333 ; CHECK-LABEL: stack_fold_pavgw_maskz:
335 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
338 ; CHECK-NEXT: #NO_APP
339 ; CHECK-NEXT: kmovd %edi, %k1
340 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
342 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
343 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
344 %3 = bitcast i32 %mask to <32 x i1>
345 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
349 define <32 x i16> @stack_fold_pavgw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
350 ; CHECK-LABEL: stack_fold_pavgw_maskz_commuted:
352 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
355 ; CHECK-NEXT: #NO_APP
356 ; CHECK-NEXT: kmovd %edi, %k1
357 ; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
359 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
360 %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
361 %3 = bitcast i32 %mask to <32 x i1>
362 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
366 define <4 x i32> @stack_fold_extracti32x4(<16 x i16> %a0, <16 x i32> %a1) {
367 ; CHECK-LABEL: stack_fold_extracti32x4:
369 ; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
370 ; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
373 ; CHECK-NEXT: #NO_APP
374 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
375 ; CHECK-NEXT: vzeroupper
377 ; zext forces execution domain
378 %1 = zext <16 x i16> %a0 to <16 x i32>
379 %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
380 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
384 define <2 x i64> @stack_fold_extracti64x2(<8 x i32> %a0, <8 x i64> %a1) {
385 ; CHECK-LABEL: stack_fold_extracti64x2:
387 ; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
388 ; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
391 ; CHECK-NEXT: #NO_APP
392 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
393 ; CHECK-NEXT: vzeroupper
395 ; zext forces execution domain
396 %1 = zext <8 x i32> %a0 to <8 x i64>
397 %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7>
398 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
402 define <8 x i32> @stack_fold_extracti32x8(<16 x i16> %a0, <16 x i32> %a1) {
403 ; CHECK-LABEL: stack_fold_extracti32x8:
405 ; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
406 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
409 ; CHECK-NEXT: #NO_APP
410 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
412 ; zext forces execution domain
413 %1 = zext <16 x i16> %a0 to <16 x i32>
414 %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
415 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
419 define <4 x i64> @stack_fold_extracti64x4(<8 x i32> %a0, <8 x i64> %a1) {
420 ; CHECK-LABEL: stack_fold_extracti64x4:
422 ; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
423 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
426 ; CHECK-NEXT: #NO_APP
427 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
429 ; zext forces execution domain
430 %1 = zext <8 x i32> %a0 to <8 x i64>
431 %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
432 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
436 define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) {
437 ; CHECK-LABEL: stack_fold_inserti32x8:
439 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
440 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
443 ; CHECK-NEXT: #NO_APP
444 ; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
445 ; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
446 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
448 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
449 %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
450 ; add forces execution domain
451 %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
455 define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) {
456 ; CHECK-LABEL: stack_fold_inserti64x4:
458 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
459 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
462 ; CHECK-NEXT: #NO_APP
463 ; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
464 ; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
465 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
467 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
468 %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
469 ; add forces execution domain
470 %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
474 define <64 x i8> @stack_fold_pabsb(<64 x i8> %a0) {
475 ; CHECK-LABEL: stack_fold_pabsb:
477 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
480 ; CHECK-NEXT: #NO_APP
481 ; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
483 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
484 %2 = icmp sgt <64 x i8> %a0, zeroinitializer
485 %3 = sub <64 x i8> zeroinitializer, %a0
486 %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3
490 define <64 x i8> @stack_fold_pabsb_mask(<64 x i8> %passthru, <64 x i8> %a0, i64 %mask) {
491 ; CHECK-LABEL: stack_fold_pabsb_mask:
493 ; CHECK-NEXT: subq $56, %rsp
494 ; CHECK-NEXT: .cfi_def_cfa_offset 64
495 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
496 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
499 ; CHECK-NEXT: #NO_APP
500 ; CHECK-NEXT: kmovq %rdi, %k1
501 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
502 ; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
503 ; CHECK-NEXT: addq $56, %rsp
504 ; CHECK-NEXT: .cfi_def_cfa_offset 8
506 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
507 %2 = icmp sgt <64 x i8> %a0, zeroinitializer
508 %3 = sub <64 x i8> zeroinitializer, %a0
509 %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3
510 %5 = bitcast i64 %mask to <64 x i1>
511 %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> %passthru
515 define <64 x i8> @stack_fold_pabsb_maskz(<64 x i8> %a0, i64 %mask) {
516 ; CHECK-LABEL: stack_fold_pabsb_maskz:
518 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
521 ; CHECK-NEXT: #NO_APP
522 ; CHECK-NEXT: kmovq %rdi, %k1
523 ; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
525 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
526 %2 = icmp sgt <64 x i8> %a0, zeroinitializer
527 %3 = sub <64 x i8> zeroinitializer, %a0
528 %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3
529 %5 = bitcast i64 %mask to <64 x i1>
530 %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer
534 define <16 x i32> @stack_fold_pabsd(<16 x i32> %a0) {
535 ; CHECK-LABEL: stack_fold_pabsd:
537 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
540 ; CHECK-NEXT: #NO_APP
541 ; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
543 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
544 %2 = icmp sgt <16 x i32> %a0, zeroinitializer
545 %3 = sub <16 x i32> zeroinitializer, %a0
546 %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3
550 define <16 x i32> @stack_fold_pabsd_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) {
551 ; CHECK-LABEL: stack_fold_pabsd_mask:
553 ; CHECK-NEXT: subq $56, %rsp
554 ; CHECK-NEXT: .cfi_def_cfa_offset 64
555 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
556 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
559 ; CHECK-NEXT: #NO_APP
560 ; CHECK-NEXT: kmovd %edi, %k1
561 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
562 ; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
563 ; CHECK-NEXT: addq $56, %rsp
564 ; CHECK-NEXT: .cfi_def_cfa_offset 8
566 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
567 %2 = icmp sgt <16 x i32> %a0, zeroinitializer
568 %3 = sub <16 x i32> zeroinitializer, %a0
569 %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3
570 %5 = bitcast i16 %mask to <16 x i1>
571 %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> %passthru
575 define <16 x i32> @stack_fold_pabsd_maskz(<16 x i32> %a0, i16 %mask) {
576 ; CHECK-LABEL: stack_fold_pabsd_maskz:
578 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
581 ; CHECK-NEXT: #NO_APP
582 ; CHECK-NEXT: kmovd %edi, %k1
583 ; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
585 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
586 %2 = icmp sgt <16 x i32> %a0, zeroinitializer
587 %3 = sub <16 x i32> zeroinitializer, %a0
588 %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3
589 %5 = bitcast i16 %mask to <16 x i1>
590 %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
594 define <8 x i64> @stack_fold_pabsq(<8 x i64> %a0) {
595 ; CHECK-LABEL: stack_fold_pabsq:
597 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
600 ; CHECK-NEXT: #NO_APP
601 ; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
603 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
604 %2 = icmp sgt <8 x i64> %a0, zeroinitializer
605 %3 = sub <8 x i64> zeroinitializer, %a0
606 %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3
610 define <8 x i64> @stack_fold_pabsq_mask(<8 x i64> %passthru, <8 x i64> %a0, i8 %mask) {
611 ; CHECK-LABEL: stack_fold_pabsq_mask:
613 ; CHECK-NEXT: subq $56, %rsp
614 ; CHECK-NEXT: .cfi_def_cfa_offset 64
615 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
616 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
619 ; CHECK-NEXT: #NO_APP
620 ; CHECK-NEXT: kmovd %edi, %k1
621 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
622 ; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
623 ; CHECK-NEXT: addq $56, %rsp
624 ; CHECK-NEXT: .cfi_def_cfa_offset 8
626 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
627 %2 = icmp sgt <8 x i64> %a0, zeroinitializer
628 %3 = sub <8 x i64> zeroinitializer, %a0
629 %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3
630 %5 = bitcast i8 %mask to <8 x i1>
631 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %passthru
635 define <8 x i64> @stack_fold_pabsq_maskz(<8 x i64> %a0, i8 %mask) {
636 ; CHECK-LABEL: stack_fold_pabsq_maskz:
638 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
641 ; CHECK-NEXT: #NO_APP
642 ; CHECK-NEXT: kmovd %edi, %k1
643 ; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
645 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
646 %2 = icmp sgt <8 x i64> %a0, zeroinitializer
647 %3 = sub <8 x i64> zeroinitializer, %a0
648 %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3
649 %5 = bitcast i8 %mask to <8 x i1>
650 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
654 define <32 x i16> @stack_fold_pabsw(<32 x i16> %a0) {
655 ; CHECK-LABEL: stack_fold_pabsw:
657 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
660 ; CHECK-NEXT: #NO_APP
661 ; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
663 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
664 %2 = icmp sgt <32 x i16> %a0, zeroinitializer
665 %3 = sub <32 x i16> zeroinitializer, %a0
666 %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3
670 define <32 x i16> @stack_fold_pabsw_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
671 ; CHECK-LABEL: stack_fold_pabsw_mask:
673 ; CHECK-NEXT: subq $56, %rsp
674 ; CHECK-NEXT: .cfi_def_cfa_offset 64
675 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
676 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
679 ; CHECK-NEXT: #NO_APP
680 ; CHECK-NEXT: kmovd %edi, %k1
681 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
682 ; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
683 ; CHECK-NEXT: addq $56, %rsp
684 ; CHECK-NEXT: .cfi_def_cfa_offset 8
686 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
687 %2 = icmp sgt <32 x i16> %a0, zeroinitializer
688 %3 = sub <32 x i16> zeroinitializer, %a0
689 %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3
690 %5 = bitcast i32 %mask to <32 x i1>
691 %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> %passthru
695 define <32 x i16> @stack_fold_pabsw_maskz(<32 x i16> %a0, i32 %mask) {
696 ; CHECK-LABEL: stack_fold_pabsw_maskz:
698 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
701 ; CHECK-NEXT: #NO_APP
702 ; CHECK-NEXT: kmovd %edi, %k1
703 ; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
705 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
706 %2 = icmp sgt <32 x i16> %a0, zeroinitializer
707 %3 = sub <32 x i16> zeroinitializer, %a0
708 %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3
709 %5 = bitcast i32 %mask to <32 x i1>
710 %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> zeroinitializer
714 define <32 x i16> @stack_fold_packssdw(<16 x i32> %a0, <16 x i32> %a1) {
715 ; CHECK-LABEL: stack_fold_packssdw:
717 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
720 ; CHECK-NEXT: #NO_APP
721 ; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
723 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
724 %2 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a0, <16 x i32> %a1)
727 declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone
729 define <64 x i8> @stack_fold_packsswb(<32 x i16> %a0, <32 x i16> %a1) {
730 ; CHECK-LABEL: stack_fold_packsswb:
732 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
735 ; CHECK-NEXT: #NO_APP
736 ; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
738 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
739 %2 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a0, <32 x i16> %a1)
742 declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone
744 define <32 x i16> @stack_fold_packusdw(<16 x i32> %a0, <16 x i32> %a1) {
745 ; CHECK-LABEL: stack_fold_packusdw:
747 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
750 ; CHECK-NEXT: #NO_APP
751 ; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
753 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
754 %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
757 declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone
759 define <32 x i16> @stack_fold_packusdw_mask(<32 x i16>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
760 ; CHECK-LABEL: stack_fold_packusdw_mask:
762 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
765 ; CHECK-NEXT: #NO_APP
766 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
767 ; CHECK-NEXT: kmovd %esi, %k1
768 ; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
769 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
771 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
772 %2 = load <32 x i16>, <32 x i16>* %passthru
773 %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
774 %4 = bitcast i32 %mask to <32 x i1>
775 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %2
779 define <32 x i16> @stack_fold_packusdw_maskz(<16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
780 ; CHECK-LABEL: stack_fold_packusdw_maskz:
782 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
785 ; CHECK-NEXT: #NO_APP
786 ; CHECK-NEXT: kmovd %edi, %k1
787 ; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
789 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
790 %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
791 %3 = bitcast i32 %mask to <32 x i1>
792 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
796 define <64 x i8> @stack_fold_packuswb(<32 x i16> %a0, <32 x i16> %a1) {
797 ; CHECK-LABEL: stack_fold_packuswb:
799 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
802 ; CHECK-NEXT: #NO_APP
803 ; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
805 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
806 %2 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a0, <32 x i16> %a1)
809 declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone
811 define <64 x i8> @stack_fold_paddb(<64 x i8> %a0, <64 x i8> %a1) {
812 ; CHECK-LABEL: stack_fold_paddb:
814 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
817 ; CHECK-NEXT: #NO_APP
818 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
821 %2 = add <64 x i8> %a0, %a1
825 define <64 x i8> @stack_fold_paddb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
826 ; CHECK-LABEL: stack_fold_paddb_commuted:
828 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
831 ; CHECK-NEXT: #NO_APP
832 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
834 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
835 %2 = add <64 x i8> %a1, %a0
839 define <64 x i8> @stack_fold_paddb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
840 ; CHECK-LABEL: stack_fold_paddb_mask:
842 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
843 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
846 ; CHECK-NEXT: #NO_APP
847 ; CHECK-NEXT: kmovq %rsi, %k1
848 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
849 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
851 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
852 %2 = add <64 x i8> %a0, %a1
853 %3 = bitcast i64 %mask to <64 x i1>
854 ; load needed to keep the operation from being scheduled about the asm block
855 %4 = load <64 x i8>, <64 x i8>* %a2
856 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
860 define <64 x i8> @stack_fold_paddb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
861 ; CHECK-LABEL: stack_fold_paddb_mask_commuted:
863 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
864 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
867 ; CHECK-NEXT: #NO_APP
868 ; CHECK-NEXT: kmovq %rsi, %k1
869 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
870 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
872 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
873 %2 = add <64 x i8> %a1, %a0
874 %3 = bitcast i64 %mask to <64 x i1>
875 ; load needed to keep the operation from being scheduled about the asm block
876 %4 = load <64 x i8>, <64 x i8>* %a2
877 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
881 define <64 x i8> @stack_fold_paddb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
882 ; CHECK-LABEL: stack_fold_paddb_maskz:
884 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
887 ; CHECK-NEXT: #NO_APP
888 ; CHECK-NEXT: kmovq %rdi, %k1
889 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
891 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
892 %2 = add <64 x i8> %a0, %a1
893 %3 = bitcast i64 %mask to <64 x i1>
894 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
898 define <64 x i8> @stack_fold_paddb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
899 ; CHECK-LABEL: stack_fold_paddb_maskz_commuted:
901 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
904 ; CHECK-NEXT: #NO_APP
905 ; CHECK-NEXT: kmovq %rdi, %k1
906 ; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
908 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
909 %2 = add <64 x i8> %a1, %a0
910 %3 = bitcast i64 %mask to <64 x i1>
911 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
915 define <16 x i32> @stack_fold_paddd(<16 x i32> %a0, <16 x i32> %a1) {
916 ; CHECK-LABEL: stack_fold_paddd:
918 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
921 ; CHECK-NEXT: #NO_APP
922 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
924 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
925 %2 = add <16 x i32> %a0, %a1
929 define <16 x i32> @stack_fold_paddd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
930 ; CHECK-LABEL: stack_fold_paddd_commuted:
932 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
935 ; CHECK-NEXT: #NO_APP
936 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
938 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
939 %2 = add <16 x i32> %a1, %a0
943 define <16 x i32> @stack_fold_paddd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
944 ; CHECK-LABEL: stack_fold_paddd_mask:
946 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
947 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
950 ; CHECK-NEXT: #NO_APP
951 ; CHECK-NEXT: kmovd %esi, %k1
952 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
953 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
955 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
956 %2 = add <16 x i32> %a0, %a1
957 %3 = bitcast i16 %mask to <16 x i1>
958 ; load needed to keep the operation from being scheduled about the asm block
959 %4 = load <16 x i32>, <16 x i32>* %a2
960 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
964 define <16 x i32> @stack_fold_paddd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
965 ; CHECK-LABEL: stack_fold_paddd_mask_commuted:
967 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
968 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
971 ; CHECK-NEXT: #NO_APP
972 ; CHECK-NEXT: kmovd %esi, %k1
973 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
974 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
976 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
977 %2 = add <16 x i32> %a1, %a0
978 %3 = bitcast i16 %mask to <16 x i1>
979 ; load needed to keep the operation from being scheduled about the asm block
980 %4 = load <16 x i32>, <16 x i32>* %a2
981 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
985 define <16 x i32> @stack_fold_paddd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
986 ; CHECK-LABEL: stack_fold_paddd_maskz:
988 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
991 ; CHECK-NEXT: #NO_APP
992 ; CHECK-NEXT: kmovd %edi, %k1
993 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
995 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
996 %2 = add <16 x i32> %a0, %a1
997 %3 = bitcast i16 %mask to <16 x i1>
998 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1002 define <16 x i32> @stack_fold_paddd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1003 ; CHECK-LABEL: stack_fold_paddd_maskz_commuted:
1005 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1008 ; CHECK-NEXT: #NO_APP
1009 ; CHECK-NEXT: kmovd %edi, %k1
1010 ; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1012 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1013 %2 = add <16 x i32> %a1, %a0
1014 %3 = bitcast i16 %mask to <16 x i1>
1015 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1019 define <8 x i64> @stack_fold_paddq(<8 x i64> %a0, <8 x i64> %a1) {
1020 ; CHECK-LABEL: stack_fold_paddq:
1022 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1025 ; CHECK-NEXT: #NO_APP
1026 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1028 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1029 %2 = add <8 x i64> %a0, %a1
1033 define <8 x i64> @stack_fold_paddq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
1034 ; CHECK-LABEL: stack_fold_paddq_commuted:
1036 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1039 ; CHECK-NEXT: #NO_APP
1040 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1042 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1043 %2 = add <8 x i64> %a1, %a0
1047 define <8 x i64> @stack_fold_paddq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
1048 ; CHECK-LABEL: stack_fold_paddq_mask:
1050 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1051 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1054 ; CHECK-NEXT: #NO_APP
1055 ; CHECK-NEXT: kmovd %esi, %k1
1056 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1057 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1059 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1060 %2 = add <8 x i64> %a0, %a1
1061 %3 = bitcast i8 %mask to <8 x i1>
1062 ; load needed to keep the operation from being scheduled about the asm block
1063 %4 = load <8 x i64>, <8 x i64>* %a2
1064 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1068 define <8 x i64> @stack_fold_paddq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
1069 ; CHECK-LABEL: stack_fold_paddq_mask_commuted:
1071 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1072 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1075 ; CHECK-NEXT: #NO_APP
1076 ; CHECK-NEXT: kmovd %esi, %k1
1077 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1078 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1080 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1081 %2 = add <8 x i64> %a1, %a0
1082 %3 = bitcast i8 %mask to <8 x i1>
1083 ; load needed to keep the operation from being scheduled about the asm block
1084 %4 = load <8 x i64>, <8 x i64>* %a2
1085 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1089 define <8 x i64> @stack_fold_paddq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1090 ; CHECK-LABEL: stack_fold_paddq_maskz:
1092 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1095 ; CHECK-NEXT: #NO_APP
1096 ; CHECK-NEXT: kmovd %edi, %k1
1097 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1099 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1100 %2 = add <8 x i64> %a0, %a1
1101 %3 = bitcast i8 %mask to <8 x i1>
1102 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1106 define <8 x i64> @stack_fold_paddq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1107 ; CHECK-LABEL: stack_fold_paddq_maskz_commuted:
1109 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1112 ; CHECK-NEXT: #NO_APP
1113 ; CHECK-NEXT: kmovd %edi, %k1
1114 ; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1116 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1117 %2 = add <8 x i64> %a1, %a0
1118 %3 = bitcast i8 %mask to <8 x i1>
1119 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1123 define <64 x i8> @stack_fold_paddsb(<64 x i8> %a0, <64 x i8> %a1) {
1124 ; CHECK-LABEL: stack_fold_paddsb:
1126 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1129 ; CHECK-NEXT: #NO_APP
1130 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1132 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1133 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1137 define <64 x i8> @stack_fold_paddsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
1138 ; CHECK-LABEL: stack_fold_paddsb_commuted:
1140 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1143 ; CHECK-NEXT: #NO_APP
1144 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1146 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1147 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1151 define <64 x i8> @stack_fold_paddsb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
1152 ; CHECK-LABEL: stack_fold_paddsb_mask:
1154 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1155 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1158 ; CHECK-NEXT: #NO_APP
1159 ; CHECK-NEXT: kmovq %rsi, %k1
1160 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1161 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1164 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1165 %3 = bitcast i64 %mask to <64 x i1>
1166 ; load needed to keep the operation from being scheduled about the asm block
1167 %4 = load <64 x i8>, <64 x i8>* %a2
1168 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1172 define <64 x i8> @stack_fold_paddsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
1173 ; CHECK-LABEL: stack_fold_paddsb_mask_commuted:
1175 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1176 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1179 ; CHECK-NEXT: #NO_APP
1180 ; CHECK-NEXT: kmovq %rsi, %k1
1181 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1182 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1184 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1185 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1186 %3 = bitcast i64 %mask to <64 x i1>
1187 ; load needed to keep the operation from being scheduled about the asm block
1188 %4 = load <64 x i8>, <64 x i8>* %a2
1189 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1193 define <64 x i8> @stack_fold_paddsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1194 ; CHECK-LABEL: stack_fold_paddsb_maskz:
1196 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1199 ; CHECK-NEXT: #NO_APP
1200 ; CHECK-NEXT: kmovq %rdi, %k1
1201 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1204 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1205 %3 = bitcast i64 %mask to <64 x i1>
1206 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1210 define <64 x i8> @stack_fold_paddsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1211 ; CHECK-LABEL: stack_fold_paddsb_maskz_commuted:
1213 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1216 ; CHECK-NEXT: #NO_APP
1217 ; CHECK-NEXT: kmovq %rdi, %k1
1218 ; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1220 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1221 %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1222 %3 = bitcast i64 %mask to <64 x i1>
1223 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1227 define <32 x i16> @stack_fold_paddsw(<32 x i16> %a0, <32 x i16> %a1) {
1228 ; CHECK-LABEL: stack_fold_paddsw:
1230 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1233 ; CHECK-NEXT: #NO_APP
1234 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1236 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1237 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1241 define <32 x i16> @stack_fold_paddsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
1242 ; CHECK-LABEL: stack_fold_paddsw_commuted:
1244 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1247 ; CHECK-NEXT: #NO_APP
1248 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1250 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1251 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1255 define <32 x i16> @stack_fold_paddsw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
1256 ; CHECK-LABEL: stack_fold_paddsw_mask:
1258 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1259 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1262 ; CHECK-NEXT: #NO_APP
1263 ; CHECK-NEXT: kmovd %esi, %k1
1264 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1265 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1267 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1268 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1269 %3 = bitcast i32 %mask to <32 x i1>
1270 ; load needed to keep the operation from being scheduled about the asm block
1271 %4 = load <32 x i16>, <32 x i16>* %a2
1272 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1276 define <32 x i16> @stack_fold_paddsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
1277 ; CHECK-LABEL: stack_fold_paddsw_mask_commuted:
1279 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1280 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1283 ; CHECK-NEXT: #NO_APP
1284 ; CHECK-NEXT: kmovd %esi, %k1
1285 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1286 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1288 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1289 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1290 %3 = bitcast i32 %mask to <32 x i1>
1291 ; load needed to keep the operation from being scheduled about the asm block
1292 %4 = load <32 x i16>, <32 x i16>* %a2
1293 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1297 define <32 x i16> @stack_fold_paddsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1298 ; CHECK-LABEL: stack_fold_paddsw_maskz:
1300 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1303 ; CHECK-NEXT: #NO_APP
1304 ; CHECK-NEXT: kmovd %edi, %k1
1305 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1307 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1308 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1309 %3 = bitcast i32 %mask to <32 x i1>
1310 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1314 define <32 x i16> @stack_fold_paddsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1315 ; CHECK-LABEL: stack_fold_paddsw_maskz_commuted:
1317 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1320 ; CHECK-NEXT: #NO_APP
1321 ; CHECK-NEXT: kmovd %edi, %k1
1322 ; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1324 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1325 %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1326 %3 = bitcast i32 %mask to <32 x i1>
1327 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1331 define <64 x i8> @stack_fold_paddusb(<64 x i8> %a0, <64 x i8> %a1) {
1332 ; CHECK-LABEL: stack_fold_paddusb:
1334 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1337 ; CHECK-NEXT: #NO_APP
1338 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1340 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1341 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1345 define <64 x i8> @stack_fold_paddusb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
1346 ; CHECK-LABEL: stack_fold_paddusb_commuted:
1348 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1351 ; CHECK-NEXT: #NO_APP
1352 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1354 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1355 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1359 define <64 x i8> @stack_fold_paddusb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
1360 ; CHECK-LABEL: stack_fold_paddusb_mask:
1362 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1363 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1366 ; CHECK-NEXT: #NO_APP
1367 ; CHECK-NEXT: kmovq %rsi, %k1
1368 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1369 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1371 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1372 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1373 %3 = bitcast i64 %mask to <64 x i1>
1374 ; load needed to keep the operation from being scheduled about the asm block
1375 %4 = load <64 x i8>, <64 x i8>* %a2
1376 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1380 define <64 x i8> @stack_fold_paddusb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
1381 ; CHECK-LABEL: stack_fold_paddusb_mask_commuted:
1383 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1384 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1387 ; CHECK-NEXT: #NO_APP
1388 ; CHECK-NEXT: kmovq %rsi, %k1
1389 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1390 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1392 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1393 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1394 %3 = bitcast i64 %mask to <64 x i1>
1395 ; load needed to keep the operation from being scheduled about the asm block
1396 %4 = load <64 x i8>, <64 x i8>* %a2
1397 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1401 define <64 x i8> @stack_fold_paddusb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1402 ; CHECK-LABEL: stack_fold_paddusb_maskz:
1404 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1407 ; CHECK-NEXT: #NO_APP
1408 ; CHECK-NEXT: kmovq %rdi, %k1
1409 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1411 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1412 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1413 %3 = bitcast i64 %mask to <64 x i1>
1414 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1418 define <64 x i8> @stack_fold_paddusb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1419 ; CHECK-LABEL: stack_fold_paddusb_maskz_commuted:
1421 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1424 ; CHECK-NEXT: #NO_APP
1425 ; CHECK-NEXT: kmovq %rdi, %k1
1426 ; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1428 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1429 %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1430 %3 = bitcast i64 %mask to <64 x i1>
1431 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1435 define <32 x i16> @stack_fold_paddusw(<32 x i16> %a0, <32 x i16> %a1) {
1436 ; CHECK-LABEL: stack_fold_paddusw:
1438 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1441 ; CHECK-NEXT: #NO_APP
1442 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1444 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1445 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1449 define <32 x i16> @stack_fold_paddusw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
1450 ; CHECK-LABEL: stack_fold_paddusw_commuted:
1452 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1455 ; CHECK-NEXT: #NO_APP
1456 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1458 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1459 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1463 define <32 x i16> @stack_fold_paddusw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
1464 ; CHECK-LABEL: stack_fold_paddusw_mask:
1466 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1467 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1470 ; CHECK-NEXT: #NO_APP
1471 ; CHECK-NEXT: kmovd %esi, %k1
1472 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1473 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1475 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1476 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1477 %3 = bitcast i32 %mask to <32 x i1>
1478 ; load needed to keep the operation from being scheduled about the asm block
1479 %4 = load <32 x i16>, <32 x i16>* %a2
1480 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1484 define <32 x i16> @stack_fold_paddusw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
1485 ; CHECK-LABEL: stack_fold_paddusw_mask_commuted:
1487 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1488 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1491 ; CHECK-NEXT: #NO_APP
1492 ; CHECK-NEXT: kmovd %esi, %k1
1493 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1494 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1496 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1497 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1498 %3 = bitcast i32 %mask to <32 x i1>
1499 ; load needed to keep the operation from being scheduled about the asm block
1500 %4 = load <32 x i16>, <32 x i16>* %a2
1501 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1505 define <32 x i16> @stack_fold_paddusw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1506 ; CHECK-LABEL: stack_fold_paddusw_maskz:
1508 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1511 ; CHECK-NEXT: #NO_APP
1512 ; CHECK-NEXT: kmovd %edi, %k1
1513 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1515 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1516 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1517 %3 = bitcast i32 %mask to <32 x i1>
1518 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1522 define <32 x i16> @stack_fold_paddusw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1523 ; CHECK-LABEL: stack_fold_paddusw_maskz_commuted:
1525 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1528 ; CHECK-NEXT: #NO_APP
1529 ; CHECK-NEXT: kmovd %edi, %k1
1530 ; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1533 %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1534 %3 = bitcast i32 %mask to <32 x i1>
1535 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1539 define <32 x i16> @stack_fold_paddw(<32 x i16> %a0, <32 x i16> %a1) {
1540 ; CHECK-LABEL: stack_fold_paddw:
1542 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1545 ; CHECK-NEXT: #NO_APP
1546 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1548 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1549 %2 = add <32 x i16> %a0, %a1
1553 define <32 x i16> @stack_fold_paddw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
1554 ; CHECK-LABEL: stack_fold_paddw_commuted:
1556 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1559 ; CHECK-NEXT: #NO_APP
1560 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1562 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1563 %2 = add <32 x i16> %a1, %a0
1567 define <32 x i16> @stack_fold_paddw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
1568 ; CHECK-LABEL: stack_fold_paddw_mask:
1570 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1571 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1574 ; CHECK-NEXT: #NO_APP
1575 ; CHECK-NEXT: kmovd %esi, %k1
1576 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1577 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1579 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1580 %2 = add <32 x i16> %a0, %a1
1581 %3 = bitcast i32 %mask to <32 x i1>
1582 ; load needed to keep the operation from being scheduled about the asm block
1583 %4 = load <32 x i16>, <32 x i16>* %a2
1584 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1588 define <32 x i16> @stack_fold_paddw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
1589 ; CHECK-LABEL: stack_fold_paddw_mask_commuted:
1591 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1592 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
1595 ; CHECK-NEXT: #NO_APP
1596 ; CHECK-NEXT: kmovd %esi, %k1
1597 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
1598 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1600 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1601 %2 = add <32 x i16> %a1, %a0
1602 %3 = bitcast i32 %mask to <32 x i1>
1603 ; load needed to keep the operation from being scheduled about the asm block
1604 %4 = load <32 x i16>, <32 x i16>* %a2
1605 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1609 define <32 x i16> @stack_fold_paddw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1610 ; CHECK-LABEL: stack_fold_paddw_maskz:
1612 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1615 ; CHECK-NEXT: #NO_APP
1616 ; CHECK-NEXT: kmovd %edi, %k1
1617 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1619 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1620 %2 = add <32 x i16> %a0, %a1
1621 %3 = bitcast i32 %mask to <32 x i1>
1622 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1626 define <32 x i16> @stack_fold_paddw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1627 ; CHECK-LABEL: stack_fold_paddw_maskz_commuted:
1629 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1632 ; CHECK-NEXT: #NO_APP
1633 ; CHECK-NEXT: kmovd %edi, %k1
1634 ; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1636 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1637 %2 = add <32 x i16> %a1, %a0
1638 %3 = bitcast i32 %mask to <32 x i1>
1639 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1643 define <64 x i8> @stack_fold_palignr(<64 x i8> %a0, <64 x i8> %a1) {
1644 ; CHECK-LABEL: stack_fold_palignr:
1646 ; CHECK-NEXT: subq $56, %rsp
1647 ; CHECK-NEXT: .cfi_def_cfa_offset 64
1648 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1649 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1652 ; CHECK-NEXT: #NO_APP
1653 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1654 ; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1655 ; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
1656 ; CHECK-NEXT: addq $56, %rsp
1657 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1659 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1660 %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
1664 define <64 x i8> @stack_fold_palignr_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %passthru, i64 %mask) {
1665 ; CHECK-LABEL: stack_fold_palignr_mask:
1667 ; CHECK-NEXT: subq $56, %rsp
1668 ; CHECK-NEXT: .cfi_def_cfa_offset 64
1669 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1670 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1673 ; CHECK-NEXT: #NO_APP
1674 ; CHECK-NEXT: kmovq %rsi, %k1
1675 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
1676 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1677 ; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
1678 ; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
1679 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
1680 ; CHECK-NEXT: addq $56, %rsp
1681 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1683 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1684 %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
1685 %3 = bitcast i64 %mask to <64 x i1>
1686 %4 = load <64 x i8>, <64 x i8>* %passthru
1687 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1691 define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1692 ; CHECK-LABEL: stack_fold_palignr_maskz:
1694 ; CHECK-NEXT: subq $56, %rsp
1695 ; CHECK-NEXT: .cfi_def_cfa_offset 64
1696 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1697 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1700 ; CHECK-NEXT: #NO_APP
1701 ; CHECK-NEXT: kmovq %rdi, %k1
1702 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1703 ; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1704 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
1705 ; CHECK-NEXT: addq $56, %rsp
1706 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1708 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1709 %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
1710 %3 = bitcast i64 %mask to <64 x i1>
1711 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1715 define <16 x i32> @stack_fold_pandd(<16 x i32> %a0, <16 x i32> %a1) {
1716 ; CHECK-LABEL: stack_fold_pandd:
1718 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1721 ; CHECK-NEXT: #NO_APP
1722 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1724 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1725 %2 = and <16 x i32> %a0, %a1
1729 define <16 x i32> @stack_fold_pandd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
1730 ; CHECK-LABEL: stack_fold_pandd_commuted:
1732 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1735 ; CHECK-NEXT: #NO_APP
1736 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1738 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1739 %2 = and <16 x i32> %a1, %a0
1743 define <16 x i32> @stack_fold_pandd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
1744 ; CHECK-LABEL: stack_fold_pandd_mask:
1746 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1747 ; CHECK-NEXT: vmovaps %zmm0, %zmm1
1750 ; CHECK-NEXT: #NO_APP
1751 ; CHECK-NEXT: kmovd %esi, %k1
1752 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
1753 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1755 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1756 %2 = and <16 x i32> %a0, %a1
1757 %3 = bitcast i16 %mask to <16 x i1>
1758 ; load needed to keep the operation from being scheduled about the asm block
1759 %4 = load <16 x i32>, <16 x i32>* %a2
1760 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
1764 define <16 x i32> @stack_fold_pandd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
1765 ; CHECK-LABEL: stack_fold_pandd_mask_commuted:
1767 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1768 ; CHECK-NEXT: vmovaps %zmm0, %zmm1
1771 ; CHECK-NEXT: #NO_APP
1772 ; CHECK-NEXT: kmovd %esi, %k1
1773 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
1774 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1776 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1777 %2 = and <16 x i32> %a1, %a0
1778 %3 = bitcast i16 %mask to <16 x i1>
1779 ; load needed to keep the operation from being scheduled about the asm block
1780 %4 = load <16 x i32>, <16 x i32>* %a2
1781 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
1785 define <16 x i32> @stack_fold_pandd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1786 ; CHECK-LABEL: stack_fold_pandd_maskz:
1788 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1791 ; CHECK-NEXT: #NO_APP
1792 ; CHECK-NEXT: kmovd %edi, %k1
1793 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1795 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1796 %2 = and <16 x i32> %a0, %a1
1797 %3 = bitcast i16 %mask to <16 x i1>
1798 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1802 define <16 x i32> @stack_fold_pandd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1803 ; CHECK-LABEL: stack_fold_pandd_maskz_commuted:
1805 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1808 ; CHECK-NEXT: #NO_APP
1809 ; CHECK-NEXT: kmovd %edi, %k1
1810 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1812 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1813 %2 = and <16 x i32> %a1, %a0
1814 %3 = bitcast i16 %mask to <16 x i1>
1815 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1819 define <8 x i64> @stack_fold_pandq(<8 x i64> %a0, <8 x i64> %a1) {
1820 ; CHECK-LABEL: stack_fold_pandq:
1822 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1825 ; CHECK-NEXT: #NO_APP
1826 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1828 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1829 %2 = and <8 x i64> %a0, %a1
1833 define <8 x i64> @stack_fold_pandq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
1834 ; CHECK-LABEL: stack_fold_pandq_commuted:
1836 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1839 ; CHECK-NEXT: #NO_APP
1840 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1842 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1843 %2 = and <8 x i64> %a1, %a0
1847 define <8 x i64> @stack_fold_pandq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
1848 ; CHECK-LABEL: stack_fold_pandq_mask:
1850 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1851 ; CHECK-NEXT: vmovapd %zmm0, %zmm1
1854 ; CHECK-NEXT: #NO_APP
1855 ; CHECK-NEXT: kmovd %esi, %k1
1856 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
1857 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1859 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1860 %2 = and <8 x i64> %a0, %a1
1861 %3 = bitcast i8 %mask to <8 x i1>
1862 ; load needed to keep the operation from being scheduled about the asm block
1863 %4 = load <8 x i64>, <8 x i64>* %a2
1864 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1868 define <8 x i64> @stack_fold_pandq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
1869 ; CHECK-LABEL: stack_fold_pandq_mask_commuted:
1871 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1872 ; CHECK-NEXT: vmovapd %zmm0, %zmm1
1875 ; CHECK-NEXT: #NO_APP
1876 ; CHECK-NEXT: kmovd %esi, %k1
1877 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
1878 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1880 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1881 %2 = and <8 x i64> %a1, %a0
1882 %3 = bitcast i8 %mask to <8 x i1>
1883 ; load needed to keep the operation from being scheduled about the asm block
1884 %4 = load <8 x i64>, <8 x i64>* %a2
1885 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1889 define <8 x i64> @stack_fold_pandq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1890 ; CHECK-LABEL: stack_fold_pandq_maskz:
1892 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1895 ; CHECK-NEXT: #NO_APP
1896 ; CHECK-NEXT: kmovd %edi, %k1
1897 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1899 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1900 %2 = and <8 x i64> %a0, %a1
1901 %3 = bitcast i8 %mask to <8 x i1>
1902 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1906 define <8 x i64> @stack_fold_pandq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1907 ; CHECK-LABEL: stack_fold_pandq_maskz_commuted:
1909 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1912 ; CHECK-NEXT: #NO_APP
1913 ; CHECK-NEXT: kmovd %edi, %k1
1914 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1916 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1917 %2 = and <8 x i64> %a1, %a0
1918 %3 = bitcast i8 %mask to <8 x i1>
1919 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1923 define <16 x i32> @stack_fold_vpconflictd(<16 x i32> %a0) {
1924 ; CHECK-LABEL: stack_fold_vpconflictd:
1926 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1929 ; CHECK-NEXT: #NO_APP
1930 ; CHECK-NEXT: vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1932 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1933 %2 = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a0)
1936 declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
1938 define <8 x i64> @stack_fold_vpconflictq(<8 x i64> %a0) {
1939 ; CHECK-LABEL: stack_fold_vpconflictq:
1941 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1944 ; CHECK-NEXT: #NO_APP
1945 ; CHECK-NEXT: vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1947 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1948 %2 = call <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %a0)
1951 declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readnone
1953 define i64 @stack_fold_pcmpeqb(<64 x i8> %a0, <64 x i8> %a1) {
1954 ; CHECK-LABEL: stack_fold_pcmpeqb:
1956 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1959 ; CHECK-NEXT: #NO_APP
1960 ; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
1961 ; CHECK-NEXT: kmovq %k0, %rax
1962 ; CHECK-NEXT: vzeroupper
1964 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1965 %2 = icmp eq <64 x i8> %a0, %a1
1966 %3 = bitcast <64 x i1> %2 to i64
1970 define i16 @stack_fold_pcmpeqd(<16 x i32> %a0, <16 x i32> %a1) {
1971 ; CHECK-LABEL: stack_fold_pcmpeqd:
1973 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1976 ; CHECK-NEXT: #NO_APP
1977 ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
1978 ; CHECK-NEXT: kmovd %k0, %eax
1979 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1980 ; CHECK-NEXT: vzeroupper
1982 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1983 %2 = icmp eq <16 x i32> %a0, %a1
1984 %3 = bitcast <16 x i1> %2 to i16
1988 define i8 @stack_fold_pcmpeqq(<8 x i64> %a0, <8 x i64> %a1) {
1989 ; CHECK-LABEL: stack_fold_pcmpeqq:
1991 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1994 ; CHECK-NEXT: #NO_APP
1995 ; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
1996 ; CHECK-NEXT: kmovd %k0, %eax
1997 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
1998 ; CHECK-NEXT: vzeroupper
2000 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2001 %2 = icmp eq <8 x i64> %a0, %a1
2002 %3 = bitcast <8 x i1> %2 to i8
2006 define i32 @stack_fold_pcmpeqw(<32 x i16> %a0, <32 x i16> %a1) {
2007 ; CHECK-LABEL: stack_fold_pcmpeqw:
2009 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2012 ; CHECK-NEXT: #NO_APP
2013 ; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
2014 ; CHECK-NEXT: kmovd %k0, %eax
2015 ; CHECK-NEXT: vzeroupper
2017 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2018 %2 = icmp eq <32 x i16> %a0, %a1
2019 %3 = bitcast <32 x i1> %2 to i32
2023 define <16 x i32> @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
2024 ; CHECK-LABEL: stack_fold_pcmpeqd_mask:
2026 ; CHECK-NEXT: subq $184, %rsp
2027 ; CHECK-NEXT: .cfi_def_cfa_offset 192
2028 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2029 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
2030 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2031 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2034 ; CHECK-NEXT: #NO_APP
2035 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2036 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
2037 ; CHECK-NEXT: kmovd %esi, %k1
2038 ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
2039 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2040 ; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
2041 ; CHECK-NEXT: addq $184, %rsp
2042 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2044 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2045 ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
2046 %2 = load <16 x i32>, <16 x i32>* %a2
2047 %3 = add <16 x i32> %a1, %2
2048 %4 = bitcast i16 %mask to <16 x i1>
2049 %5 = icmp eq <16 x i32> %3, %a0
2050 %6 = and <16 x i1> %4, %5
2051 %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
2055 define <16 x i32> @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
2056 ; CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted:
2058 ; CHECK-NEXT: subq $184, %rsp
2059 ; CHECK-NEXT: .cfi_def_cfa_offset 192
2060 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2061 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
2062 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2063 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2066 ; CHECK-NEXT: #NO_APP
2067 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2068 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
2069 ; CHECK-NEXT: kmovd %esi, %k1
2070 ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
2071 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2072 ; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
2073 ; CHECK-NEXT: addq $184, %rsp
2074 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2076 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2077 ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
2078 %2 = load <16 x i32>, <16 x i32>* %a2
2079 %3 = add <16 x i32> %a1, %2
2080 %4 = bitcast i16 %mask to <16 x i1>
2081 %5 = icmp eq <16 x i32> %a0, %3
2082 %6 = and <16 x i1> %4, %5
2083 %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
2087 define <16 x i32> @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
2088 ; CHECK-LABEL: stack_fold_pcmpled_mask:
2090 ; CHECK-NEXT: subq $184, %rsp
2091 ; CHECK-NEXT: .cfi_def_cfa_offset 192
2092 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2093 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
2094 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2095 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2098 ; CHECK-NEXT: #NO_APP
2099 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2100 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
2101 ; CHECK-NEXT: kmovd %esi, %k1
2102 ; CHECK-NEXT: vpcmpled {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
2103 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2104 ; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
2105 ; CHECK-NEXT: addq $184, %rsp
2106 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2108 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2109 ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
2110 %2 = load <16 x i32>, <16 x i32>* %a2
2111 %3 = add <16 x i32> %a1, %2
2112 %4 = bitcast i16 %mask to <16 x i1>
2113 %5 = icmp sge <16 x i32> %a0, %3
2114 %6 = and <16 x i1> %4, %5
2115 %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
2119 define i16 @stack_fold_pcmpleud(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
2120 ; CHECK-LABEL: stack_fold_pcmpleud:
2122 ; CHECK-NEXT: subq $56, %rsp
2123 ; CHECK-NEXT: .cfi_def_cfa_offset 64
2124 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2125 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2128 ; CHECK-NEXT: #NO_APP
2129 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2130 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
2131 ; CHECK-NEXT: vpcmpleud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
2132 ; CHECK-NEXT: kmovd %k0, %eax
2133 ; CHECK-NEXT: andl %esi, %eax
2134 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
2135 ; CHECK-NEXT: addq $56, %rsp
2136 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2137 ; CHECK-NEXT: vzeroupper
2139 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2140 %2 = load <16 x i32>, <16 x i32>* %a2
2141 %3 = add <16 x i32> %a1, %2
2142 %4 = bitcast i16 %mask to <16 x i1>
2143 %5 = icmp uge <16 x i32> %a0, %3
2144 %6 = and <16 x i1> %5, %4
2145 %7 = bitcast <16 x i1> %6 to i16
2149 define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) {
2150 ; CHECK-LABEL: stack_fold_permbvar:
2152 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2155 ; CHECK-NEXT: #NO_APP
2156 ; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2158 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2159 %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
2162 declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) nounwind readonly
2164 define <64 x i8> @stack_fold_permbvar_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
2165 ; CHECK-LABEL: stack_fold_permbvar_mask:
2167 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2170 ; CHECK-NEXT: #NO_APP
2171 ; CHECK-NEXT: kmovq %rsi, %k1
2172 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2173 ; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2174 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2176 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2177 %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
2178 %3 = bitcast i64 %mask to <64 x i1>
2179 ; load needed to keep the operation from being scheduled above the asm block
2180 %4 = load <64 x i8>, <64 x i8>* %passthru
2181 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
2185 define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
2186 ; CHECK-LABEL: stack_fold_permbvar_maskz:
2188 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2191 ; CHECK-NEXT: #NO_APP
2192 ; CHECK-NEXT: kmovq %rdi, %k1
2193 ; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2196 %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
2197 %3 = bitcast i64 %mask to <64 x i1>
2198 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
2202 define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) {
2203 ; CHECK-LABEL: stack_fold_permd:
2205 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2208 ; CHECK-NEXT: #NO_APP
2209 ; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2210 ; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
2211 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
2213 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2214 %2 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0)
2215 ; add forces execution domain
2216 %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2219 declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) nounwind readonly
2221 define <64 x i8> @stack_fold_vpermi2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
2222 ; CHECK-LABEL: stack_fold_vpermi2b:
2224 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2227 ; CHECK-NEXT: #NO_APP
2228 ; CHECK-NEXT: vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2230 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2231 %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2)
2235 define <16 x i32> @stack_fold_vpermi2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
2236 ; CHECK-LABEL: stack_fold_vpermi2d:
2238 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2241 ; CHECK-NEXT: #NO_APP
2242 ; CHECK-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2244 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2245 %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
2249 define <8 x i64> @stack_fold_vpermi2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
2250 ; CHECK-LABEL: stack_fold_vpermi2q:
2252 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2255 ; CHECK-NEXT: #NO_APP
2256 ; CHECK-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2258 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2259 %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
2263 define <32 x i16> @stack_fold_vpermi2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
2264 ; CHECK-LABEL: stack_fold_vpermi2w:
2266 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2269 ; CHECK-NEXT: #NO_APP
2270 ; CHECK-NEXT: vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2272 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2273 %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2)
2277 define <8 x i64> @stack_fold_permq(<8 x i64> %a0) {
2278 ; CHECK-LABEL: stack_fold_permq:
2280 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2283 ; CHECK-NEXT: #NO_APP
2284 ; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
2285 ; CHECK-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7]
2286 ; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
2287 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
2289 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2290 %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
2291 ; add forces execution domain
2292 %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2296 define <8 x i64> @stack_fold_permq_mask(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
2297 ; CHECK-LABEL: stack_fold_permq_mask:
2299 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2302 ; CHECK-NEXT: #NO_APP
2303 ; CHECK-NEXT: kmovd %esi, %k1
2304 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
2305 ; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
2306 ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7]
2307 ; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
2308 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
2310 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2311 %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
2312 %3 = bitcast i8 %mask to <8 x i1>
2313 ; load needed to keep the operation from being scheduled above the asm block
2314 %4 = load <8 x i64>, <8 x i64>* %passthru
2315 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
2316 ; add forces execution domain
2317 %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2321 define <8 x i64> @stack_fold_permq_maskz(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
2322 ; CHECK-LABEL: stack_fold_permq_maskz:
2324 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2327 ; CHECK-NEXT: #NO_APP
2328 ; CHECK-NEXT: kmovd %esi, %k1
2329 ; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
2330 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,2,3,7,6,6,7]
2332 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2333 %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
2334 %3 = bitcast i8 %mask to <8 x i1>
2335 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
2339 define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) {
2340 ; CHECK-LABEL: stack_fold_permqvar:
2342 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2345 ; CHECK-NEXT: #NO_APP
2346 ; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2347 ; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
2348 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
2350 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2351 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0)
2352 ; add forces execution domain
2353 %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2356 declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) nounwind readonly
2358 define <8 x i64> @stack_fold_permqvar_mask(<8 x i64>* %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
2359 ; CHECK-LABEL: stack_fold_permqvar_mask:
2361 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2364 ; CHECK-NEXT: #NO_APP
2365 ; CHECK-NEXT: kmovd %esi, %k1
2366 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
2367 ; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
2368 ; CHECK-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
2369 ; CHECK-NEXT: vpsubq %zmm0, %zmm1, %zmm0
2371 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2372 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0)
2373 %3 = bitcast i8 %mask to <8 x i1>
2374 ; load needed to keep the operation from being scheduled above the asm block
2375 %4 = load <8 x i64>, <8 x i64>* %passthru
2376 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
2377 ; add forces execution domain
2378 %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2382 define <64 x i8> @stack_fold_vpermt2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
2383 ; CHECK-LABEL: stack_fold_vpermt2b:
2385 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2388 ; CHECK-NEXT: #NO_APP
2389 ; CHECK-NEXT: vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2391 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2392 %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2)
2395 declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>)
2397 define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
2398 ; CHECK-LABEL: stack_fold_vpermt2d:
2400 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2403 ; CHECK-NEXT: #NO_APP
2404 ; CHECK-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2406 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2407 %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
2410 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2412 define <8 x i64> @stack_fold_vpermt2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
2413 ; CHECK-LABEL: stack_fold_vpermt2q:
2415 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2418 ; CHECK-NEXT: #NO_APP
2419 ; CHECK-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2421 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2422 %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
2425 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
2427 define <32 x i16> @stack_fold_vpermt2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
2428 ; CHECK-LABEL: stack_fold_vpermt2w:
2430 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2433 ; CHECK-NEXT: #NO_APP
2434 ; CHECK-NEXT: vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2436 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2437 %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2)
2440 declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>)
2442 define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) {
2443 ; CHECK-LABEL: stack_fold_permwvar:
2445 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2448 ; CHECK-NEXT: #NO_APP
2449 ; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2451 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2452 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
2455 declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) nounwind readonly
2457 define <32 x i16> @stack_fold_permwvar_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
2458 ; CHECK-LABEL: stack_fold_permwvar_mask:
2460 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2463 ; CHECK-NEXT: #NO_APP
2464 ; CHECK-NEXT: kmovd %esi, %k1
2465 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2466 ; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2467 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2469 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2470 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
2471 %3 = bitcast i32 %mask to <32 x i1>
2472 ; load needed to keep the operation from being scheduled above the asm block
2473 %4 = load <32 x i16>, <32 x i16>* %passthru
2474 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
2478 define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
2479 ; CHECK-LABEL: stack_fold_permwvar_maskz:
2481 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2484 ; CHECK-NEXT: #NO_APP
2485 ; CHECK-NEXT: kmovd %edi, %k1
2486 ; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2488 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2489 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
2490 %3 = bitcast i32 %mask to <32 x i1>
2491 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
2495 define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
2496 ; CHECK-LABEL: stack_fold_pextrd:
2498 ; CHECK-NEXT: pushq %rbp
2499 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2500 ; CHECK-NEXT: pushq %r15
2501 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2502 ; CHECK-NEXT: pushq %r14
2503 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2504 ; CHECK-NEXT: pushq %r13
2505 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2506 ; CHECK-NEXT: pushq %r12
2507 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2508 ; CHECK-NEXT: pushq %rbx
2509 ; CHECK-NEXT: .cfi_def_cfa_offset 56
2510 ; CHECK-NEXT: .cfi_offset %rbx, -56
2511 ; CHECK-NEXT: .cfi_offset %r12, -48
2512 ; CHECK-NEXT: .cfi_offset %r13, -40
2513 ; CHECK-NEXT: .cfi_offset %r14, -32
2514 ; CHECK-NEXT: .cfi_offset %r15, -24
2515 ; CHECK-NEXT: .cfi_offset %rbp, -16
2516 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2517 ; CHECK-NEXT: vpextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
2520 ; CHECK-NEXT: #NO_APP
2521 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2522 ; CHECK-NEXT: popq %rbx
2523 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2524 ; CHECK-NEXT: popq %r12
2525 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2526 ; CHECK-NEXT: popq %r13
2527 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2528 ; CHECK-NEXT: popq %r14
2529 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2530 ; CHECK-NEXT: popq %r15
2531 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2532 ; CHECK-NEXT: popq %rbp
2533 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2535 ; add forces execution domain
2536 %1 = add <4 x i32> %a0, %a1
2537 %2 = extractelement <4 x i32> %1, i32 1
2538 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2542 define i64 @stack_fold_pextrq(<2 x i64> %a0) {
2543 ; CHECK-LABEL: stack_fold_pextrq:
2545 ; CHECK-NEXT: pushq %rbp
2546 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2547 ; CHECK-NEXT: pushq %r15
2548 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2549 ; CHECK-NEXT: pushq %r14
2550 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2551 ; CHECK-NEXT: pushq %r13
2552 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2553 ; CHECK-NEXT: pushq %r12
2554 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2555 ; CHECK-NEXT: pushq %rbx
2556 ; CHECK-NEXT: .cfi_def_cfa_offset 56
2557 ; CHECK-NEXT: .cfi_offset %rbx, -56
2558 ; CHECK-NEXT: .cfi_offset %r12, -48
2559 ; CHECK-NEXT: .cfi_offset %r13, -40
2560 ; CHECK-NEXT: .cfi_offset %r14, -32
2561 ; CHECK-NEXT: .cfi_offset %r15, -24
2562 ; CHECK-NEXT: .cfi_offset %rbp, -16
2563 ; CHECK-NEXT: vpextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2566 ; CHECK-NEXT: #NO_APP
2567 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2568 ; CHECK-NEXT: popq %rbx
2569 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2570 ; CHECK-NEXT: popq %r12
2571 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2572 ; CHECK-NEXT: popq %r13
2573 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2574 ; CHECK-NEXT: popq %r14
2575 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2576 ; CHECK-NEXT: popq %r15
2577 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2578 ; CHECK-NEXT: popq %rbp
2579 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2581 %1 = extractelement <2 x i64> %a0, i32 1
2582 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2586 define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
2587 ; CHECK-LABEL: stack_fold_pinsrb:
2589 ; CHECK-NEXT: pushq %rbp
2590 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2591 ; CHECK-NEXT: pushq %r15
2592 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2593 ; CHECK-NEXT: pushq %r14
2594 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2595 ; CHECK-NEXT: pushq %r13
2596 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2597 ; CHECK-NEXT: pushq %r12
2598 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2599 ; CHECK-NEXT: pushq %rbx
2600 ; CHECK-NEXT: .cfi_def_cfa_offset 56
2601 ; CHECK-NEXT: .cfi_offset %rbx, -56
2602 ; CHECK-NEXT: .cfi_offset %r12, -48
2603 ; CHECK-NEXT: .cfi_offset %r13, -40
2604 ; CHECK-NEXT: .cfi_offset %r14, -32
2605 ; CHECK-NEXT: .cfi_offset %r15, -24
2606 ; CHECK-NEXT: .cfi_offset %rbp, -16
2607 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2610 ; CHECK-NEXT: #NO_APP
2611 ; CHECK-NEXT: vpinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2612 ; CHECK-NEXT: popq %rbx
2613 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2614 ; CHECK-NEXT: popq %r12
2615 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2616 ; CHECK-NEXT: popq %r13
2617 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2618 ; CHECK-NEXT: popq %r14
2619 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2620 ; CHECK-NEXT: popq %r15
2621 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2622 ; CHECK-NEXT: popq %rbp
2623 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2625 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2626 %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
2630 define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
2631 ; CHECK-LABEL: stack_fold_pinsrd:
2633 ; CHECK-NEXT: pushq %rbp
2634 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2635 ; CHECK-NEXT: pushq %r15
2636 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2637 ; CHECK-NEXT: pushq %r14
2638 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2639 ; CHECK-NEXT: pushq %r13
2640 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2641 ; CHECK-NEXT: pushq %r12
2642 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2643 ; CHECK-NEXT: pushq %rbx
2644 ; CHECK-NEXT: .cfi_def_cfa_offset 56
2645 ; CHECK-NEXT: .cfi_offset %rbx, -56
2646 ; CHECK-NEXT: .cfi_offset %r12, -48
2647 ; CHECK-NEXT: .cfi_offset %r13, -40
2648 ; CHECK-NEXT: .cfi_offset %r14, -32
2649 ; CHECK-NEXT: .cfi_offset %r15, -24
2650 ; CHECK-NEXT: .cfi_offset %rbp, -16
2651 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2654 ; CHECK-NEXT: #NO_APP
2655 ; CHECK-NEXT: vpinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2656 ; CHECK-NEXT: popq %rbx
2657 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2658 ; CHECK-NEXT: popq %r12
2659 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2660 ; CHECK-NEXT: popq %r13
2661 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2662 ; CHECK-NEXT: popq %r14
2663 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2664 ; CHECK-NEXT: popq %r15
2665 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2666 ; CHECK-NEXT: popq %rbp
2667 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2669 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2670 %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
2674 define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
2675 ; CHECK-LABEL: stack_fold_pinsrq:
2677 ; CHECK-NEXT: pushq %rbp
2678 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2679 ; CHECK-NEXT: pushq %r15
2680 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2681 ; CHECK-NEXT: pushq %r14
2682 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2683 ; CHECK-NEXT: pushq %r13
2684 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2685 ; CHECK-NEXT: pushq %r12
2686 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2687 ; CHECK-NEXT: pushq %rbx
2688 ; CHECK-NEXT: .cfi_def_cfa_offset 56
2689 ; CHECK-NEXT: .cfi_offset %rbx, -56
2690 ; CHECK-NEXT: .cfi_offset %r12, -48
2691 ; CHECK-NEXT: .cfi_offset %r13, -40
2692 ; CHECK-NEXT: .cfi_offset %r14, -32
2693 ; CHECK-NEXT: .cfi_offset %r15, -24
2694 ; CHECK-NEXT: .cfi_offset %rbp, -16
2695 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2698 ; CHECK-NEXT: #NO_APP
2699 ; CHECK-NEXT: vpinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2700 ; CHECK-NEXT: popq %rbx
2701 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2702 ; CHECK-NEXT: popq %r12
2703 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2704 ; CHECK-NEXT: popq %r13
2705 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2706 ; CHECK-NEXT: popq %r14
2707 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2708 ; CHECK-NEXT: popq %r15
2709 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2710 ; CHECK-NEXT: popq %rbp
2711 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2713 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2714 %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
2718 define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
2719 ; CHECK-LABEL: stack_fold_pinsrw:
2721 ; CHECK-NEXT: pushq %rbp
2722 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2723 ; CHECK-NEXT: pushq %r15
2724 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2725 ; CHECK-NEXT: pushq %r14
2726 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2727 ; CHECK-NEXT: pushq %r13
2728 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2729 ; CHECK-NEXT: pushq %r12
2730 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2731 ; CHECK-NEXT: pushq %rbx
2732 ; CHECK-NEXT: .cfi_def_cfa_offset 56
2733 ; CHECK-NEXT: .cfi_offset %rbx, -56
2734 ; CHECK-NEXT: .cfi_offset %r12, -48
2735 ; CHECK-NEXT: .cfi_offset %r13, -40
2736 ; CHECK-NEXT: .cfi_offset %r14, -32
2737 ; CHECK-NEXT: .cfi_offset %r15, -24
2738 ; CHECK-NEXT: .cfi_offset %rbp, -16
2739 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2742 ; CHECK-NEXT: #NO_APP
2743 ; CHECK-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2744 ; CHECK-NEXT: popq %rbx
2745 ; CHECK-NEXT: .cfi_def_cfa_offset 48
2746 ; CHECK-NEXT: popq %r12
2747 ; CHECK-NEXT: .cfi_def_cfa_offset 40
2748 ; CHECK-NEXT: popq %r13
2749 ; CHECK-NEXT: .cfi_def_cfa_offset 32
2750 ; CHECK-NEXT: popq %r14
2751 ; CHECK-NEXT: .cfi_def_cfa_offset 24
2752 ; CHECK-NEXT: popq %r15
2753 ; CHECK-NEXT: .cfi_def_cfa_offset 16
2754 ; CHECK-NEXT: popq %rbp
2755 ; CHECK-NEXT: .cfi_def_cfa_offset 8
2757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2758 %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
2762 define <16 x i32> @stack_fold_vplzcntd(<16 x i32> %a0) {
2763 ; CHECK-LABEL: stack_fold_vplzcntd:
2765 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2768 ; CHECK-NEXT: #NO_APP
2769 ; CHECK-NEXT: vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
2771 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2772 %2 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a0, i1 false)
2776 define <8 x i64> @stack_fold_vplzcntq(<8 x i64> %a0) {
2777 ; CHECK-LABEL: stack_fold_vplzcntq:
2779 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2782 ; CHECK-NEXT: #NO_APP
2783 ; CHECK-NEXT: vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
2785 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2786 %2 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a0, i1 false)
2790 define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
2791 ; CHECK-LABEL: stack_fold_pmaddubsw_zmm:
2793 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2796 ; CHECK-NEXT: #NO_APP
2797 ; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2799 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2800 %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
2803 declare <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>) nounwind readnone
2805 define <32 x i16> @stack_fold_pmaddubsw_zmm_mask(<32 x i16>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
2806 ; CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask:
2808 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2811 ; CHECK-NEXT: #NO_APP
2812 ; CHECK-NEXT: kmovd %esi, %k1
2813 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2814 ; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2815 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2817 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2818 %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
2819 %3 = bitcast i32 %mask to <32 x i1>
2820 ; load needed to keep the operation from being scheduled about the asm block
2821 %4 = load <32 x i16>, <32 x i16>* %passthru
2822 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
2826 define <32 x i16> @stack_fold_pmaddubsw_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
2827 ; CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz:
2829 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2832 ; CHECK-NEXT: #NO_APP
2833 ; CHECK-NEXT: kmovd %edi, %k1
2834 ; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2836 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2837 %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
2838 %3 = bitcast i32 %mask to <32 x i1>
2839 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
2843 define <16 x i32> @stack_fold_pmaddwd_zmm(<32 x i16> %a0, <32 x i16> %a1) {
2844 ; CHECK-LABEL: stack_fold_pmaddwd_zmm:
2846 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2849 ; CHECK-NEXT: #NO_APP
2850 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2852 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2853 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
2856 declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) nounwind readnone
2858 define <16 x i32> @stack_fold_pmaddwd_zmm_commuted(<32 x i16> %a0, <32 x i16> %a1) {
2859 ; CHECK-LABEL: stack_fold_pmaddwd_zmm_commuted:
2861 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2864 ; CHECK-NEXT: #NO_APP
2865 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2867 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2868 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
2872 define <16 x i32> @stack_fold_pmaddwd_zmm_mask(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2873 ; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask:
2875 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2878 ; CHECK-NEXT: #NO_APP
2879 ; CHECK-NEXT: kmovd %esi, %k1
2880 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2881 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2882 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2884 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2885 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
2886 %3 = bitcast i16 %mask to <16 x i1>
2887 ; load needed to keep the operation from being scheduled about the asm block
2888 %4 = load <16 x i32>, <16 x i32>* %passthru
2889 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
2893 define <16 x i32> @stack_fold_pmaddwd_zmm_mask_commuted(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2894 ; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask_commuted:
2896 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2899 ; CHECK-NEXT: #NO_APP
2900 ; CHECK-NEXT: kmovd %esi, %k1
2901 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
2902 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2903 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2905 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2906 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
2907 %3 = bitcast i16 %mask to <16 x i1>
2908 ; load needed to keep the operation from being scheduled about the asm block
2909 %4 = load <16 x i32>, <16 x i32>* %passthru
2910 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
2914 define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2915 ; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz:
2917 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2920 ; CHECK-NEXT: #NO_APP
2921 ; CHECK-NEXT: kmovd %esi, %k1
2922 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2924 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2925 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
2926 %3 = bitcast i16 %mask to <16 x i1>
2927 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
2931 define <16 x i32> @stack_fold_pmaddwd_zmm_maskz_commuted(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2932 ; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz_commuted:
2934 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2937 ; CHECK-NEXT: #NO_APP
2938 ; CHECK-NEXT: kmovd %esi, %k1
2939 ; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2941 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2942 %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
2943 %3 = bitcast i16 %mask to <16 x i1>
2944 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
2948 define <64 x i8> @stack_fold_pmaxsb(<64 x i8> %a0, <64 x i8> %a1) {
2949 ; CHECK-LABEL: stack_fold_pmaxsb:
2951 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2954 ; CHECK-NEXT: #NO_APP
2955 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2958 %2 = icmp sgt <64 x i8> %a0, %a1
2959 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
2963 define <64 x i8> @stack_fold_pmaxsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
2964 ; CHECK-LABEL: stack_fold_pmaxsb_commuted:
2966 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2969 ; CHECK-NEXT: #NO_APP
2970 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2972 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2973 %2 = icmp sgt <64 x i8> %a1, %a0
2974 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
2978 define <64 x i8> @stack_fold_pmaxsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
2979 ; CHECK-LABEL: stack_fold_pmaxsb_mask:
2981 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2984 ; CHECK-NEXT: #NO_APP
2985 ; CHECK-NEXT: kmovq %rdi, %k1
2986 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
2987 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2988 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2990 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2991 %2 = icmp sgt <64 x i8> %a0, %a1
2992 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
2993 %4 = bitcast i64 %mask to <64 x i1>
2994 ; load needed to keep the operation from being scheduled about the asm block
2995 %5 = load <64 x i8>, <64 x i8>* %passthru
2996 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3000 define <64 x i8> @stack_fold_pmaxsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
3001 ; CHECK-LABEL: stack_fold_pmaxsb_mask_commuted:
3003 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3006 ; CHECK-NEXT: #NO_APP
3007 ; CHECK-NEXT: kmovq %rdi, %k1
3008 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3009 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3010 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3012 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3013 %2 = icmp sgt <64 x i8> %a1, %a0
3014 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3015 %4 = bitcast i64 %mask to <64 x i1>
3016 ; load needed to keep the operation from being scheduled about the asm block
3017 %5 = load <64 x i8>, <64 x i8>* %passthru
3018 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3022 define <64 x i8> @stack_fold_pmaxsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3023 ; CHECK-LABEL: stack_fold_pmaxsb_maskz:
3025 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3028 ; CHECK-NEXT: #NO_APP
3029 ; CHECK-NEXT: kmovq %rdi, %k1
3030 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3032 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3033 %2 = icmp sgt <64 x i8> %a0, %a1
3034 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3035 %4 = bitcast i64 %mask to <64 x i1>
3036 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3040 define <64 x i8> @stack_fold_pmaxsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3041 ; CHECK-LABEL: stack_fold_pmaxsb_maskz_commuted:
3043 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3046 ; CHECK-NEXT: #NO_APP
3047 ; CHECK-NEXT: kmovq %rdi, %k1
3048 ; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3050 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3051 %2 = icmp sgt <64 x i8> %a1, %a0
3052 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3053 %4 = bitcast i64 %mask to <64 x i1>
3054 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3058 define <16 x i32> @stack_fold_pmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
3059 ; CHECK-LABEL: stack_fold_pmaxsd:
3061 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3064 ; CHECK-NEXT: #NO_APP
3065 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3067 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3068 %2 = icmp sgt <16 x i32> %a0, %a1
3069 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3073 define <16 x i32> @stack_fold_pmaxsd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
3074 ; CHECK-LABEL: stack_fold_pmaxsd_commuted:
3076 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3079 ; CHECK-NEXT: #NO_APP
3080 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3082 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3083 %2 = icmp sgt <16 x i32> %a1, %a0
3084 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3088 define <16 x i32> @stack_fold_pmaxsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
3089 ; CHECK-LABEL: stack_fold_pmaxsd_mask:
3091 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3094 ; CHECK-NEXT: #NO_APP
3095 ; CHECK-NEXT: kmovd %edi, %k1
3096 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3097 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3098 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3100 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3101 %2 = icmp sgt <16 x i32> %a0, %a1
3102 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3103 %4 = bitcast i16 %mask to <16 x i1>
3104 ; load needed to keep the operation from being scheduled about the asm block
3105 %5 = load <16 x i32>, <16 x i32>* %passthru
3106 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3110 define <16 x i32> @stack_fold_pmaxsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
3111 ; CHECK-LABEL: stack_fold_pmaxsd_mask_commuted:
3113 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3116 ; CHECK-NEXT: #NO_APP
3117 ; CHECK-NEXT: kmovd %edi, %k1
3118 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3119 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3120 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3122 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3123 %2 = icmp sgt <16 x i32> %a1, %a0
3124 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3125 %4 = bitcast i16 %mask to <16 x i1>
3126 ; load needed to keep the operation from being scheduled about the asm block
3127 %5 = load <16 x i32>, <16 x i32>* %passthru
3128 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3132 define <16 x i32> @stack_fold_pmaxsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3133 ; CHECK-LABEL: stack_fold_pmaxsd_maskz:
3135 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3138 ; CHECK-NEXT: #NO_APP
3139 ; CHECK-NEXT: kmovd %edi, %k1
3140 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3142 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3143 %2 = icmp sgt <16 x i32> %a0, %a1
3144 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3145 %4 = bitcast i16 %mask to <16 x i1>
3146 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3150 define <16 x i32> @stack_fold_pmaxsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3151 ; CHECK-LABEL: stack_fold_pmaxsd_maskz_commuted:
3153 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3156 ; CHECK-NEXT: #NO_APP
3157 ; CHECK-NEXT: kmovd %edi, %k1
3158 ; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3160 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3161 %2 = icmp sgt <16 x i32> %a1, %a0
3162 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3163 %4 = bitcast i16 %mask to <16 x i1>
3164 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3168 define <8 x i64> @stack_fold_pmaxsq(<8 x i64> %a0, <8 x i64> %a1) {
3169 ; CHECK-LABEL: stack_fold_pmaxsq:
3171 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3174 ; CHECK-NEXT: #NO_APP
3175 ; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3177 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3178 %2 = icmp sgt <8 x i64> %a0, %a1
3179 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3183 define <8 x i64> @stack_fold_pmaxsq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
3184 ; CHECK-LABEL: stack_fold_pmaxsq_commuted:
3186 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3189 ; CHECK-NEXT: #NO_APP
3190 ; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3192 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3193 %2 = icmp sgt <8 x i64> %a1, %a0
3194 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3198 define <8 x i64> @stack_fold_pmaxsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
3199 ; CHECK-LABEL: stack_fold_pmaxsq_mask:
3201 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3204 ; CHECK-NEXT: #NO_APP
3205 ; CHECK-NEXT: kmovd %edi, %k1
3206 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3207 ; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3208 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3210 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3211 %2 = icmp sgt <8 x i64> %a0, %a1
3212 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3213 %4 = bitcast i8 %mask to <8 x i1>
3214 ; load needed to keep the operation from being scheduled about the asm block
3215 %5 = load <8 x i64>, <8 x i64>* %passthru
3216 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3220 define <8 x i64> @stack_fold_pmaxsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
3221 ; CHECK-LABEL: stack_fold_pmaxsq_mask_commuted:
3223 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3226 ; CHECK-NEXT: #NO_APP
3227 ; CHECK-NEXT: kmovd %edi, %k1
3228 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3229 ; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3230 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3232 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3233 %2 = icmp sgt <8 x i64> %a1, %a0
3234 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3235 %4 = bitcast i8 %mask to <8 x i1>
3236 ; load needed to keep the operation from being scheduled about the asm block
3237 %5 = load <8 x i64>, <8 x i64>* %passthru
3238 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3242 define <8 x i64> @stack_fold_pmaxsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3243 ; CHECK-LABEL: stack_fold_pmaxsq_maskz:
3245 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3248 ; CHECK-NEXT: #NO_APP
3249 ; CHECK-NEXT: kmovd %edi, %k1
3250 ; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3252 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3253 %2 = icmp sgt <8 x i64> %a0, %a1
3254 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3255 %4 = bitcast i8 %mask to <8 x i1>
3256 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3260 define <8 x i64> @stack_fold_pmaxsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3261 ; CHECK-LABEL: stack_fold_pmaxsq_maskz_commuted:
3263 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3266 ; CHECK-NEXT: #NO_APP
3267 ; CHECK-NEXT: kmovd %edi, %k1
3268 ; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3270 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3271 %2 = icmp sgt <8 x i64> %a1, %a0
3272 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3273 %4 = bitcast i8 %mask to <8 x i1>
3274 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3278 define <32 x i16> @stack_fold_pmaxsw(<32 x i16> %a0, <32 x i16> %a1) {
3279 ; CHECK-LABEL: stack_fold_pmaxsw:
3281 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3284 ; CHECK-NEXT: #NO_APP
3285 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3287 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3288 %2 = icmp sgt <32 x i16> %a0, %a1
3289 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3293 define <32 x i16> @stack_fold_pmaxsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
3294 ; CHECK-LABEL: stack_fold_pmaxsw_commuted:
3296 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3299 ; CHECK-NEXT: #NO_APP
3300 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3302 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3303 %2 = icmp sgt <32 x i16> %a1, %a0
3304 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3308 define <32 x i16> @stack_fold_pmaxsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
3309 ; CHECK-LABEL: stack_fold_pmaxsw_mask:
3311 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3314 ; CHECK-NEXT: #NO_APP
3315 ; CHECK-NEXT: kmovd %edi, %k1
3316 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3317 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3318 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3321 %2 = icmp sgt <32 x i16> %a0, %a1
3322 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3323 %4 = bitcast i32 %mask to <32 x i1>
3324 ; load needed to keep the operation from being scheduled about the asm block
3325 %5 = load <32 x i16>, <32 x i16>* %passthru
3326 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3330 define <32 x i16> @stack_fold_pmaxsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
3331 ; CHECK-LABEL: stack_fold_pmaxsw_mask_commuted:
3333 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3336 ; CHECK-NEXT: #NO_APP
3337 ; CHECK-NEXT: kmovd %edi, %k1
3338 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3339 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3340 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3342 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3343 %2 = icmp sgt <32 x i16> %a1, %a0
3344 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3345 %4 = bitcast i32 %mask to <32 x i1>
3346 ; load needed to keep the operation from being scheduled about the asm block
3347 %5 = load <32 x i16>, <32 x i16>* %passthru
3348 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3352 define <32 x i16> @stack_fold_pmaxsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3353 ; CHECK-LABEL: stack_fold_pmaxsw_maskz:
3355 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3358 ; CHECK-NEXT: #NO_APP
3359 ; CHECK-NEXT: kmovd %edi, %k1
3360 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3362 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3363 %2 = icmp sgt <32 x i16> %a0, %a1
3364 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3365 %4 = bitcast i32 %mask to <32 x i1>
3366 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3370 define <32 x i16> @stack_fold_pmaxsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3371 ; CHECK-LABEL: stack_fold_pmaxsw_maskz_commuted:
3373 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3376 ; CHECK-NEXT: #NO_APP
3377 ; CHECK-NEXT: kmovd %edi, %k1
3378 ; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3380 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3381 %2 = icmp sgt <32 x i16> %a1, %a0
3382 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3383 %4 = bitcast i32 %mask to <32 x i1>
3384 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3388 define <64 x i8> @stack_fold_pmaxub(<64 x i8> %a0, <64 x i8> %a1) {
3389 ; CHECK-LABEL: stack_fold_pmaxub:
3391 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3394 ; CHECK-NEXT: #NO_APP
3395 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3397 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3398 %2 = icmp ugt <64 x i8> %a0, %a1
3399 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3403 define <64 x i8> @stack_fold_pmaxub_commuted(<64 x i8> %a0, <64 x i8> %a1) {
3404 ; CHECK-LABEL: stack_fold_pmaxub_commuted:
3406 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3409 ; CHECK-NEXT: #NO_APP
3410 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3412 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3413 %2 = icmp ugt <64 x i8> %a1, %a0
3414 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3418 define <64 x i8> @stack_fold_pmaxub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
3419 ; CHECK-LABEL: stack_fold_pmaxub_mask:
3421 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3424 ; CHECK-NEXT: #NO_APP
3425 ; CHECK-NEXT: kmovq %rdi, %k1
3426 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3427 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3428 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3430 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3431 %2 = icmp ugt <64 x i8> %a0, %a1
3432 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3433 %4 = bitcast i64 %mask to <64 x i1>
3434 ; load needed to keep the operation from being scheduled about the asm block
3435 %5 = load <64 x i8>, <64 x i8>* %passthru
3436 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3440 define <64 x i8> @stack_fold_pmaxub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
3441 ; CHECK-LABEL: stack_fold_pmaxub_mask_commuted:
3443 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3446 ; CHECK-NEXT: #NO_APP
3447 ; CHECK-NEXT: kmovq %rdi, %k1
3448 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3449 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3450 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3452 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3453 %2 = icmp ugt <64 x i8> %a1, %a0
3454 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3455 %4 = bitcast i64 %mask to <64 x i1>
3456 ; load needed to keep the operation from being scheduled about the asm block
3457 %5 = load <64 x i8>, <64 x i8>* %passthru
3458 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3462 define <64 x i8> @stack_fold_pmaxub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3463 ; CHECK-LABEL: stack_fold_pmaxub_maskz:
3465 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3468 ; CHECK-NEXT: #NO_APP
3469 ; CHECK-NEXT: kmovq %rdi, %k1
3470 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3472 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3473 %2 = icmp ugt <64 x i8> %a0, %a1
3474 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3475 %4 = bitcast i64 %mask to <64 x i1>
3476 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3480 define <64 x i8> @stack_fold_pmaxub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3481 ; CHECK-LABEL: stack_fold_pmaxub_maskz_commuted:
3483 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3486 ; CHECK-NEXT: #NO_APP
3487 ; CHECK-NEXT: kmovq %rdi, %k1
3488 ; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3490 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3491 %2 = icmp ugt <64 x i8> %a1, %a0
3492 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3493 %4 = bitcast i64 %mask to <64 x i1>
3494 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3498 define <16 x i32> @stack_fold_pmaxud(<16 x i32> %a0, <16 x i32> %a1) {
3499 ; CHECK-LABEL: stack_fold_pmaxud:
3501 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3504 ; CHECK-NEXT: #NO_APP
3505 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3507 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3508 %2 = icmp ugt <16 x i32> %a0, %a1
3509 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3513 define <16 x i32> @stack_fold_pmaxud_commuted(<16 x i32> %a0, <16 x i32> %a1) {
3514 ; CHECK-LABEL: stack_fold_pmaxud_commuted:
3516 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3519 ; CHECK-NEXT: #NO_APP
3520 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3522 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3523 %2 = icmp ugt <16 x i32> %a1, %a0
3524 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3528 define <16 x i32> @stack_fold_pmaxud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
3529 ; CHECK-LABEL: stack_fold_pmaxud_mask:
3531 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3534 ; CHECK-NEXT: #NO_APP
3535 ; CHECK-NEXT: kmovd %edi, %k1
3536 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3537 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3538 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3540 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3541 %2 = icmp ugt <16 x i32> %a0, %a1
3542 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3543 %4 = bitcast i16 %mask to <16 x i1>
3544 ; load needed to keep the operation from being scheduled about the asm block
3545 %5 = load <16 x i32>, <16 x i32>* %passthru
3546 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3550 define <16 x i32> @stack_fold_pmaxud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
3551 ; CHECK-LABEL: stack_fold_pmaxud_mask_commuted:
3553 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3556 ; CHECK-NEXT: #NO_APP
3557 ; CHECK-NEXT: kmovd %edi, %k1
3558 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3559 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3560 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3562 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3563 %2 = icmp ugt <16 x i32> %a1, %a0
3564 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3565 %4 = bitcast i16 %mask to <16 x i1>
3566 ; load needed to keep the operation from being scheduled about the asm block
3567 %5 = load <16 x i32>, <16 x i32>* %passthru
3568 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3572 define <16 x i32> @stack_fold_pmaxud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3573 ; CHECK-LABEL: stack_fold_pmaxud_maskz:
3575 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3578 ; CHECK-NEXT: #NO_APP
3579 ; CHECK-NEXT: kmovd %edi, %k1
3580 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3582 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3583 %2 = icmp ugt <16 x i32> %a0, %a1
3584 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3585 %4 = bitcast i16 %mask to <16 x i1>
3586 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3590 define <16 x i32> @stack_fold_pmaxud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3591 ; CHECK-LABEL: stack_fold_pmaxud_maskz_commuted:
3593 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3596 ; CHECK-NEXT: #NO_APP
3597 ; CHECK-NEXT: kmovd %edi, %k1
3598 ; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3600 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3601 %2 = icmp ugt <16 x i32> %a1, %a0
3602 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3603 %4 = bitcast i16 %mask to <16 x i1>
3604 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3608 define <8 x i64> @stack_fold_pmaxuq(<8 x i64> %a0, <8 x i64> %a1) {
3609 ; CHECK-LABEL: stack_fold_pmaxuq:
3611 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3614 ; CHECK-NEXT: #NO_APP
3615 ; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3617 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3618 %2 = icmp ugt <8 x i64> %a0, %a1
3619 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3623 define <8 x i64> @stack_fold_pmaxuq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
3624 ; CHECK-LABEL: stack_fold_pmaxuq_commuted:
3626 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3629 ; CHECK-NEXT: #NO_APP
3630 ; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3632 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3633 %2 = icmp ugt <8 x i64> %a1, %a0
3634 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3638 define <8 x i64> @stack_fold_pmaxuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
3639 ; CHECK-LABEL: stack_fold_pmaxuq_mask:
3641 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3644 ; CHECK-NEXT: #NO_APP
3645 ; CHECK-NEXT: kmovd %edi, %k1
3646 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3647 ; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3648 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3650 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3651 %2 = icmp ugt <8 x i64> %a0, %a1
3652 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3653 %4 = bitcast i8 %mask to <8 x i1>
3654 ; load needed to keep the operation from being scheduled about the asm block
3655 %5 = load <8 x i64>, <8 x i64>* %passthru
3656 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3660 define <8 x i64> @stack_fold_pmaxuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
3661 ; CHECK-LABEL: stack_fold_pmaxuq_mask_commuted:
3663 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3666 ; CHECK-NEXT: #NO_APP
3667 ; CHECK-NEXT: kmovd %edi, %k1
3668 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3669 ; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3670 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3672 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3673 %2 = icmp ugt <8 x i64> %a1, %a0
3674 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3675 %4 = bitcast i8 %mask to <8 x i1>
3676 ; load needed to keep the operation from being scheduled about the asm block
3677 %5 = load <8 x i64>, <8 x i64>* %passthru
3678 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3682 define <8 x i64> @stack_fold_pmaxuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3683 ; CHECK-LABEL: stack_fold_pmaxuq_maskz:
3685 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3688 ; CHECK-NEXT: #NO_APP
3689 ; CHECK-NEXT: kmovd %edi, %k1
3690 ; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3692 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3693 %2 = icmp ugt <8 x i64> %a0, %a1
3694 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3695 %4 = bitcast i8 %mask to <8 x i1>
3696 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3700 define <8 x i64> @stack_fold_pmaxuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3701 ; CHECK-LABEL: stack_fold_pmaxuq_maskz_commuted:
3703 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3706 ; CHECK-NEXT: #NO_APP
3707 ; CHECK-NEXT: kmovd %edi, %k1
3708 ; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3710 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3711 %2 = icmp ugt <8 x i64> %a1, %a0
3712 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3713 %4 = bitcast i8 %mask to <8 x i1>
3714 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3718 define <32 x i16> @stack_fold_pmaxuw(<32 x i16> %a0, <32 x i16> %a1) {
3719 ; CHECK-LABEL: stack_fold_pmaxuw:
3721 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3724 ; CHECK-NEXT: #NO_APP
3725 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3727 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3728 %2 = icmp ugt <32 x i16> %a0, %a1
3729 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3733 define <32 x i16> @stack_fold_pmaxuw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
3734 ; CHECK-LABEL: stack_fold_pmaxuw_commuted:
3736 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3739 ; CHECK-NEXT: #NO_APP
3740 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3742 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3743 %2 = icmp ugt <32 x i16> %a1, %a0
3744 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3748 define <32 x i16> @stack_fold_pmaxuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
3749 ; CHECK-LABEL: stack_fold_pmaxuw_mask:
3751 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3754 ; CHECK-NEXT: #NO_APP
3755 ; CHECK-NEXT: kmovd %edi, %k1
3756 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3757 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3758 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3760 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3761 %2 = icmp ugt <32 x i16> %a0, %a1
3762 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3763 %4 = bitcast i32 %mask to <32 x i1>
3764 ; load needed to keep the operation from being scheduled about the asm block
3765 %5 = load <32 x i16>, <32 x i16>* %passthru
3766 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3770 define <32 x i16> @stack_fold_pmaxuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
3771 ; CHECK-LABEL: stack_fold_pmaxuw_mask_commuted:
3773 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3776 ; CHECK-NEXT: #NO_APP
3777 ; CHECK-NEXT: kmovd %edi, %k1
3778 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3779 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3780 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3782 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3783 %2 = icmp ugt <32 x i16> %a1, %a0
3784 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3785 %4 = bitcast i32 %mask to <32 x i1>
3786 ; load needed to keep the operation from being scheduled about the asm block
3787 %5 = load <32 x i16>, <32 x i16>* %passthru
3788 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3792 define <32 x i16> @stack_fold_pmaxuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3793 ; CHECK-LABEL: stack_fold_pmaxuw_maskz:
3795 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3798 ; CHECK-NEXT: #NO_APP
3799 ; CHECK-NEXT: kmovd %edi, %k1
3800 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3802 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3803 %2 = icmp ugt <32 x i16> %a0, %a1
3804 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3805 %4 = bitcast i32 %mask to <32 x i1>
3806 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3810 define <32 x i16> @stack_fold_pmaxuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3811 ; CHECK-LABEL: stack_fold_pmaxuw_maskz_commuted:
3813 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3816 ; CHECK-NEXT: #NO_APP
3817 ; CHECK-NEXT: kmovd %edi, %k1
3818 ; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3821 %2 = icmp ugt <32 x i16> %a1, %a0
3822 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3823 %4 = bitcast i32 %mask to <32 x i1>
3824 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3828 define <64 x i8> @stack_fold_pminsb(<64 x i8> %a0, <64 x i8> %a1) {
3829 ; CHECK-LABEL: stack_fold_pminsb:
3831 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3834 ; CHECK-NEXT: #NO_APP
3835 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3837 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3838 %2 = icmp slt <64 x i8> %a0, %a1
3839 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3843 define <64 x i8> @stack_fold_pminsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
3844 ; CHECK-LABEL: stack_fold_pminsb_commuted:
3846 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3849 ; CHECK-NEXT: #NO_APP
3850 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3852 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3853 %2 = icmp slt <64 x i8> %a1, %a0
3854 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3858 define <64 x i8> @stack_fold_pminsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
3859 ; CHECK-LABEL: stack_fold_pminsb_mask:
3861 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3864 ; CHECK-NEXT: #NO_APP
3865 ; CHECK-NEXT: kmovq %rdi, %k1
3866 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3867 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3868 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3870 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3871 %2 = icmp slt <64 x i8> %a0, %a1
3872 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3873 %4 = bitcast i64 %mask to <64 x i1>
3874 ; load needed to keep the operation from being scheduled about the asm block
3875 %5 = load <64 x i8>, <64 x i8>* %passthru
3876 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3880 define <64 x i8> @stack_fold_pminsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
3881 ; CHECK-LABEL: stack_fold_pminsb_mask_commuted:
3883 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3886 ; CHECK-NEXT: #NO_APP
3887 ; CHECK-NEXT: kmovq %rdi, %k1
3888 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3889 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3890 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3892 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3893 %2 = icmp slt <64 x i8> %a1, %a0
3894 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3895 %4 = bitcast i64 %mask to <64 x i1>
3896 ; load needed to keep the operation from being scheduled about the asm block
3897 %5 = load <64 x i8>, <64 x i8>* %passthru
3898 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3902 define <64 x i8> @stack_fold_pminsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3903 ; CHECK-LABEL: stack_fold_pminsb_maskz:
3905 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3908 ; CHECK-NEXT: #NO_APP
3909 ; CHECK-NEXT: kmovq %rdi, %k1
3910 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3912 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3913 %2 = icmp slt <64 x i8> %a0, %a1
3914 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3915 %4 = bitcast i64 %mask to <64 x i1>
3916 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3920 define <64 x i8> @stack_fold_pminsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3921 ; CHECK-LABEL: stack_fold_pminsb_maskz_commuted:
3923 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3926 ; CHECK-NEXT: #NO_APP
3927 ; CHECK-NEXT: kmovq %rdi, %k1
3928 ; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3930 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3931 %2 = icmp slt <64 x i8> %a1, %a0
3932 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3933 %4 = bitcast i64 %mask to <64 x i1>
3934 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3938 define <16 x i32> @stack_fold_pminsd(<16 x i32> %a0, <16 x i32> %a1) {
3939 ; CHECK-LABEL: stack_fold_pminsd:
3941 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3944 ; CHECK-NEXT: #NO_APP
3945 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3947 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3948 %2 = icmp slt <16 x i32> %a0, %a1
3949 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3953 define <16 x i32> @stack_fold_pminsd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
3954 ; CHECK-LABEL: stack_fold_pminsd_commuted:
3956 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3959 ; CHECK-NEXT: #NO_APP
3960 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3962 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3963 %2 = icmp slt <16 x i32> %a1, %a0
3964 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3968 define <16 x i32> @stack_fold_pminsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
3969 ; CHECK-LABEL: stack_fold_pminsd_mask:
3971 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3974 ; CHECK-NEXT: #NO_APP
3975 ; CHECK-NEXT: kmovd %edi, %k1
3976 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3977 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3978 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
3980 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3981 %2 = icmp slt <16 x i32> %a0, %a1
3982 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3983 %4 = bitcast i16 %mask to <16 x i1>
3984 ; load needed to keep the operation from being scheduled about the asm block
3985 %5 = load <16 x i32>, <16 x i32>* %passthru
3986 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3990 define <16 x i32> @stack_fold_pminsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
3991 ; CHECK-LABEL: stack_fold_pminsd_mask_commuted:
3993 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3996 ; CHECK-NEXT: #NO_APP
3997 ; CHECK-NEXT: kmovd %edi, %k1
3998 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
3999 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4000 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4002 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4003 %2 = icmp slt <16 x i32> %a1, %a0
4004 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4005 %4 = bitcast i16 %mask to <16 x i1>
4006 ; load needed to keep the operation from being scheduled about the asm block
4007 %5 = load <16 x i32>, <16 x i32>* %passthru
4008 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
4012 define <16 x i32> @stack_fold_pminsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4013 ; CHECK-LABEL: stack_fold_pminsd_maskz:
4015 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4018 ; CHECK-NEXT: #NO_APP
4019 ; CHECK-NEXT: kmovd %edi, %k1
4020 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4022 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4023 %2 = icmp slt <16 x i32> %a0, %a1
4024 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4025 %4 = bitcast i16 %mask to <16 x i1>
4026 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4030 define <16 x i32> @stack_fold_pminsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4031 ; CHECK-LABEL: stack_fold_pminsd_maskz_commuted:
4033 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4036 ; CHECK-NEXT: #NO_APP
4037 ; CHECK-NEXT: kmovd %edi, %k1
4038 ; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4040 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4041 %2 = icmp slt <16 x i32> %a1, %a0
4042 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4043 %4 = bitcast i16 %mask to <16 x i1>
4044 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4048 define <8 x i64> @stack_fold_pminsq(<8 x i64> %a0, <8 x i64> %a1) {
4049 ; CHECK-LABEL: stack_fold_pminsq:
4051 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4054 ; CHECK-NEXT: #NO_APP
4055 ; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4057 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4058 %2 = icmp slt <8 x i64> %a0, %a1
4059 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4063 define <8 x i64> @stack_fold_pminsq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
4064 ; CHECK-LABEL: stack_fold_pminsq_commuted:
4066 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4069 ; CHECK-NEXT: #NO_APP
4070 ; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4072 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4073 %2 = icmp slt <8 x i64> %a1, %a0
4074 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4078 define <8 x i64> @stack_fold_pminsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
4079 ; CHECK-LABEL: stack_fold_pminsq_mask:
4081 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4084 ; CHECK-NEXT: #NO_APP
4085 ; CHECK-NEXT: kmovd %edi, %k1
4086 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4087 ; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4088 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4090 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4091 %2 = icmp slt <8 x i64> %a0, %a1
4092 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4093 %4 = bitcast i8 %mask to <8 x i1>
4094 ; load needed to keep the operation from being scheduled about the asm block
4095 %5 = load <8 x i64>, <8 x i64>* %passthru
4096 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4100 define <8 x i64> @stack_fold_pminsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
4101 ; CHECK-LABEL: stack_fold_pminsq_mask_commuted:
4103 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4106 ; CHECK-NEXT: #NO_APP
4107 ; CHECK-NEXT: kmovd %edi, %k1
4108 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4109 ; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4110 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4112 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4113 %2 = icmp slt <8 x i64> %a1, %a0
4114 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4115 %4 = bitcast i8 %mask to <8 x i1>
4116 ; load needed to keep the operation from being scheduled about the asm block
4117 %5 = load <8 x i64>, <8 x i64>* %passthru
4118 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4122 define <8 x i64> @stack_fold_pminsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4123 ; CHECK-LABEL: stack_fold_pminsq_maskz:
4125 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4128 ; CHECK-NEXT: #NO_APP
4129 ; CHECK-NEXT: kmovd %edi, %k1
4130 ; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4132 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4133 %2 = icmp slt <8 x i64> %a0, %a1
4134 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4135 %4 = bitcast i8 %mask to <8 x i1>
4136 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4140 define <8 x i64> @stack_fold_pminsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4141 ; CHECK-LABEL: stack_fold_pminsq_maskz_commuted:
4143 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4146 ; CHECK-NEXT: #NO_APP
4147 ; CHECK-NEXT: kmovd %edi, %k1
4148 ; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4150 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4151 %2 = icmp slt <8 x i64> %a1, %a0
4152 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4153 %4 = bitcast i8 %mask to <8 x i1>
4154 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4158 define <32 x i16> @stack_fold_pminsw(<32 x i16> %a0, <32 x i16> %a1) {
4159 ; CHECK-LABEL: stack_fold_pminsw:
4161 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4164 ; CHECK-NEXT: #NO_APP
4165 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4167 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4168 %2 = icmp slt <32 x i16> %a0, %a1
4169 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4173 define <32 x i16> @stack_fold_pminsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
4174 ; CHECK-LABEL: stack_fold_pminsw_commuted:
4176 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4179 ; CHECK-NEXT: #NO_APP
4180 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4182 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4183 %2 = icmp slt <32 x i16> %a1, %a0
4184 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4188 define <32 x i16> @stack_fold_pminsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
4189 ; CHECK-LABEL: stack_fold_pminsw_mask:
4191 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4194 ; CHECK-NEXT: #NO_APP
4195 ; CHECK-NEXT: kmovd %edi, %k1
4196 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4197 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4198 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4200 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4201 %2 = icmp slt <32 x i16> %a0, %a1
4202 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4203 %4 = bitcast i32 %mask to <32 x i1>
4204 ; load needed to keep the operation from being scheduled about the asm block
4205 %5 = load <32 x i16>, <32 x i16>* %passthru
4206 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4210 define <32 x i16> @stack_fold_pminsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
4211 ; CHECK-LABEL: stack_fold_pminsw_mask_commuted:
4213 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4216 ; CHECK-NEXT: #NO_APP
4217 ; CHECK-NEXT: kmovd %edi, %k1
4218 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4219 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4220 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4222 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4223 %2 = icmp slt <32 x i16> %a1, %a0
4224 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4225 %4 = bitcast i32 %mask to <32 x i1>
4226 ; load needed to keep the operation from being scheduled about the asm block
4227 %5 = load <32 x i16>, <32 x i16>* %passthru
4228 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4232 define <32 x i16> @stack_fold_pminsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4233 ; CHECK-LABEL: stack_fold_pminsw_maskz:
4235 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4238 ; CHECK-NEXT: #NO_APP
4239 ; CHECK-NEXT: kmovd %edi, %k1
4240 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4242 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4243 %2 = icmp slt <32 x i16> %a0, %a1
4244 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4245 %4 = bitcast i32 %mask to <32 x i1>
4246 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4250 define <32 x i16> @stack_fold_pminsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4251 ; CHECK-LABEL: stack_fold_pminsw_maskz_commuted:
4253 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4256 ; CHECK-NEXT: #NO_APP
4257 ; CHECK-NEXT: kmovd %edi, %k1
4258 ; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4260 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4261 %2 = icmp slt <32 x i16> %a1, %a0
4262 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4263 %4 = bitcast i32 %mask to <32 x i1>
4264 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4268 define <64 x i8> @stack_fold_pminub(<64 x i8> %a0, <64 x i8> %a1) {
4269 ; CHECK-LABEL: stack_fold_pminub:
4271 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4274 ; CHECK-NEXT: #NO_APP
4275 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4277 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4278 %2 = icmp ult <64 x i8> %a0, %a1
4279 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
4283 define <64 x i8> @stack_fold_pminub_commuted(<64 x i8> %a0, <64 x i8> %a1) {
4284 ; CHECK-LABEL: stack_fold_pminub_commuted:
4286 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4289 ; CHECK-NEXT: #NO_APP
4290 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4292 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4293 %2 = icmp ult <64 x i8> %a1, %a0
4294 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
4298 define <64 x i8> @stack_fold_pminub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
4299 ; CHECK-LABEL: stack_fold_pminub_mask:
4301 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4304 ; CHECK-NEXT: #NO_APP
4305 ; CHECK-NEXT: kmovq %rdi, %k1
4306 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4307 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4308 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4310 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4311 %2 = icmp ult <64 x i8> %a0, %a1
4312 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
4313 %4 = bitcast i64 %mask to <64 x i1>
4314 ; load needed to keep the operation from being scheduled about the asm block
4315 %5 = load <64 x i8>, <64 x i8>* %passthru
4316 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
4320 define <64 x i8> @stack_fold_pminub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
4321 ; CHECK-LABEL: stack_fold_pminub_mask_commuted:
4323 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4326 ; CHECK-NEXT: #NO_APP
4327 ; CHECK-NEXT: kmovq %rdi, %k1
4328 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4329 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4330 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4332 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4333 %2 = icmp ult <64 x i8> %a1, %a0
4334 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
4335 %4 = bitcast i64 %mask to <64 x i1>
4336 ; load needed to keep the operation from being scheduled about the asm block
4337 %5 = load <64 x i8>, <64 x i8>* %passthru
4338 %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
4342 define <64 x i8> @stack_fold_pminub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
4343 ; CHECK-LABEL: stack_fold_pminub_maskz:
4345 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4348 ; CHECK-NEXT: #NO_APP
4349 ; CHECK-NEXT: kmovq %rdi, %k1
4350 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4352 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4353 %2 = icmp ult <64 x i8> %a0, %a1
4354 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
4355 %4 = bitcast i64 %mask to <64 x i1>
4356 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
4360 define <64 x i8> @stack_fold_pminub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
4361 ; CHECK-LABEL: stack_fold_pminub_maskz_commuted:
4363 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4366 ; CHECK-NEXT: #NO_APP
4367 ; CHECK-NEXT: kmovq %rdi, %k1
4368 ; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4370 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4371 %2 = icmp ult <64 x i8> %a1, %a0
4372 %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
4373 %4 = bitcast i64 %mask to <64 x i1>
4374 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
4378 define <16 x i32> @stack_fold_pminud(<16 x i32> %a0, <16 x i32> %a1) {
4379 ; CHECK-LABEL: stack_fold_pminud:
4381 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4384 ; CHECK-NEXT: #NO_APP
4385 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4387 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4388 %2 = icmp ult <16 x i32> %a0, %a1
4389 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4393 define <16 x i32> @stack_fold_pminud_commuted(<16 x i32> %a0, <16 x i32> %a1) {
4394 ; CHECK-LABEL: stack_fold_pminud_commuted:
4396 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4399 ; CHECK-NEXT: #NO_APP
4400 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4402 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4403 %2 = icmp ult <16 x i32> %a1, %a0
4404 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4408 define <16 x i32> @stack_fold_pminud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
4409 ; CHECK-LABEL: stack_fold_pminud_mask:
4411 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4414 ; CHECK-NEXT: #NO_APP
4415 ; CHECK-NEXT: kmovd %edi, %k1
4416 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4417 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4418 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4420 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4421 %2 = icmp ult <16 x i32> %a0, %a1
4422 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4423 %4 = bitcast i16 %mask to <16 x i1>
4424 ; load needed to keep the operation from being scheduled about the asm block
4425 %5 = load <16 x i32>, <16 x i32>* %passthru
4426 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
4430 define <16 x i32> @stack_fold_pminud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
4431 ; CHECK-LABEL: stack_fold_pminud_mask_commuted:
4433 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4436 ; CHECK-NEXT: #NO_APP
4437 ; CHECK-NEXT: kmovd %edi, %k1
4438 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4439 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4440 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4442 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4443 %2 = icmp ult <16 x i32> %a1, %a0
4444 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4445 %4 = bitcast i16 %mask to <16 x i1>
4446 ; load needed to keep the operation from being scheduled about the asm block
4447 %5 = load <16 x i32>, <16 x i32>* %passthru
4448 %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
4452 define <16 x i32> @stack_fold_pminud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4453 ; CHECK-LABEL: stack_fold_pminud_maskz:
4455 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4458 ; CHECK-NEXT: #NO_APP
4459 ; CHECK-NEXT: kmovd %edi, %k1
4460 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4462 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4463 %2 = icmp ult <16 x i32> %a0, %a1
4464 %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4465 %4 = bitcast i16 %mask to <16 x i1>
4466 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4470 define <16 x i32> @stack_fold_pminud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4471 ; CHECK-LABEL: stack_fold_pminud_maskz_commuted:
4473 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4476 ; CHECK-NEXT: #NO_APP
4477 ; CHECK-NEXT: kmovd %edi, %k1
4478 ; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4480 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4481 %2 = icmp ult <16 x i32> %a1, %a0
4482 %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4483 %4 = bitcast i16 %mask to <16 x i1>
4484 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4488 define <8 x i64> @stack_fold_pminuq(<8 x i64> %a0, <8 x i64> %a1) {
4489 ; CHECK-LABEL: stack_fold_pminuq:
4491 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4494 ; CHECK-NEXT: #NO_APP
4495 ; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4497 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4498 %2 = icmp ult <8 x i64> %a0, %a1
4499 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4503 define <8 x i64> @stack_fold_pminuq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
4504 ; CHECK-LABEL: stack_fold_pminuq_commuted:
4506 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4509 ; CHECK-NEXT: #NO_APP
4510 ; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4512 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4513 %2 = icmp ult <8 x i64> %a1, %a0
4514 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4518 define <8 x i64> @stack_fold_pminuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
4519 ; CHECK-LABEL: stack_fold_pminuq_mask:
4521 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4524 ; CHECK-NEXT: #NO_APP
4525 ; CHECK-NEXT: kmovd %edi, %k1
4526 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4527 ; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4528 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4530 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4531 %2 = icmp ult <8 x i64> %a0, %a1
4532 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4533 %4 = bitcast i8 %mask to <8 x i1>
4534 ; load needed to keep the operation from being scheduled about the asm block
4535 %5 = load <8 x i64>, <8 x i64>* %passthru
4536 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4540 define <8 x i64> @stack_fold_pminuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
4541 ; CHECK-LABEL: stack_fold_pminuq_mask_commuted:
4543 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4546 ; CHECK-NEXT: #NO_APP
4547 ; CHECK-NEXT: kmovd %edi, %k1
4548 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4549 ; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4550 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4552 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4553 %2 = icmp ult <8 x i64> %a1, %a0
4554 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4555 %4 = bitcast i8 %mask to <8 x i1>
4556 ; load needed to keep the operation from being scheduled about the asm block
4557 %5 = load <8 x i64>, <8 x i64>* %passthru
4558 %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4562 define <8 x i64> @stack_fold_pminuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4563 ; CHECK-LABEL: stack_fold_pminuq_maskz:
4565 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4568 ; CHECK-NEXT: #NO_APP
4569 ; CHECK-NEXT: kmovd %edi, %k1
4570 ; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4572 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4573 %2 = icmp ult <8 x i64> %a0, %a1
4574 %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4575 %4 = bitcast i8 %mask to <8 x i1>
4576 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4580 define <8 x i64> @stack_fold_pminuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4581 ; CHECK-LABEL: stack_fold_pminuq_maskz_commuted:
4583 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4586 ; CHECK-NEXT: #NO_APP
4587 ; CHECK-NEXT: kmovd %edi, %k1
4588 ; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4590 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4591 %2 = icmp ult <8 x i64> %a1, %a0
4592 %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4593 %4 = bitcast i8 %mask to <8 x i1>
4594 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4598 define <32 x i16> @stack_fold_pminuw(<32 x i16> %a0, <32 x i16> %a1) {
4599 ; CHECK-LABEL: stack_fold_pminuw:
4601 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4604 ; CHECK-NEXT: #NO_APP
4605 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4607 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4608 %2 = icmp ult <32 x i16> %a0, %a1
4609 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4613 define <32 x i16> @stack_fold_pminuw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
4614 ; CHECK-LABEL: stack_fold_pminuw_commuted:
4616 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4619 ; CHECK-NEXT: #NO_APP
4620 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4622 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4623 %2 = icmp ult <32 x i16> %a1, %a0
4624 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4628 define <32 x i16> @stack_fold_pminuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
4629 ; CHECK-LABEL: stack_fold_pminuw_mask:
4631 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4634 ; CHECK-NEXT: #NO_APP
4635 ; CHECK-NEXT: kmovd %edi, %k1
4636 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4637 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4638 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4640 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4641 %2 = icmp ult <32 x i16> %a0, %a1
4642 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4643 %4 = bitcast i32 %mask to <32 x i1>
4644 ; load needed to keep the operation from being scheduled about the asm block
4645 %5 = load <32 x i16>, <32 x i16>* %passthru
4646 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4650 define <32 x i16> @stack_fold_pminuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
4651 ; CHECK-LABEL: stack_fold_pminuw_mask_commuted:
4653 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4656 ; CHECK-NEXT: #NO_APP
4657 ; CHECK-NEXT: kmovd %edi, %k1
4658 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
4659 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4660 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
4662 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4663 %2 = icmp ult <32 x i16> %a1, %a0
4664 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4665 %4 = bitcast i32 %mask to <32 x i1>
4666 ; load needed to keep the operation from being scheduled about the asm block
4667 %5 = load <32 x i16>, <32 x i16>* %passthru
4668 %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4672 define <32 x i16> @stack_fold_pminuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4673 ; CHECK-LABEL: stack_fold_pminuw_maskz:
4675 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4678 ; CHECK-NEXT: #NO_APP
4679 ; CHECK-NEXT: kmovd %edi, %k1
4680 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4682 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4683 %2 = icmp ult <32 x i16> %a0, %a1
4684 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4685 %4 = bitcast i32 %mask to <32 x i1>
4686 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4690 define <32 x i16> @stack_fold_pminuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4691 ; CHECK-LABEL: stack_fold_pminuw_maskz_commuted:
4693 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4696 ; CHECK-NEXT: #NO_APP
4697 ; CHECK-NEXT: kmovd %edi, %k1
4698 ; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4700 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4701 %2 = icmp ult <32 x i16> %a1, %a0
4702 %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4703 %4 = bitcast i32 %mask to <32 x i1>
4704 %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4708 define <16 x i8> @stack_fold_vpmovdb(<16 x i32> %a0) {
4709 ; CHECK-LABEL: stack_fold_vpmovdb:
4711 ; CHECK-NEXT: vpmovdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4714 ; CHECK-NEXT: #NO_APP
4715 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4716 ; CHECK-NEXT: vzeroupper
4718 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
4719 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4722 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
4724 define <16 x i16> @stack_fold_vpmovdw(<16 x i32> %a0) {
4725 ; CHECK-LABEL: stack_fold_vpmovdw:
4727 ; CHECK-NEXT: vpmovdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4730 ; CHECK-NEXT: #NO_APP
4731 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4733 %1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
4734 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4737 declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
4739 define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
4740 ; CHECK-LABEL: stack_fold_movq_load:
4742 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4745 ; CHECK-NEXT: #NO_APP
4746 ; CHECK-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4747 ; CHECK-NEXT: # xmm0 = mem[0],zero
4748 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
4749 ; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0
4751 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4752 %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
4753 ; add forces execution domain
4754 %3 = add <2 x i64> %2, <i64 1, i64 1>
4758 define <8 x i32> @stack_fold_vpmovqd(<8 x i64> %a0) {
4759 ; CHECK-LABEL: stack_fold_vpmovqd:
4761 ; CHECK-NEXT: vpmovqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4764 ; CHECK-NEXT: #NO_APP
4765 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4767 %1 = trunc <8 x i64> %a0 to <8 x i32>
4768 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4771 declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
4773 define <8 x i16> @stack_fold_vpmovqw(<8 x i64> %a0) {
4774 ; CHECK-LABEL: stack_fold_vpmovqw:
4776 ; CHECK-NEXT: vpmovqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4779 ; CHECK-NEXT: #NO_APP
4780 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4781 ; CHECK-NEXT: vzeroupper
4783 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
4784 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4787 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
4789 define <32 x i8> @stack_fold_vpmovwb(<32 x i16> %a0) {
4790 ; CHECK-LABEL: stack_fold_vpmovwb:
4792 ; CHECK-NEXT: vpmovwb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4795 ; CHECK-NEXT: #NO_APP
4796 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4798 %1 = trunc <32 x i16> %a0 to <32 x i8>
4799 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4802 declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
4804 define <16 x i8> @stack_fold_vpmovsdb(<16 x i32> %a0) {
4805 ; CHECK-LABEL: stack_fold_vpmovsdb:
4807 ; CHECK-NEXT: vpmovsdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4810 ; CHECK-NEXT: #NO_APP
4811 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4812 ; CHECK-NEXT: vzeroupper
4814 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
4815 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4818 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
4820 define <16 x i16> @stack_fold_vpmovsdw(<16 x i32> %a0) {
4821 ; CHECK-LABEL: stack_fold_vpmovsdw:
4823 ; CHECK-NEXT: vpmovsdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4826 ; CHECK-NEXT: #NO_APP
4827 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4829 %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
4830 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4833 declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
4835 define <8 x i32> @stack_fold_vpmovsqd(<8 x i64> %a0) {
4836 ; CHECK-LABEL: stack_fold_vpmovsqd:
4838 ; CHECK-NEXT: vpmovsqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4841 ; CHECK-NEXT: #NO_APP
4842 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4844 %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
4845 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4848 declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
4850 define <8 x i16> @stack_fold_vpmovsqw(<8 x i64> %a0) {
4851 ; CHECK-LABEL: stack_fold_vpmovsqw:
4853 ; CHECK-NEXT: vpmovsqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4856 ; CHECK-NEXT: #NO_APP
4857 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4858 ; CHECK-NEXT: vzeroupper
4860 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
4861 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4864 declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
4866 define <32 x i8> @stack_fold_vpmovswb(<32 x i16> %a0) {
4867 ; CHECK-LABEL: stack_fold_vpmovswb:
4869 ; CHECK-NEXT: vpmovswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4872 ; CHECK-NEXT: #NO_APP
4873 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4875 %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
4876 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4879 declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
4881 define <16 x i32> @stack_fold_pmovsxbd_zmm(<16 x i8> %a0) {
4882 ; CHECK-LABEL: stack_fold_pmovsxbd_zmm:
4884 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4887 ; CHECK-NEXT: #NO_APP
4888 ; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
4890 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4891 %2 = sext <16 x i8> %a0 to <16 x i32>
4895 define <8 x i64> @stack_fold_pmovsxbq_zmm(<16 x i8> %a0) {
4896 ; CHECK-LABEL: stack_fold_pmovsxbq_zmm:
4898 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4901 ; CHECK-NEXT: #NO_APP
4902 ; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
4904 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4905 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4906 %3 = sext <8 x i8> %2 to <8 x i64>
4910 define <32 x i16> @stack_fold_pmovsxbw_zmm(<32 x i8> %a0) {
4911 ; CHECK-LABEL: stack_fold_pmovsxbw_zmm:
4913 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4916 ; CHECK-NEXT: #NO_APP
4917 ; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
4919 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4920 %2 = sext <32 x i8> %a0 to <32 x i16>
4924 define <8 x i64> @stack_fold_pmovsxdq_zmm(<8 x i32> %a0) {
4925 ; CHECK-LABEL: stack_fold_pmovsxdq_zmm:
4927 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4930 ; CHECK-NEXT: #NO_APP
4931 ; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
4933 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4934 %2 = sext <8 x i32> %a0 to <8 x i64>
4938 define <16 x i32> @stack_fold_pmovsxwd_zmm(<16 x i16> %a0) {
4939 ; CHECK-LABEL: stack_fold_pmovsxwd_zmm:
4941 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4944 ; CHECK-NEXT: #NO_APP
4945 ; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
4947 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4948 %2 = sext <16 x i16> %a0 to <16 x i32>
4952 define <8 x i64> @stack_fold_pmovsxwq_zmm(<8 x i16> %a0) {
4953 ; CHECK-LABEL: stack_fold_pmovsxwq_zmm:
4955 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4958 ; CHECK-NEXT: #NO_APP
4959 ; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
4961 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4962 %2 = sext <8 x i16> %a0 to <8 x i64>
4966 define <8 x i64> @stack_fold_pmovsxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
4967 ; CHECK-LABEL: stack_fold_pmovsxwq_mask_zmm:
4969 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4972 ; CHECK-NEXT: #NO_APP
4973 ; CHECK-NEXT: kmovd %edi, %k1
4974 ; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload
4976 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4977 %2 = sext <8 x i16> %a0 to <8 x i64>
4978 %3 = bitcast i8 %mask to <8 x i1>
4979 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %passthru
4983 define <8 x i64> @stack_fold_pmovsxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) {
4984 ; CHECK-LABEL: stack_fold_pmovsxwq_maskz_zmm:
4986 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4989 ; CHECK-NEXT: #NO_APP
4990 ; CHECK-NEXT: kmovd %edi, %k1
4991 ; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload
4993 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4994 %2 = sext <8 x i16> %a0 to <8 x i64>
4995 %3 = bitcast i8 %mask to <8 x i1>
4996 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5000 define <16 x i8> @stack_fold_vpmovusdb(<16 x i32> %a0) {
5001 ; CHECK-LABEL: stack_fold_vpmovusdb:
5003 ; CHECK-NEXT: vpmovusdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
5006 ; CHECK-NEXT: #NO_APP
5007 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5008 ; CHECK-NEXT: vzeroupper
5010 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
5011 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5014 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
5016 define <16 x i16> @stack_fold_vpmovusdw(<16 x i32> %a0) {
5017 ; CHECK-LABEL: stack_fold_vpmovusdw:
5019 ; CHECK-NEXT: vpmovusdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
5022 ; CHECK-NEXT: #NO_APP
5023 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5025 %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
5026 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5029 declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
5031 define <8 x i32> @stack_fold_vpmovusqd(<8 x i64> %a0) {
5032 ; CHECK-LABEL: stack_fold_vpmovusqd:
5034 ; CHECK-NEXT: vpmovusqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
5037 ; CHECK-NEXT: #NO_APP
5038 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5040 %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
5041 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5044 declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
5046 define <8 x i16> @stack_fold_vpmovusqw(<8 x i64> %a0) {
5047 ; CHECK-LABEL: stack_fold_vpmovusqw:
5049 ; CHECK-NEXT: vpmovusqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
5052 ; CHECK-NEXT: #NO_APP
5053 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5054 ; CHECK-NEXT: vzeroupper
5056 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
5057 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5060 declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
5062 define <32 x i8> @stack_fold_vpmovuswb(<32 x i16> %a0) {
5063 ; CHECK-LABEL: stack_fold_vpmovuswb:
5065 ; CHECK-NEXT: vpmovuswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
5068 ; CHECK-NEXT: #NO_APP
5069 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5071 %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
5072 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5075 declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)
5077 define <16 x i32> @stack_fold_pmovzxbd_zmm(<16 x i8> %a0) {
5078 ; CHECK-LABEL: stack_fold_pmovzxbd_zmm:
5080 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5083 ; CHECK-NEXT: #NO_APP
5084 ; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
5085 ; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
5087 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5088 %2 = zext <16 x i8> %a0 to <16 x i32>
5092 define <8 x i64> @stack_fold_pmovzxbq_zmm(<16 x i8> %a0) {
5093 ; CHECK-LABEL: stack_fold_pmovzxbq_zmm:
5095 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5098 ; CHECK-NEXT: #NO_APP
5099 ; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
5100 ; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
5102 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5103 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5104 %3 = zext <8 x i8> %2 to <8 x i64>
5108 define <32 x i16> @stack_fold_pmovzxbw_zmm(<32 x i8> %a0) {
5109 ; CHECK-LABEL: stack_fold_pmovzxbw_zmm:
5111 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5114 ; CHECK-NEXT: #NO_APP
5115 ; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
5116 ; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
5118 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5119 %2 = zext <32 x i8> %a0 to <32 x i16>
5123 define <8 x i64> @stack_fold_pmovzxdq_zmm(<8 x i32> %a0) {
5124 ; CHECK-LABEL: stack_fold_pmovzxdq_zmm:
5126 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5129 ; CHECK-NEXT: #NO_APP
5130 ; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
5131 ; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
5133 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5134 %2 = zext <8 x i32> %a0 to <8 x i64>
5138 define <16 x i32> @stack_fold_pmovzxwd_zmm(<16 x i16> %a0) {
5139 ; CHECK-LABEL: stack_fold_pmovzxwd_zmm:
5141 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5144 ; CHECK-NEXT: #NO_APP
5145 ; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
5146 ; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
5148 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5149 %2 = zext <16 x i16> %a0 to <16 x i32>
5153 define <8 x i64> @stack_fold_pmovzxwq_zmm(<8 x i16> %a0) {
5154 ; CHECK-LABEL: stack_fold_pmovzxwq_zmm:
5156 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5159 ; CHECK-NEXT: #NO_APP
5160 ; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
5161 ; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5164 %2 = zext <8 x i16> %a0 to <8 x i64>
5168 define <8 x i64> @stack_fold_pmovzxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
5169 ; CHECK-LABEL: stack_fold_pmovzxwq_mask_zmm:
5171 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5174 ; CHECK-NEXT: #NO_APP
5175 ; CHECK-NEXT: kmovd %edi, %k1
5176 ; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload
5177 ; CHECK-NEXT: # zmm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5179 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5180 %2 = zext <8 x i16> %a0 to <8 x i64>
5181 %3 = bitcast i8 %mask to <8 x i1>
5182 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %passthru
5186 define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) {
5187 ; CHECK-LABEL: stack_fold_pmovzxwq_maskz_zmm:
5189 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5192 ; CHECK-NEXT: #NO_APP
5193 ; CHECK-NEXT: kmovd %edi, %k1
5194 ; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload
5195 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5197 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5198 %2 = zext <8 x i16> %a0 to <8 x i64>
5199 %3 = bitcast i8 %mask to <8 x i1>
5200 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5204 define <16 x i32> @stack_fold_pmulld(<16 x i32> %a0, <16 x i32> %a1) {
5205 ; CHECK-LABEL: stack_fold_pmulld:
5207 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5210 ; CHECK-NEXT: #NO_APP
5211 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5213 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5214 %2 = mul <16 x i32> %a0, %a1
5218 define <16 x i32> @stack_fold_pmulld_commuted(<16 x i32> %a0, <16 x i32> %a1) {
5219 ; CHECK-LABEL: stack_fold_pmulld_commuted:
5221 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5224 ; CHECK-NEXT: #NO_APP
5225 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5227 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5228 %2 = mul <16 x i32> %a1, %a0
5232 define <16 x i32> @stack_fold_pmulld_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
5233 ; CHECK-LABEL: stack_fold_pmulld_mask:
5235 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5236 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5239 ; CHECK-NEXT: #NO_APP
5240 ; CHECK-NEXT: kmovd %esi, %k1
5241 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5242 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5244 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5245 %2 = mul <16 x i32> %a0, %a1
5246 %3 = bitcast i16 %mask to <16 x i1>
5247 ; load needed to keep the operation from being scheduled about the asm block
5248 %4 = load <16 x i32>, <16 x i32>* %a2
5249 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5253 define <16 x i32> @stack_fold_pmulld_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
5254 ; CHECK-LABEL: stack_fold_pmulld_mask_commuted:
5256 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5257 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5260 ; CHECK-NEXT: #NO_APP
5261 ; CHECK-NEXT: kmovd %esi, %k1
5262 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5263 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5265 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5266 %2 = mul <16 x i32> %a1, %a0
5267 %3 = bitcast i16 %mask to <16 x i1>
5268 ; load needed to keep the operation from being scheduled about the asm block
5269 %4 = load <16 x i32>, <16 x i32>* %a2
5270 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5274 define <16 x i32> @stack_fold_pmulld_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5275 ; CHECK-LABEL: stack_fold_pmulld_maskz:
5277 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5280 ; CHECK-NEXT: #NO_APP
5281 ; CHECK-NEXT: kmovd %edi, %k1
5282 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5284 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5285 %2 = mul <16 x i32> %a0, %a1
5286 %3 = bitcast i16 %mask to <16 x i1>
5287 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5291 define <16 x i32> @stack_fold_pmulld_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5292 ; CHECK-LABEL: stack_fold_pmulld_maskz_commuted:
5294 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5297 ; CHECK-NEXT: #NO_APP
5298 ; CHECK-NEXT: kmovd %edi, %k1
5299 ; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5301 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5302 %2 = mul <16 x i32> %a1, %a0
5303 %3 = bitcast i16 %mask to <16 x i1>
5304 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5308 define <8 x i64> @stack_fold_pmullq(<8 x i64> %a0, <8 x i64> %a1) {
5309 ; CHECK-LABEL: stack_fold_pmullq:
5311 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5314 ; CHECK-NEXT: #NO_APP
5315 ; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5317 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5318 %2 = mul <8 x i64> %a0, %a1
5322 define <8 x i64> @stack_fold_pmullq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5323 ; CHECK-LABEL: stack_fold_pmullq_commuted:
5325 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5328 ; CHECK-NEXT: #NO_APP
5329 ; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5331 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5332 %2 = mul <8 x i64> %a1, %a0
5336 define <8 x i64> @stack_fold_pmullq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5337 ; CHECK-LABEL: stack_fold_pmullq_mask:
5339 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5340 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5343 ; CHECK-NEXT: #NO_APP
5344 ; CHECK-NEXT: kmovd %esi, %k1
5345 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5346 ; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5348 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5349 %2 = mul <8 x i64> %a0, %a1
5350 %3 = bitcast i8 %mask to <8 x i1>
5351 ; load needed to keep the operation from being scheduled about the asm block
5352 %4 = load <8 x i64>, <8 x i64>* %a2
5353 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5357 define <8 x i64> @stack_fold_pmullq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5358 ; CHECK-LABEL: stack_fold_pmullq_mask_commuted:
5360 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5361 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5364 ; CHECK-NEXT: #NO_APP
5365 ; CHECK-NEXT: kmovd %esi, %k1
5366 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5367 ; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5369 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5370 %2 = mul <8 x i64> %a1, %a0
5371 %3 = bitcast i8 %mask to <8 x i1>
5372 ; load needed to keep the operation from being scheduled about the asm block
5373 %4 = load <8 x i64>, <8 x i64>* %a2
5374 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5378 define <8 x i64> @stack_fold_pmullq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5379 ; CHECK-LABEL: stack_fold_pmullq_maskz:
5381 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5384 ; CHECK-NEXT: #NO_APP
5385 ; CHECK-NEXT: kmovd %edi, %k1
5386 ; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5388 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5389 %2 = mul <8 x i64> %a0, %a1
5390 %3 = bitcast i8 %mask to <8 x i1>
5391 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5395 define <8 x i64> @stack_fold_pmullq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5396 ; CHECK-LABEL: stack_fold_pmullq_maskz_commuted:
5398 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5401 ; CHECK-NEXT: #NO_APP
5402 ; CHECK-NEXT: kmovd %edi, %k1
5403 ; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5405 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5406 %2 = mul <8 x i64> %a1, %a0
5407 %3 = bitcast i8 %mask to <8 x i1>
5408 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5412 define <32 x i16> @stack_fold_pmullw(<32 x i16> %a0, <32 x i16> %a1) {
5413 ; CHECK-LABEL: stack_fold_pmullw:
5415 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5418 ; CHECK-NEXT: #NO_APP
5419 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5421 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5422 %2 = mul <32 x i16> %a0, %a1
5426 define <32 x i16> @stack_fold_pmullw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
5427 ; CHECK-LABEL: stack_fold_pmullw_commuted:
5429 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5432 ; CHECK-NEXT: #NO_APP
5433 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5435 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5436 %2 = mul <32 x i16> %a1, %a0
5440 define <32 x i16> @stack_fold_pmullw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
5441 ; CHECK-LABEL: stack_fold_pmullw_mask:
5443 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5444 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5447 ; CHECK-NEXT: #NO_APP
5448 ; CHECK-NEXT: kmovd %esi, %k1
5449 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5450 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5452 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5453 %2 = mul <32 x i16> %a0, %a1
5454 %3 = bitcast i32 %mask to <32 x i1>
5455 ; load needed to keep the operation from being scheduled about the asm block
5456 %4 = load <32 x i16>, <32 x i16>* %a2
5457 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
5461 define <32 x i16> @stack_fold_pmullw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
5462 ; CHECK-LABEL: stack_fold_pmullw_mask_commuted:
5464 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5465 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5468 ; CHECK-NEXT: #NO_APP
5469 ; CHECK-NEXT: kmovd %esi, %k1
5470 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5471 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5473 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5474 %2 = mul <32 x i16> %a1, %a0
5475 %3 = bitcast i32 %mask to <32 x i1>
5476 ; load needed to keep the operation from being scheduled about the asm block
5477 %4 = load <32 x i16>, <32 x i16>* %a2
5478 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
5482 define <32 x i16> @stack_fold_pmullw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
5483 ; CHECK-LABEL: stack_fold_pmullw_maskz:
5485 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5488 ; CHECK-NEXT: #NO_APP
5489 ; CHECK-NEXT: kmovd %edi, %k1
5490 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5492 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5493 %2 = mul <32 x i16> %a0, %a1
5494 %3 = bitcast i32 %mask to <32 x i1>
5495 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
5499 define <32 x i16> @stack_fold_pmullw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
5500 ; CHECK-LABEL: stack_fold_pmullw_maskz_commuted:
5502 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5505 ; CHECK-NEXT: #NO_APP
5506 ; CHECK-NEXT: kmovd %edi, %k1
5507 ; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5509 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5510 %2 = mul <32 x i16> %a1, %a0
5511 %3 = bitcast i32 %mask to <32 x i1>
5512 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
5516 define <8 x i64> @stack_fold_pmuldq(<8 x i64> %a0, <8 x i64> %a1) {
5517 ; CHECK-LABEL: stack_fold_pmuldq:
5519 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5522 ; CHECK-NEXT: #NO_APP
5523 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5525 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5526 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5527 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5528 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5529 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5530 %6 = mul <8 x i64> %3, %5
5534 define <8 x i64> @stack_fold_pmuldq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5535 ; CHECK-LABEL: stack_fold_pmuldq_commuted:
5537 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5540 ; CHECK-NEXT: #NO_APP
5541 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5543 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5544 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5545 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5546 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5547 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5548 %6 = mul <8 x i64> %5, %3
5552 define <8 x i64> @stack_fold_pmuldq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5553 ; CHECK-LABEL: stack_fold_pmuldq_mask:
5555 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5556 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5559 ; CHECK-NEXT: #NO_APP
5560 ; CHECK-NEXT: kmovd %esi, %k1
5561 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5562 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5564 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5565 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5566 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5567 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5568 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5569 %6 = mul <8 x i64> %3, %5
5570 %7 = bitcast i8 %mask to <8 x i1>
5571 ; load needed to keep the operation from being scheduled about the asm block
5572 %8 = load <8 x i64>, <8 x i64>* %a2
5573 %9 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> %8
5577 define <8 x i64> @stack_fold_pmuldq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5578 ; CHECK-LABEL: stack_fold_pmuldq_mask_commuted:
5580 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5581 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5584 ; CHECK-NEXT: #NO_APP
5585 ; CHECK-NEXT: kmovd %esi, %k1
5586 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5587 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5589 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5590 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5591 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5592 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5593 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5594 %6 = mul <8 x i64> %5, %3
5595 %7 = bitcast i8 %mask to <8 x i1>
5596 ; load needed to keep the operation from being scheduled about the asm block
5597 %8 = load <8 x i64>, <8 x i64>* %a2
5598 %9 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> %8
5602 define <8 x i64> @stack_fold_pmuldq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5603 ; CHECK-LABEL: stack_fold_pmuldq_maskz:
5605 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5608 ; CHECK-NEXT: #NO_APP
5609 ; CHECK-NEXT: kmovd %edi, %k1
5610 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5612 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5613 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5614 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5615 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5616 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5617 %6 = mul <8 x i64> %3, %5
5618 %7 = bitcast i8 %mask to <8 x i1>
5619 %8 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> zeroinitializer
5623 define <8 x i64> @stack_fold_pmuldq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5624 ; CHECK-LABEL: stack_fold_pmuldq_maskz_commuted:
5626 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5629 ; CHECK-NEXT: #NO_APP
5630 ; CHECK-NEXT: kmovd %edi, %k1
5631 ; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5633 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5634 %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5635 %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5636 %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5637 %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5638 %6 = mul <8 x i64> %5, %3
5639 %7 = bitcast i8 %mask to <8 x i1>
5640 %8 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> zeroinitializer
5647 define <8 x i64> @stack_fold_pmuludq(<8 x i64> %a0, <8 x i64> %a1) {
5648 ; CHECK-LABEL: stack_fold_pmuludq:
5650 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5653 ; CHECK-NEXT: #NO_APP
5654 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5656 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5657 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5658 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5659 %4 = mul <8 x i64> %2, %3
5663 define <8 x i64> @stack_fold_pmuludq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5664 ; CHECK-LABEL: stack_fold_pmuludq_commuted:
5666 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5669 ; CHECK-NEXT: #NO_APP
5670 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5672 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5673 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5674 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5675 %4 = mul <8 x i64> %3, %2
5679 define <8 x i64> @stack_fold_pmuludq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5680 ; CHECK-LABEL: stack_fold_pmuludq_mask:
5682 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5683 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5686 ; CHECK-NEXT: #NO_APP
5687 ; CHECK-NEXT: kmovd %esi, %k1
5688 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5689 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5691 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5692 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5693 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5694 %4 = mul <8 x i64> %2, %3
5695 %5 = bitcast i8 %mask to <8 x i1>
5696 ; load needed to keep the operation from being scheduled about the asm block
5697 %6 = load <8 x i64>, <8 x i64>* %a2
5698 %7 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %6
5702 define <8 x i64> @stack_fold_pmuludq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5703 ; CHECK-LABEL: stack_fold_pmuludq_mask_commuted:
5705 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5706 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
5709 ; CHECK-NEXT: #NO_APP
5710 ; CHECK-NEXT: kmovd %esi, %k1
5711 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
5712 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5714 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5715 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5716 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5717 %4 = mul <8 x i64> %3, %2
5718 %5 = bitcast i8 %mask to <8 x i1>
5719 ; load needed to keep the operation from being scheduled about the asm block
5720 %6 = load <8 x i64>, <8 x i64>* %a2
5721 %7 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %6
5725 define <8 x i64> @stack_fold_pmuludq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5726 ; CHECK-LABEL: stack_fold_pmuludq_maskz:
5728 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5731 ; CHECK-NEXT: #NO_APP
5732 ; CHECK-NEXT: kmovd %edi, %k1
5733 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5735 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5736 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5737 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5738 %4 = mul <8 x i64> %2, %3
5739 %5 = bitcast i8 %mask to <8 x i1>
5740 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
5744 define <8 x i64> @stack_fold_pmuludq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5745 ; CHECK-LABEL: stack_fold_pmuludq_maskz_commuted:
5747 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5750 ; CHECK-NEXT: #NO_APP
5751 ; CHECK-NEXT: kmovd %edi, %k1
5752 ; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5754 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5755 %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5756 %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5757 %4 = mul <8 x i64> %3, %2
5758 %5 = bitcast i8 %mask to <8 x i1>
5759 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
5763 define <16 x i32> @stack_fold_vpopcntd(<16 x i32> %a0) {
5764 ; CHECK-LABEL: stack_fold_vpopcntd:
5766 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5769 ; CHECK-NEXT: #NO_APP
5770 ; CHECK-NEXT: vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
5772 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5773 %2 = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a0)
5776 declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readonly
5778 define <8 x i64> @stack_fold_vpopcntq(<8 x i64> %a0) {
5779 ; CHECK-LABEL: stack_fold_vpopcntq:
5781 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5784 ; CHECK-NEXT: #NO_APP
5785 ; CHECK-NEXT: vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
5787 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5788 %2 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a0)
5791 declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
5793 define <16 x i32> @stack_fold_pord(<16 x i32> %a0, <16 x i32> %a1) {
5794 ; CHECK-LABEL: stack_fold_pord:
5796 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5799 ; CHECK-NEXT: #NO_APP
5800 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5802 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5803 %2 = or <16 x i32> %a0, %a1
5807 define <16 x i32> @stack_fold_pord_commuted(<16 x i32> %a0, <16 x i32> %a1) {
5808 ; CHECK-LABEL: stack_fold_pord_commuted:
5810 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5813 ; CHECK-NEXT: #NO_APP
5814 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5816 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5817 %2 = or <16 x i32> %a1, %a0
5821 define <16 x i32> @stack_fold_pord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
5822 ; CHECK-LABEL: stack_fold_pord_mask:
5824 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5825 ; CHECK-NEXT: vmovaps %zmm0, %zmm1
5828 ; CHECK-NEXT: #NO_APP
5829 ; CHECK-NEXT: kmovd %esi, %k1
5830 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
5831 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5833 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5834 %2 = or <16 x i32> %a0, %a1
5835 %3 = bitcast i16 %mask to <16 x i1>
5836 ; load needed to keep the operation from being scheduled about the asm block
5837 %4 = load <16 x i32>, <16 x i32>* %a2
5838 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5842 define <16 x i32> @stack_fold_pord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
5843 ; CHECK-LABEL: stack_fold_pord_mask_commuted:
5845 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5846 ; CHECK-NEXT: vmovaps %zmm0, %zmm1
5849 ; CHECK-NEXT: #NO_APP
5850 ; CHECK-NEXT: kmovd %esi, %k1
5851 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
5852 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5854 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5855 %2 = or <16 x i32> %a1, %a0
5856 %3 = bitcast i16 %mask to <16 x i1>
5857 ; load needed to keep the operation from being scheduled about the asm block
5858 %4 = load <16 x i32>, <16 x i32>* %a2
5859 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5863 define <16 x i32> @stack_fold_pord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5864 ; CHECK-LABEL: stack_fold_pord_maskz:
5866 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5869 ; CHECK-NEXT: #NO_APP
5870 ; CHECK-NEXT: kmovd %edi, %k1
5871 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5873 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5874 %2 = or <16 x i32> %a0, %a1
5875 %3 = bitcast i16 %mask to <16 x i1>
5876 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5880 define <16 x i32> @stack_fold_pord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5881 ; CHECK-LABEL: stack_fold_pord_maskz_commuted:
5883 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5886 ; CHECK-NEXT: #NO_APP
5887 ; CHECK-NEXT: kmovd %edi, %k1
5888 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5890 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5891 %2 = or <16 x i32> %a1, %a0
5892 %3 = bitcast i16 %mask to <16 x i1>
5893 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5897 define <8 x i64> @stack_fold_porq(<8 x i64> %a0, <8 x i64> %a1) {
5898 ; CHECK-LABEL: stack_fold_porq:
5900 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5903 ; CHECK-NEXT: #NO_APP
5904 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5906 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5907 %2 = or <8 x i64> %a0, %a1
5911 define <8 x i64> @stack_fold_porq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5912 ; CHECK-LABEL: stack_fold_porq_commuted:
5914 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5917 ; CHECK-NEXT: #NO_APP
5918 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5920 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5921 %2 = or <8 x i64> %a1, %a0
5925 define <8 x i64> @stack_fold_porq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5926 ; CHECK-LABEL: stack_fold_porq_mask:
5928 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5929 ; CHECK-NEXT: vmovapd %zmm0, %zmm1
5932 ; CHECK-NEXT: #NO_APP
5933 ; CHECK-NEXT: kmovd %esi, %k1
5934 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
5935 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5937 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5938 %2 = or <8 x i64> %a0, %a1
5939 %3 = bitcast i8 %mask to <8 x i1>
5940 ; load needed to keep the operation from being scheduled about the asm block
5941 %4 = load <8 x i64>, <8 x i64>* %a2
5942 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5946 define <8 x i64> @stack_fold_porq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5947 ; CHECK-LABEL: stack_fold_porq_mask_commuted:
5949 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5950 ; CHECK-NEXT: vmovapd %zmm0, %zmm1
5953 ; CHECK-NEXT: #NO_APP
5954 ; CHECK-NEXT: kmovd %esi, %k1
5955 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
5956 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5958 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5959 %2 = or <8 x i64> %a1, %a0
5960 %3 = bitcast i8 %mask to <8 x i1>
5961 ; load needed to keep the operation from being scheduled about the asm block
5962 %4 = load <8 x i64>, <8 x i64>* %a2
5963 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5967 define <8 x i64> @stack_fold_porq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5968 ; CHECK-LABEL: stack_fold_porq_maskz:
5970 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5973 ; CHECK-NEXT: #NO_APP
5974 ; CHECK-NEXT: kmovd %edi, %k1
5975 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5977 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5978 %2 = or <8 x i64> %a0, %a1
5979 %3 = bitcast i8 %mask to <8 x i1>
5980 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5984 define <8 x i64> @stack_fold_porq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5985 ; CHECK-LABEL: stack_fold_porq_maskz_commuted:
5987 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5990 ; CHECK-NEXT: #NO_APP
5991 ; CHECK-NEXT: kmovd %edi, %k1
5992 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5994 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5995 %2 = or <8 x i64> %a1, %a0
5996 %3 = bitcast i8 %mask to <8 x i1>
5997 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
6001 define <8 x i64> @stack_fold_psadbw(<64 x i8> %a0, <64 x i8> %a1) {
6002 ; CHECK-LABEL: stack_fold_psadbw:
6004 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6007 ; CHECK-NEXT: #NO_APP
6008 ; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6010 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6011 %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a0, <64 x i8> %a1)
6014 declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) nounwind readnone
6016 define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) {
6017 ; CHECK-LABEL: stack_fold_pshufb_zmm:
6019 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6022 ; CHECK-NEXT: #NO_APP
6023 ; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6025 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6026 %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
6029 declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
6031 define <64 x i8> @stack_fold_pshufb_zmm_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
6032 ; CHECK-LABEL: stack_fold_pshufb_zmm_mask:
6034 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6037 ; CHECK-NEXT: #NO_APP
6038 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
6039 ; CHECK-NEXT: kmovq %rsi, %k1
6040 ; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
6041 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
6043 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6044 %2 = load <64 x i8>, <64 x i8>* %passthru
6045 %3 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
6046 %4 = bitcast i64 %mask to <64 x i1>
6047 %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %2
6051 define <64 x i8> @stack_fold_pshufb_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
6052 ; CHECK-LABEL: stack_fold_pshufb_zmm_maskz:
6054 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6057 ; CHECK-NEXT: #NO_APP
6058 ; CHECK-NEXT: kmovq %rdi, %k1
6059 ; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
6061 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6062 %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
6063 %3 = bitcast i64 %mask to <64 x i1>
6064 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
6068 define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) {
6069 ; CHECK-LABEL: stack_fold_pshufd_zmm:
6071 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6074 ; CHECK-NEXT: #NO_APP
6075 ; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6076 ; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
6077 ; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6078 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
6080 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6081 %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
6082 %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
6086 define <16 x i32> @stack_fold_pshufd_zmm_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) {
6087 ; CHECK-LABEL: stack_fold_pshufd_zmm_mask:
6089 ; CHECK-NEXT: subq $56, %rsp
6090 ; CHECK-NEXT: .cfi_def_cfa_offset 64
6091 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6092 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6095 ; CHECK-NEXT: #NO_APP
6096 ; CHECK-NEXT: kmovd %edi, %k1
6097 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6098 ; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
6099 ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
6100 ; CHECK-NEXT: addq $56, %rsp
6101 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6103 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6104 %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
6105 %3 = bitcast i16 %mask to <16 x i1>
6106 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %passthru
6110 define <16 x i32> @stack_fold_pshufd_zmm_maskz(<16 x i32> %a0, i16 %mask) {
6111 ; CHECK-LABEL: stack_fold_pshufd_zmm_maskz:
6113 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6116 ; CHECK-NEXT: #NO_APP
6117 ; CHECK-NEXT: kmovd %edi, %k1
6118 ; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6119 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
6121 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6122 %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
6123 %3 = bitcast i16 %mask to <16 x i1>
6124 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6128 define <32 x i16> @stack_fold_pshufhw_zmm(<32 x i16> %a0) {
6129 ; CHECK-LABEL: stack_fold_pshufhw_zmm:
6131 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6134 ; CHECK-NEXT: #NO_APP
6135 ; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6136 ; CHECK-NEXT: # zmm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
6138 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6139 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
6143 define <32 x i16> @stack_fold_pshufhw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
6144 ; CHECK-LABEL: stack_fold_pshufhw_zmm_mask:
6146 ; CHECK-NEXT: subq $56, %rsp
6147 ; CHECK-NEXT: .cfi_def_cfa_offset 64
6148 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6149 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6152 ; CHECK-NEXT: #NO_APP
6153 ; CHECK-NEXT: kmovd %edi, %k1
6154 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6155 ; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
6156 ; CHECK-NEXT: # zmm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
6157 ; CHECK-NEXT: addq $56, %rsp
6158 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6160 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6161 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
6162 %3 = bitcast i32 %mask to <32 x i1>
6163 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru
6167 define <32 x i16> @stack_fold_pshufhw_zmm_maskz(<32 x i16> %a0, i32 %mask) {
6168 ; CHECK-LABEL: stack_fold_pshufhw_zmm_maskz:
6170 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6173 ; CHECK-NEXT: #NO_APP
6174 ; CHECK-NEXT: kmovd %edi, %k1
6175 ; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6176 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
6178 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6179 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
6180 %3 = bitcast i32 %mask to <32 x i1>
6181 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
6185 define <32 x i16> @stack_fold_pshuflw_zmm(<32 x i16> %a0) {
6186 ; CHECK-LABEL: stack_fold_pshuflw_zmm:
6188 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6191 ; CHECK-NEXT: #NO_APP
6192 ; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6193 ; CHECK-NEXT: # zmm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
6195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6196 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
6200 define <32 x i16> @stack_fold_pshuflw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
6201 ; CHECK-LABEL: stack_fold_pshuflw_zmm_mask:
6203 ; CHECK-NEXT: subq $56, %rsp
6204 ; CHECK-NEXT: .cfi_def_cfa_offset 64
6205 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6206 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6209 ; CHECK-NEXT: #NO_APP
6210 ; CHECK-NEXT: kmovd %edi, %k1
6211 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6212 ; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
6213 ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
6214 ; CHECK-NEXT: addq $56, %rsp
6215 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6217 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6218 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
6219 %3 = bitcast i32 %mask to <32 x i1>
6220 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru
6224 define <32 x i16> @stack_fold_pshuflw_zmm_maskz(<32 x i16> %a0, i32 %mask) {
6225 ; CHECK-LABEL: stack_fold_pshuflw_zmm_maskz:
6227 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6230 ; CHECK-NEXT: #NO_APP
6231 ; CHECK-NEXT: kmovd %edi, %k1
6232 ; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6233 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
6235 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6236 %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
6237 %3 = bitcast i32 %mask to <32 x i1>
6238 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
6242 define <16 x i32> @stack_fold_pslld(<16 x i32> %a0, <4 x i32> %a1) {
6243 ; CHECK-LABEL: stack_fold_pslld:
6245 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6248 ; CHECK-NEXT: #NO_APP
6249 ; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6251 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6252 %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
6255 declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6257 define <16 x i32> @stack_fold_pslld_mask(<16 x i32>* %passthru, <16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6258 ; CHECK-LABEL: stack_fold_pslld_mask:
6260 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6263 ; CHECK-NEXT: #NO_APP
6264 ; CHECK-NEXT: kmovd %esi, %k1
6265 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
6266 ; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
6267 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
6269 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6270 %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
6271 %3 = bitcast i16 %mask to <16 x i1>
6272 %4 = load <16 x i32>, <16 x i32>* %passthru
6273 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
6277 define <16 x i32> @stack_fold_pslld_maskz(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6278 ; CHECK-LABEL: stack_fold_pslld_maskz:
6280 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6283 ; CHECK-NEXT: #NO_APP
6284 ; CHECK-NEXT: kmovd %edi, %k1
6285 ; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 16-byte Folded Reload
6287 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6288 %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
6289 %3 = bitcast i16 %mask to <16 x i1>
6290 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6294 define <16 x i32> @stack_fold_pslldi(<16 x i32> %a0) {
6295 ; CHECK-LABEL: stack_fold_pslldi:
6297 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6300 ; CHECK-NEXT: #NO_APP
6301 ; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6303 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6304 %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
6307 declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
6309 define <16 x i32> @stack_fold_pslldi_mask(<16 x i32>* %passthru, <16 x i32> %a0, i16 %mask) {
6310 ; CHECK-LABEL: stack_fold_pslldi_mask:
6312 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6315 ; CHECK-NEXT: #NO_APP
6316 ; CHECK-NEXT: kmovd %esi, %k1
6317 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
6318 ; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
6319 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
6321 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6322 %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
6323 %3 = bitcast i16 %mask to <16 x i1>
6324 %4 = load <16 x i32>, <16 x i32>* %passthru
6325 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
6329 define <16 x i32> @stack_fold_pslldi_maskz(<16 x i32> %a0, i16 %mask) {
6330 ; CHECK-LABEL: stack_fold_pslldi_maskz:
6332 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6335 ; CHECK-NEXT: #NO_APP
6336 ; CHECK-NEXT: kmovd %edi, %k1
6337 ; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6339 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6340 %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
6341 %3 = bitcast i16 %mask to <16 x i1>
6342 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6346 define <64 x i8> @stack_fold_pslldq(<64 x i8> %a, <64 x i8> %b) {
6347 ; CHECK-LABEL: stack_fold_pslldq:
6349 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6352 ; CHECK-NEXT: #NO_APP
6353 ; CHECK-NEXT: vpslldq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6354 ; CHECK-NEXT: # zmm0 = zero,mem[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,mem[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,mem[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,mem[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
6356 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6357 %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
6361 define <8 x i64> @stack_fold_psllq(<8 x i64> %a0, <2 x i64> %a1) {
6362 ; CHECK-LABEL: stack_fold_psllq:
6364 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6367 ; CHECK-NEXT: #NO_APP
6368 ; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6370 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6371 %2 = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1)
6374 declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6376 define <8 x i64> @stack_fold_psllqi(<8 x i64> %a0) {
6377 ; CHECK-LABEL: stack_fold_psllqi:
6379 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6382 ; CHECK-NEXT: #NO_APP
6383 ; CHECK-NEXT: vpsllq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6385 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6386 %2 = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 1)
6389 declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
6391 define <16 x i32> @stack_fold_psllvd(<16 x i32> %a0, <16 x i32> %a1) {
6392 ; CHECK-LABEL: stack_fold_psllvd:
6394 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6397 ; CHECK-NEXT: #NO_APP
6398 ; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6400 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6401 %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6404 declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
6406 define <16 x i32> @stack_fold_psllvd_mask(<16 x i32>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
6407 ; CHECK-LABEL: stack_fold_psllvd_mask:
6409 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6412 ; CHECK-NEXT: #NO_APP
6413 ; CHECK-NEXT: kmovd %esi, %k1
6414 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
6415 ; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
6416 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
6418 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6419 %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6420 %3 = bitcast i16 %mask to <16 x i1>
6421 %4 = load <16 x i32>, <16 x i32>* %passthru
6422 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
6426 define <16 x i32> @stack_fold_psllvd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
6427 ; CHECK-LABEL: stack_fold_psllvd_maskz:
6429 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6432 ; CHECK-NEXT: #NO_APP
6433 ; CHECK-NEXT: kmovd %edi, %k1
6434 ; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
6436 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6437 %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6438 %3 = bitcast i16 %mask to <16 x i1>
6439 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6443 define <8 x i64> @stack_fold_psllvq(<8 x i64> %a0, <8 x i64> %a1) {
6444 ; CHECK-LABEL: stack_fold_psllvq:
6446 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6449 ; CHECK-NEXT: #NO_APP
6450 ; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6452 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6453 %2 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
6456 declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
6458 define <32 x i16> @stack_fold_psllvw(<32 x i16> %a0, <32 x i16> %a1) {
6459 ; CHECK-LABEL: stack_fold_psllvw:
6461 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6464 ; CHECK-NEXT: #NO_APP
6465 ; CHECK-NEXT: vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6467 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6468 %2 = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %a0, <32 x i16> %a1)
6471 declare <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
6473 define <32 x i16> @stack_fold_psllw(<32 x i16> %a0, <8 x i16> %a1) {
6474 ; CHECK-LABEL: stack_fold_psllw:
6476 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6479 ; CHECK-NEXT: #NO_APP
6480 ; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6482 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6483 %2 = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1)
6486 declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind readnone
6488 define <32 x i16> @stack_fold_psllwi(<32 x i16> %a0) {
6489 ; CHECK-LABEL: stack_fold_psllwi:
6491 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6494 ; CHECK-NEXT: #NO_APP
6495 ; CHECK-NEXT: vpsllw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6497 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6498 %2 = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 1)
6501 declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readnone
6503 define <16 x i32> @stack_fold_psrad(<16 x i32> %a0, <4 x i32> %a1) {
6504 ; CHECK-LABEL: stack_fold_psrad:
6506 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6509 ; CHECK-NEXT: #NO_APP
6510 ; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6512 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6513 %2 = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1)
6516 declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6518 define <16 x i32> @stack_fold_psradi(<16 x i32> %a0) {
6519 ; CHECK-LABEL: stack_fold_psradi:
6521 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6524 ; CHECK-NEXT: #NO_APP
6525 ; CHECK-NEXT: vpsrad $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6527 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6528 %2 = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 1)
6531 declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
6533 define <8 x i64> @stack_fold_psraq(<8 x i64> %a0, <2 x i64> %a1) {
6534 ; CHECK-LABEL: stack_fold_psraq:
6536 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6539 ; CHECK-NEXT: #NO_APP
6540 ; CHECK-NEXT: vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6542 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6543 %2 = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1)
6546 declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6548 define <8 x i64> @stack_fold_psraqi(<8 x i64> %a0) {
6549 ; CHECK-LABEL: stack_fold_psraqi:
6551 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6554 ; CHECK-NEXT: #NO_APP
6555 ; CHECK-NEXT: vpsraq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6557 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6558 %2 = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 1)
6561 declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
6563 define <16 x i32> @stack_fold_psravd(<16 x i32> %a0, <16 x i32> %a1) {
6564 ; CHECK-LABEL: stack_fold_psravd:
6566 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6569 ; CHECK-NEXT: #NO_APP
6570 ; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6572 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6573 %2 = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
6576 declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
6578 define <8 x i64> @stack_fold_psravq(<8 x i64> %a0, <8 x i64> %a1) {
6579 ; CHECK-LABEL: stack_fold_psravq:
6581 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6584 ; CHECK-NEXT: #NO_APP
6585 ; CHECK-NEXT: vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6587 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6588 %2 = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
6591 declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
6593 define <32 x i16> @stack_fold_psravw(<32 x i16> %a0, <32 x i16> %a1) {
6594 ; CHECK-LABEL: stack_fold_psravw:
6596 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6599 ; CHECK-NEXT: #NO_APP
6600 ; CHECK-NEXT: vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6602 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6603 %2 = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %a0, <32 x i16> %a1)
6606 declare <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16>, <32 x i16>) nounwind readnone
6608 define <32 x i16> @stack_fold_psraw(<32 x i16> %a0, <8 x i16> %a1) {
6609 ; CHECK-LABEL: stack_fold_psraw:
6611 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6614 ; CHECK-NEXT: #NO_APP
6615 ; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6617 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6618 %2 = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1)
6621 declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) nounwind readnone
6623 define <32 x i16> @stack_fold_psrawi(<32 x i16> %a0) {
6624 ; CHECK-LABEL: stack_fold_psrawi:
6626 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6629 ; CHECK-NEXT: #NO_APP
6630 ; CHECK-NEXT: vpsraw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6632 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6633 %2 = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 1)
6636 declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readnone
6638 define <16 x i32> @stack_fold_psrld(<16 x i32> %a0, <4 x i32> %a1) {
6639 ; CHECK-LABEL: stack_fold_psrld:
6641 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6644 ; CHECK-NEXT: #NO_APP
6645 ; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6647 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6648 %2 = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1)
6651 declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6653 define <16 x i32> @stack_fold_psrldi(<16 x i32> %a0) {
6654 ; CHECK-LABEL: stack_fold_psrldi:
6656 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6659 ; CHECK-NEXT: #NO_APP
6660 ; CHECK-NEXT: vpsrld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6662 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6663 %2 = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 1)
6666 declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
6668 define <64 x i8> @stack_fold_psrldq(<64 x i8> %a, <64 x i8> %b) {
6669 ; CHECK-LABEL: stack_fold_psrldq:
6671 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6674 ; CHECK-NEXT: #NO_APP
6675 ; CHECK-NEXT: vpsrldq $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6676 ; CHECK-NEXT: # zmm0 = mem[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,mem[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,mem[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,mem[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
6678 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6679 %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64>
6683 define <8 x i64> @stack_fold_psrlq(<8 x i64> %a0, <2 x i64> %a1) {
6684 ; CHECK-LABEL: stack_fold_psrlq:
6686 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6689 ; CHECK-NEXT: #NO_APP
6690 ; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6692 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6693 %2 = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1)
6696 declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6698 define <8 x i64> @stack_fold_psrlqi(<8 x i64> %a0) {
6699 ; CHECK-LABEL: stack_fold_psrlqi:
6701 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6704 ; CHECK-NEXT: #NO_APP
6705 ; CHECK-NEXT: vpsrlq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6707 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6708 %2 = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 1)
6711 declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
6713 define <16 x i32> @stack_fold_psrlvd(<16 x i32> %a0, <16 x i32> %a1) {
6714 ; CHECK-LABEL: stack_fold_psrlvd:
6716 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6719 ; CHECK-NEXT: #NO_APP
6720 ; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6722 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6723 %2 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6726 declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
6728 define <8 x i64> @stack_fold_psrlvq(<8 x i64> %a0, <8 x i64> %a1) {
6729 ; CHECK-LABEL: stack_fold_psrlvq:
6731 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6734 ; CHECK-NEXT: #NO_APP
6735 ; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6737 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6738 %2 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
6741 declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
6743 define <32 x i16> @stack_fold_psrlvw(<32 x i16> %a0, <32 x i16> %a1) {
6744 ; CHECK-LABEL: stack_fold_psrlvw:
6746 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6749 ; CHECK-NEXT: #NO_APP
6750 ; CHECK-NEXT: vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6752 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6753 %2 = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %a0, <32 x i16> %a1)
6756 declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
6758 define <32 x i16> @stack_fold_psrlw(<32 x i16> %a0, <8 x i16> %a1) {
6759 ; CHECK-LABEL: stack_fold_psrlw:
6761 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6764 ; CHECK-NEXT: #NO_APP
6765 ; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6767 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6768 %2 = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1)
6771 declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone
6773 define <32 x i16> @stack_fold_psrlwi(<32 x i16> %a0) {
6774 ; CHECK-LABEL: stack_fold_psrlwi:
6776 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6779 ; CHECK-NEXT: #NO_APP
6780 ; CHECK-NEXT: vpsrlw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6782 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6783 %2 = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 1)
6786 declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) nounwind readnone
6788 define <64 x i8> @stack_fold_psubb(<64 x i8> %a0, <64 x i8> %a1) {
6789 ; CHECK-LABEL: stack_fold_psubb:
6791 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6794 ; CHECK-NEXT: #NO_APP
6795 ; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6797 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6798 %2 = sub <64 x i8> %a0, %a1
6802 define <16 x i32> @stack_fold_psubd(<16 x i32> %a0, <16 x i32> %a1) {
6803 ; CHECK-LABEL: stack_fold_psubd:
6805 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6808 ; CHECK-NEXT: #NO_APP
6809 ; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6811 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6812 %2 = sub <16 x i32> %a0, %a1
6816 define <8 x i64> @stack_fold_psubq(<8 x i64> %a0, <8 x i64> %a1) {
6817 ; CHECK-LABEL: stack_fold_psubq:
6819 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6822 ; CHECK-NEXT: #NO_APP
6823 ; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6825 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6826 %2 = sub <8 x i64> %a0, %a1
6830 define <64 x i8> @stack_fold_psubsb(<64 x i8> %a0, <64 x i8> %a1) {
6831 ; CHECK-LABEL: stack_fold_psubsb:
6833 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6836 ; CHECK-NEXT: #NO_APP
6837 ; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6839 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6840 %2 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
6844 define <32 x i16> @stack_fold_psubsw(<32 x i16> %a0, <32 x i16> %a1) {
6845 ; CHECK-LABEL: stack_fold_psubsw:
6847 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6850 ; CHECK-NEXT: #NO_APP
6851 ; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6853 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6854 %2 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
6858 define <64 x i8> @stack_fold_psubusb(<64 x i8> %a0, <64 x i8> %a1) {
6859 ; CHECK-LABEL: stack_fold_psubusb:
6861 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6864 ; CHECK-NEXT: #NO_APP
6865 ; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6867 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6868 %2 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
6872 define <32 x i16> @stack_fold_psubusw(<32 x i16> %a0, <32 x i16> %a1) {
6873 ; CHECK-LABEL: stack_fold_psubusw:
6875 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6878 ; CHECK-NEXT: #NO_APP
6879 ; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6881 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6882 %2 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
6886 define <32 x i16> @stack_fold_psubw(<32 x i16> %a0, <32 x i16> %a1) {
6887 ; CHECK-LABEL: stack_fold_psubw:
6889 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6892 ; CHECK-NEXT: #NO_APP
6893 ; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6895 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6896 %2 = sub <32 x i16> %a0, %a1
6900 define <8 x i64> @stack_fold_shufi64x2(<8 x i64> %a, <8 x i64> %b) {
6901 ; CHECK-LABEL: stack_fold_shufi64x2:
6903 ; CHECK-NEXT: subq $56, %rsp
6904 ; CHECK-NEXT: .cfi_def_cfa_offset 64
6905 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6906 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6909 ; CHECK-NEXT: #NO_APP
6910 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6911 ; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6912 ; CHECK-NEXT: # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
6913 ; CHECK-NEXT: addq $56, %rsp
6914 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6916 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6917 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
6921 define <8 x i64> @stack_fold_shufi64x2_mask(<8 x i64> %a, <8 x i64> %b, i8 %mask, <8 x i64>* %passthru) {
6922 ; CHECK-LABEL: stack_fold_shufi64x2_mask:
6924 ; CHECK-NEXT: subq $56, %rsp
6925 ; CHECK-NEXT: .cfi_def_cfa_offset 64
6926 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6927 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6930 ; CHECK-NEXT: #NO_APP
6931 ; CHECK-NEXT: kmovd %edi, %k1
6932 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm1
6933 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6934 ; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
6935 ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
6936 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
6937 ; CHECK-NEXT: addq $56, %rsp
6938 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6940 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6941 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
6942 %3 = bitcast i8 %mask to <8 x i1>
6943 ; load needed to keep the operation from being scheduled above the asm block
6944 %4 = load <8 x i64>, <8 x i64>* %passthru
6945 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
6949 define <8 x i64> @stack_fold_shufi64x2_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask, <8 x i64>* %passthru) {
6950 ; CHECK-LABEL: stack_fold_shufi64x2_maskz:
6952 ; CHECK-NEXT: subq $56, %rsp
6953 ; CHECK-NEXT: .cfi_def_cfa_offset 64
6954 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6955 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6958 ; CHECK-NEXT: #NO_APP
6959 ; CHECK-NEXT: kmovd %edi, %k1
6960 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6961 ; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
6962 ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
6963 ; CHECK-NEXT: addq $56, %rsp
6964 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6966 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6967 %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
6968 %3 = bitcast i8 %mask to <8 x i1>
6969 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
6973 define <16 x i32> @stack_fold_shufi32x4_mask(<16 x i32> %a, <16 x i32> %b, i16 %mask, <16 x i32>* %passthru) {
6974 ; CHECK-LABEL: stack_fold_shufi32x4_mask:
6976 ; CHECK-NEXT: subq $56, %rsp
6977 ; CHECK-NEXT: .cfi_def_cfa_offset 64
6978 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6979 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6982 ; CHECK-NEXT: #NO_APP
6983 ; CHECK-NEXT: kmovd %edi, %k1
6984 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm1
6985 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6986 ; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
6987 ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
6988 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
6989 ; CHECK-NEXT: addq $56, %rsp
6990 ; CHECK-NEXT: .cfi_def_cfa_offset 8
6992 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6993 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
6994 %3 = bitcast i16 %mask to <16 x i1>
6995 ; load needed to keep the operation from being scheduled above the asm block
6996 %4 = load <16 x i32>, <16 x i32>* %passthru
6997 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
7001 define <16 x i32> @stack_fold_shufi32x4_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
7002 ; CHECK-LABEL: stack_fold_shufi32x4_maskz:
7004 ; CHECK-NEXT: subq $56, %rsp
7005 ; CHECK-NEXT: .cfi_def_cfa_offset 64
7006 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7007 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7010 ; CHECK-NEXT: #NO_APP
7011 ; CHECK-NEXT: kmovd %edi, %k1
7012 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7013 ; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7014 ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
7015 ; CHECK-NEXT: addq $56, %rsp
7016 ; CHECK-NEXT: .cfi_def_cfa_offset 8
7018 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7019 %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
7020 %3 = bitcast i16 %mask to <16 x i1>
7021 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
7025 define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
7026 ; CHECK-LABEL: stack_fold_ternlogd:
7028 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7031 ; CHECK-NEXT: #NO_APP
7032 ; CHECK-NEXT: vpternlogd $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
7034 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7035 %2 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
7038 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32)
7040 define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
7041 ; CHECK-LABEL: stack_fold_ternlogq:
7043 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7046 ; CHECK-NEXT: #NO_APP
7047 ; CHECK-NEXT: vpternlogq $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
7049 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7050 %2 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
7054 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32)
7056 define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
7057 ; CHECK-LABEL: stack_fold_punpckhbw_zmm:
7059 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7062 ; CHECK-NEXT: #NO_APP
7063 ; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7064 ; CHECK-NEXT: # zmm0 = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
7066 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7067 %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
7071 define <64 x i8> @stack_fold_punpckhbw_mask_zmm(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
7072 ; CHECK-LABEL: stack_fold_punpckhbw_mask_zmm:
7074 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7077 ; CHECK-NEXT: #NO_APP
7078 ; CHECK-NEXT: kmovq %rsi, %k1
7079 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
7080 ; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
7081 ; CHECK-NEXT: # zmm2 {%k1} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
7082 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
7084 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7085 %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
7086 %3 = bitcast i64 %mask to <64 x i1>
7087 ; load needed to keep the operation from being scheduled about the asm block
7088 %4 = load <64 x i8>, <64 x i8>* %passthru
7089 %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
7093 define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
7094 ; CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm:
7096 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7099 ; CHECK-NEXT: #NO_APP
7100 ; CHECK-NEXT: kmovq %rdi, %k1
7101 ; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7102 ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
7104 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7105 %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
7106 %3 = bitcast i64 %mask to <64 x i1>
7107 %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
7111 define <16 x i32> @stack_fold_pxord(<16 x i32> %a0, <16 x i32> %a1) {
7112 ; CHECK-LABEL: stack_fold_pxord:
7114 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7117 ; CHECK-NEXT: #NO_APP
7118 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7120 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7121 %2 = xor <16 x i32> %a0, %a1
7125 define <16 x i32> @stack_fold_pxord_commuted(<16 x i32> %a0, <16 x i32> %a1) {
7126 ; CHECK-LABEL: stack_fold_pxord_commuted:
7128 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7131 ; CHECK-NEXT: #NO_APP
7132 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7134 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7135 %2 = xor <16 x i32> %a1, %a0
7139 define <16 x i32> @stack_fold_pxord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
7140 ; CHECK-LABEL: stack_fold_pxord_mask:
7142 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7143 ; CHECK-NEXT: vmovaps %zmm0, %zmm1
7146 ; CHECK-NEXT: #NO_APP
7147 ; CHECK-NEXT: kmovd %esi, %k1
7148 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
7149 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7151 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7152 %2 = xor <16 x i32> %a0, %a1
7153 %3 = bitcast i16 %mask to <16 x i1>
7154 ; load needed to keep the operation from being scheduled about the asm block
7155 %4 = load <16 x i32>, <16 x i32>* %a2
7156 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
7160 define <16 x i32> @stack_fold_pxord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
7161 ; CHECK-LABEL: stack_fold_pxord_mask_commuted:
7163 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7164 ; CHECK-NEXT: vmovaps %zmm0, %zmm1
7167 ; CHECK-NEXT: #NO_APP
7168 ; CHECK-NEXT: kmovd %esi, %k1
7169 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
7170 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7172 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7173 %2 = xor <16 x i32> %a1, %a0
7174 %3 = bitcast i16 %mask to <16 x i1>
7175 ; load needed to keep the operation from being scheduled about the asm block
7176 %4 = load <16 x i32>, <16 x i32>* %a2
7177 %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
7181 define <16 x i32> @stack_fold_pxord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7182 ; CHECK-LABEL: stack_fold_pxord_maskz:
7184 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7187 ; CHECK-NEXT: #NO_APP
7188 ; CHECK-NEXT: kmovd %edi, %k1
7189 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7191 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7192 %2 = xor <16 x i32> %a0, %a1
7193 %3 = bitcast i16 %mask to <16 x i1>
7194 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
7198 define <16 x i32> @stack_fold_pxord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7199 ; CHECK-LABEL: stack_fold_pxord_maskz_commuted:
7201 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7204 ; CHECK-NEXT: #NO_APP
7205 ; CHECK-NEXT: kmovd %edi, %k1
7206 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7208 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7209 %2 = xor <16 x i32> %a1, %a0
7210 %3 = bitcast i16 %mask to <16 x i1>
7211 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
7215 define <8 x i64> @stack_fold_pxorq(<8 x i64> %a0, <8 x i64> %a1) {
7216 ; CHECK-LABEL: stack_fold_pxorq:
7218 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7221 ; CHECK-NEXT: #NO_APP
7222 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7224 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7225 %2 = xor <8 x i64> %a0, %a1
7229 define <8 x i64> @stack_fold_pxorq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
7230 ; CHECK-LABEL: stack_fold_pxorq_commuted:
7232 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7235 ; CHECK-NEXT: #NO_APP
7236 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7238 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7239 %2 = xor <8 x i64> %a1, %a0
7243 define <8 x i64> @stack_fold_pxorq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
7244 ; CHECK-LABEL: stack_fold_pxorq_mask:
7246 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7247 ; CHECK-NEXT: vmovapd %zmm0, %zmm1
7250 ; CHECK-NEXT: #NO_APP
7251 ; CHECK-NEXT: kmovd %esi, %k1
7252 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
7253 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7255 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7256 %2 = xor <8 x i64> %a0, %a1
7257 %3 = bitcast i8 %mask to <8 x i1>
7258 ; load needed to keep the operation from being scheduled about the asm block
7259 %4 = load <8 x i64>, <8 x i64>* %a2
7260 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
7264 define <8 x i64> @stack_fold_pxorq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
7265 ; CHECK-LABEL: stack_fold_pxorq_mask_commuted:
7267 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7268 ; CHECK-NEXT: vmovapd %zmm0, %zmm1
7271 ; CHECK-NEXT: #NO_APP
7272 ; CHECK-NEXT: kmovd %esi, %k1
7273 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
7274 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7276 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7277 %2 = xor <8 x i64> %a1, %a0
7278 %3 = bitcast i8 %mask to <8 x i1>
7279 ; load needed to keep the operation from being scheduled about the asm block
7280 %4 = load <8 x i64>, <8 x i64>* %a2
7281 %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
7285 define <8 x i64> @stack_fold_pxorq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7286 ; CHECK-LABEL: stack_fold_pxorq_maskz:
7288 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7291 ; CHECK-NEXT: #NO_APP
7292 ; CHECK-NEXT: kmovd %edi, %k1
7293 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7295 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7296 %2 = xor <8 x i64> %a0, %a1
7297 %3 = bitcast i8 %mask to <8 x i1>
7298 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
7302 define <8 x i64> @stack_fold_pxorq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7303 ; CHECK-LABEL: stack_fold_pxorq_maskz_commuted:
7305 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7308 ; CHECK-NEXT: #NO_APP
7309 ; CHECK-NEXT: kmovd %edi, %k1
7310 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7312 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7313 %2 = xor <8 x i64> %a1, %a0
7314 %3 = bitcast i8 %mask to <8 x i1>
7315 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
7319 declare <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
7320 declare <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16>, <32 x i16>)
7321 declare <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
7322 declare <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16>, <32 x i16>)
7323 declare <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32>)
7324 declare <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64>)
7325 declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1)
7326 declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1)
7327 declare <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
7328 declare <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16>, <32 x i16>)
7329 declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
7330 declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>)