1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <8 x double> @stack_fold_addpd_zmm(<8 x double> %a0, <8 x double> %a1) {
13 ; CHECK-LABEL: stack_fold_addpd_zmm:
15 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
22 %2 = fadd <8 x double> %a0, %a1
26 define <8 x double> @stack_fold_addpd_zmm_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) {
27 ; CHECK-LABEL: stack_fold_addpd_zmm_k:
29 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
33 ; CHECK-NEXT: kmovw %edi, %k1
34 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
35 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
36 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
38 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
39 %2 = fadd <8 x double> %a0, %a1
40 %3 = bitcast i8 %mask to <8 x i1>
41 %4 = load <8 x double>, <8 x double>* %passthru
42 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
46 define <8 x double> @stack_fold_addpd_zmm_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) {
47 ; CHECK-LABEL: stack_fold_addpd_zmm_k_commuted:
49 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
53 ; CHECK-NEXT: kmovw %edi, %k1
54 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
55 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
56 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
58 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
59 %2 = fadd <8 x double> %a1, %a0
60 %3 = bitcast i8 %mask to <8 x i1>
61 %4 = load <8 x double>, <8 x double>* %passthru
62 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
66 define <8 x double> @stack_fold_addpd_zmm_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
67 ; CHECK-LABEL: stack_fold_addpd_zmm_kz:
69 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
73 ; CHECK-NEXT: kmovw %edi, %k1
74 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
76 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
77 %2 = fadd <8 x double> %a1, %a0
78 %3 = bitcast i8 %mask to <8 x i1>
79 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
83 define <16 x float> @stack_fold_addps_zmm(<16 x float> %a0, <16 x float> %a1) {
84 ; CHECK-LABEL: stack_fold_addps_zmm:
86 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
90 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
92 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
93 %2 = fadd <16 x float> %a0, %a1
97 define <16 x float> @stack_fold_addps_zmm_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) {
98 ; CHECK-LABEL: stack_fold_addps_zmm_k:
100 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
103 ; CHECK-NEXT: #NO_APP
104 ; CHECK-NEXT: kmovw %edi, %k1
105 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
106 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
107 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
109 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
110 %2 = fadd <16 x float> %a0, %a1
111 %3 = bitcast i16 %mask to <16 x i1>
112 %4 = load <16 x float>, <16 x float>* %passthru
113 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
117 define <16 x float> @stack_fold_addps_zmm_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) {
118 ; CHECK-LABEL: stack_fold_addps_zmm_k_commuted:
120 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
123 ; CHECK-NEXT: #NO_APP
124 ; CHECK-NEXT: kmovw %edi, %k1
125 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
126 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
127 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
129 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
130 %2 = fadd <16 x float> %a1, %a0
131 %3 = bitcast i16 %mask to <16 x i1>
132 %4 = load <16 x float>, <16 x float>* %passthru
133 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
137 define <16 x float> @stack_fold_addps_zmm_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
138 ; CHECK-LABEL: stack_fold_addps_zmm_kz:
140 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
143 ; CHECK-NEXT: #NO_APP
144 ; CHECK-NEXT: kmovw %edi, %k1
145 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
147 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
148 %2 = fadd <16 x float> %a1, %a0
149 %3 = bitcast i16 %mask to <16 x i1>
150 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
154 define double @stack_fold_addsd(double %a0, double %a1) {
155 ; CHECK-LABEL: stack_fold_addsd:
157 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
160 ; CHECK-NEXT: #NO_APP
161 ; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
164 %2 = fadd double %a0, %a1
168 define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
169 ; CHECK-LABEL: stack_fold_addsd_int:
171 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
174 ; CHECK-NEXT: #NO_APP
175 ; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
177 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
178 %2 = extractelement <2 x double> %a0, i32 0
179 %3 = extractelement <2 x double> %a1, i32 0
180 %4 = fadd double %2, %3
181 %5 = insertelement <2 x double> %a0, double %4, i32 0
185 define float @stack_fold_addss(float %a0, float %a1) {
186 ; CHECK-LABEL: stack_fold_addss:
188 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
191 ; CHECK-NEXT: #NO_APP
192 ; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
194 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
195 %2 = fadd float %a0, %a1
199 define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
200 ; CHECK-LABEL: stack_fold_addss_int:
202 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
205 ; CHECK-NEXT: #NO_APP
206 ; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
208 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
209 %2 = extractelement <4 x float> %a0, i32 0
210 %3 = extractelement <4 x float> %a1, i32 0
211 %4 = fadd float %2, %3
212 %5 = insertelement <4 x float> %a0, float %4, i32 0
216 define <8 x double> @stack_fold_andnpd_zmm(<8 x double> %a0, <8 x double> %a1) {
217 ; CHECK-LABEL: stack_fold_andnpd_zmm:
219 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
222 ; CHECK-NEXT: #NO_APP
223 ; CHECK-NEXT: vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
224 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
225 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
227 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
228 %2 = bitcast <8 x double> %a0 to <8 x i64>
229 %3 = bitcast <8 x double> %a1 to <8 x i64>
230 %4 = xor <8 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
231 %5 = and <8 x i64> %4, %3
232 %6 = bitcast <8 x i64> %5 to <8 x double>
233 ; fadd forces execution domain
234 %7 = fadd <8 x double> %6, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
238 define <16 x float> @stack_fold_andnps_zmm(<16 x float> %a0, <16 x float> %a1) {
239 ; CHECK-LABEL: stack_fold_andnps_zmm:
241 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
244 ; CHECK-NEXT: #NO_APP
245 ; CHECK-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
246 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
247 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
249 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
250 %2 = bitcast <16 x float> %a0 to <16 x i32>
251 %3 = bitcast <16 x float> %a1 to <16 x i32>
252 %4 = xor <16 x i32> %2, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
253 %5 = and <16 x i32> %4, %3
254 %6 = bitcast <16 x i32> %5 to <16 x float>
255 ; fadd forces execution domain
256 %7 = fadd <16 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
260 define <8 x double> @stack_fold_andpd_zmm(<8 x double> %a0, <8 x double> %a1) {
261 ; CHECK-LABEL: stack_fold_andpd_zmm:
263 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
266 ; CHECK-NEXT: #NO_APP
267 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
268 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
269 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
271 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
272 %2 = bitcast <8 x double> %a0 to <8 x i64>
273 %3 = bitcast <8 x double> %a1 to <8 x i64>
274 %4 = and <8 x i64> %2, %3
275 %5 = bitcast <8 x i64> %4 to <8 x double>
276 ; fadd forces execution domain
277 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
281 define <16 x float> @stack_fold_andps_zmm(<16 x float> %a0, <16 x float> %a1) {
282 ; CHECK-LABEL: stack_fold_andps_zmm:
284 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
287 ; CHECK-NEXT: #NO_APP
288 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
289 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
290 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
292 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
293 %2 = bitcast <16 x float> %a0 to <16 x i32>
294 %3 = bitcast <16 x float> %a1 to <16 x i32>
295 %4 = and <16 x i32> %2, %3
296 %5 = bitcast <16 x i32> %4 to <16 x float>
297 ; fadd forces execution domain
298 %6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
302 define i8 @stack_fold_cmppd(<8 x double> %a0, <8 x double> %a1) {
303 ; CHECK-LABEL: stack_fold_cmppd:
305 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
308 ; CHECK-NEXT: #NO_APP
309 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
310 ; CHECK-NEXT: kmovw %k0, %eax
311 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
312 ; CHECK-NEXT: vzeroupper
314 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
315 %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
316 %2 = bitcast <8 x i1> %res to i8
319 declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32)
321 define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) {
322 ; CHECK-LABEL: stack_fold_cmppd_mask:
324 ; CHECK-NEXT: subq $136, %rsp
325 ; CHECK-NEXT: .cfi_def_cfa_offset 144
326 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
327 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
328 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
329 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
332 ; CHECK-NEXT: #NO_APP
333 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
334 ; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0
335 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
336 ; CHECK-NEXT: kmovw %esi, %k1
337 ; CHECK-NEXT: kandb %k0, %k1, %k1
338 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
339 ; CHECK-NEXT: vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
340 ; CHECK-NEXT: addq $136, %rsp
341 ; CHECK-NEXT: .cfi_def_cfa_offset 8
343 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
344 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
345 %2 = load <8 x double>, <8 x double>* %a2
346 %3 = fadd <8 x double> %a1, %2
347 %4 = bitcast i8 %mask to <8 x i1>
348 %5 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %3, <8 x double> %a0, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
349 %6 = and <8 x i1> %4, %5
350 %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
354 define <8 x double> @stack_fold_cmppd_mask_commuted(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) {
355 ; CHECK-LABEL: stack_fold_cmppd_mask_commuted:
357 ; CHECK-NEXT: subq $136, %rsp
358 ; CHECK-NEXT: .cfi_def_cfa_offset 144
359 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
360 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
361 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
362 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
365 ; CHECK-NEXT: #NO_APP
366 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
367 ; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0
368 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
369 ; CHECK-NEXT: kmovw %esi, %k1
370 ; CHECK-NEXT: kandb %k0, %k1, %k1
371 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
372 ; CHECK-NEXT: vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
373 ; CHECK-NEXT: addq $136, %rsp
374 ; CHECK-NEXT: .cfi_def_cfa_offset 8
376 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
377 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
378 %2 = load <8 x double>, <8 x double>* %a2
379 %3 = fadd <8 x double> %a1, %2
380 %4 = bitcast i8 %mask to <8 x i1>
381 %5 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %3, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
382 %6 = and <8 x i1> %4, %5
383 %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
387 define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) {
388 ; CHECK-LABEL: stack_fold_cmpps:
390 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
393 ; CHECK-NEXT: #NO_APP
394 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
395 ; CHECK-NEXT: kmovw %k0, %eax
396 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
397 ; CHECK-NEXT: vzeroupper
399 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
400 %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
401 %2 = bitcast <16 x i1> %res to i16
404 declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32)
406 define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) {
407 ; CHECK-LABEL: stack_fold_cmpps_mask:
409 ; CHECK-NEXT: subq $136, %rsp
410 ; CHECK-NEXT: .cfi_def_cfa_offset 144
411 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
412 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
413 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
414 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
417 ; CHECK-NEXT: #NO_APP
418 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
419 ; CHECK-NEXT: vaddps (%rdi), %zmm0, %zmm0
420 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
421 ; CHECK-NEXT: kmovw %esi, %k1
422 ; CHECK-NEXT: kandw %k0, %k1, %k1
423 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
424 ; CHECK-NEXT: vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
425 ; CHECK-NEXT: addq $136, %rsp
426 ; CHECK-NEXT: .cfi_def_cfa_offset 8
428 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
429 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
430 %2 = load <16 x float>, <16 x float>* %a2
431 %3 = fadd <16 x float> %a1, %2
432 %4 = bitcast i16 %mask to <16 x i1>
433 %5 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %3, <16 x float> %a0, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
434 %6 = and <16 x i1> %4, %5
435 %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
439 define <16 x float> @stack_fold_cmpps_mask_commuted(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) {
440 ; CHECK-LABEL: stack_fold_cmpps_mask_commuted:
442 ; CHECK-NEXT: subq $136, %rsp
443 ; CHECK-NEXT: .cfi_def_cfa_offset 144
444 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
445 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
446 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
447 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
450 ; CHECK-NEXT: #NO_APP
451 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
452 ; CHECK-NEXT: vaddps (%rdi), %zmm0, %zmm0
453 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
454 ; CHECK-NEXT: kmovw %esi, %k1
455 ; CHECK-NEXT: kandw %k0, %k1, %k1
456 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
457 ; CHECK-NEXT: vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
458 ; CHECK-NEXT: addq $136, %rsp
459 ; CHECK-NEXT: .cfi_def_cfa_offset 8
461 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
462 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
463 %2 = load <16 x float>, <16 x float>* %a2
464 %3 = fadd <16 x float> %a1, %2
465 %4 = bitcast i16 %mask to <16 x i1>
466 %5 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %3, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
467 %6 = and <16 x i1> %4, %5
468 %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
472 define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
473 ; CHECK-LABEL: stack_fold_divsd_int:
475 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
478 ; CHECK-NEXT: #NO_APP
479 ; CHECK-NEXT: vdivsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
481 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
482 %2 = extractelement <2 x double> %a0, i32 0
483 %3 = extractelement <2 x double> %a1, i32 0
484 %4 = fdiv double %2, %3
485 %5 = insertelement <2 x double> %a0, double %4, i32 0
489 define float @stack_fold_divss(float %a0, float %a1) {
490 ; CHECK-LABEL: stack_fold_divss:
492 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
495 ; CHECK-NEXT: #NO_APP
496 ; CHECK-NEXT: vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
498 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
499 %2 = fdiv float %a0, %a1
503 define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
504 ; CHECK-LABEL: stack_fold_divss_int:
506 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
509 ; CHECK-NEXT: #NO_APP
510 ; CHECK-NEXT: vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
512 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
513 %2 = extractelement <4 x float> %a0, i32 0
514 %3 = extractelement <4 x float> %a1, i32 0
515 %4 = fdiv float %2, %3
516 %5 = insertelement <4 x float> %a0, float %4, i32 0
520 define <8 x double> @stack_fold_cvtdq2pd(<8 x i32> %a0) {
521 ; CHECK-LABEL: stack_fold_cvtdq2pd:
523 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
526 ; CHECK-NEXT: #NO_APP
527 ; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
529 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
530 %2 = sitofp <8 x i32> %a0 to <8 x double>
534 define <8 x double> @stack_fold_cvtudq2pd(<8 x i32> %a0) {
535 ; CHECK-LABEL: stack_fold_cvtudq2pd:
537 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
540 ; CHECK-NEXT: #NO_APP
541 ; CHECK-NEXT: vcvtudq2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
543 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
544 %2 = uitofp <8 x i32> %a0 to <8 x double>
548 define <8 x float> @stack_fold_cvtpd2ps(<8 x double> %a0) {
549 ; CHECK-LABEL: stack_fold_cvtpd2ps:
551 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
554 ; CHECK-NEXT: #NO_APP
555 ; CHECK-NEXT: vcvtpd2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 64-byte Folded Reload
557 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
558 %2 = fptrunc <8 x double> %a0 to <8 x float>
562 define <16 x float> @stack_fold_cvtph2ps(<16 x i16> %a0) {
563 ; CHECK-LABEL: stack_fold_cvtph2ps:
565 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
568 ; CHECK-NEXT: #NO_APP
569 ; CHECK-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
571 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
572 %2 = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> undef, i16 -1, i32 4)
575 declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
577 define <16 x i16> @stack_fold_cvtps2ph(<16 x float> %a0) {
578 ; CHECK-LABEL: stack_fold_cvtps2ph:
580 ; CHECK-NEXT: vcvtps2ph $0, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
583 ; CHECK-NEXT: #NO_APP
584 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
586 %1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 0, <16 x i16> undef, i16 -1)
587 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
590 declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
592 define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
593 ; CHECK-LABEL: stack_fold_insertps:
595 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
598 ; CHECK-NEXT: #NO_APP
599 ; CHECK-NEXT: vinsertps $17, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
600 ; CHECK-NEXT: # xmm0 = zero,mem[0],xmm0[2,3]
602 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
603 %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
606 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
608 define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
609 ; CHECK-LABEL: stack_fold_maxpd_zmm:
611 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
614 ; CHECK-NEXT: #NO_APP
615 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
617 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
618 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
621 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32) nounwind readnone
623 define <8 x double> @stack_fold_maxpd_zmm_commutable(<8 x double> %a0, <8 x double> %a1) #1 {
624 ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable:
626 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
629 ; CHECK-NEXT: #NO_APP
630 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
632 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
633 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
637 define <8 x double> @stack_fold_maxpd_zmm_commutable_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) #1 {
638 ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_k:
640 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
643 ; CHECK-NEXT: #NO_APP
644 ; CHECK-NEXT: kmovw %edi, %k1
645 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
646 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
647 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
649 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
650 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
651 %3 = bitcast i8 %mask to <8 x i1>
652 %4 = load <8 x double>, <8 x double>* %passthru
653 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
657 define <8 x double> @stack_fold_maxpd_zmm_commutable_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) #1 {
658 ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_k_commuted:
660 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
663 ; CHECK-NEXT: #NO_APP
664 ; CHECK-NEXT: kmovw %edi, %k1
665 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
666 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
667 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
669 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
670 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4)
671 %3 = bitcast i8 %mask to <8 x i1>
672 %4 = load <8 x double>, <8 x double>* %passthru
673 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
677 define <8 x double> @stack_fold_maxpd_zmm_commutable_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #1 {
678 ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_kz:
680 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
683 ; CHECK-NEXT: #NO_APP
684 ; CHECK-NEXT: kmovw %edi, %k1
685 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
687 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
688 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4)
689 %3 = bitcast i8 %mask to <8 x i1>
690 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
694 define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
695 ; CHECK-LABEL: stack_fold_maxps_zmm:
697 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
700 ; CHECK-NEXT: #NO_APP
701 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
703 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
704 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
707 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32) nounwind readnone
709 define <16 x float> @stack_fold_maxps_zmm_commutable(<16 x float> %a0, <16 x float> %a1) #1 {
710 ; CHECK-LABEL: stack_fold_maxps_zmm_commutable:
712 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
715 ; CHECK-NEXT: #NO_APP
716 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
718 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
719 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
723 define <16 x float> @stack_fold_maxps_zmm_commutable_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) #1 {
724 ; CHECK-LABEL: stack_fold_maxps_zmm_commutable_k:
726 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
729 ; CHECK-NEXT: #NO_APP
730 ; CHECK-NEXT: kmovw %edi, %k1
731 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
732 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
733 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
735 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
736 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
737 %3 = bitcast i16 %mask to <16 x i1>
738 %4 = load <16 x float>, <16 x float>* %passthru
739 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
743 define <16 x float> @stack_fold_maxps_zmm_commutable_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) #1 {
744 ; CHECK-LABEL: stack_fold_maxps_zmm_commutable_k_commuted:
746 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
749 ; CHECK-NEXT: #NO_APP
750 ; CHECK-NEXT: kmovw %edi, %k1
751 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
752 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
753 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
755 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
756 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4)
757 %3 = bitcast i16 %mask to <16 x i1>
758 %4 = load <16 x float>, <16 x float>* %passthru
759 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
763 define <16 x float> @stack_fold_maxps_zmm_commutable_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #1 {
764 ; CHECK-LABEL: stack_fold_maxps_zmm_commutable_kz:
766 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
769 ; CHECK-NEXT: #NO_APP
770 ; CHECK-NEXT: kmovw %edi, %k1
771 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
773 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
774 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4)
775 %3 = bitcast i16 %mask to <16 x i1>
776 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
780 define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
781 ; CHECK-LABEL: stack_fold_minpd_zmm:
783 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
786 ; CHECK-NEXT: #NO_APP
787 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
789 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
790 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
793 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32) nounwind readnone
795 define <8 x double> @stack_fold_minpd_zmm_commutable(<8 x double> %a0, <8 x double> %a1) #1 {
796 ; CHECK-LABEL: stack_fold_minpd_zmm_commutable:
798 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
801 ; CHECK-NEXT: #NO_APP
802 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
804 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
805 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
809 define <8 x double> @stack_fold_minpd_zmm_commutable_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) #1 {
810 ; CHECK-LABEL: stack_fold_minpd_zmm_commutable_k:
812 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
815 ; CHECK-NEXT: #NO_APP
816 ; CHECK-NEXT: kmovw %edi, %k1
817 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
818 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
819 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
821 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
822 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
823 %3 = bitcast i8 %mask to <8 x i1>
824 %4 = load <8 x double>, <8 x double>* %passthru
825 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
829 define <8 x double> @stack_fold_minpd_zmm_commutable_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) #1 {
830 ; CHECK-LABEL: stack_fold_minpd_zmm_commutable_k_commuted:
832 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
835 ; CHECK-NEXT: #NO_APP
836 ; CHECK-NEXT: kmovw %edi, %k1
837 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
838 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
839 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
841 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
842 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4)
843 %3 = bitcast i8 %mask to <8 x i1>
844 %4 = load <8 x double>, <8 x double>* %passthru
845 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
849 define <8 x double> @stack_fold_minpd_zmm_commutable_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #1 {
850 ; CHECK-LABEL: stack_fold_minpd_zmm_commutable_kz:
852 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
855 ; CHECK-NEXT: #NO_APP
856 ; CHECK-NEXT: kmovw %edi, %k1
857 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
859 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
860 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4)
861 %3 = bitcast i8 %mask to <8 x i1>
862 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
866 define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
867 ; CHECK-LABEL: stack_fold_minps_zmm:
869 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
872 ; CHECK-NEXT: #NO_APP
873 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
875 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
876 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
879 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32) nounwind readnone
881 define <16 x float> @stack_fold_minps_zmm_commutable(<16 x float> %a0, <16 x float> %a1) #1 {
882 ; CHECK-LABEL: stack_fold_minps_zmm_commutable:
884 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
887 ; CHECK-NEXT: #NO_APP
888 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
890 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
891 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
895 define <16 x float> @stack_fold_minps_zmm_commutable_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) #1 {
896 ; CHECK-LABEL: stack_fold_minps_zmm_commutable_k:
898 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
901 ; CHECK-NEXT: #NO_APP
902 ; CHECK-NEXT: kmovw %edi, %k1
903 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
904 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
905 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
907 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
908 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
909 %3 = bitcast i16 %mask to <16 x i1>
910 %4 = load <16 x float>, <16 x float>* %passthru
911 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
915 define <16 x float> @stack_fold_minps_zmm_commutable_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) #1 {
916 ; CHECK-LABEL: stack_fold_minps_zmm_commutable_k_commuted:
918 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
921 ; CHECK-NEXT: #NO_APP
922 ; CHECK-NEXT: kmovw %edi, %k1
923 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
924 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
925 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
927 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
928 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4)
929 %3 = bitcast i16 %mask to <16 x i1>
930 %4 = load <16 x float>, <16 x float>* %passthru
931 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
935 define <16 x float> @stack_fold_minps_zmm_commutable_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #1 {
936 ; CHECK-LABEL: stack_fold_minps_zmm_commutable_kz:
938 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
941 ; CHECK-NEXT: #NO_APP
942 ; CHECK-NEXT: kmovw %edi, %k1
943 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
945 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
946 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4)
947 %3 = bitcast i16 %mask to <16 x i1>
948 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
952 define <8 x double> @stack_fold_mulpd_zmm(<8 x double> %a0, <8 x double> %a1) {
953 ; CHECK-LABEL: stack_fold_mulpd_zmm:
955 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
958 ; CHECK-NEXT: #NO_APP
959 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
961 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
962 %2 = fmul <8 x double> %a0, %a1
966 define <8 x double> @stack_fold_mulpd_zmm_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) {
967 ; CHECK-LABEL: stack_fold_mulpd_zmm_k:
969 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
972 ; CHECK-NEXT: #NO_APP
973 ; CHECK-NEXT: kmovw %edi, %k1
974 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
975 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
976 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
978 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
979 %2 = fmul <8 x double> %a0, %a1
980 %3 = bitcast i8 %mask to <8 x i1>
981 %4 = load <8 x double>, <8 x double>* %passthru
982 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
986 define <8 x double> @stack_fold_mulpd_zmm_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) {
987 ; CHECK-LABEL: stack_fold_mulpd_zmm_k_commuted:
989 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
992 ; CHECK-NEXT: #NO_APP
993 ; CHECK-NEXT: kmovw %edi, %k1
994 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
995 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
996 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
998 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
999 %2 = fmul <8 x double> %a1, %a0
1000 %3 = bitcast i8 %mask to <8 x i1>
1001 %4 = load <8 x double>, <8 x double>* %passthru
1002 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1006 define <8 x double> @stack_fold_mulpd_zmm_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1007 ; CHECK-LABEL: stack_fold_mulpd_zmm_kz:
1009 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1012 ; CHECK-NEXT: #NO_APP
1013 ; CHECK-NEXT: kmovw %edi, %k1
1014 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1016 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1017 %2 = fmul <8 x double> %a1, %a0
1018 %3 = bitcast i8 %mask to <8 x i1>
1019 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1023 define <16 x float> @stack_fold_mulps_zmm(<16 x float> %a0, <16 x float> %a1) {
1024 ; CHECK-LABEL: stack_fold_mulps_zmm:
1026 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1029 ; CHECK-NEXT: #NO_APP
1030 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1032 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1033 %2 = fmul <16 x float> %a0, %a1
1037 define <16 x float> @stack_fold_mulps_zmm_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) {
1038 ; CHECK-LABEL: stack_fold_mulps_zmm_k:
1040 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1043 ; CHECK-NEXT: #NO_APP
1044 ; CHECK-NEXT: kmovw %edi, %k1
1045 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
1046 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1047 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1049 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1050 %2 = fmul <16 x float> %a0, %a1
1051 %3 = bitcast i16 %mask to <16 x i1>
1052 %4 = load <16 x float>, <16 x float>* %passthru
1053 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
1057 define <16 x float> @stack_fold_mulps_zmm_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) {
1058 ; CHECK-LABEL: stack_fold_mulps_zmm_k_commuted:
1060 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1063 ; CHECK-NEXT: #NO_APP
1064 ; CHECK-NEXT: kmovw %edi, %k1
1065 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
1066 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1067 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1069 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1070 %2 = fmul <16 x float> %a1, %a0
1071 %3 = bitcast i16 %mask to <16 x i1>
1072 %4 = load <16 x float>, <16 x float>* %passthru
1073 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
1077 define <16 x float> @stack_fold_mulps_zmm_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1078 ; CHECK-LABEL: stack_fold_mulps_zmm_kz:
1080 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1083 ; CHECK-NEXT: #NO_APP
1084 ; CHECK-NEXT: kmovw %edi, %k1
1085 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1087 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1088 %2 = fmul <16 x float> %a1, %a0
1089 %3 = bitcast i16 %mask to <16 x i1>
1090 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
1094 define double @stack_fold_mulsd(double %a0, double %a1) {
1095 ; CHECK-LABEL: stack_fold_mulsd:
1097 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1100 ; CHECK-NEXT: #NO_APP
1101 ; CHECK-NEXT: vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
1103 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1104 %2 = fmul double %a0, %a1
1108 define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
1109 ; CHECK-LABEL: stack_fold_mulsd_int:
1111 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1114 ; CHECK-NEXT: #NO_APP
1115 ; CHECK-NEXT: vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1117 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1118 %2 = extractelement <2 x double> %a0, i32 0
1119 %3 = extractelement <2 x double> %a1, i32 0
1120 %4 = fmul double %2, %3
1121 %5 = insertelement <2 x double> %a0, double %4, i32 0
1125 define float @stack_fold_mulss(float %a0, float %a1) {
1126 ; CHECK-LABEL: stack_fold_mulss:
1128 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1131 ; CHECK-NEXT: #NO_APP
1132 ; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1134 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1135 %2 = fmul float %a0, %a1
1139 define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
1140 ; CHECK-LABEL: stack_fold_mulss_int:
1142 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1145 ; CHECK-NEXT: #NO_APP
1146 ; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1148 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1149 %2 = extractelement <4 x float> %a0, i32 0
1150 %3 = extractelement <4 x float> %a1, i32 0
1151 %4 = fmul float %2, %3
1152 %5 = insertelement <4 x float> %a0, float %4, i32 0
1156 define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
1157 ; CHECK-LABEL: stack_fold_orpd_zmm:
1159 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1162 ; CHECK-NEXT: #NO_APP
1163 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1164 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1165 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1167 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1168 %2 = bitcast <8 x double> %a0 to <8 x i64>
1169 %3 = bitcast <8 x double> %a1 to <8 x i64>
1170 %4 = or <8 x i64> %2, %3
1171 %5 = bitcast <8 x i64> %4 to <8 x double>
1172 ; fadd forces execution domain
1173 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
1177 define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
1178 ; CHECK-LABEL: stack_fold_orps_zmm:
1180 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1183 ; CHECK-NEXT: #NO_APP
1184 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1185 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
1186 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
1188 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1189 %2 = bitcast <16 x float> %a0 to <16 x i32>
1190 %3 = bitcast <16 x float> %a1 to <16 x i32>
1191 %4 = or <16 x i32> %2, %3
1192 %5 = bitcast <16 x i32> %4 to <16 x float>
1193 ; fadd forces execution domain
1194 %6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
1198 define <8 x double> @stack_fold_shuff64x2(<8 x double> %a, <8 x double> %b) {
1199 ; CHECK-LABEL: stack_fold_shuff64x2:
1201 ; CHECK-NEXT: pushq %rax
1202 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1203 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1204 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1207 ; CHECK-NEXT: #NO_APP
1208 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1209 ; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1210 ; CHECK-NEXT: # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
1211 ; CHECK-NEXT: popq %rax
1212 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1214 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1215 %2 = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
1219 define <8 x double> @stack_fold_shuff64x2_mask(<8 x double> %a, <8 x double> %b, i8 %mask, <8 x double>* %passthru) {
1220 ; CHECK-LABEL: stack_fold_shuff64x2_mask:
1222 ; CHECK-NEXT: pushq %rax
1223 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1224 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1225 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1228 ; CHECK-NEXT: #NO_APP
1229 ; CHECK-NEXT: kmovw %edi, %k1
1230 ; CHECK-NEXT: vmovapd (%rsi), %zmm1
1231 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1232 ; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
1233 ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
1234 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
1235 ; CHECK-NEXT: popq %rax
1236 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1238 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1239 %2 = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
1240 %3 = bitcast i8 %mask to <8 x i1>
1241 ; load needed to keep the operation from being scheduled above the asm block
1242 %4 = load <8 x double>, <8 x double>* %passthru
1243 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1247 define <8 x double> @stack_fold_shuff64x2_maskz(<8 x double> %a, <8 x double> %b, i8 %mask, <8 x double>* %passthru) {
1248 ; CHECK-LABEL: stack_fold_shuff64x2_maskz:
1250 ; CHECK-NEXT: pushq %rax
1251 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1252 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1253 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1256 ; CHECK-NEXT: #NO_APP
1257 ; CHECK-NEXT: kmovw %edi, %k1
1258 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1259 ; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1260 ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
1261 ; CHECK-NEXT: popq %rax
1262 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1264 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1265 %2 = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
1266 %3 = bitcast i8 %mask to <8 x i1>
1267 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1271 define <16 x float> @stack_fold_shuff32x4_mask(<16 x float> %a, <16 x float> %b, i16 %mask, <16 x float>* %passthru) {
1272 ; CHECK-LABEL: stack_fold_shuff32x4_mask:
1274 ; CHECK-NEXT: pushq %rax
1275 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1276 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1277 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1280 ; CHECK-NEXT: #NO_APP
1281 ; CHECK-NEXT: kmovw %edi, %k1
1282 ; CHECK-NEXT: vmovaps (%rsi), %zmm1
1283 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1284 ; CHECK-NEXT: vshuff32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
1285 ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
1286 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1287 ; CHECK-NEXT: popq %rax
1288 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1290 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1291 %2 = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
1292 %3 = bitcast i16 %mask to <16 x i1>
1293 ; load needed to keep the operation from being scheduled above the asm block
1294 %4 = load <16 x float>, <16 x float>* %passthru
1295 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
1299 define <16 x float> @stack_fold_shuff32x4_maskz(<16 x float> %a, <16 x float> %b, i16 %mask) {
1300 ; CHECK-LABEL: stack_fold_shuff32x4_maskz:
1302 ; CHECK-NEXT: pushq %rax
1303 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1304 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1305 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1308 ; CHECK-NEXT: #NO_APP
1309 ; CHECK-NEXT: kmovw %edi, %k1
1310 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1311 ; CHECK-NEXT: vshuff32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1312 ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
1313 ; CHECK-NEXT: popq %rax
1314 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1316 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1317 %2 = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
1318 %3 = bitcast i16 %mask to <16 x i1>
1319 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
1323 define <8 x double> @stack_fold_subpd_zmm(<8 x double> %a0, <8 x double> %a1) {
1324 ; CHECK-LABEL: stack_fold_subpd_zmm:
1326 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1329 ; CHECK-NEXT: #NO_APP
1330 ; CHECK-NEXT: vsubpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1332 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1333 %2 = fsub <8 x double> %a0, %a1
1337 define <16 x float> @stack_fold_subps_zmm(<16 x float> %a0, <16 x float> %a1) {
1338 ; CHECK-LABEL: stack_fold_subps_zmm:
1340 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1343 ; CHECK-NEXT: #NO_APP
1344 ; CHECK-NEXT: vsubps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1346 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1347 %2 = fsub <16 x float> %a0, %a1
1351 define double @stack_fold_subsd(double %a0, double %a1) {
1352 ; CHECK-LABEL: stack_fold_subsd:
1354 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1357 ; CHECK-NEXT: #NO_APP
1358 ; CHECK-NEXT: vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
1360 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1361 %2 = fsub double %a0, %a1
1365 define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
1366 ; CHECK-LABEL: stack_fold_subsd_int:
1368 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1371 ; CHECK-NEXT: #NO_APP
1372 ; CHECK-NEXT: vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1374 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1375 %2 = extractelement <2 x double> %a0, i32 0
1376 %3 = extractelement <2 x double> %a1, i32 0
1377 %4 = fsub double %2, %3
1378 %5 = insertelement <2 x double> %a0, double %4, i32 0
1382 define float @stack_fold_subss(float %a0, float %a1) {
1383 ; CHECK-LABEL: stack_fold_subss:
1385 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1388 ; CHECK-NEXT: #NO_APP
1389 ; CHECK-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1391 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1392 %2 = fsub float %a0, %a1
1396 define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
1397 ; CHECK-LABEL: stack_fold_subss_int:
1399 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1402 ; CHECK-NEXT: #NO_APP
1403 ; CHECK-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1405 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1406 %2 = extractelement <4 x float> %a0, i32 0
1407 %3 = extractelement <4 x float> %a1, i32 0
1408 %4 = fsub float %2, %3
1409 %5 = insertelement <4 x float> %a0, float %4, i32 0
1413 define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
1414 ; CHECK-LABEL: stack_fold_xorpd_zmm:
1416 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1419 ; CHECK-NEXT: #NO_APP
1420 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1421 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1422 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1424 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1425 %2 = bitcast <8 x double> %a0 to <8 x i64>
1426 %3 = bitcast <8 x double> %a1 to <8 x i64>
1427 %4 = xor <8 x i64> %2, %3
1428 %5 = bitcast <8 x i64> %4 to <8 x double>
1429 ; fadd forces execution domain
1430 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
1434 define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
1435 ; CHECK-LABEL: stack_fold_xorps_zmm:
1437 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1440 ; CHECK-NEXT: #NO_APP
1441 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1442 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
1443 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
1445 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1446 %2 = bitcast <16 x float> %a0 to <16 x i32>
1447 %3 = bitcast <16 x float> %a1 to <16 x i32>
1448 %4 = xor <16 x i32> %2, %3
1449 %5 = bitcast <16 x i32> %4 to <16 x float>
1450 ; fadd forces execution domain
1451 %6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
1455 define i32 @stack_fold_extractps(<4 x float> %a0) {
1456 ; CHECK-LABEL: stack_fold_extractps:
1458 ; CHECK-NEXT: pushq %rbp
1459 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1460 ; CHECK-NEXT: pushq %r15
1461 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1462 ; CHECK-NEXT: pushq %r14
1463 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1464 ; CHECK-NEXT: pushq %r13
1465 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1466 ; CHECK-NEXT: pushq %r12
1467 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1468 ; CHECK-NEXT: pushq %rbx
1469 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1470 ; CHECK-NEXT: .cfi_offset %rbx, -56
1471 ; CHECK-NEXT: .cfi_offset %r12, -48
1472 ; CHECK-NEXT: .cfi_offset %r13, -40
1473 ; CHECK-NEXT: .cfi_offset %r14, -32
1474 ; CHECK-NEXT: .cfi_offset %r15, -24
1475 ; CHECK-NEXT: .cfi_offset %rbp, -16
1476 ; CHECK-NEXT: vextractps $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1479 ; CHECK-NEXT: #NO_APP
1480 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1481 ; CHECK-NEXT: popq %rbx
1482 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1483 ; CHECK-NEXT: popq %r12
1484 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1485 ; CHECK-NEXT: popq %r13
1486 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1487 ; CHECK-NEXT: popq %r14
1488 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1489 ; CHECK-NEXT: popq %r15
1490 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1491 ; CHECK-NEXT: popq %rbp
1492 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1494 %1 = extractelement <4 x float> %a0, i32 1
1495 %2 = bitcast float %1 to i32
1496 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1500 define <4 x float> @stack_fold_extracti32x4(<16 x float> %a0) {
1501 ; CHECK-LABEL: stack_fold_extracti32x4:
1503 ; CHECK-NEXT: vextractf32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
1506 ; CHECK-NEXT: #NO_APP
1507 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1508 ; CHECK-NEXT: vzeroupper
1510 %1 = shufflevector <16 x float> %a0, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
1511 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1515 define <2 x double> @stack_fold_extractf64x2(<8 x double> %a0) {
1516 ; CHECK-LABEL: stack_fold_extractf64x2:
1518 ; CHECK-NEXT: vextractf32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
1521 ; CHECK-NEXT: #NO_APP
1522 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1523 ; CHECK-NEXT: vzeroupper
1525 %1 = shufflevector <8 x double> %a0, <8 x double> undef, <2 x i32> <i32 6, i32 7>
1526 %2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1530 define <8 x float> @stack_fold_extracti32x8(<16 x float> %a0) {
1531 ; CHECK-LABEL: stack_fold_extracti32x8:
1533 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
1536 ; CHECK-NEXT: #NO_APP
1537 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1539 %1 = shufflevector <16 x float> %a0, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1540 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1544 define <4 x double> @stack_fold_extractf64x4(<8 x double> %a0) {
1545 ; CHECK-LABEL: stack_fold_extractf64x4:
1547 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
1550 ; CHECK-NEXT: #NO_APP
1551 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1553 %1 = shufflevector <8 x double> %a0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1554 %2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1558 define <16 x float> @stack_fold_insertf32x8(<8 x float> %a0, <8 x float> %a1) {
1559 ; CHECK-LABEL: stack_fold_insertf32x8:
1561 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1562 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1565 ; CHECK-NEXT: #NO_APP
1566 ; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
1568 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1569 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1573 define <8 x double> @stack_fold_insertf64x4(<4 x double> %a0, <4 x double> %a1) {
1574 ; CHECK-LABEL: stack_fold_insertf64x4:
1576 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1577 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1580 ; CHECK-NEXT: #NO_APP
1581 ; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
1583 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1584 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1588 define <8 x double> @stack_fold_insertf64x4_mask(<8 x double>* %passthru, <4 x double> %a0, <4 x double> %a1, i8 %mask) {
1589 ; CHECK-LABEL: stack_fold_insertf64x4_mask:
1591 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1592 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1595 ; CHECK-NEXT: #NO_APP
1596 ; CHECK-NEXT: kmovw %esi, %k1
1597 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
1598 ; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 32-byte Folded Reload
1599 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
1601 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1602 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1603 %3 = bitcast i8 %mask to <8 x i1>
1604 %4 = load <8 x double>, <8 x double>* %passthru
1605 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1609 define <8 x double> @stack_fold_insertf64x4_maskz(<4 x double> %a0, <4 x double> %a1, i8 %mask) {
1610 ; CHECK-LABEL: stack_fold_insertf64x4_maskz:
1612 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1613 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1616 ; CHECK-NEXT: #NO_APP
1617 ; CHECK-NEXT: kmovw %edi, %k1
1618 ; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 32-byte Folded Reload
1620 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1621 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1622 %3 = bitcast i8 %mask to <8 x i1>
1623 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1627 define <16 x float> @stack_fold_vpermt2ps(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) {
1628 ; CHECK-LABEL: stack_fold_vpermt2ps:
1630 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1633 ; CHECK-NEXT: #NO_APP
1634 ; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1636 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1637 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
1640 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
1642 define <16 x float> @stack_fold_vpermi2ps(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2) {
1643 ; CHECK-LABEL: stack_fold_vpermi2ps:
1645 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1648 ; CHECK-NEXT: #NO_APP
1649 ; CHECK-NEXT: vpermi2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1651 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1652 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
1656 define <16 x float> @stack_fold_vpermi2ps_mask(<16 x float> %x0, <16 x i32>* %x1, <16 x float> %x2, i16 %mask) {
1657 ; CHECK-LABEL: stack_fold_vpermi2ps_mask:
1659 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1662 ; CHECK-NEXT: #NO_APP
1663 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
1664 ; CHECK-NEXT: kmovw %esi, %k1
1665 ; CHECK-NEXT: vpermi2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1666 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1668 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1669 %x1b = load <16 x i32>, <16 x i32>* %x1
1670 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1b, <16 x float> %x2)
1671 %3 = bitcast <16 x i32> %x1b to <16 x float>
1672 %4 = bitcast i16 %mask to <16 x i1>
1673 %5 = select <16 x i1> %4, <16 x float> %2, <16 x float> %3
1677 define <16 x float> @stack_fold_vpermt2ps_mask(<16 x i32>* %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) {
1678 ; CHECK-LABEL: stack_fold_vpermt2ps_mask:
1680 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1683 ; CHECK-NEXT: #NO_APP
1684 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1685 ; CHECK-NEXT: kmovw %esi, %k1
1686 ; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1688 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1689 %x0b = load <16 x i32>, <16 x i32>* %x0
1690 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0b, <16 x float> %x2)
1691 %3 = bitcast i16 %mask to <16 x i1>
1692 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %x1
1696 define <16 x float> @stack_fold_vpermt2ps_maskz(<16 x i32>* %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) {
1697 ; CHECK-LABEL: stack_fold_vpermt2ps_maskz:
1699 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1702 ; CHECK-NEXT: #NO_APP
1703 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1704 ; CHECK-NEXT: kmovw %esi, %k1
1705 ; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1707 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1708 %x0b = load <16 x i32>, <16 x i32>* %x0
1709 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0b, <16 x float> %x2)
1710 %3 = bitcast i16 %mask to <16 x i1>
1711 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
1715 define <8 x double> @stack_fold_vpermt2pd(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) {
1716 ; CHECK-LABEL: stack_fold_vpermt2pd:
1718 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1721 ; CHECK-NEXT: #NO_APP
1722 ; CHECK-NEXT: vpermt2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1724 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1725 %2 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
1726 %3 = bitcast <8 x i64> %x1 to <8 x double>
1729 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
1731 define <8 x double> @stack_fold_vpermi2pd(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2) {
1732 ; CHECK-LABEL: stack_fold_vpermi2pd:
1734 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1737 ; CHECK-NEXT: #NO_APP
1738 ; CHECK-NEXT: vpermi2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1740 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1741 %2 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2)
1745 define <8 x double> @stack_fold_permpd(<8 x double> %a0) {
1746 ; CHECK-LABEL: stack_fold_permpd:
1748 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1751 ; CHECK-NEXT: #NO_APP
1752 ; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1753 ; CHECK-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7]
1754 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1755 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1758 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
1759 ; fadd forces execution domain
1760 %3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
1764 define <8 x double> @stack_fold_permpd_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) {
1765 ; CHECK-LABEL: stack_fold_permpd_mask:
1767 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1770 ; CHECK-NEXT: #NO_APP
1771 ; CHECK-NEXT: kmovw %esi, %k1
1772 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
1773 ; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
1774 ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7]
1775 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1776 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1778 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1779 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
1780 %3 = bitcast i8 %mask to <8 x i1>
1781 ; load needed to keep the operation from being scheduled above the asm block
1782 %4 = load <8 x double>, <8 x double>* %passthru
1783 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1784 ; fadd forces execution domain
1785 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
1789 define <8 x double> @stack_fold_permpd_maskz(<8 x double> %a0, i8 %mask) {
1790 ; CHECK-LABEL: stack_fold_permpd_maskz:
1792 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1795 ; CHECK-NEXT: #NO_APP
1796 ; CHECK-NEXT: kmovw %edi, %k1
1797 ; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1798 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,2,3,7,6,6,7]
1800 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1801 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
1802 %3 = bitcast i8 %mask to <8 x i1>
1803 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1807 define <8 x double> @stack_fold_permpdvar(<8 x i64> %a0, <8 x double> %a1) {
1808 ; CHECK-LABEL: stack_fold_permpdvar:
1810 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1813 ; CHECK-NEXT: #NO_APP
1814 ; CHECK-NEXT: vpermpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1815 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1816 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1818 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1819 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a1, <8 x i64> %a0)
1820 ; fadd forces execution domain
1821 %3 = fadd <8 x double> %2, zeroinitializer
1824 declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) nounwind readonly
1826 define <16 x float> @stack_fold_permps(<16 x i32> %a0, <16 x float> %a1) {
1827 ; CHECK-LABEL: stack_fold_permps:
1829 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1832 ; CHECK-NEXT: #NO_APP
1833 ; CHECK-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1835 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1836 %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a1, <16 x i32> %a0)
1839 declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) nounwind readonly
1841 define <8 x double> @stack_fold_permilpd_zmm(<8 x double> %a0) {
1842 ; CHECK-LABEL: stack_fold_permilpd_zmm:
1844 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1847 ; CHECK-NEXT: #NO_APP
1848 ; CHECK-NEXT: vpermilpd $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1849 ; CHECK-NEXT: # zmm0 = mem[1,0,3,2,5,4,7,6]
1851 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1852 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1856 define <8 x double> @stack_fold_permilpd_zmm_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) {
1857 ; CHECK-LABEL: stack_fold_permilpd_zmm_mask:
1859 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1862 ; CHECK-NEXT: #NO_APP
1863 ; CHECK-NEXT: kmovw %esi, %k1
1864 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
1865 ; CHECK-NEXT: vpermilpd $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
1866 ; CHECK-NEXT: # zmm1 {%k1} = mem[1,0,3,2,5,4,7,6]
1867 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
1869 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1870 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1871 %3 = bitcast i8 %mask to <8 x i1>
1872 ; load needed to keep the operation from being scheduled above the asm block
1873 %4 = load <8 x double>, <8 x double>* %passthru
1874 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1878 define <8 x double> @stack_fold_permilpd_zmm_maskz(<8 x double> %a0, i8 %mask) {
1879 ; CHECK-LABEL: stack_fold_permilpd_zmm_maskz:
1881 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1884 ; CHECK-NEXT: #NO_APP
1885 ; CHECK-NEXT: kmovw %edi, %k1
1886 ; CHECK-NEXT: vpermilpd $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1887 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,0,3,2,5,4,7,6]
1889 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1890 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1891 %3 = bitcast i8 %mask to <8 x i1>
1892 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1896 define <8 x double> @stack_fold_permilpdvar_zmm(<8 x double> %a0, <8 x i64> %a1) {
1897 ; CHECK-LABEL: stack_fold_permilpdvar_zmm:
1899 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1902 ; CHECK-NEXT: #NO_APP
1903 ; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1905 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1906 %2 = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1)
1909 declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>) nounwind readnone
1911 define <8 x double> @stack_fold_permilpdvar_zmm_mask(<8 x double>* %passthru, <8 x double> %a0, <8 x i64> %a1, i8 %mask) {
1912 ; CHECK-LABEL: stack_fold_permilpdvar_zmm_mask:
1914 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1917 ; CHECK-NEXT: #NO_APP
1918 ; CHECK-NEXT: kmovw %esi, %k1
1919 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
1920 ; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1921 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
1923 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1924 %2 = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1)
1925 %3 = bitcast i8 %mask to <8 x i1>
1926 ; load needed to keep the operation from being scheduled above the asm block
1927 %4 = load <8 x double>, <8 x double>* %passthru
1928 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1932 define <8 x double> @stack_fold_permilpdvar_zmm_maskz(<8 x double> %a0, <8 x i64> %a1, i8 %mask) {
1933 ; CHECK-LABEL: stack_fold_permilpdvar_zmm_maskz:
1935 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1938 ; CHECK-NEXT: #NO_APP
1939 ; CHECK-NEXT: kmovw %edi, %k1
1940 ; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1942 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1943 %2 = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1)
1944 %3 = bitcast i8 %mask to <8 x i1>
1945 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1949 define <16 x float> @stack_fold_permilps_zmm(<16 x float> %a0) {
1950 ; CHECK-LABEL: stack_fold_permilps_zmm:
1952 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1955 ; CHECK-NEXT: #NO_APP
1956 ; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1957 ; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1959 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1960 %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
1964 define <16 x float> @stack_fold_permilps_zmm_mask(<16 x float>* %passthru, <16 x float> %a0, i16 %mask) {
1965 ; CHECK-LABEL: stack_fold_permilps_zmm_mask:
1967 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1970 ; CHECK-NEXT: #NO_APP
1971 ; CHECK-NEXT: kmovw %esi, %k1
1972 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1973 ; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
1974 ; CHECK-NEXT: # zmm1 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1975 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1977 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1978 %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
1979 %3 = bitcast i16 %mask to <16 x i1>
1980 ; load needed to keep the operation from being scheduled above the asm block
1981 %4 = load <16 x float>, <16 x float>* %passthru
1982 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
1986 define <16 x float> @stack_fold_permilps_zmm_maskz(<16 x float> %a0, i16 %mask) {
1987 ; CHECK-LABEL: stack_fold_permilps_zmm_maskz:
1989 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1992 ; CHECK-NEXT: #NO_APP
1993 ; CHECK-NEXT: kmovw %edi, %k1
1994 ; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1995 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1997 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1998 %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
1999 %3 = bitcast i16 %mask to <16 x i1>
2000 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
2004 define <16 x float> @stack_fold_permilpsvar_zmm(<16 x float> %a0, <16 x i32> %a1) {
2005 ; CHECK-LABEL: stack_fold_permilpsvar_zmm:
2007 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2010 ; CHECK-NEXT: #NO_APP
2011 ; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2013 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2014 %2 = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1)
2017 declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>) nounwind readnone
2019 define <16 x float> @stack_fold_permilpsvar_zmm_mask(<16 x float>* %passthru, <16 x float> %a0, <16 x i32> %a1, i16 %mask) {
2020 ; CHECK-LABEL: stack_fold_permilpsvar_zmm_mask:
2022 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2025 ; CHECK-NEXT: #NO_APP
2026 ; CHECK-NEXT: kmovw %esi, %k1
2027 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
2028 ; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2029 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2031 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2032 %2 = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1)
2033 %3 = bitcast i16 %mask to <16 x i1>
2034 ; load needed to keep the operation from being scheduled above the asm block
2035 %4 = load <16 x float>, <16 x float>* %passthru
2036 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
2040 define <16 x float> @stack_fold_permilpsvar_zmm_maskz(<16 x float> %a0, <16 x i32> %a1, i16 %mask) {
2041 ; CHECK-LABEL: stack_fold_permilpsvar_zmm_maskz:
2043 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2046 ; CHECK-NEXT: #NO_APP
2047 ; CHECK-NEXT: kmovw %edi, %k1
2048 ; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2050 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2051 %2 = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1)
2052 %3 = bitcast i16 %mask to <16 x i1>
2053 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
2057 attributes #0 = { "unsafe-fp-math"="false" }
2058 attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }