1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <8 x double> @stack_fold_addpd_zmm(<8 x double> %a0, <8 x double> %a1) {
13 ; CHECK-LABEL: stack_fold_addpd_zmm:
15 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
22 %2 = fadd <8 x double> %a0, %a1
26 define <8 x double> @stack_fold_addpd_zmm_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) {
27 ; CHECK-LABEL: stack_fold_addpd_zmm_k:
29 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
33 ; CHECK-NEXT: kmovw %edi, %k1
34 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
35 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
36 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
38 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
39 %2 = fadd <8 x double> %a0, %a1
40 %3 = bitcast i8 %mask to <8 x i1>
41 %4 = load <8 x double>, <8 x double>* %passthru
42 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
46 define <8 x double> @stack_fold_addpd_zmm_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) {
47 ; CHECK-LABEL: stack_fold_addpd_zmm_k_commuted:
49 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
53 ; CHECK-NEXT: kmovw %edi, %k1
54 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
55 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
56 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
58 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
59 %2 = fadd <8 x double> %a1, %a0
60 %3 = bitcast i8 %mask to <8 x i1>
61 %4 = load <8 x double>, <8 x double>* %passthru
62 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
66 define <8 x double> @stack_fold_addpd_zmm_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
67 ; CHECK-LABEL: stack_fold_addpd_zmm_kz:
69 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
73 ; CHECK-NEXT: kmovw %edi, %k1
74 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
76 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
77 %2 = fadd <8 x double> %a1, %a0
78 %3 = bitcast i8 %mask to <8 x i1>
79 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
83 define <16 x float> @stack_fold_addps_zmm(<16 x float> %a0, <16 x float> %a1) {
84 ; CHECK-LABEL: stack_fold_addps_zmm:
86 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
90 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
92 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
93 %2 = fadd <16 x float> %a0, %a1
97 define <16 x float> @stack_fold_addps_zmm_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) {
98 ; CHECK-LABEL: stack_fold_addps_zmm_k:
100 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
103 ; CHECK-NEXT: #NO_APP
104 ; CHECK-NEXT: kmovw %edi, %k1
105 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
106 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
107 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
109 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
110 %2 = fadd <16 x float> %a0, %a1
111 %3 = bitcast i16 %mask to <16 x i1>
112 %4 = load <16 x float>, <16 x float>* %passthru
113 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
117 define <16 x float> @stack_fold_addps_zmm_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) {
118 ; CHECK-LABEL: stack_fold_addps_zmm_k_commuted:
120 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
123 ; CHECK-NEXT: #NO_APP
124 ; CHECK-NEXT: kmovw %edi, %k1
125 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
126 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
127 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
129 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
130 %2 = fadd <16 x float> %a1, %a0
131 %3 = bitcast i16 %mask to <16 x i1>
132 %4 = load <16 x float>, <16 x float>* %passthru
133 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
137 define <16 x float> @stack_fold_addps_zmm_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
138 ; CHECK-LABEL: stack_fold_addps_zmm_kz:
140 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
143 ; CHECK-NEXT: #NO_APP
144 ; CHECK-NEXT: kmovw %edi, %k1
145 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
147 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
148 %2 = fadd <16 x float> %a1, %a0
149 %3 = bitcast i16 %mask to <16 x i1>
150 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
154 define double @stack_fold_addsd(double %a0, double %a1) {
155 ; CHECK-LABEL: stack_fold_addsd:
157 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
160 ; CHECK-NEXT: #NO_APP
161 ; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
164 %2 = fadd double %a0, %a1
168 define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
169 ; CHECK-LABEL: stack_fold_addsd_int:
171 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
174 ; CHECK-NEXT: #NO_APP
175 ; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
177 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
178 %2 = extractelement <2 x double> %a0, i32 0
179 %3 = extractelement <2 x double> %a1, i32 0
180 %4 = fadd double %2, %3
181 %5 = insertelement <2 x double> %a0, double %4, i32 0
185 define float @stack_fold_addss(float %a0, float %a1) {
186 ; CHECK-LABEL: stack_fold_addss:
188 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
191 ; CHECK-NEXT: #NO_APP
192 ; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
194 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
195 %2 = fadd float %a0, %a1
199 define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
200 ; CHECK-LABEL: stack_fold_addss_int:
202 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
205 ; CHECK-NEXT: #NO_APP
206 ; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
208 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
209 %2 = extractelement <4 x float> %a0, i32 0
210 %3 = extractelement <4 x float> %a1, i32 0
211 %4 = fadd float %2, %3
212 %5 = insertelement <4 x float> %a0, float %4, i32 0
216 define <8 x double> @stack_fold_andnpd_zmm(<8 x double> %a0, <8 x double> %a1) {
217 ; CHECK-LABEL: stack_fold_andnpd_zmm:
219 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
222 ; CHECK-NEXT: #NO_APP
223 ; CHECK-NEXT: vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
224 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
225 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
227 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
228 %2 = bitcast <8 x double> %a0 to <8 x i64>
229 %3 = bitcast <8 x double> %a1 to <8 x i64>
230 %4 = xor <8 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
231 %5 = and <8 x i64> %4, %3
232 %6 = bitcast <8 x i64> %5 to <8 x double>
233 ; fadd forces execution domain
234 %7 = fadd <8 x double> %6, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
238 define <16 x float> @stack_fold_andnps_zmm(<16 x float> %a0, <16 x float> %a1) {
239 ; CHECK-LABEL: stack_fold_andnps_zmm:
241 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
244 ; CHECK-NEXT: #NO_APP
245 ; CHECK-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
246 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
247 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
249 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
250 %2 = bitcast <16 x float> %a0 to <16 x i32>
251 %3 = bitcast <16 x float> %a1 to <16 x i32>
252 %4 = xor <16 x i32> %2, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
253 %5 = and <16 x i32> %4, %3
254 %6 = bitcast <16 x i32> %5 to <16 x float>
255 ; fadd forces execution domain
256 %7 = fadd <16 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
260 define <8 x double> @stack_fold_andpd_zmm(<8 x double> %a0, <8 x double> %a1) {
261 ; CHECK-LABEL: stack_fold_andpd_zmm:
263 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
266 ; CHECK-NEXT: #NO_APP
267 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
268 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
269 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
271 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
272 %2 = bitcast <8 x double> %a0 to <8 x i64>
273 %3 = bitcast <8 x double> %a1 to <8 x i64>
274 %4 = and <8 x i64> %2, %3
275 %5 = bitcast <8 x i64> %4 to <8 x double>
276 ; fadd forces execution domain
277 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
281 define <16 x float> @stack_fold_andps_zmm(<16 x float> %a0, <16 x float> %a1) {
282 ; CHECK-LABEL: stack_fold_andps_zmm:
284 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
287 ; CHECK-NEXT: #NO_APP
288 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
289 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
290 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
292 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
293 %2 = bitcast <16 x float> %a0 to <16 x i32>
294 %3 = bitcast <16 x float> %a1 to <16 x i32>
295 %4 = and <16 x i32> %2, %3
296 %5 = bitcast <16 x i32> %4 to <16 x float>
297 ; fadd forces execution domain
298 %6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
302 define i8 @stack_fold_cmppd(<8 x double> %a0, <8 x double> %a1) {
303 ; CHECK-LABEL: stack_fold_cmppd:
305 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
308 ; CHECK-NEXT: #NO_APP
309 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
310 ; CHECK-NEXT: kmovw %k0, %eax
311 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
312 ; CHECK-NEXT: vzeroupper
314 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
315 %res = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0, i32 4)
316 %2 = bitcast <8 x i1> %res to i8
319 declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
321 define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) {
322 ; CHECK-LABEL: stack_fold_cmppd_mask:
324 ; CHECK-NEXT: subq $184, %rsp
325 ; CHECK-NEXT: .cfi_def_cfa_offset 192
326 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
327 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
328 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
329 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
332 ; CHECK-NEXT: #NO_APP
333 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
334 ; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0
335 ; CHECK-NEXT: kmovw %esi, %k1
336 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
337 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
338 ; CHECK-NEXT: vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
339 ; CHECK-NEXT: addq $184, %rsp
340 ; CHECK-NEXT: .cfi_def_cfa_offset 8
342 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
343 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
344 %2 = load <8 x double>, <8 x double>* %a2
345 %3 = fadd <8 x double> %a1, %2
346 %4 = bitcast i8 %mask to <8 x i1>
347 %5 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %3, <8 x double> %a0, i32 0, i32 4)
348 %6 = and <8 x i1> %4, %5
349 %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
353 define <8 x double> @stack_fold_cmppd_mask_commuted(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) {
354 ; CHECK-LABEL: stack_fold_cmppd_mask_commuted:
356 ; CHECK-NEXT: subq $184, %rsp
357 ; CHECK-NEXT: .cfi_def_cfa_offset 192
358 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
359 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
360 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
361 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
364 ; CHECK-NEXT: #NO_APP
365 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
366 ; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0
367 ; CHECK-NEXT: kmovw %esi, %k1
368 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
369 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
370 ; CHECK-NEXT: vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
371 ; CHECK-NEXT: addq $184, %rsp
372 ; CHECK-NEXT: .cfi_def_cfa_offset 8
374 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
375 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
376 %2 = load <8 x double>, <8 x double>* %a2
377 %3 = fadd <8 x double> %a1, %2
378 %4 = bitcast i8 %mask to <8 x i1>
379 %5 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a0, <8 x double> %3, i32 0, i32 4)
380 %6 = and <8 x i1> %4, %5
381 %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
385 define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) {
386 ; CHECK-LABEL: stack_fold_cmpps:
388 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
391 ; CHECK-NEXT: #NO_APP
392 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
393 ; CHECK-NEXT: kmovw %k0, %eax
394 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
395 ; CHECK-NEXT: vzeroupper
397 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
398 %res = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0, i32 4)
399 %2 = bitcast <16 x i1> %res to i16
402 declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
404 define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) {
405 ; CHECK-LABEL: stack_fold_cmpps_mask:
407 ; CHECK-NEXT: subq $184, %rsp
408 ; CHECK-NEXT: .cfi_def_cfa_offset 192
409 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
410 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
411 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
412 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
415 ; CHECK-NEXT: #NO_APP
416 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
417 ; CHECK-NEXT: vaddps (%rdi), %zmm0, %zmm0
418 ; CHECK-NEXT: kmovw %esi, %k1
419 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
420 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
421 ; CHECK-NEXT: vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
422 ; CHECK-NEXT: addq $184, %rsp
423 ; CHECK-NEXT: .cfi_def_cfa_offset 8
425 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
426 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
427 %2 = load <16 x float>, <16 x float>* %a2
428 %3 = fadd <16 x float> %a1, %2
429 %4 = bitcast i16 %mask to <16 x i1>
430 %5 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %3, <16 x float> %a0, i32 0, i32 4)
431 %6 = and <16 x i1> %4, %5
432 %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
436 define <16 x float> @stack_fold_cmpps_mask_commuted(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) {
437 ; CHECK-LABEL: stack_fold_cmpps_mask_commuted:
439 ; CHECK-NEXT: subq $184, %rsp
440 ; CHECK-NEXT: .cfi_def_cfa_offset 192
441 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
442 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
443 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
444 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
447 ; CHECK-NEXT: #NO_APP
448 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
449 ; CHECK-NEXT: vaddps (%rdi), %zmm0, %zmm0
450 ; CHECK-NEXT: kmovw %esi, %k1
451 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
452 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
453 ; CHECK-NEXT: vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
454 ; CHECK-NEXT: addq $184, %rsp
455 ; CHECK-NEXT: .cfi_def_cfa_offset 8
457 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
458 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
459 %2 = load <16 x float>, <16 x float>* %a2
460 %3 = fadd <16 x float> %a1, %2
461 %4 = bitcast i16 %mask to <16 x i1>
462 %5 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a0, <16 x float> %3, i32 0, i32 4)
463 %6 = and <16 x i1> %4, %5
464 %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
468 define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
469 ; CHECK-LABEL: stack_fold_divsd_int:
471 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
474 ; CHECK-NEXT: #NO_APP
475 ; CHECK-NEXT: vdivsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
477 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
478 %2 = extractelement <2 x double> %a0, i32 0
479 %3 = extractelement <2 x double> %a1, i32 0
480 %4 = fdiv double %2, %3
481 %5 = insertelement <2 x double> %a0, double %4, i32 0
485 define float @stack_fold_divss(float %a0, float %a1) {
486 ; CHECK-LABEL: stack_fold_divss:
488 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
491 ; CHECK-NEXT: #NO_APP
492 ; CHECK-NEXT: vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
494 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
495 %2 = fdiv float %a0, %a1
499 define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
500 ; CHECK-LABEL: stack_fold_divss_int:
502 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
505 ; CHECK-NEXT: #NO_APP
506 ; CHECK-NEXT: vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
508 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
509 %2 = extractelement <4 x float> %a0, i32 0
510 %3 = extractelement <4 x float> %a1, i32 0
511 %4 = fdiv float %2, %3
512 %5 = insertelement <4 x float> %a0, float %4, i32 0
516 define <8 x double> @stack_fold_cvtdq2pd(<8 x i32> %a0) {
517 ; CHECK-LABEL: stack_fold_cvtdq2pd:
519 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
522 ; CHECK-NEXT: #NO_APP
523 ; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
525 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
526 %2 = sitofp <8 x i32> %a0 to <8 x double>
530 define <8 x double> @stack_fold_cvtudq2pd(<8 x i32> %a0) {
531 ; CHECK-LABEL: stack_fold_cvtudq2pd:
533 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
536 ; CHECK-NEXT: #NO_APP
537 ; CHECK-NEXT: vcvtudq2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
539 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
540 %2 = uitofp <8 x i32> %a0 to <8 x double>
544 define <8 x float> @stack_fold_cvtpd2ps(<8 x double> %a0) {
545 ; CHECK-LABEL: stack_fold_cvtpd2ps:
547 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
550 ; CHECK-NEXT: #NO_APP
551 ; CHECK-NEXT: vcvtpd2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 64-byte Folded Reload
553 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
554 %2 = fptrunc <8 x double> %a0 to <8 x float>
558 define <16 x float> @stack_fold_cvtph2ps(<16 x i16> %a0) {
559 ; CHECK-LABEL: stack_fold_cvtph2ps:
561 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
564 ; CHECK-NEXT: #NO_APP
565 ; CHECK-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
567 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
568 %2 = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> undef, i16 -1, i32 4)
571 declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
573 define <16 x i16> @stack_fold_cvtps2ph(<16 x float> %a0) {
574 ; CHECK-LABEL: stack_fold_cvtps2ph:
576 ; CHECK-NEXT: vcvtps2ph $0, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
579 ; CHECK-NEXT: #NO_APP
580 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
582 %1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 0, <16 x i16> undef, i16 -1)
583 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
586 declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
588 define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
589 ; CHECK-LABEL: stack_fold_insertps:
591 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
594 ; CHECK-NEXT: #NO_APP
595 ; CHECK-NEXT: vinsertps $17, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
596 ; CHECK-NEXT: # xmm0 = zero,mem[0],xmm0[2,3]
598 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
599 %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
602 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
604 define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
605 ; CHECK-LABEL: stack_fold_maxpd_zmm:
607 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
610 ; CHECK-NEXT: #NO_APP
611 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
613 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
614 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
617 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32) nounwind readnone
619 define <8 x double> @stack_fold_maxpd_zmm_commutable(<8 x double> %a0, <8 x double> %a1) #1 {
620 ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable:
622 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
625 ; CHECK-NEXT: #NO_APP
626 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
628 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
629 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
633 define <8 x double> @stack_fold_maxpd_zmm_commutable_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) #1 {
634 ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_k:
636 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
639 ; CHECK-NEXT: #NO_APP
640 ; CHECK-NEXT: kmovw %edi, %k1
641 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
642 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
643 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
645 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
646 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
647 %3 = bitcast i8 %mask to <8 x i1>
648 %4 = load <8 x double>, <8 x double>* %passthru
649 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
653 define <8 x double> @stack_fold_maxpd_zmm_commutable_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) #1 {
654 ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_k_commuted:
656 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
659 ; CHECK-NEXT: #NO_APP
660 ; CHECK-NEXT: kmovw %edi, %k1
661 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
662 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
663 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
665 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
666 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4)
667 %3 = bitcast i8 %mask to <8 x i1>
668 %4 = load <8 x double>, <8 x double>* %passthru
669 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
673 define <8 x double> @stack_fold_maxpd_zmm_commutable_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #1 {
674 ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_kz:
676 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
679 ; CHECK-NEXT: #NO_APP
680 ; CHECK-NEXT: kmovw %edi, %k1
681 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
683 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
684 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4)
685 %3 = bitcast i8 %mask to <8 x i1>
686 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
690 define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
691 ; CHECK-LABEL: stack_fold_maxps_zmm:
693 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
696 ; CHECK-NEXT: #NO_APP
697 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
699 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
700 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
703 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32) nounwind readnone
705 define <16 x float> @stack_fold_maxps_zmm_commutable(<16 x float> %a0, <16 x float> %a1) #1 {
706 ; CHECK-LABEL: stack_fold_maxps_zmm_commutable:
708 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
711 ; CHECK-NEXT: #NO_APP
712 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
714 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
715 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
719 define <16 x float> @stack_fold_maxps_zmm_commutable_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) #1 {
720 ; CHECK-LABEL: stack_fold_maxps_zmm_commutable_k:
722 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
725 ; CHECK-NEXT: #NO_APP
726 ; CHECK-NEXT: kmovw %edi, %k1
727 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
728 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
729 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
731 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
732 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
733 %3 = bitcast i16 %mask to <16 x i1>
734 %4 = load <16 x float>, <16 x float>* %passthru
735 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
739 define <16 x float> @stack_fold_maxps_zmm_commutable_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) #1 {
740 ; CHECK-LABEL: stack_fold_maxps_zmm_commutable_k_commuted:
742 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
745 ; CHECK-NEXT: #NO_APP
746 ; CHECK-NEXT: kmovw %edi, %k1
747 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
748 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
749 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
751 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
752 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4)
753 %3 = bitcast i16 %mask to <16 x i1>
754 %4 = load <16 x float>, <16 x float>* %passthru
755 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
759 define <16 x float> @stack_fold_maxps_zmm_commutable_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #1 {
760 ; CHECK-LABEL: stack_fold_maxps_zmm_commutable_kz:
762 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
765 ; CHECK-NEXT: #NO_APP
766 ; CHECK-NEXT: kmovw %edi, %k1
767 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
769 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
770 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4)
771 %3 = bitcast i16 %mask to <16 x i1>
772 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
776 define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
777 ; CHECK-LABEL: stack_fold_minpd_zmm:
779 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
782 ; CHECK-NEXT: #NO_APP
783 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
785 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
786 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
789 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32) nounwind readnone
791 define <8 x double> @stack_fold_minpd_zmm_commutable(<8 x double> %a0, <8 x double> %a1) #1 {
792 ; CHECK-LABEL: stack_fold_minpd_zmm_commutable:
794 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
797 ; CHECK-NEXT: #NO_APP
798 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
800 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
801 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
805 define <8 x double> @stack_fold_minpd_zmm_commutable_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) #1 {
806 ; CHECK-LABEL: stack_fold_minpd_zmm_commutable_k:
808 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
811 ; CHECK-NEXT: #NO_APP
812 ; CHECK-NEXT: kmovw %edi, %k1
813 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
814 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
815 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
817 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
818 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
819 %3 = bitcast i8 %mask to <8 x i1>
820 %4 = load <8 x double>, <8 x double>* %passthru
821 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
825 define <8 x double> @stack_fold_minpd_zmm_commutable_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) #1 {
826 ; CHECK-LABEL: stack_fold_minpd_zmm_commutable_k_commuted:
828 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
831 ; CHECK-NEXT: #NO_APP
832 ; CHECK-NEXT: kmovw %edi, %k1
833 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
834 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
835 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
837 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
838 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4)
839 %3 = bitcast i8 %mask to <8 x i1>
840 %4 = load <8 x double>, <8 x double>* %passthru
841 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
845 define <8 x double> @stack_fold_minpd_zmm_commutable_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #1 {
846 ; CHECK-LABEL: stack_fold_minpd_zmm_commutable_kz:
848 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
851 ; CHECK-NEXT: #NO_APP
852 ; CHECK-NEXT: kmovw %edi, %k1
853 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
855 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
856 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4)
857 %3 = bitcast i8 %mask to <8 x i1>
858 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
862 define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
863 ; CHECK-LABEL: stack_fold_minps_zmm:
865 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
868 ; CHECK-NEXT: #NO_APP
869 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
871 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
872 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
875 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32) nounwind readnone
877 define <16 x float> @stack_fold_minps_zmm_commutable(<16 x float> %a0, <16 x float> %a1) #1 {
878 ; CHECK-LABEL: stack_fold_minps_zmm_commutable:
880 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
883 ; CHECK-NEXT: #NO_APP
884 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
886 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
887 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
891 define <16 x float> @stack_fold_minps_zmm_commutable_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) #1 {
892 ; CHECK-LABEL: stack_fold_minps_zmm_commutable_k:
894 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
897 ; CHECK-NEXT: #NO_APP
898 ; CHECK-NEXT: kmovw %edi, %k1
899 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
900 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
901 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
903 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
904 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
905 %3 = bitcast i16 %mask to <16 x i1>
906 %4 = load <16 x float>, <16 x float>* %passthru
907 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
911 define <16 x float> @stack_fold_minps_zmm_commutable_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) #1 {
912 ; CHECK-LABEL: stack_fold_minps_zmm_commutable_k_commuted:
914 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
917 ; CHECK-NEXT: #NO_APP
918 ; CHECK-NEXT: kmovw %edi, %k1
919 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
920 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
921 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
923 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
924 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4)
925 %3 = bitcast i16 %mask to <16 x i1>
926 %4 = load <16 x float>, <16 x float>* %passthru
927 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
931 define <16 x float> @stack_fold_minps_zmm_commutable_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #1 {
932 ; CHECK-LABEL: stack_fold_minps_zmm_commutable_kz:
934 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
937 ; CHECK-NEXT: #NO_APP
938 ; CHECK-NEXT: kmovw %edi, %k1
939 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
941 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
942 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4)
943 %3 = bitcast i16 %mask to <16 x i1>
944 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
948 define <8 x double> @stack_fold_mulpd_zmm(<8 x double> %a0, <8 x double> %a1) {
949 ; CHECK-LABEL: stack_fold_mulpd_zmm:
951 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
954 ; CHECK-NEXT: #NO_APP
955 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
958 %2 = fmul <8 x double> %a0, %a1
962 define <8 x double> @stack_fold_mulpd_zmm_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) {
963 ; CHECK-LABEL: stack_fold_mulpd_zmm_k:
965 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
968 ; CHECK-NEXT: #NO_APP
969 ; CHECK-NEXT: kmovw %edi, %k1
970 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
971 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
972 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
974 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
975 %2 = fmul <8 x double> %a0, %a1
976 %3 = bitcast i8 %mask to <8 x i1>
977 %4 = load <8 x double>, <8 x double>* %passthru
978 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
982 define <8 x double> @stack_fold_mulpd_zmm_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) {
983 ; CHECK-LABEL: stack_fold_mulpd_zmm_k_commuted:
985 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
988 ; CHECK-NEXT: #NO_APP
989 ; CHECK-NEXT: kmovw %edi, %k1
990 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
991 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
992 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
994 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
995 %2 = fmul <8 x double> %a1, %a0
996 %3 = bitcast i8 %mask to <8 x i1>
997 %4 = load <8 x double>, <8 x double>* %passthru
998 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1002 define <8 x double> @stack_fold_mulpd_zmm_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1003 ; CHECK-LABEL: stack_fold_mulpd_zmm_kz:
1005 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1008 ; CHECK-NEXT: #NO_APP
1009 ; CHECK-NEXT: kmovw %edi, %k1
1010 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1012 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1013 %2 = fmul <8 x double> %a1, %a0
1014 %3 = bitcast i8 %mask to <8 x i1>
1015 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1019 define <16 x float> @stack_fold_mulps_zmm(<16 x float> %a0, <16 x float> %a1) {
1020 ; CHECK-LABEL: stack_fold_mulps_zmm:
1022 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1025 ; CHECK-NEXT: #NO_APP
1026 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1028 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1029 %2 = fmul <16 x float> %a0, %a1
1033 define <16 x float> @stack_fold_mulps_zmm_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) {
1034 ; CHECK-LABEL: stack_fold_mulps_zmm_k:
1036 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1039 ; CHECK-NEXT: #NO_APP
1040 ; CHECK-NEXT: kmovw %edi, %k1
1041 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
1042 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1043 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1045 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1046 %2 = fmul <16 x float> %a0, %a1
1047 %3 = bitcast i16 %mask to <16 x i1>
1048 %4 = load <16 x float>, <16 x float>* %passthru
1049 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
1053 define <16 x float> @stack_fold_mulps_zmm_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) {
1054 ; CHECK-LABEL: stack_fold_mulps_zmm_k_commuted:
1056 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1059 ; CHECK-NEXT: #NO_APP
1060 ; CHECK-NEXT: kmovw %edi, %k1
1061 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
1062 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1063 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1065 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1066 %2 = fmul <16 x float> %a1, %a0
1067 %3 = bitcast i16 %mask to <16 x i1>
1068 %4 = load <16 x float>, <16 x float>* %passthru
1069 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
1073 define <16 x float> @stack_fold_mulps_zmm_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1074 ; CHECK-LABEL: stack_fold_mulps_zmm_kz:
1076 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1079 ; CHECK-NEXT: #NO_APP
1080 ; CHECK-NEXT: kmovw %edi, %k1
1081 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1083 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1084 %2 = fmul <16 x float> %a1, %a0
1085 %3 = bitcast i16 %mask to <16 x i1>
1086 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
1090 define double @stack_fold_mulsd(double %a0, double %a1) {
1091 ; CHECK-LABEL: stack_fold_mulsd:
1093 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1096 ; CHECK-NEXT: #NO_APP
1097 ; CHECK-NEXT: vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
1099 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1100 %2 = fmul double %a0, %a1
1104 define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
1105 ; CHECK-LABEL: stack_fold_mulsd_int:
1107 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1110 ; CHECK-NEXT: #NO_APP
1111 ; CHECK-NEXT: vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1113 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1114 %2 = extractelement <2 x double> %a0, i32 0
1115 %3 = extractelement <2 x double> %a1, i32 0
1116 %4 = fmul double %2, %3
1117 %5 = insertelement <2 x double> %a0, double %4, i32 0
1121 define float @stack_fold_mulss(float %a0, float %a1) {
1122 ; CHECK-LABEL: stack_fold_mulss:
1124 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1127 ; CHECK-NEXT: #NO_APP
1128 ; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1130 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1131 %2 = fmul float %a0, %a1
1135 define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
1136 ; CHECK-LABEL: stack_fold_mulss_int:
1138 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1141 ; CHECK-NEXT: #NO_APP
1142 ; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1144 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1145 %2 = extractelement <4 x float> %a0, i32 0
1146 %3 = extractelement <4 x float> %a1, i32 0
1147 %4 = fmul float %2, %3
1148 %5 = insertelement <4 x float> %a0, float %4, i32 0
1152 define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
1153 ; CHECK-LABEL: stack_fold_orpd_zmm:
1155 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1158 ; CHECK-NEXT: #NO_APP
1159 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1160 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1161 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1164 %2 = bitcast <8 x double> %a0 to <8 x i64>
1165 %3 = bitcast <8 x double> %a1 to <8 x i64>
1166 %4 = or <8 x i64> %2, %3
1167 %5 = bitcast <8 x i64> %4 to <8 x double>
1168 ; fadd forces execution domain
1169 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
1173 define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
1174 ; CHECK-LABEL: stack_fold_orps_zmm:
1176 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1179 ; CHECK-NEXT: #NO_APP
1180 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1181 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
1182 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
1184 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1185 %2 = bitcast <16 x float> %a0 to <16 x i32>
1186 %3 = bitcast <16 x float> %a1 to <16 x i32>
1187 %4 = or <16 x i32> %2, %3
1188 %5 = bitcast <16 x i32> %4 to <16 x float>
1189 ; fadd forces execution domain
1190 %6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
1194 define <8 x double> @stack_fold_shuff64x2(<8 x double> %a, <8 x double> %b) {
1195 ; CHECK-LABEL: stack_fold_shuff64x2:
1197 ; CHECK-NEXT: subq $56, %rsp
1198 ; CHECK-NEXT: .cfi_def_cfa_offset 64
1199 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1200 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1203 ; CHECK-NEXT: #NO_APP
1204 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1205 ; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1206 ; CHECK-NEXT: # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
1207 ; CHECK-NEXT: addq $56, %rsp
1208 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1210 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1211 %2 = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
1215 define <8 x double> @stack_fold_shuff64x2_mask(<8 x double> %a, <8 x double> %b, i8 %mask, <8 x double>* %passthru) {
1216 ; CHECK-LABEL: stack_fold_shuff64x2_mask:
1218 ; CHECK-NEXT: subq $56, %rsp
1219 ; CHECK-NEXT: .cfi_def_cfa_offset 64
1220 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1221 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1224 ; CHECK-NEXT: #NO_APP
1225 ; CHECK-NEXT: kmovw %edi, %k1
1226 ; CHECK-NEXT: vmovapd (%rsi), %zmm1
1227 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1228 ; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
1229 ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
1230 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
1231 ; CHECK-NEXT: addq $56, %rsp
1232 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1234 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1235 %2 = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
1236 %3 = bitcast i8 %mask to <8 x i1>
1237 ; load needed to keep the operation from being scheduled above the asm block
1238 %4 = load <8 x double>, <8 x double>* %passthru
1239 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1243 define <8 x double> @stack_fold_shuff64x2_maskz(<8 x double> %a, <8 x double> %b, i8 %mask, <8 x double>* %passthru) {
1244 ; CHECK-LABEL: stack_fold_shuff64x2_maskz:
1246 ; CHECK-NEXT: subq $56, %rsp
1247 ; CHECK-NEXT: .cfi_def_cfa_offset 64
1248 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1249 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1252 ; CHECK-NEXT: #NO_APP
1253 ; CHECK-NEXT: kmovw %edi, %k1
1254 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1255 ; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1256 ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
1257 ; CHECK-NEXT: addq $56, %rsp
1258 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1260 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1261 %2 = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
1262 %3 = bitcast i8 %mask to <8 x i1>
1263 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1267 define <16 x float> @stack_fold_shuff32x4_mask(<16 x float> %a, <16 x float> %b, i16 %mask, <16 x float>* %passthru) {
1268 ; CHECK-LABEL: stack_fold_shuff32x4_mask:
1270 ; CHECK-NEXT: subq $56, %rsp
1271 ; CHECK-NEXT: .cfi_def_cfa_offset 64
1272 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1273 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1276 ; CHECK-NEXT: #NO_APP
1277 ; CHECK-NEXT: kmovw %edi, %k1
1278 ; CHECK-NEXT: vmovaps (%rsi), %zmm1
1279 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1280 ; CHECK-NEXT: vshuff32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
1281 ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
1282 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1283 ; CHECK-NEXT: addq $56, %rsp
1284 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1286 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1287 %2 = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
1288 %3 = bitcast i16 %mask to <16 x i1>
1289 ; load needed to keep the operation from being scheduled above the asm block
1290 %4 = load <16 x float>, <16 x float>* %passthru
1291 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
1295 define <16 x float> @stack_fold_shuff32x4_maskz(<16 x float> %a, <16 x float> %b, i16 %mask) {
1296 ; CHECK-LABEL: stack_fold_shuff32x4_maskz:
1298 ; CHECK-NEXT: subq $56, %rsp
1299 ; CHECK-NEXT: .cfi_def_cfa_offset 64
1300 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1301 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1304 ; CHECK-NEXT: #NO_APP
1305 ; CHECK-NEXT: kmovw %edi, %k1
1306 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1307 ; CHECK-NEXT: vshuff32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1308 ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
1309 ; CHECK-NEXT: addq $56, %rsp
1310 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1312 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1313 %2 = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
1314 %3 = bitcast i16 %mask to <16 x i1>
1315 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
1319 define <8 x double> @stack_fold_subpd_zmm(<8 x double> %a0, <8 x double> %a1) {
1320 ; CHECK-LABEL: stack_fold_subpd_zmm:
1322 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1325 ; CHECK-NEXT: #NO_APP
1326 ; CHECK-NEXT: vsubpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1328 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1329 %2 = fsub <8 x double> %a0, %a1
1333 define <16 x float> @stack_fold_subps_zmm(<16 x float> %a0, <16 x float> %a1) {
1334 ; CHECK-LABEL: stack_fold_subps_zmm:
1336 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1339 ; CHECK-NEXT: #NO_APP
1340 ; CHECK-NEXT: vsubps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1342 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1343 %2 = fsub <16 x float> %a0, %a1
1347 define double @stack_fold_subsd(double %a0, double %a1) {
1348 ; CHECK-LABEL: stack_fold_subsd:
1350 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1353 ; CHECK-NEXT: #NO_APP
1354 ; CHECK-NEXT: vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
1356 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1357 %2 = fsub double %a0, %a1
1361 define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
1362 ; CHECK-LABEL: stack_fold_subsd_int:
1364 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1367 ; CHECK-NEXT: #NO_APP
1368 ; CHECK-NEXT: vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1370 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1371 %2 = extractelement <2 x double> %a0, i32 0
1372 %3 = extractelement <2 x double> %a1, i32 0
1373 %4 = fsub double %2, %3
1374 %5 = insertelement <2 x double> %a0, double %4, i32 0
1378 define float @stack_fold_subss(float %a0, float %a1) {
1379 ; CHECK-LABEL: stack_fold_subss:
1381 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1384 ; CHECK-NEXT: #NO_APP
1385 ; CHECK-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1387 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1388 %2 = fsub float %a0, %a1
1392 define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
1393 ; CHECK-LABEL: stack_fold_subss_int:
1395 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1398 ; CHECK-NEXT: #NO_APP
1399 ; CHECK-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1401 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1402 %2 = extractelement <4 x float> %a0, i32 0
1403 %3 = extractelement <4 x float> %a1, i32 0
1404 %4 = fsub float %2, %3
1405 %5 = insertelement <4 x float> %a0, float %4, i32 0
1409 define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
1410 ; CHECK-LABEL: stack_fold_xorpd_zmm:
1412 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1415 ; CHECK-NEXT: #NO_APP
1416 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1417 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1418 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1420 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1421 %2 = bitcast <8 x double> %a0 to <8 x i64>
1422 %3 = bitcast <8 x double> %a1 to <8 x i64>
1423 %4 = xor <8 x i64> %2, %3
1424 %5 = bitcast <8 x i64> %4 to <8 x double>
1425 ; fadd forces execution domain
1426 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
1430 define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
1431 ; CHECK-LABEL: stack_fold_xorps_zmm:
1433 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1436 ; CHECK-NEXT: #NO_APP
1437 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1438 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
1439 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
1441 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1442 %2 = bitcast <16 x float> %a0 to <16 x i32>
1443 %3 = bitcast <16 x float> %a1 to <16 x i32>
1444 %4 = xor <16 x i32> %2, %3
1445 %5 = bitcast <16 x i32> %4 to <16 x float>
1446 ; fadd forces execution domain
1447 %6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
1451 define i32 @stack_fold_extractps(<4 x float> %a0) {
1452 ; CHECK-LABEL: stack_fold_extractps:
1454 ; CHECK-NEXT: pushq %rbp
1455 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1456 ; CHECK-NEXT: pushq %r15
1457 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1458 ; CHECK-NEXT: pushq %r14
1459 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1460 ; CHECK-NEXT: pushq %r13
1461 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1462 ; CHECK-NEXT: pushq %r12
1463 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1464 ; CHECK-NEXT: pushq %rbx
1465 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1466 ; CHECK-NEXT: .cfi_offset %rbx, -56
1467 ; CHECK-NEXT: .cfi_offset %r12, -48
1468 ; CHECK-NEXT: .cfi_offset %r13, -40
1469 ; CHECK-NEXT: .cfi_offset %r14, -32
1470 ; CHECK-NEXT: .cfi_offset %r15, -24
1471 ; CHECK-NEXT: .cfi_offset %rbp, -16
1472 ; CHECK-NEXT: vextractps $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1475 ; CHECK-NEXT: #NO_APP
1476 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1477 ; CHECK-NEXT: popq %rbx
1478 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1479 ; CHECK-NEXT: popq %r12
1480 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1481 ; CHECK-NEXT: popq %r13
1482 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1483 ; CHECK-NEXT: popq %r14
1484 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1485 ; CHECK-NEXT: popq %r15
1486 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1487 ; CHECK-NEXT: popq %rbp
1488 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1490 %1 = extractelement <4 x float> %a0, i32 1
1491 %2 = bitcast float %1 to i32
1492 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1496 define <4 x float> @stack_fold_extracti32x4(<16 x float> %a0) {
1497 ; CHECK-LABEL: stack_fold_extracti32x4:
1499 ; CHECK-NEXT: vextractf32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
1502 ; CHECK-NEXT: #NO_APP
1503 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1504 ; CHECK-NEXT: vzeroupper
1506 %1 = shufflevector <16 x float> %a0, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
1507 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1511 define <2 x double> @stack_fold_extractf64x2(<8 x double> %a0) {
1512 ; CHECK-LABEL: stack_fold_extractf64x2:
1514 ; CHECK-NEXT: vextractf32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
1517 ; CHECK-NEXT: #NO_APP
1518 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1519 ; CHECK-NEXT: vzeroupper
1521 %1 = shufflevector <8 x double> %a0, <8 x double> undef, <2 x i32> <i32 6, i32 7>
1522 %2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1526 define <8 x float> @stack_fold_extracti32x8(<16 x float> %a0) {
1527 ; CHECK-LABEL: stack_fold_extracti32x8:
1529 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
1532 ; CHECK-NEXT: #NO_APP
1533 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1535 %1 = shufflevector <16 x float> %a0, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1536 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1540 define <4 x double> @stack_fold_extractf64x4(<8 x double> %a0) {
1541 ; CHECK-LABEL: stack_fold_extractf64x4:
1543 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
1546 ; CHECK-NEXT: #NO_APP
1547 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1549 %1 = shufflevector <8 x double> %a0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1550 %2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1554 define <16 x float> @stack_fold_insertf32x8(<8 x float> %a0, <8 x float> %a1) {
1555 ; CHECK-LABEL: stack_fold_insertf32x8:
1557 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1558 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1561 ; CHECK-NEXT: #NO_APP
1562 ; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
1564 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1565 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1569 define <8 x double> @stack_fold_insertf64x4(<4 x double> %a0, <4 x double> %a1) {
1570 ; CHECK-LABEL: stack_fold_insertf64x4:
1572 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1573 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1576 ; CHECK-NEXT: #NO_APP
1577 ; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
1579 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1580 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1584 define <8 x double> @stack_fold_insertf64x4_mask(<8 x double>* %passthru, <4 x double> %a0, <4 x double> %a1, i8 %mask) {
1585 ; CHECK-LABEL: stack_fold_insertf64x4_mask:
1587 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1588 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1591 ; CHECK-NEXT: #NO_APP
1592 ; CHECK-NEXT: kmovw %esi, %k1
1593 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
1594 ; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 32-byte Folded Reload
1595 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
1597 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1598 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1599 %3 = bitcast i8 %mask to <8 x i1>
1600 %4 = load <8 x double>, <8 x double>* %passthru
1601 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1605 define <8 x double> @stack_fold_insertf64x4_maskz(<4 x double> %a0, <4 x double> %a1, i8 %mask) {
1606 ; CHECK-LABEL: stack_fold_insertf64x4_maskz:
1608 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1609 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1612 ; CHECK-NEXT: #NO_APP
1613 ; CHECK-NEXT: kmovw %edi, %k1
1614 ; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 32-byte Folded Reload
1616 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1617 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1618 %3 = bitcast i8 %mask to <8 x i1>
1619 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1623 define <16 x float> @stack_fold_vpermt2ps(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) {
1624 ; CHECK-LABEL: stack_fold_vpermt2ps:
1626 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1629 ; CHECK-NEXT: #NO_APP
1630 ; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1632 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1633 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
1636 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
1638 define <16 x float> @stack_fold_vpermi2ps(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2) {
1639 ; CHECK-LABEL: stack_fold_vpermi2ps:
1641 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1644 ; CHECK-NEXT: #NO_APP
1645 ; CHECK-NEXT: vpermi2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1647 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1648 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
1652 define <16 x float> @stack_fold_vpermi2ps_mask(<16 x float> %x0, <16 x i32>* %x1, <16 x float> %x2, i16 %mask) {
1653 ; CHECK-LABEL: stack_fold_vpermi2ps_mask:
1655 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1658 ; CHECK-NEXT: #NO_APP
1659 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
1660 ; CHECK-NEXT: kmovw %esi, %k1
1661 ; CHECK-NEXT: vpermi2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1662 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1664 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1665 %x1b = load <16 x i32>, <16 x i32>* %x1
1666 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1b, <16 x float> %x2)
1667 %3 = bitcast <16 x i32> %x1b to <16 x float>
1668 %4 = bitcast i16 %mask to <16 x i1>
1669 %5 = select <16 x i1> %4, <16 x float> %2, <16 x float> %3
1673 define <16 x float> @stack_fold_vpermt2ps_mask(<16 x i32>* %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) {
1674 ; CHECK-LABEL: stack_fold_vpermt2ps_mask:
1676 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1679 ; CHECK-NEXT: #NO_APP
1680 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1681 ; CHECK-NEXT: kmovw %esi, %k1
1682 ; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1684 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1685 %x0b = load <16 x i32>, <16 x i32>* %x0
1686 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0b, <16 x float> %x2)
1687 %3 = bitcast i16 %mask to <16 x i1>
1688 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %x1
1692 define <16 x float> @stack_fold_vpermt2ps_maskz(<16 x i32>* %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) {
1693 ; CHECK-LABEL: stack_fold_vpermt2ps_maskz:
1695 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1698 ; CHECK-NEXT: #NO_APP
1699 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1700 ; CHECK-NEXT: kmovw %esi, %k1
1701 ; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1703 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1704 %x0b = load <16 x i32>, <16 x i32>* %x0
1705 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0b, <16 x float> %x2)
1706 %3 = bitcast i16 %mask to <16 x i1>
1707 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
1711 define <8 x double> @stack_fold_vpermt2pd(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) {
1712 ; CHECK-LABEL: stack_fold_vpermt2pd:
1714 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1717 ; CHECK-NEXT: #NO_APP
1718 ; CHECK-NEXT: vpermt2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1720 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1721 %2 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
1722 %3 = bitcast <8 x i64> %x1 to <8 x double>
1725 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
1727 define <8 x double> @stack_fold_vpermi2pd(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2) {
1728 ; CHECK-LABEL: stack_fold_vpermi2pd:
1730 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1733 ; CHECK-NEXT: #NO_APP
1734 ; CHECK-NEXT: vpermi2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1736 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1737 %2 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2)
1741 define <8 x double> @stack_fold_permpd(<8 x double> %a0) {
1742 ; CHECK-LABEL: stack_fold_permpd:
1744 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1747 ; CHECK-NEXT: #NO_APP
1748 ; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1749 ; CHECK-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7]
1750 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1751 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1753 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1754 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
1755 ; fadd forces execution domain
1756 %3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
1760 define <8 x double> @stack_fold_permpd_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) {
1761 ; CHECK-LABEL: stack_fold_permpd_mask:
1763 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1766 ; CHECK-NEXT: #NO_APP
1767 ; CHECK-NEXT: kmovw %esi, %k1
1768 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
1769 ; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
1770 ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7]
1771 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1772 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1774 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1775 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
1776 %3 = bitcast i8 %mask to <8 x i1>
1777 ; load needed to keep the operation from being scheduled above the asm block
1778 %4 = load <8 x double>, <8 x double>* %passthru
1779 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1780 ; fadd forces execution domain
1781 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
1785 define <8 x double> @stack_fold_permpd_maskz(<8 x double> %a0, i8 %mask) {
1786 ; CHECK-LABEL: stack_fold_permpd_maskz:
1788 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1791 ; CHECK-NEXT: #NO_APP
1792 ; CHECK-NEXT: kmovw %edi, %k1
1793 ; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1794 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,2,3,7,6,6,7]
1796 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1797 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
1798 %3 = bitcast i8 %mask to <8 x i1>
1799 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1803 define <8 x double> @stack_fold_permpdvar(<8 x i64> %a0, <8 x double> %a1) {
1804 ; CHECK-LABEL: stack_fold_permpdvar:
1806 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1809 ; CHECK-NEXT: #NO_APP
1810 ; CHECK-NEXT: vpermpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1811 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1812 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1814 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1815 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a1, <8 x i64> %a0)
1816 ; fadd forces execution domain
1817 %3 = fadd <8 x double> %2, zeroinitializer
1820 declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) nounwind readonly
1822 define <16 x float> @stack_fold_permps(<16 x i32> %a0, <16 x float> %a1) {
1823 ; CHECK-LABEL: stack_fold_permps:
1825 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1828 ; CHECK-NEXT: #NO_APP
1829 ; CHECK-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1831 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1832 %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a1, <16 x i32> %a0)
1835 declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) nounwind readonly
1837 define <8 x double> @stack_fold_permilpd_zmm(<8 x double> %a0) {
1838 ; CHECK-LABEL: stack_fold_permilpd_zmm:
1840 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1843 ; CHECK-NEXT: #NO_APP
1844 ; CHECK-NEXT: vpermilpd $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1845 ; CHECK-NEXT: # zmm0 = mem[1,0,3,2,5,4,7,6]
1847 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1848 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1852 define <8 x double> @stack_fold_permilpd_zmm_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) {
1853 ; CHECK-LABEL: stack_fold_permilpd_zmm_mask:
1855 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1858 ; CHECK-NEXT: #NO_APP
1859 ; CHECK-NEXT: kmovw %esi, %k1
1860 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
1861 ; CHECK-NEXT: vpermilpd $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
1862 ; CHECK-NEXT: # zmm1 {%k1} = mem[1,0,3,2,5,4,7,6]
1863 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
1865 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1866 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1867 %3 = bitcast i8 %mask to <8 x i1>
1868 ; load needed to keep the operation from being scheduled above the asm block
1869 %4 = load <8 x double>, <8 x double>* %passthru
1870 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1874 define <8 x double> @stack_fold_permilpd_zmm_maskz(<8 x double> %a0, i8 %mask) {
1875 ; CHECK-LABEL: stack_fold_permilpd_zmm_maskz:
1877 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1880 ; CHECK-NEXT: #NO_APP
1881 ; CHECK-NEXT: kmovw %edi, %k1
1882 ; CHECK-NEXT: vpermilpd $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1883 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,0,3,2,5,4,7,6]
1885 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1886 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1887 %3 = bitcast i8 %mask to <8 x i1>
1888 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1892 define <8 x double> @stack_fold_permilpdvar_zmm(<8 x double> %a0, <8 x i64> %a1) {
1893 ; CHECK-LABEL: stack_fold_permilpdvar_zmm:
1895 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1898 ; CHECK-NEXT: #NO_APP
1899 ; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1901 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1902 %2 = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1)
1905 declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>) nounwind readnone
1907 define <8 x double> @stack_fold_permilpdvar_zmm_mask(<8 x double>* %passthru, <8 x double> %a0, <8 x i64> %a1, i8 %mask) {
1908 ; CHECK-LABEL: stack_fold_permilpdvar_zmm_mask:
1910 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1913 ; CHECK-NEXT: #NO_APP
1914 ; CHECK-NEXT: kmovw %esi, %k1
1915 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
1916 ; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1917 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
1919 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1920 %2 = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1)
1921 %3 = bitcast i8 %mask to <8 x i1>
1922 ; load needed to keep the operation from being scheduled above the asm block
1923 %4 = load <8 x double>, <8 x double>* %passthru
1924 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1928 define <8 x double> @stack_fold_permilpdvar_zmm_maskz(<8 x double> %a0, <8 x i64> %a1, i8 %mask) {
1929 ; CHECK-LABEL: stack_fold_permilpdvar_zmm_maskz:
1931 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1934 ; CHECK-NEXT: #NO_APP
1935 ; CHECK-NEXT: kmovw %edi, %k1
1936 ; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1938 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1939 %2 = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1)
1940 %3 = bitcast i8 %mask to <8 x i1>
1941 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1945 define <16 x float> @stack_fold_permilps_zmm(<16 x float> %a0) {
1946 ; CHECK-LABEL: stack_fold_permilps_zmm:
1948 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1951 ; CHECK-NEXT: #NO_APP
1952 ; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1953 ; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1955 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1956 %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
1960 define <16 x float> @stack_fold_permilps_zmm_mask(<16 x float>* %passthru, <16 x float> %a0, i16 %mask) {
1961 ; CHECK-LABEL: stack_fold_permilps_zmm_mask:
1963 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1966 ; CHECK-NEXT: #NO_APP
1967 ; CHECK-NEXT: kmovw %esi, %k1
1968 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1969 ; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
1970 ; CHECK-NEXT: # zmm1 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1971 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1973 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1974 %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
1975 %3 = bitcast i16 %mask to <16 x i1>
1976 ; load needed to keep the operation from being scheduled above the asm block
1977 %4 = load <16 x float>, <16 x float>* %passthru
1978 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
1982 define <16 x float> @stack_fold_permilps_zmm_maskz(<16 x float> %a0, i16 %mask) {
1983 ; CHECK-LABEL: stack_fold_permilps_zmm_maskz:
1985 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1988 ; CHECK-NEXT: #NO_APP
1989 ; CHECK-NEXT: kmovw %edi, %k1
1990 ; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1991 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1993 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1994 %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
1995 %3 = bitcast i16 %mask to <16 x i1>
1996 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
2000 define <16 x float> @stack_fold_permilpsvar_zmm(<16 x float> %a0, <16 x i32> %a1) {
2001 ; CHECK-LABEL: stack_fold_permilpsvar_zmm:
2003 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2006 ; CHECK-NEXT: #NO_APP
2007 ; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2009 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2010 %2 = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1)
2013 declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>) nounwind readnone
2015 define <16 x float> @stack_fold_permilpsvar_zmm_mask(<16 x float>* %passthru, <16 x float> %a0, <16 x i32> %a1, i16 %mask) {
2016 ; CHECK-LABEL: stack_fold_permilpsvar_zmm_mask:
2018 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2021 ; CHECK-NEXT: #NO_APP
2022 ; CHECK-NEXT: kmovw %esi, %k1
2023 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
2024 ; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2025 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2027 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2028 %2 = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1)
2029 %3 = bitcast i16 %mask to <16 x i1>
2030 ; load needed to keep the operation from being scheduled above the asm block
2031 %4 = load <16 x float>, <16 x float>* %passthru
2032 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
2036 define <16 x float> @stack_fold_permilpsvar_zmm_maskz(<16 x float> %a0, <16 x i32> %a1, i16 %mask) {
2037 ; CHECK-LABEL: stack_fold_permilpsvar_zmm_maskz:
2039 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2042 ; CHECK-NEXT: #NO_APP
2043 ; CHECK-NEXT: kmovw %edi, %k1
2044 ; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2046 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2047 %2 = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1)
2048 %3 = bitcast i16 %mask to <16 x i1>
2049 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
2053 attributes #0 = { "unsafe-fp-math"="false" }
2054 attributes #1 = { "unsafe-fp-math"="true" }