1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <8 x double> @stack_fold_addpd_zmm(<8 x double> %a0, <8 x double> %a1) {
13 ; CHECK-LABEL: stack_fold_addpd_zmm:
15 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
22 %2 = fadd <8 x double> %a0, %a1
26 define <8 x double> @stack_fold_addpd_zmm_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) {
27 ; CHECK-LABEL: stack_fold_addpd_zmm_k:
29 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
33 ; CHECK-NEXT: kmovw %edi, %k1
34 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
35 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
36 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
38 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
39 %2 = fadd <8 x double> %a0, %a1
40 %3 = bitcast i8 %mask to <8 x i1>
41 %4 = load <8 x double>, ptr %passthru
42 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
46 define <8 x double> @stack_fold_addpd_zmm_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) {
47 ; CHECK-LABEL: stack_fold_addpd_zmm_k_commuted:
49 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
53 ; CHECK-NEXT: kmovw %edi, %k1
54 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
55 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
56 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
58 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
59 %2 = fadd <8 x double> %a1, %a0
60 %3 = bitcast i8 %mask to <8 x i1>
61 %4 = load <8 x double>, ptr %passthru
62 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
66 define <8 x double> @stack_fold_addpd_zmm_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
67 ; CHECK-LABEL: stack_fold_addpd_zmm_kz:
69 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
73 ; CHECK-NEXT: kmovw %edi, %k1
74 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
76 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
77 %2 = fadd <8 x double> %a1, %a0
78 %3 = bitcast i8 %mask to <8 x i1>
79 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
83 define <16 x float> @stack_fold_addps_zmm(<16 x float> %a0, <16 x float> %a1) {
84 ; CHECK-LABEL: stack_fold_addps_zmm:
86 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
90 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
92 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
93 %2 = fadd <16 x float> %a0, %a1
97 define <16 x float> @stack_fold_addps_zmm_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) {
98 ; CHECK-LABEL: stack_fold_addps_zmm_k:
100 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
103 ; CHECK-NEXT: #NO_APP
104 ; CHECK-NEXT: kmovw %edi, %k1
105 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
106 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
107 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
109 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
110 %2 = fadd <16 x float> %a0, %a1
111 %3 = bitcast i16 %mask to <16 x i1>
112 %4 = load <16 x float>, ptr %passthru
113 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
117 define <16 x float> @stack_fold_addps_zmm_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) {
118 ; CHECK-LABEL: stack_fold_addps_zmm_k_commuted:
120 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
123 ; CHECK-NEXT: #NO_APP
124 ; CHECK-NEXT: kmovw %edi, %k1
125 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
126 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
127 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
129 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
130 %2 = fadd <16 x float> %a1, %a0
131 %3 = bitcast i16 %mask to <16 x i1>
132 %4 = load <16 x float>, ptr %passthru
133 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
137 define <16 x float> @stack_fold_addps_zmm_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
138 ; CHECK-LABEL: stack_fold_addps_zmm_kz:
140 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
143 ; CHECK-NEXT: #NO_APP
144 ; CHECK-NEXT: kmovw %edi, %k1
145 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
147 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
148 %2 = fadd <16 x float> %a1, %a0
149 %3 = bitcast i16 %mask to <16 x i1>
150 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
154 define double @stack_fold_addsd(double %a0, double %a1) {
155 ; CHECK-LABEL: stack_fold_addsd:
157 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
160 ; CHECK-NEXT: #NO_APP
161 ; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
164 %2 = fadd double %a0, %a1
168 define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
169 ; CHECK-LABEL: stack_fold_addsd_int:
171 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
174 ; CHECK-NEXT: #NO_APP
175 ; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
177 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
178 %2 = extractelement <2 x double> %a0, i32 0
179 %3 = extractelement <2 x double> %a1, i32 0
180 %4 = fadd double %2, %3
181 %5 = insertelement <2 x double> %a0, double %4, i32 0
185 define float @stack_fold_addss(float %a0, float %a1) {
186 ; CHECK-LABEL: stack_fold_addss:
188 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
191 ; CHECK-NEXT: #NO_APP
192 ; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
194 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
195 %2 = fadd float %a0, %a1
199 define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
200 ; CHECK-LABEL: stack_fold_addss_int:
202 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
205 ; CHECK-NEXT: #NO_APP
206 ; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
208 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
209 %2 = extractelement <4 x float> %a0, i32 0
210 %3 = extractelement <4 x float> %a1, i32 0
211 %4 = fadd float %2, %3
212 %5 = insertelement <4 x float> %a0, float %4, i32 0
216 define <8 x double> @stack_fold_andnpd_zmm(<8 x double> %a0, <8 x double> %a1) {
217 ; CHECK-LABEL: stack_fold_andnpd_zmm:
219 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
222 ; CHECK-NEXT: #NO_APP
223 ; CHECK-NEXT: vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
224 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
225 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
227 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
228 %2 = bitcast <8 x double> %a0 to <8 x i64>
229 %3 = bitcast <8 x double> %a1 to <8 x i64>
230 %4 = xor <8 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
231 %5 = and <8 x i64> %4, %3
232 %6 = bitcast <8 x i64> %5 to <8 x double>
233 ; fadd forces execution domain
234 %7 = fadd <8 x double> %6, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
238 define <16 x float> @stack_fold_andnps_zmm(<16 x float> %a0, <16 x float> %a1) {
239 ; CHECK-LABEL: stack_fold_andnps_zmm:
241 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
244 ; CHECK-NEXT: #NO_APP
245 ; CHECK-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
246 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
247 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
249 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
250 %2 = bitcast <16 x float> %a0 to <16 x i32>
251 %3 = bitcast <16 x float> %a1 to <16 x i32>
252 %4 = xor <16 x i32> %2, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
253 %5 = and <16 x i32> %4, %3
254 %6 = bitcast <16 x i32> %5 to <16 x float>
255 ; fadd forces execution domain
256 %7 = fadd <16 x float> %6, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
260 define <8 x double> @stack_fold_andpd_zmm(<8 x double> %a0, <8 x double> %a1) {
261 ; CHECK-LABEL: stack_fold_andpd_zmm:
263 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
266 ; CHECK-NEXT: #NO_APP
267 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
268 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
269 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
271 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
272 %2 = bitcast <8 x double> %a0 to <8 x i64>
273 %3 = bitcast <8 x double> %a1 to <8 x i64>
274 %4 = and <8 x i64> %2, %3
275 %5 = bitcast <8 x i64> %4 to <8 x double>
276 ; fadd forces execution domain
277 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
281 define <16 x float> @stack_fold_andps_zmm(<16 x float> %a0, <16 x float> %a1) {
282 ; CHECK-LABEL: stack_fold_andps_zmm:
284 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
287 ; CHECK-NEXT: #NO_APP
288 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
289 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
290 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
292 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
293 %2 = bitcast <16 x float> %a0 to <16 x i32>
294 %3 = bitcast <16 x float> %a1 to <16 x i32>
295 %4 = and <16 x i32> %2, %3
296 %5 = bitcast <16 x i32> %4 to <16 x float>
297 ; fadd forces execution domain
298 %6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
302 define i8 @stack_fold_cmppd(<8 x double> %a0, <8 x double> %a1) {
303 ; CHECK-LABEL: stack_fold_cmppd:
305 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
308 ; CHECK-NEXT: #NO_APP
309 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
310 ; CHECK-NEXT: kmovw %k0, %eax
311 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
312 ; CHECK-NEXT: vzeroupper
314 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
315 %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %a1, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
316 %2 = bitcast <8 x i1> %res to i8
319 declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32)
321 define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, ptr %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) {
322 ; CHECK-LABEL: stack_fold_cmppd_mask:
324 ; CHECK-NEXT: subq $136, %rsp
325 ; CHECK-NEXT: .cfi_def_cfa_offset 144
326 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
327 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
328 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
329 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
332 ; CHECK-NEXT: #NO_APP
333 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
334 ; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0
335 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
336 ; CHECK-NEXT: kmovw %esi, %k1
337 ; CHECK-NEXT: kandb %k0, %k1, %k1
338 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
339 ; CHECK-NEXT: vmovupd (%rsp), %zmm1 # 64-byte Reload
340 ; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1}
341 ; CHECK-NEXT: addq $136, %rsp
342 ; CHECK-NEXT: .cfi_def_cfa_offset 8
344 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
345 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
346 %2 = load <8 x double>, ptr %a2
347 %3 = fadd <8 x double> %a1, %2
348 %4 = bitcast i8 %mask to <8 x i1>
349 %5 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %3, <8 x double> %a0, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
350 %6 = and <8 x i1> %4, %5
351 %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
355 define <8 x double> @stack_fold_cmppd_mask_commuted(<8 x double> %a0, <8 x double> %a1, ptr %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) {
356 ; CHECK-LABEL: stack_fold_cmppd_mask_commuted:
358 ; CHECK-NEXT: subq $136, %rsp
359 ; CHECK-NEXT: .cfi_def_cfa_offset 144
360 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
361 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
362 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
363 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
366 ; CHECK-NEXT: #NO_APP
367 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
368 ; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0
369 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
370 ; CHECK-NEXT: kmovw %esi, %k1
371 ; CHECK-NEXT: kandb %k0, %k1, %k1
372 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
373 ; CHECK-NEXT: vmovupd (%rsp), %zmm1 # 64-byte Reload
374 ; CHECK-NEXT: vmovapd %zmm1, %zmm0 {%k1}
375 ; CHECK-NEXT: addq $136, %rsp
376 ; CHECK-NEXT: .cfi_def_cfa_offset 8
378 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
379 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
380 %2 = load <8 x double>, ptr %a2
381 %3 = fadd <8 x double> %a1, %2
382 %4 = bitcast i8 %mask to <8 x i1>
383 %5 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a0, <8 x double> %3, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
384 %6 = and <8 x i1> %4, %5
385 %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
389 define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) {
390 ; CHECK-LABEL: stack_fold_cmpps:
392 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
395 ; CHECK-NEXT: #NO_APP
396 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
397 ; CHECK-NEXT: kmovw %k0, %eax
398 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
399 ; CHECK-NEXT: vzeroupper
401 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
402 %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %a1, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
403 %2 = bitcast <16 x i1> %res to i16
406 declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32)
408 define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, ptr %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) {
409 ; CHECK-LABEL: stack_fold_cmpps_mask:
411 ; CHECK-NEXT: subq $136, %rsp
412 ; CHECK-NEXT: .cfi_def_cfa_offset 144
413 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
414 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
415 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
416 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
419 ; CHECK-NEXT: #NO_APP
420 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
421 ; CHECK-NEXT: vaddps (%rdi), %zmm0, %zmm0
422 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
423 ; CHECK-NEXT: kmovw %esi, %k1
424 ; CHECK-NEXT: kandw %k0, %k1, %k1
425 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
426 ; CHECK-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload
427 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1}
428 ; CHECK-NEXT: addq $136, %rsp
429 ; CHECK-NEXT: .cfi_def_cfa_offset 8
431 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
432 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
433 %2 = load <16 x float>, ptr %a2
434 %3 = fadd <16 x float> %a1, %2
435 %4 = bitcast i16 %mask to <16 x i1>
436 %5 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %3, <16 x float> %a0, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
437 %6 = and <16 x i1> %4, %5
438 %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
442 define <16 x float> @stack_fold_cmpps_mask_commuted(<16 x float> %a0, <16 x float> %a1, ptr %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) {
443 ; CHECK-LABEL: stack_fold_cmpps_mask_commuted:
445 ; CHECK-NEXT: subq $136, %rsp
446 ; CHECK-NEXT: .cfi_def_cfa_offset 144
447 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
448 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
449 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
450 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
453 ; CHECK-NEXT: #NO_APP
454 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
455 ; CHECK-NEXT: vaddps (%rdi), %zmm0, %zmm0
456 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
457 ; CHECK-NEXT: kmovw %esi, %k1
458 ; CHECK-NEXT: kandw %k0, %k1, %k1
459 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
460 ; CHECK-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload
461 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 {%k1}
462 ; CHECK-NEXT: addq $136, %rsp
463 ; CHECK-NEXT: .cfi_def_cfa_offset 8
465 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
466 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
467 %2 = load <16 x float>, ptr %a2
468 %3 = fadd <16 x float> %a1, %2
469 %4 = bitcast i16 %mask to <16 x i1>
470 %5 = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a0, <16 x float> %3, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
471 %6 = and <16 x i1> %4, %5
472 %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
476 define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
477 ; CHECK-LABEL: stack_fold_divsd_int:
479 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
482 ; CHECK-NEXT: #NO_APP
483 ; CHECK-NEXT: vdivsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
485 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
486 %2 = extractelement <2 x double> %a0, i32 0
487 %3 = extractelement <2 x double> %a1, i32 0
488 %4 = fdiv double %2, %3
489 %5 = insertelement <2 x double> %a0, double %4, i32 0
493 define float @stack_fold_divss(float %a0, float %a1) {
494 ; CHECK-LABEL: stack_fold_divss:
496 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
499 ; CHECK-NEXT: #NO_APP
500 ; CHECK-NEXT: vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
502 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
503 %2 = fdiv float %a0, %a1
507 define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
508 ; CHECK-LABEL: stack_fold_divss_int:
510 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
513 ; CHECK-NEXT: #NO_APP
514 ; CHECK-NEXT: vdivss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
516 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
517 %2 = extractelement <4 x float> %a0, i32 0
518 %3 = extractelement <4 x float> %a1, i32 0
519 %4 = fdiv float %2, %3
520 %5 = insertelement <4 x float> %a0, float %4, i32 0
524 define <8 x double> @stack_fold_cvtdq2pd(<8 x i32> %a0) {
525 ; CHECK-LABEL: stack_fold_cvtdq2pd:
527 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
530 ; CHECK-NEXT: #NO_APP
531 ; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
533 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
534 %2 = sitofp <8 x i32> %a0 to <8 x double>
538 define <8 x double> @stack_fold_cvtudq2pd(<8 x i32> %a0) {
539 ; CHECK-LABEL: stack_fold_cvtudq2pd:
541 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
544 ; CHECK-NEXT: #NO_APP
545 ; CHECK-NEXT: vcvtudq2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
547 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
548 %2 = uitofp <8 x i32> %a0 to <8 x double>
552 define <8 x float> @stack_fold_cvtpd2ps(<8 x double> %a0) {
553 ; CHECK-LABEL: stack_fold_cvtpd2ps:
555 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
558 ; CHECK-NEXT: #NO_APP
559 ; CHECK-NEXT: vcvtpd2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 64-byte Folded Reload
561 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
562 %2 = fptrunc <8 x double> %a0 to <8 x float>
566 define <16 x float> @stack_fold_cvtph2ps(<16 x i16> %a0) {
567 ; CHECK-LABEL: stack_fold_cvtph2ps:
569 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
572 ; CHECK-NEXT: #NO_APP
573 ; CHECK-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
575 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
576 %2 = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> undef, i16 -1, i32 4)
579 declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
581 define <16 x i16> @stack_fold_cvtps2ph(<16 x float> %a0) {
582 ; CHECK-LABEL: stack_fold_cvtps2ph:
584 ; CHECK-NEXT: vcvtps2ph $0, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
587 ; CHECK-NEXT: #NO_APP
588 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
590 %1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 0, <16 x i16> undef, i16 -1)
591 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
594 declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
596 define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
597 ; CHECK-LABEL: stack_fold_insertps:
599 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
602 ; CHECK-NEXT: #NO_APP
603 ; CHECK-NEXT: vinsertps $17, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
604 ; CHECK-NEXT: # xmm0 = zero,mem[0],xmm0[2,3]
606 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
607 %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209)
610 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
612 define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
613 ; CHECK-LABEL: stack_fold_maxpd_zmm:
615 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
618 ; CHECK-NEXT: #NO_APP
619 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
621 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
622 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
625 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32) nounwind readnone
627 define <8 x double> @stack_fold_maxpd_zmm_commutable(<8 x double> %a0, <8 x double> %a1) #1 {
628 ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable:
630 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
633 ; CHECK-NEXT: #NO_APP
634 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
636 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
637 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
641 define <8 x double> @stack_fold_maxpd_zmm_commutable_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) #1 {
642 ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_k:
644 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
647 ; CHECK-NEXT: #NO_APP
648 ; CHECK-NEXT: kmovw %edi, %k1
649 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
650 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
651 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
653 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
654 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
655 %3 = bitcast i8 %mask to <8 x i1>
656 %4 = load <8 x double>, ptr %passthru
657 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
661 define <8 x double> @stack_fold_maxpd_zmm_commutable_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) #1 {
662 ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_k_commuted:
664 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
667 ; CHECK-NEXT: #NO_APP
668 ; CHECK-NEXT: kmovw %edi, %k1
669 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
670 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
671 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
673 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
674 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4)
675 %3 = bitcast i8 %mask to <8 x i1>
676 %4 = load <8 x double>, ptr %passthru
677 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
681 define <8 x double> @stack_fold_maxpd_zmm_commutable_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #1 {
682 ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_kz:
684 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
687 ; CHECK-NEXT: #NO_APP
688 ; CHECK-NEXT: kmovw %edi, %k1
689 ; CHECK-NEXT: vmaxpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
691 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
692 %2 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4)
693 %3 = bitcast i8 %mask to <8 x i1>
694 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
698 define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
699 ; CHECK-LABEL: stack_fold_maxps_zmm:
701 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
704 ; CHECK-NEXT: #NO_APP
705 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
707 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
708 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
711 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32) nounwind readnone
713 define <16 x float> @stack_fold_maxps_zmm_commutable(<16 x float> %a0, <16 x float> %a1) #1 {
714 ; CHECK-LABEL: stack_fold_maxps_zmm_commutable:
716 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
719 ; CHECK-NEXT: #NO_APP
720 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
722 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
723 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
727 define <16 x float> @stack_fold_maxps_zmm_commutable_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) #1 {
728 ; CHECK-LABEL: stack_fold_maxps_zmm_commutable_k:
730 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
733 ; CHECK-NEXT: #NO_APP
734 ; CHECK-NEXT: kmovw %edi, %k1
735 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
736 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
737 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
739 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
740 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
741 %3 = bitcast i16 %mask to <16 x i1>
742 %4 = load <16 x float>, ptr %passthru
743 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
747 define <16 x float> @stack_fold_maxps_zmm_commutable_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) #1 {
748 ; CHECK-LABEL: stack_fold_maxps_zmm_commutable_k_commuted:
750 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
753 ; CHECK-NEXT: #NO_APP
754 ; CHECK-NEXT: kmovw %edi, %k1
755 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
756 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
757 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
759 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
760 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4)
761 %3 = bitcast i16 %mask to <16 x i1>
762 %4 = load <16 x float>, ptr %passthru
763 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
767 define <16 x float> @stack_fold_maxps_zmm_commutable_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #1 {
768 ; CHECK-LABEL: stack_fold_maxps_zmm_commutable_kz:
770 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
773 ; CHECK-NEXT: #NO_APP
774 ; CHECK-NEXT: kmovw %edi, %k1
775 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
777 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
778 %2 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4)
779 %3 = bitcast i16 %mask to <16 x i1>
780 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
784 define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
785 ; CHECK-LABEL: stack_fold_minpd_zmm:
787 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
790 ; CHECK-NEXT: #NO_APP
791 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
793 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
794 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
797 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32) nounwind readnone
799 define <8 x double> @stack_fold_minpd_zmm_commutable(<8 x double> %a0, <8 x double> %a1) #1 {
800 ; CHECK-LABEL: stack_fold_minpd_zmm_commutable:
802 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
805 ; CHECK-NEXT: #NO_APP
806 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
808 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
809 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
813 define <8 x double> @stack_fold_minpd_zmm_commutable_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) #1 {
814 ; CHECK-LABEL: stack_fold_minpd_zmm_commutable_k:
816 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
819 ; CHECK-NEXT: #NO_APP
820 ; CHECK-NEXT: kmovw %edi, %k1
821 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
822 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
823 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
825 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
826 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
827 %3 = bitcast i8 %mask to <8 x i1>
828 %4 = load <8 x double>, ptr %passthru
829 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
833 define <8 x double> @stack_fold_minpd_zmm_commutable_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) #1 {
834 ; CHECK-LABEL: stack_fold_minpd_zmm_commutable_k_commuted:
836 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
839 ; CHECK-NEXT: #NO_APP
840 ; CHECK-NEXT: kmovw %edi, %k1
841 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
842 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
843 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
845 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
846 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4)
847 %3 = bitcast i8 %mask to <8 x i1>
848 %4 = load <8 x double>, ptr %passthru
849 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
853 define <8 x double> @stack_fold_minpd_zmm_commutable_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #1 {
854 ; CHECK-LABEL: stack_fold_minpd_zmm_commutable_kz:
856 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
859 ; CHECK-NEXT: #NO_APP
860 ; CHECK-NEXT: kmovw %edi, %k1
861 ; CHECK-NEXT: vminpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
863 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
864 %2 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a1, <8 x double> %a0, i32 4)
865 %3 = bitcast i8 %mask to <8 x i1>
866 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
870 define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
871 ; CHECK-LABEL: stack_fold_minps_zmm:
873 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
876 ; CHECK-NEXT: #NO_APP
877 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
879 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
880 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
883 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32) nounwind readnone
885 define <16 x float> @stack_fold_minps_zmm_commutable(<16 x float> %a0, <16 x float> %a1) #1 {
886 ; CHECK-LABEL: stack_fold_minps_zmm_commutable:
888 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
891 ; CHECK-NEXT: #NO_APP
892 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
894 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
895 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
899 define <16 x float> @stack_fold_minps_zmm_commutable_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) #1 {
900 ; CHECK-LABEL: stack_fold_minps_zmm_commutable_k:
902 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
905 ; CHECK-NEXT: #NO_APP
906 ; CHECK-NEXT: kmovw %edi, %k1
907 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
908 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
909 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
911 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
912 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
913 %3 = bitcast i16 %mask to <16 x i1>
914 %4 = load <16 x float>, ptr %passthru
915 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
919 define <16 x float> @stack_fold_minps_zmm_commutable_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) #1 {
920 ; CHECK-LABEL: stack_fold_minps_zmm_commutable_k_commuted:
922 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
925 ; CHECK-NEXT: #NO_APP
926 ; CHECK-NEXT: kmovw %edi, %k1
927 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
928 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
929 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
931 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
932 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4)
933 %3 = bitcast i16 %mask to <16 x i1>
934 %4 = load <16 x float>, ptr %passthru
935 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
939 define <16 x float> @stack_fold_minps_zmm_commutable_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #1 {
940 ; CHECK-LABEL: stack_fold_minps_zmm_commutable_kz:
942 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
945 ; CHECK-NEXT: #NO_APP
946 ; CHECK-NEXT: kmovw %edi, %k1
947 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
949 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
950 %2 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a1, <16 x float> %a0, i32 4)
951 %3 = bitcast i16 %mask to <16 x i1>
952 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
956 define <8 x double> @stack_fold_mulpd_zmm(<8 x double> %a0, <8 x double> %a1) {
957 ; CHECK-LABEL: stack_fold_mulpd_zmm:
959 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
962 ; CHECK-NEXT: #NO_APP
963 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
965 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
966 %2 = fmul <8 x double> %a0, %a1
970 define <8 x double> @stack_fold_mulpd_zmm_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) {
971 ; CHECK-LABEL: stack_fold_mulpd_zmm_k:
973 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
976 ; CHECK-NEXT: #NO_APP
977 ; CHECK-NEXT: kmovw %edi, %k1
978 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
979 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
980 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
982 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
983 %2 = fmul <8 x double> %a0, %a1
984 %3 = bitcast i8 %mask to <8 x i1>
985 %4 = load <8 x double>, ptr %passthru
986 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
990 define <8 x double> @stack_fold_mulpd_zmm_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, ptr %passthru) {
991 ; CHECK-LABEL: stack_fold_mulpd_zmm_k_commuted:
993 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
996 ; CHECK-NEXT: #NO_APP
997 ; CHECK-NEXT: kmovw %edi, %k1
998 ; CHECK-NEXT: vmovapd (%rsi), %zmm2
999 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1000 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
1002 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1003 %2 = fmul <8 x double> %a1, %a0
1004 %3 = bitcast i8 %mask to <8 x i1>
1005 %4 = load <8 x double>, ptr %passthru
1006 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1010 define <8 x double> @stack_fold_mulpd_zmm_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1011 ; CHECK-LABEL: stack_fold_mulpd_zmm_kz:
1013 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1016 ; CHECK-NEXT: #NO_APP
1017 ; CHECK-NEXT: kmovw %edi, %k1
1018 ; CHECK-NEXT: vmulpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1020 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1021 %2 = fmul <8 x double> %a1, %a0
1022 %3 = bitcast i8 %mask to <8 x i1>
1023 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1027 define <16 x float> @stack_fold_mulps_zmm(<16 x float> %a0, <16 x float> %a1) {
1028 ; CHECK-LABEL: stack_fold_mulps_zmm:
1030 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1033 ; CHECK-NEXT: #NO_APP
1034 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1036 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1037 %2 = fmul <16 x float> %a0, %a1
1041 define <16 x float> @stack_fold_mulps_zmm_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) {
1042 ; CHECK-LABEL: stack_fold_mulps_zmm_k:
1044 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1047 ; CHECK-NEXT: #NO_APP
1048 ; CHECK-NEXT: kmovw %edi, %k1
1049 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
1050 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1051 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1053 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1054 %2 = fmul <16 x float> %a0, %a1
1055 %3 = bitcast i16 %mask to <16 x i1>
1056 %4 = load <16 x float>, ptr %passthru
1057 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
1061 define <16 x float> @stack_fold_mulps_zmm_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, ptr %passthru) {
1062 ; CHECK-LABEL: stack_fold_mulps_zmm_k_commuted:
1064 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1067 ; CHECK-NEXT: #NO_APP
1068 ; CHECK-NEXT: kmovw %edi, %k1
1069 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
1070 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1071 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1073 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1074 %2 = fmul <16 x float> %a1, %a0
1075 %3 = bitcast i16 %mask to <16 x i1>
1076 %4 = load <16 x float>, ptr %passthru
1077 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
1081 define <16 x float> @stack_fold_mulps_zmm_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1082 ; CHECK-LABEL: stack_fold_mulps_zmm_kz:
1084 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1087 ; CHECK-NEXT: #NO_APP
1088 ; CHECK-NEXT: kmovw %edi, %k1
1089 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1091 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1092 %2 = fmul <16 x float> %a1, %a0
1093 %3 = bitcast i16 %mask to <16 x i1>
1094 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
1098 define double @stack_fold_mulsd(double %a0, double %a1) {
1099 ; CHECK-LABEL: stack_fold_mulsd:
1101 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1104 ; CHECK-NEXT: #NO_APP
1105 ; CHECK-NEXT: vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
1107 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1108 %2 = fmul double %a0, %a1
1112 define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
1113 ; CHECK-LABEL: stack_fold_mulsd_int:
1115 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1118 ; CHECK-NEXT: #NO_APP
1119 ; CHECK-NEXT: vmulsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1121 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1122 %2 = extractelement <2 x double> %a0, i32 0
1123 %3 = extractelement <2 x double> %a1, i32 0
1124 %4 = fmul double %2, %3
1125 %5 = insertelement <2 x double> %a0, double %4, i32 0
1129 define float @stack_fold_mulss(float %a0, float %a1) {
1130 ; CHECK-LABEL: stack_fold_mulss:
1132 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1135 ; CHECK-NEXT: #NO_APP
1136 ; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1138 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1139 %2 = fmul float %a0, %a1
1143 define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
1144 ; CHECK-LABEL: stack_fold_mulss_int:
1146 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1149 ; CHECK-NEXT: #NO_APP
1150 ; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1152 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1153 %2 = extractelement <4 x float> %a0, i32 0
1154 %3 = extractelement <4 x float> %a1, i32 0
1155 %4 = fmul float %2, %3
1156 %5 = insertelement <4 x float> %a0, float %4, i32 0
1160 define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
1161 ; CHECK-LABEL: stack_fold_orpd_zmm:
1163 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1166 ; CHECK-NEXT: #NO_APP
1167 ; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1168 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1169 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1171 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1172 %2 = bitcast <8 x double> %a0 to <8 x i64>
1173 %3 = bitcast <8 x double> %a1 to <8 x i64>
1174 %4 = or <8 x i64> %2, %3
1175 %5 = bitcast <8 x i64> %4 to <8 x double>
1176 ; fadd forces execution domain
1177 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
1181 define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
1182 ; CHECK-LABEL: stack_fold_orps_zmm:
1184 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1187 ; CHECK-NEXT: #NO_APP
1188 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1189 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
1190 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
1192 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1193 %2 = bitcast <16 x float> %a0 to <16 x i32>
1194 %3 = bitcast <16 x float> %a1 to <16 x i32>
1195 %4 = or <16 x i32> %2, %3
1196 %5 = bitcast <16 x i32> %4 to <16 x float>
1197 ; fadd forces execution domain
1198 %6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
1202 define <8 x double> @stack_fold_shuff64x2(<8 x double> %a, <8 x double> %b) {
1203 ; CHECK-LABEL: stack_fold_shuff64x2:
1205 ; CHECK-NEXT: pushq %rax
1206 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1207 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1208 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1211 ; CHECK-NEXT: #NO_APP
1212 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1213 ; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1214 ; CHECK-NEXT: # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
1215 ; CHECK-NEXT: popq %rax
1216 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1218 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1219 %2 = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
1223 define <8 x double> @stack_fold_shuff64x2_mask(<8 x double> %a, <8 x double> %b, i8 %mask, ptr %passthru) {
1224 ; CHECK-LABEL: stack_fold_shuff64x2_mask:
1226 ; CHECK-NEXT: pushq %rax
1227 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1228 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1229 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1232 ; CHECK-NEXT: #NO_APP
1233 ; CHECK-NEXT: kmovw %edi, %k1
1234 ; CHECK-NEXT: vmovapd (%rsi), %zmm1
1235 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1236 ; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
1237 ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
1238 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
1239 ; CHECK-NEXT: popq %rax
1240 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1242 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1243 %2 = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
1244 %3 = bitcast i8 %mask to <8 x i1>
1245 ; load needed to keep the operation from being scheduled above the asm block
1246 %4 = load <8 x double>, ptr %passthru
1247 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1251 define <8 x double> @stack_fold_shuff64x2_maskz(<8 x double> %a, <8 x double> %b, i8 %mask, ptr %passthru) {
1252 ; CHECK-LABEL: stack_fold_shuff64x2_maskz:
1254 ; CHECK-NEXT: pushq %rax
1255 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1256 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1257 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1260 ; CHECK-NEXT: #NO_APP
1261 ; CHECK-NEXT: kmovw %edi, %k1
1262 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1263 ; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1264 ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
1265 ; CHECK-NEXT: popq %rax
1266 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1268 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1269 %2 = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
1270 %3 = bitcast i8 %mask to <8 x i1>
1271 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1275 define <16 x float> @stack_fold_shuff32x4_mask(<16 x float> %a, <16 x float> %b, i16 %mask, ptr %passthru) {
1276 ; CHECK-LABEL: stack_fold_shuff32x4_mask:
1278 ; CHECK-NEXT: pushq %rax
1279 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1280 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1281 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1284 ; CHECK-NEXT: #NO_APP
1285 ; CHECK-NEXT: kmovw %edi, %k1
1286 ; CHECK-NEXT: vmovaps (%rsi), %zmm1
1287 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1288 ; CHECK-NEXT: vshuff32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
1289 ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
1290 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1291 ; CHECK-NEXT: popq %rax
1292 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1294 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1295 %2 = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
1296 %3 = bitcast i16 %mask to <16 x i1>
1297 ; load needed to keep the operation from being scheduled above the asm block
1298 %4 = load <16 x float>, ptr %passthru
1299 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
1303 define <16 x float> @stack_fold_shuff32x4_maskz(<16 x float> %a, <16 x float> %b, i16 %mask) {
1304 ; CHECK-LABEL: stack_fold_shuff32x4_maskz:
1306 ; CHECK-NEXT: pushq %rax
1307 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1308 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1309 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1312 ; CHECK-NEXT: #NO_APP
1313 ; CHECK-NEXT: kmovw %edi, %k1
1314 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1315 ; CHECK-NEXT: vshuff32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1316 ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
1317 ; CHECK-NEXT: popq %rax
1318 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1321 %2 = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
1322 %3 = bitcast i16 %mask to <16 x i1>
1323 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
1327 define <8 x double> @stack_fold_subpd_zmm(<8 x double> %a0, <8 x double> %a1) {
1328 ; CHECK-LABEL: stack_fold_subpd_zmm:
1330 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1333 ; CHECK-NEXT: #NO_APP
1334 ; CHECK-NEXT: vsubpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1336 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1337 %2 = fsub <8 x double> %a0, %a1
1341 define <16 x float> @stack_fold_subps_zmm(<16 x float> %a0, <16 x float> %a1) {
1342 ; CHECK-LABEL: stack_fold_subps_zmm:
1344 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1347 ; CHECK-NEXT: #NO_APP
1348 ; CHECK-NEXT: vsubps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1350 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1351 %2 = fsub <16 x float> %a0, %a1
1355 define double @stack_fold_subsd(double %a0, double %a1) {
1356 ; CHECK-LABEL: stack_fold_subsd:
1358 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1361 ; CHECK-NEXT: #NO_APP
1362 ; CHECK-NEXT: vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
1364 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1365 %2 = fsub double %a0, %a1
1369 define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
1370 ; CHECK-LABEL: stack_fold_subsd_int:
1372 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1375 ; CHECK-NEXT: #NO_APP
1376 ; CHECK-NEXT: vsubsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1378 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1379 %2 = extractelement <2 x double> %a0, i32 0
1380 %3 = extractelement <2 x double> %a1, i32 0
1381 %4 = fsub double %2, %3
1382 %5 = insertelement <2 x double> %a0, double %4, i32 0
1386 define float @stack_fold_subss(float %a0, float %a1) {
1387 ; CHECK-LABEL: stack_fold_subss:
1389 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1392 ; CHECK-NEXT: #NO_APP
1393 ; CHECK-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1395 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1396 %2 = fsub float %a0, %a1
1400 define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
1401 ; CHECK-LABEL: stack_fold_subss_int:
1403 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1406 ; CHECK-NEXT: #NO_APP
1407 ; CHECK-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1409 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1410 %2 = extractelement <4 x float> %a0, i32 0
1411 %3 = extractelement <4 x float> %a1, i32 0
1412 %4 = fsub float %2, %3
1413 %5 = insertelement <4 x float> %a0, float %4, i32 0
1417 define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
1418 ; CHECK-LABEL: stack_fold_xorpd_zmm:
1420 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1423 ; CHECK-NEXT: #NO_APP
1424 ; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1425 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1426 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1428 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1429 %2 = bitcast <8 x double> %a0 to <8 x i64>
1430 %3 = bitcast <8 x double> %a1 to <8 x i64>
1431 %4 = xor <8 x i64> %2, %3
1432 %5 = bitcast <8 x i64> %4 to <8 x double>
1433 ; fadd forces execution domain
1434 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
1438 define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
1439 ; CHECK-LABEL: stack_fold_xorps_zmm:
1441 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1444 ; CHECK-NEXT: #NO_APP
1445 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1446 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
1447 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
1449 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1450 %2 = bitcast <16 x float> %a0 to <16 x i32>
1451 %3 = bitcast <16 x float> %a1 to <16 x i32>
1452 %4 = xor <16 x i32> %2, %3
1453 %5 = bitcast <16 x i32> %4 to <16 x float>
1454 ; fadd forces execution domain
1455 %6 = fadd <16 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
1459 define i32 @stack_fold_extractps(<4 x float> %a0) {
1460 ; CHECK-LABEL: stack_fold_extractps:
1462 ; CHECK-NEXT: pushq %rbp
1463 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1464 ; CHECK-NEXT: pushq %r15
1465 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1466 ; CHECK-NEXT: pushq %r14
1467 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1468 ; CHECK-NEXT: pushq %r13
1469 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1470 ; CHECK-NEXT: pushq %r12
1471 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1472 ; CHECK-NEXT: pushq %rbx
1473 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1474 ; CHECK-NEXT: .cfi_offset %rbx, -56
1475 ; CHECK-NEXT: .cfi_offset %r12, -48
1476 ; CHECK-NEXT: .cfi_offset %r13, -40
1477 ; CHECK-NEXT: .cfi_offset %r14, -32
1478 ; CHECK-NEXT: .cfi_offset %r15, -24
1479 ; CHECK-NEXT: .cfi_offset %rbp, -16
1480 ; CHECK-NEXT: vextractps $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1483 ; CHECK-NEXT: #NO_APP
1484 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1485 ; CHECK-NEXT: popq %rbx
1486 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1487 ; CHECK-NEXT: popq %r12
1488 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1489 ; CHECK-NEXT: popq %r13
1490 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1491 ; CHECK-NEXT: popq %r14
1492 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1493 ; CHECK-NEXT: popq %r15
1494 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1495 ; CHECK-NEXT: popq %rbp
1496 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1498 %1 = extractelement <4 x float> %a0, i32 1
1499 %2 = bitcast float %1 to i32
1500 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1504 define <4 x float> @stack_fold_extracti32x4(<16 x float> %a0) {
1505 ; CHECK-LABEL: stack_fold_extracti32x4:
1507 ; CHECK-NEXT: vextractf32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
1510 ; CHECK-NEXT: #NO_APP
1511 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1512 ; CHECK-NEXT: vzeroupper
1514 %1 = shufflevector <16 x float> %a0, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
1515 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1519 define <2 x double> @stack_fold_extractf64x2(<8 x double> %a0) {
1520 ; CHECK-LABEL: stack_fold_extractf64x2:
1522 ; CHECK-NEXT: vextractf32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
1525 ; CHECK-NEXT: #NO_APP
1526 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1527 ; CHECK-NEXT: vzeroupper
1529 %1 = shufflevector <8 x double> %a0, <8 x double> undef, <2 x i32> <i32 6, i32 7>
1530 %2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1534 define <8 x float> @stack_fold_extracti32x8(<16 x float> %a0) {
1535 ; CHECK-LABEL: stack_fold_extracti32x8:
1537 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
1540 ; CHECK-NEXT: #NO_APP
1541 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1543 %1 = shufflevector <16 x float> %a0, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1544 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1548 define <4 x double> @stack_fold_extractf64x4(<8 x double> %a0) {
1549 ; CHECK-LABEL: stack_fold_extractf64x4:
1551 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
1554 ; CHECK-NEXT: #NO_APP
1555 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1557 %1 = shufflevector <8 x double> %a0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1558 %2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1562 define <16 x float> @stack_fold_insertf32x8(<8 x float> %a0, <8 x float> %a1) {
1563 ; CHECK-LABEL: stack_fold_insertf32x8:
1565 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1566 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1569 ; CHECK-NEXT: #NO_APP
1570 ; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
1572 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1573 %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1577 define <8 x double> @stack_fold_insertf64x4(<4 x double> %a0, <4 x double> %a1) {
1578 ; CHECK-LABEL: stack_fold_insertf64x4:
1580 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1581 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1584 ; CHECK-NEXT: #NO_APP
1585 ; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
1587 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1588 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1592 define <8 x double> @stack_fold_insertf64x4_mask(ptr %passthru, <4 x double> %a0, <4 x double> %a1, i8 %mask) {
1593 ; CHECK-LABEL: stack_fold_insertf64x4_mask:
1595 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1596 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1599 ; CHECK-NEXT: #NO_APP
1600 ; CHECK-NEXT: kmovw %esi, %k1
1601 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
1602 ; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 32-byte Folded Reload
1603 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
1605 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1606 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1607 %3 = bitcast i8 %mask to <8 x i1>
1608 %4 = load <8 x double>, ptr %passthru
1609 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1613 define <8 x double> @stack_fold_insertf64x4_maskz(<4 x double> %a0, <4 x double> %a1, i8 %mask) {
1614 ; CHECK-LABEL: stack_fold_insertf64x4_maskz:
1616 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1617 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1620 ; CHECK-NEXT: #NO_APP
1621 ; CHECK-NEXT: kmovw %edi, %k1
1622 ; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 32-byte Folded Reload
1624 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1625 %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1626 %3 = bitcast i8 %mask to <8 x i1>
1627 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1631 define <16 x float> @stack_fold_vpermt2ps(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) {
1632 ; CHECK-LABEL: stack_fold_vpermt2ps:
1634 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1637 ; CHECK-NEXT: #NO_APP
1638 ; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1640 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1641 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
1644 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
1646 define <16 x float> @stack_fold_vpermi2ps(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2) {
1647 ; CHECK-LABEL: stack_fold_vpermi2ps:
1649 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1652 ; CHECK-NEXT: #NO_APP
1653 ; CHECK-NEXT: vpermi2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1655 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1656 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
1660 define <16 x float> @stack_fold_vpermi2ps_mask(<16 x float> %x0, ptr %x1, <16 x float> %x2, i16 %mask) {
1661 ; CHECK-LABEL: stack_fold_vpermi2ps_mask:
1663 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1666 ; CHECK-NEXT: #NO_APP
1667 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
1668 ; CHECK-NEXT: kmovw %esi, %k1
1669 ; CHECK-NEXT: vpermi2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1670 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1672 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1673 %x1b = load <16 x i32>, ptr %x1
1674 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1b, <16 x float> %x2)
1675 %3 = bitcast <16 x i32> %x1b to <16 x float>
1676 %4 = bitcast i16 %mask to <16 x i1>
1677 %5 = select <16 x i1> %4, <16 x float> %2, <16 x float> %3
1681 define <16 x float> @stack_fold_vpermt2ps_mask(ptr %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) {
1682 ; CHECK-LABEL: stack_fold_vpermt2ps_mask:
1684 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1687 ; CHECK-NEXT: #NO_APP
1688 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1689 ; CHECK-NEXT: kmovw %esi, %k1
1690 ; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1692 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1693 %x0b = load <16 x i32>, ptr %x0
1694 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0b, <16 x float> %x2)
1695 %3 = bitcast i16 %mask to <16 x i1>
1696 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %x1
1700 define <16 x float> @stack_fold_vpermt2ps_maskz(ptr %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) {
1701 ; CHECK-LABEL: stack_fold_vpermt2ps_maskz:
1703 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1706 ; CHECK-NEXT: #NO_APP
1707 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1708 ; CHECK-NEXT: kmovw %esi, %k1
1709 ; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1711 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1712 %x0b = load <16 x i32>, ptr %x0
1713 %2 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0b, <16 x float> %x2)
1714 %3 = bitcast i16 %mask to <16 x i1>
1715 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
1719 define <8 x double> @stack_fold_vpermt2pd(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) {
1720 ; CHECK-LABEL: stack_fold_vpermt2pd:
1722 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1725 ; CHECK-NEXT: #NO_APP
1726 ; CHECK-NEXT: vpermt2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1728 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1729 %2 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
1730 %3 = bitcast <8 x i64> %x1 to <8 x double>
1733 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
1735 define <8 x double> @stack_fold_vpermi2pd(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2) {
1736 ; CHECK-LABEL: stack_fold_vpermi2pd:
1738 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1741 ; CHECK-NEXT: #NO_APP
1742 ; CHECK-NEXT: vpermi2pd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1744 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1745 %2 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2)
1749 define <8 x double> @stack_fold_permpd(<8 x double> %a0) {
1750 ; CHECK-LABEL: stack_fold_permpd:
1752 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1755 ; CHECK-NEXT: #NO_APP
1756 ; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1757 ; CHECK-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7]
1758 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1759 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1761 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1762 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
1763 ; fadd forces execution domain
1764 %3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
1768 define <8 x double> @stack_fold_permpd_mask(ptr %passthru, <8 x double> %a0, i8 %mask) {
1769 ; CHECK-LABEL: stack_fold_permpd_mask:
1771 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1774 ; CHECK-NEXT: #NO_APP
1775 ; CHECK-NEXT: kmovw %esi, %k1
1776 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
1777 ; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
1778 ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7]
1779 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1780 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1782 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1783 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
1784 %3 = bitcast i8 %mask to <8 x i1>
1785 ; load needed to keep the operation from being scheduled above the asm block
1786 %4 = load <8 x double>, ptr %passthru
1787 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1788 ; fadd forces execution domain
1789 %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
1793 define <8 x double> @stack_fold_permpd_maskz(<8 x double> %a0, i8 %mask) {
1794 ; CHECK-LABEL: stack_fold_permpd_maskz:
1796 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1799 ; CHECK-NEXT: #NO_APP
1800 ; CHECK-NEXT: kmovw %edi, %k1
1801 ; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1802 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,2,3,7,6,6,7]
1804 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1805 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
1806 %3 = bitcast i8 %mask to <8 x i1>
1807 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1811 define <8 x double> @stack_fold_permpdvar(<8 x i64> %a0, <8 x double> %a1) {
1812 ; CHECK-LABEL: stack_fold_permpdvar:
1814 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1817 ; CHECK-NEXT: #NO_APP
1818 ; CHECK-NEXT: vpermpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1819 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1820 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
1822 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1823 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a1, <8 x i64> %a0)
1824 ; fadd forces execution domain
1825 %3 = fadd <8 x double> %2, zeroinitializer
1828 declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) nounwind readonly
1830 define <16 x float> @stack_fold_permps(<16 x i32> %a0, <16 x float> %a1) {
1831 ; CHECK-LABEL: stack_fold_permps:
1833 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1836 ; CHECK-NEXT: #NO_APP
1837 ; CHECK-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1839 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1840 %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a1, <16 x i32> %a0)
1843 declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) nounwind readonly
1845 define <8 x double> @stack_fold_permilpd_zmm(<8 x double> %a0) {
1846 ; CHECK-LABEL: stack_fold_permilpd_zmm:
1848 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1851 ; CHECK-NEXT: #NO_APP
1852 ; CHECK-NEXT: vpermilpd $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1853 ; CHECK-NEXT: # zmm0 = mem[1,0,3,2,5,4,7,6]
1855 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1856 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1860 define <8 x double> @stack_fold_permilpd_zmm_mask(ptr %passthru, <8 x double> %a0, i8 %mask) {
1861 ; CHECK-LABEL: stack_fold_permilpd_zmm_mask:
1863 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1866 ; CHECK-NEXT: #NO_APP
1867 ; CHECK-NEXT: kmovw %esi, %k1
1868 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
1869 ; CHECK-NEXT: vpermilpd $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
1870 ; CHECK-NEXT: # zmm1 {%k1} = mem[1,0,3,2,5,4,7,6]
1871 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
1873 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1874 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1875 %3 = bitcast i8 %mask to <8 x i1>
1876 ; load needed to keep the operation from being scheduled above the asm block
1877 %4 = load <8 x double>, ptr %passthru
1878 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1882 define <8 x double> @stack_fold_permilpd_zmm_maskz(<8 x double> %a0, i8 %mask) {
1883 ; CHECK-LABEL: stack_fold_permilpd_zmm_maskz:
1885 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1888 ; CHECK-NEXT: #NO_APP
1889 ; CHECK-NEXT: kmovw %edi, %k1
1890 ; CHECK-NEXT: vpermilpd $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1891 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,0,3,2,5,4,7,6]
1893 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1894 %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1895 %3 = bitcast i8 %mask to <8 x i1>
1896 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1900 define <8 x double> @stack_fold_permilpdvar_zmm(<8 x double> %a0, <8 x i64> %a1) {
1901 ; CHECK-LABEL: stack_fold_permilpdvar_zmm:
1903 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1906 ; CHECK-NEXT: #NO_APP
1907 ; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1909 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1910 %2 = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1)
1913 declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>) nounwind readnone
1915 define <8 x double> @stack_fold_permilpdvar_zmm_mask(ptr %passthru, <8 x double> %a0, <8 x i64> %a1, i8 %mask) {
1916 ; CHECK-LABEL: stack_fold_permilpdvar_zmm_mask:
1918 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1921 ; CHECK-NEXT: #NO_APP
1922 ; CHECK-NEXT: kmovw %esi, %k1
1923 ; CHECK-NEXT: vmovapd (%rdi), %zmm2
1924 ; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1925 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
1927 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1928 %2 = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1)
1929 %3 = bitcast i8 %mask to <8 x i1>
1930 ; load needed to keep the operation from being scheduled above the asm block
1931 %4 = load <8 x double>, ptr %passthru
1932 %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
1936 define <8 x double> @stack_fold_permilpdvar_zmm_maskz(<8 x double> %a0, <8 x i64> %a1, i8 %mask) {
1937 ; CHECK-LABEL: stack_fold_permilpdvar_zmm_maskz:
1939 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1942 ; CHECK-NEXT: #NO_APP
1943 ; CHECK-NEXT: kmovw %edi, %k1
1944 ; CHECK-NEXT: vpermilpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1946 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1947 %2 = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %a1)
1948 %3 = bitcast i8 %mask to <8 x i1>
1949 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
1953 define <16 x float> @stack_fold_permilps_zmm(<16 x float> %a0) {
1954 ; CHECK-LABEL: stack_fold_permilps_zmm:
1956 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1959 ; CHECK-NEXT: #NO_APP
1960 ; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1961 ; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1963 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1964 %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
1968 define <16 x float> @stack_fold_permilps_zmm_mask(ptr %passthru, <16 x float> %a0, i16 %mask) {
1969 ; CHECK-LABEL: stack_fold_permilps_zmm_mask:
1971 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1974 ; CHECK-NEXT: #NO_APP
1975 ; CHECK-NEXT: kmovw %esi, %k1
1976 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1977 ; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
1978 ; CHECK-NEXT: # zmm1 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1979 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1981 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1982 %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
1983 %3 = bitcast i16 %mask to <16 x i1>
1984 ; load needed to keep the operation from being scheduled above the asm block
1985 %4 = load <16 x float>, ptr %passthru
1986 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
1990 define <16 x float> @stack_fold_permilps_zmm_maskz(<16 x float> %a0, i16 %mask) {
1991 ; CHECK-LABEL: stack_fold_permilps_zmm_maskz:
1993 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1996 ; CHECK-NEXT: #NO_APP
1997 ; CHECK-NEXT: kmovw %edi, %k1
1998 ; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1999 ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2001 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2002 %2 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
2003 %3 = bitcast i16 %mask to <16 x i1>
2004 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
2008 define <16 x float> @stack_fold_permilpsvar_zmm(<16 x float> %a0, <16 x i32> %a1) {
2009 ; CHECK-LABEL: stack_fold_permilpsvar_zmm:
2011 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2014 ; CHECK-NEXT: #NO_APP
2015 ; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2017 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2018 %2 = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1)
2021 declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>) nounwind readnone
2023 define <16 x float> @stack_fold_permilpsvar_zmm_mask(ptr %passthru, <16 x float> %a0, <16 x i32> %a1, i16 %mask) {
2024 ; CHECK-LABEL: stack_fold_permilpsvar_zmm_mask:
2026 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2029 ; CHECK-NEXT: #NO_APP
2030 ; CHECK-NEXT: kmovw %esi, %k1
2031 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
2032 ; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2033 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2035 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2036 %2 = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1)
2037 %3 = bitcast i16 %mask to <16 x i1>
2038 ; load needed to keep the operation from being scheduled above the asm block
2039 %4 = load <16 x float>, ptr %passthru
2040 %5 = select <16 x i1> %3, <16 x float> %2, <16 x float> %4
2044 define <16 x float> @stack_fold_permilpsvar_zmm_maskz(<16 x float> %a0, <16 x i32> %a1, i16 %mask) {
2045 ; CHECK-LABEL: stack_fold_permilpsvar_zmm_maskz:
2047 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2050 ; CHECK-NEXT: #NO_APP
2051 ; CHECK-NEXT: kmovw %edi, %k1
2052 ; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2054 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2055 %2 = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %a1)
2056 %3 = bitcast i16 %mask to <16 x i1>
2057 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
2061 attributes #0 = { "unsafe-fp-math"="false" }
2062 attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }