1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <32 x half> @stack_fold_addph_zmm(<32 x half> %a0, <32 x half> %a1) {
13 ; CHECK-LABEL: stack_fold_addph_zmm:
15 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19 ; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
22 %2 = fadd <32 x half> %a0, %a1
26 define <32 x half> @stack_fold_addph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
27 ; CHECK-LABEL: stack_fold_addph_zmm_k:
29 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
33 ; CHECK-NEXT: kmovd %edi, %k1
34 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
35 ; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
36 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
38 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
39 %2 = fadd <32 x half> %a0, %a1
40 %3 = bitcast i32 %mask to <32 x i1>
41 %4 = load <32 x half>, ptr %passthru
42 %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
46 define <32 x half> @stack_fold_addph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
47 ; CHECK-LABEL: stack_fold_addph_zmm_k_commuted:
49 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
53 ; CHECK-NEXT: kmovd %edi, %k1
54 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
55 ; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
56 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
58 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
59 %2 = fadd <32 x half> %a1, %a0
60 %3 = bitcast i32 %mask to <32 x i1>
61 %4 = load <32 x half>, ptr %passthru
62 %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
66 define <32 x half> @stack_fold_addph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
67 ; CHECK-LABEL: stack_fold_addph_zmm_kz:
69 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
73 ; CHECK-NEXT: kmovd %edi, %k1
74 ; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
76 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
77 %2 = fadd <32 x half> %a1, %a0
78 %3 = bitcast i32 %mask to <32 x i1>
79 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
83 define half @stack_fold_addsh(half %a0, half %a1) {
84 ; CHECK-LABEL: stack_fold_addsh:
86 ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
90 ; CHECK-NEXT: vaddsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
92 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
93 %2 = fadd half %a0, %a1
97 define <8 x half> @stack_fold_addsh_int(<8 x half> %a0, <8 x half> %a1) {
98 ; CHECK-LABEL: stack_fold_addsh_int:
100 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
103 ; CHECK-NEXT: #NO_APP
104 ; CHECK-NEXT: vaddsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
106 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
107 %2 = extractelement <8 x half> %a0, i32 0
108 %3 = extractelement <8 x half> %a1, i32 0
109 %4 = fadd half %2, %3
110 %5 = insertelement <8 x half> %a0, half %4, i32 0
114 define i32 @stack_fold_cmpph(<32 x half> %a0, <32 x half> %a1) {
115 ; CHECK-LABEL: stack_fold_cmpph:
117 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
120 ; CHECK-NEXT: #NO_APP
121 ; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
122 ; CHECK-NEXT: kmovd %k0, %eax
123 ; CHECK-NEXT: vzeroupper
125 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
126 %res = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %a1, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
127 %2 = bitcast <32 x i1> %res to i32
130 declare <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half>, <32 x half>, i32, <32 x i1>, i32)
132 define <32 x half> @stack_fold_cmpph_mask(<32 x half> %a0, <32 x half> %a1, ptr %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) {
133 ; CHECK-LABEL: stack_fold_cmpph_mask:
135 ; CHECK-NEXT: subq $136, %rsp
136 ; CHECK-NEXT: .cfi_def_cfa_offset 144
137 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
138 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
139 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
140 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
143 ; CHECK-NEXT: #NO_APP
144 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
145 ; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0
146 ; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
147 ; CHECK-NEXT: kmovd %esi, %k1
148 ; CHECK-NEXT: kandd %k0, %k1, %k1
149 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
150 ; CHECK-NEXT: vmovdqu16 (%rsp), %zmm0 {%k1} # 64-byte Folded Reload
151 ; CHECK-NEXT: addq $136, %rsp
152 ; CHECK-NEXT: .cfi_def_cfa_offset 8
154 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
155 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
156 %2 = load <32 x half>, ptr %a2
157 %3 = fadd <32 x half> %a1, %2
158 %4 = bitcast i32 %mask to <32 x i1>
159 %5 = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %3, <32 x half> %a0, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
160 %6 = and <32 x i1> %4, %5
161 %7 = select <32 x i1> %6, <32 x half> %b0, <32 x half> %b1
165 define <32 x half> @stack_fold_cmpph_mask_commuted(<32 x half> %a0, <32 x half> %a1, ptr %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) {
166 ; CHECK-LABEL: stack_fold_cmpph_mask_commuted:
168 ; CHECK-NEXT: subq $136, %rsp
169 ; CHECK-NEXT: .cfi_def_cfa_offset 144
170 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
171 ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
172 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
173 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
176 ; CHECK-NEXT: #NO_APP
177 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
178 ; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0
179 ; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
180 ; CHECK-NEXT: kmovd %esi, %k1
181 ; CHECK-NEXT: kandd %k0, %k1, %k1
182 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
183 ; CHECK-NEXT: vmovdqu16 (%rsp), %zmm0 {%k1} # 64-byte Folded Reload
184 ; CHECK-NEXT: addq $136, %rsp
185 ; CHECK-NEXT: .cfi_def_cfa_offset 8
187 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
188 ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
189 %2 = load <32 x half>, ptr %a2
190 %3 = fadd <32 x half> %a1, %2
191 %4 = bitcast i32 %mask to <32 x i1>
192 %5 = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %3, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
193 %6 = and <32 x i1> %4, %5
194 %7 = select <32 x i1> %6, <32 x half> %b0, <32 x half> %b1
198 define half @stack_fold_divsh(half %a0, half %a1) {
199 ; CHECK-LABEL: stack_fold_divsh:
201 ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
204 ; CHECK-NEXT: #NO_APP
205 ; CHECK-NEXT: vdivsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
207 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
208 %2 = fdiv half %a0, %a1
212 define <8 x half> @stack_fold_divsh_int(<8 x half> %a0, <8 x half> %a1) {
213 ; CHECK-LABEL: stack_fold_divsh_int:
215 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
218 ; CHECK-NEXT: #NO_APP
219 ; CHECK-NEXT: vdivsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
221 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
222 %2 = extractelement <8 x half> %a0, i32 0
223 %3 = extractelement <8 x half> %a1, i32 0
224 %4 = fdiv half %2, %3
225 %5 = insertelement <8 x half> %a0, half %4, i32 0
229 define i32 @stack_fold_fpclassph(<32 x half> %a0) {
230 ; CHECK-LABEL: stack_fold_fpclassph:
232 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
235 ; CHECK-NEXT: #NO_APP
236 ; CHECK-NEXT: vfpclassphz $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 64-byte Folded Reload
237 ; CHECK-NEXT: # k0 = isNegativeZero(mem)
238 ; CHECK-NEXT: kmovd %k0, %eax
239 ; CHECK-NEXT: vzeroupper
241 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
242 %2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4)
243 %3 = bitcast <32 x i1> %2 to i32
246 declare <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half>, i32)
248 define i32 @stack_fold_fpclassph_mask(<32 x half> %a0, ptr %p) {
249 ; CHECK-LABEL: stack_fold_fpclassph_mask:
251 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
254 ; CHECK-NEXT: #NO_APP
255 ; CHECK-NEXT: kmovd (%rdi), %k1
256 ; CHECK-NEXT: vfpclassphz $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 64-byte Folded Reload
257 ; CHECK-NEXT: # k0 {%k1} = isNegativeZero(mem)
258 ; CHECK-NEXT: kmovd %k0, %eax
259 ; CHECK-NEXT: vzeroupper
261 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
262 %2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4)
263 %mask = load <32 x i1>, ptr %p
264 %3 = and <32 x i1> %2, %mask
265 %4 = bitcast <32 x i1> %3 to i32
269 define i8 @stack_fold_fpclasssh(<8 x half> %a0) {
270 ; CHECK-LABEL: stack_fold_fpclasssh:
272 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
275 ; CHECK-NEXT: #NO_APP
276 ; CHECK-NEXT: vfpclasssh $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 16-byte Folded Reload
277 ; CHECK-NEXT: # k0 = isNegativeZero(mem)
278 ; CHECK-NEXT: kmovd %k0, %eax
279 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
281 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
282 %2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 -1)
285 declare i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half>, i32, i8)
287 define i8 @stack_fold_fpclasssh_mask(<8 x half> %a0, ptr %p) {
288 ; CHECK-LABEL: stack_fold_fpclasssh_mask:
290 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
293 ; CHECK-NEXT: #NO_APP
294 ; CHECK-NEXT: kmovb (%rdi), %k1
295 ; CHECK-NEXT: vfpclasssh $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 16-byte Folded Reload
296 ; CHECK-NEXT: # k0 {%k1} = isNegativeZero(mem)
297 ; CHECK-NEXT: kmovd %k0, %eax
298 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
300 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
301 %mask = load i8, ptr %p
302 %2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 %mask)
306 define <32 x half> @stack_fold_getexpph(<32 x half> %a0) {
307 ; CHECK-LABEL: stack_fold_getexpph:
309 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
312 ; CHECK-NEXT: #NO_APP
313 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
315 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
316 %2 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1, i32 4)
319 declare <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half>, <32 x half>, i32, i32)
321 define <32 x half> @stack_fold_getexpph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
322 ; CHECK-LABEL: stack_fold_getexpph_mask:
324 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
325 ; CHECK-NEXT: kmovd %esi, %k1
328 ; CHECK-NEXT: #NO_APP
329 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
330 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
331 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
333 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
334 %2 = load <32 x half>, ptr %passthru
335 %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask, i32 4)
339 define <32 x half> @stack_fold_getexpph_maskz(<32 x half> %a0, ptr %mask) {
340 ; CHECK-LABEL: stack_fold_getexpph_maskz:
342 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
345 ; CHECK-NEXT: #NO_APP
346 ; CHECK-NEXT: kmovd (%rdi), %k1
347 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
349 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
350 %2 = load i32, ptr %mask
351 %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2, i32 4)
355 define <8 x half> @stack_fold_getexpsh(<8 x half> %a0, <8 x half> %a1) {
356 ; CHECK-LABEL: stack_fold_getexpsh:
358 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
361 ; CHECK-NEXT: #NO_APP
362 ; CHECK-NEXT: vgetexpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
364 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
365 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
368 declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32)
370 define <8 x half> @stack_fold_getexpsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
371 ; CHECK-LABEL: stack_fold_getexpsh_mask:
373 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
374 ; CHECK-NEXT: kmovd %esi, %k1
377 ; CHECK-NEXT: #NO_APP
378 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
379 ; CHECK-NEXT: vgetexpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
380 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
382 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
383 %2 = load <8 x half>, ptr %passthru
384 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
388 define <8 x half> @stack_fold_getexpsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
389 ; CHECK-LABEL: stack_fold_getexpsh_maskz:
391 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
394 ; CHECK-NEXT: #NO_APP
395 ; CHECK-NEXT: kmovb (%rdi), %k1
396 ; CHECK-NEXT: vgetexpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
398 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
399 %2 = load i8, ptr %mask
400 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4)
404 define <32 x half> @stack_fold_getmantph(<32 x half> %a0) {
405 ; CHECK-LABEL: stack_fold_getmantph:
407 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
410 ; CHECK-NEXT: #NO_APP
411 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
413 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
414 %2 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
417 declare <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
419 define <32 x half> @stack_fold_getmantph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
420 ; CHECK-LABEL: stack_fold_getmantph_mask:
422 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
423 ; CHECK-NEXT: kmovd %esi, %k1
426 ; CHECK-NEXT: #NO_APP
427 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
428 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
429 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
431 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
432 %2 = load <32 x half>, ptr %passthru
433 %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
437 define <32 x half> @stack_fold_getmantph_maskz(<32 x half> %a0, ptr %mask) {
438 ; CHECK-LABEL: stack_fold_getmantph_maskz:
440 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
443 ; CHECK-NEXT: #NO_APP
444 ; CHECK-NEXT: kmovd (%rdi), %k1
445 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
447 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
448 %2 = load i32, ptr %mask
449 %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
453 define <8 x half> @stack_fold_getmantsh(<8 x half> %a0, <8 x half> %a1) {
454 ; CHECK-LABEL: stack_fold_getmantsh:
456 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
459 ; CHECK-NEXT: #NO_APP
460 ; CHECK-NEXT: vgetmantsh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
462 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
463 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> undef, i8 -1, i32 4)
466 declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half>, <8 x half>, i32, <8 x half>, i8, i32)
468 define <8 x half> @stack_fold_getmantsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
469 ; CHECK-LABEL: stack_fold_getmantsh_mask:
471 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
472 ; CHECK-NEXT: kmovd %esi, %k1
475 ; CHECK-NEXT: #NO_APP
476 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
477 ; CHECK-NEXT: vgetmantsh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
478 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
480 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
481 %2 = load <8 x half>, ptr %passthru
482 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> %2, i8 %mask, i32 4)
486 define <8 x half> @stack_fold_getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
487 ; CHECK-LABEL: stack_fold_getmantsh_maskz:
489 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
492 ; CHECK-NEXT: #NO_APP
493 ; CHECK-NEXT: kmovb (%rdi), %k1
494 ; CHECK-NEXT: vgetmantsh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
496 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
497 %2 = load i8, ptr %mask
498 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> zeroinitializer, i8 %2, i32 4)
502 define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
503 ; CHECK-LABEL: stack_fold_maxph_zmm:
505 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
508 ; CHECK-NEXT: #NO_APP
509 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
511 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
512 %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
515 declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
517 define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
518 ; CHECK-LABEL: stack_fold_maxph_zmm_commuted:
520 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
523 ; CHECK-NEXT: #NO_APP
524 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
525 ; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm0
527 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
528 %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
532 define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
533 ; CHECK-LABEL: stack_fold_maxph_zmm_k:
535 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
538 ; CHECK-NEXT: #NO_APP
539 ; CHECK-NEXT: kmovd %edi, %k1
540 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
541 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
542 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
544 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
545 %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
546 %3 = bitcast i32 %mask to <32 x i1>
547 %4 = load <32 x half>, ptr %passthru
548 %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
552 define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
553 ; CHECK-LABEL: stack_fold_maxph_zmm_k_commuted:
555 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
558 ; CHECK-NEXT: #NO_APP
559 ; CHECK-NEXT: kmovd %edi, %k1
560 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
561 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
562 ; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm2 {%k1}
563 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
565 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
566 %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
567 %3 = bitcast i32 %mask to <32 x i1>
568 %4 = load <32 x half>, ptr %passthru
569 %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
573 define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
574 ; CHECK-LABEL: stack_fold_maxph_zmm_kz:
576 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
579 ; CHECK-NEXT: #NO_APP
580 ; CHECK-NEXT: kmovd %edi, %k1
581 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
583 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
584 %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
585 %3 = bitcast i32 %mask to <32 x i1>
586 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
590 define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
591 ; CHECK-LABEL: stack_fold_maxph_zmm_kz_commuted:
593 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
596 ; CHECK-NEXT: #NO_APP
597 ; CHECK-NEXT: kmovd %edi, %k1
598 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
599 ; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm0 {%k1} {z}
601 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
602 %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
603 %3 = bitcast i32 %mask to <32 x i1>
604 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
608 define <32 x half> @stack_fold_maxph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 {
609 ; CHECK-LABEL: stack_fold_maxph_zmm_commutable:
611 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
614 ; CHECK-NEXT: #NO_APP
615 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
617 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
618 %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
622 define <32 x half> @stack_fold_maxph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 {
623 ; CHECK-LABEL: stack_fold_maxph_zmm_commutable_commuted:
625 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
628 ; CHECK-NEXT: #NO_APP
629 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
631 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
632 %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
636 define <32 x half> @stack_fold_maxph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 {
637 ; CHECK-LABEL: stack_fold_maxph_zmm_commutable_k:
639 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
642 ; CHECK-NEXT: #NO_APP
643 ; CHECK-NEXT: kmovd %edi, %k1
644 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
645 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
646 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
648 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
649 %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
650 %3 = bitcast i32 %mask to <32 x i1>
651 %4 = load <32 x half>, ptr %passthru
652 %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
656 define <32 x half> @stack_fold_maxph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 {
657 ; CHECK-LABEL: stack_fold_maxph_zmm_commutable_k_commuted:
659 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
662 ; CHECK-NEXT: #NO_APP
663 ; CHECK-NEXT: kmovd %edi, %k1
664 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
665 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
666 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
668 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
669 %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
670 %3 = bitcast i32 %mask to <32 x i1>
671 %4 = load <32 x half>, ptr %passthru
672 %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
676 define <32 x half> @stack_fold_maxph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
677 ; CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz:
679 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
682 ; CHECK-NEXT: #NO_APP
683 ; CHECK-NEXT: kmovd %edi, %k1
684 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
686 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
687 %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
688 %3 = bitcast i32 %mask to <32 x i1>
689 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
693 define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
694 ; CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz_commuted:
696 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
699 ; CHECK-NEXT: #NO_APP
700 ; CHECK-NEXT: kmovd %edi, %k1
701 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
703 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
704 %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
705 %3 = bitcast i32 %mask to <32 x i1>
706 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
710 define half @stack_fold_maxsh(half %a0, half %a1) #0 {
711 ; CHECK-LABEL: stack_fold_maxsh:
713 ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
716 ; CHECK-NEXT: #NO_APP
717 ; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
719 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
720 %2 = fcmp ogt half %a0, %a1
721 %3 = select i1 %2, half %a0, half %a1
725 define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
726 ; CHECK-LABEL: stack_fold_maxsh_commuted:
728 ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
731 ; CHECK-NEXT: #NO_APP
732 ; CHECK-NEXT: vmovsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
733 ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
734 ; CHECK-NEXT: vmaxsh %xmm0, %xmm1, %xmm0
736 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
737 %2 = fcmp ogt half %a1, %a0
738 %3 = select i1 %2, half %a1, half %a0
742 define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 {
743 ; CHECK-LABEL: stack_fold_maxsh_commutable:
745 ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
748 ; CHECK-NEXT: #NO_APP
749 ; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
751 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
752 %2 = fcmp ogt half %a0, %a1
753 %3 = select i1 %2, half %a0, half %a1
757 define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 {
758 ; CHECK-LABEL: stack_fold_maxsh_commutable_commuted:
760 ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
763 ; CHECK-NEXT: #NO_APP
764 ; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
766 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
767 %2 = fcmp ogt half %a1, %a0
768 %3 = select i1 %2, half %a1, half %a0
772 define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
773 ; CHECK-LABEL: stack_fold_maxsh_int:
775 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
778 ; CHECK-NEXT: #NO_APP
779 ; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
781 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
782 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
785 declare <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
787 define <8 x half> @stack_fold_maxsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, ptr %passthru) {
788 ; CHECK-LABEL: stack_fold_maxsh_mask:
790 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
791 ; CHECK-NEXT: kmovd %edi, %k1
794 ; CHECK-NEXT: #NO_APP
795 ; CHECK-NEXT: vmovaps (%rsi), %xmm2
796 ; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
797 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
799 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
800 %2 = load <8 x half>, ptr %passthru
801 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
805 define <8 x half> @stack_fold_maxsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) {
806 ; CHECK-LABEL: stack_fold_maxsh_maskz:
808 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
809 ; CHECK-NEXT: kmovd %edi, %k1
812 ; CHECK-NEXT: #NO_APP
813 ; CHECK-NEXT: vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
815 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
816 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4)
820 define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
821 ; CHECK-LABEL: stack_fold_minph_zmm:
823 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
826 ; CHECK-NEXT: #NO_APP
827 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
829 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
830 %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
833 declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
835 define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
836 ; CHECK-LABEL: stack_fold_minph_zmm_commuted:
838 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
841 ; CHECK-NEXT: #NO_APP
842 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
843 ; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm0
845 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
846 %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
850 define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
851 ; CHECK-LABEL: stack_fold_minph_zmm_k:
853 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
856 ; CHECK-NEXT: #NO_APP
857 ; CHECK-NEXT: kmovd %edi, %k1
858 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
859 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
860 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
862 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
863 %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
864 %3 = bitcast i32 %mask to <32 x i1>
865 %4 = load <32 x half>, ptr %passthru
866 %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
870 define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
871 ; CHECK-LABEL: stack_fold_minph_zmm_k_commuted:
873 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
876 ; CHECK-NEXT: #NO_APP
877 ; CHECK-NEXT: kmovd %edi, %k1
878 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
879 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
880 ; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm2 {%k1}
881 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
883 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
884 %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
885 %3 = bitcast i32 %mask to <32 x i1>
886 %4 = load <32 x half>, ptr %passthru
887 %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
891 define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
892 ; CHECK-LABEL: stack_fold_minph_zmm_kz:
894 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
897 ; CHECK-NEXT: #NO_APP
898 ; CHECK-NEXT: kmovd %edi, %k1
899 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
901 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
902 %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
903 %3 = bitcast i32 %mask to <32 x i1>
904 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
908 define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
909 ; CHECK-LABEL: stack_fold_minph_zmm_kz_commuted:
911 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
914 ; CHECK-NEXT: #NO_APP
915 ; CHECK-NEXT: kmovd %edi, %k1
916 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
917 ; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm0 {%k1} {z}
919 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
920 %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
921 %3 = bitcast i32 %mask to <32 x i1>
922 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
926 define <32 x half> @stack_fold_minph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 {
927 ; CHECK-LABEL: stack_fold_minph_zmm_commutable:
929 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
932 ; CHECK-NEXT: #NO_APP
933 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
935 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
936 %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
940 define <32 x half> @stack_fold_minph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 {
941 ; CHECK-LABEL: stack_fold_minph_zmm_commutable_commuted:
943 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
946 ; CHECK-NEXT: #NO_APP
947 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
949 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
950 %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
954 define <32 x half> @stack_fold_minph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 {
955 ; CHECK-LABEL: stack_fold_minph_zmm_commutable_k:
957 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
960 ; CHECK-NEXT: #NO_APP
961 ; CHECK-NEXT: kmovd %edi, %k1
962 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
963 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
964 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
966 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
967 %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
968 %3 = bitcast i32 %mask to <32 x i1>
969 %4 = load <32 x half>, ptr %passthru
970 %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
974 define <32 x half> @stack_fold_minph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 {
975 ; CHECK-LABEL: stack_fold_minph_zmm_commutable_k_commuted:
977 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
980 ; CHECK-NEXT: #NO_APP
981 ; CHECK-NEXT: kmovd %edi, %k1
982 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
983 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
984 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
986 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
987 %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
988 %3 = bitcast i32 %mask to <32 x i1>
989 %4 = load <32 x half>, ptr %passthru
990 %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
994 define <32 x half> @stack_fold_minph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
995 ; CHECK-LABEL: stack_fold_minph_zmm_commutable_kz:
997 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1000 ; CHECK-NEXT: #NO_APP
1001 ; CHECK-NEXT: kmovd %edi, %k1
1002 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1004 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1005 %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
1006 %3 = bitcast i32 %mask to <32 x i1>
1007 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
1011 define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
1012 ; CHECK-LABEL: stack_fold_minph_zmm_commutable_kz_commuted:
1014 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1017 ; CHECK-NEXT: #NO_APP
1018 ; CHECK-NEXT: kmovd %edi, %k1
1019 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1021 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1022 %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
1023 %3 = bitcast i32 %mask to <32 x i1>
1024 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
1028 define half @stack_fold_minsh(half %a0, half %a1) #0 {
1029 ; CHECK-LABEL: stack_fold_minsh:
1031 ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1034 ; CHECK-NEXT: #NO_APP
1035 ; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1037 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1038 %2 = fcmp olt half %a0, %a1
1039 %3 = select i1 %2, half %a0, half %a1
1043 define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
1044 ; CHECK-LABEL: stack_fold_minsh_commuted:
1046 ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1049 ; CHECK-NEXT: #NO_APP
1050 ; CHECK-NEXT: vmovsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
1051 ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1052 ; CHECK-NEXT: vminsh %xmm0, %xmm1, %xmm0
1054 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1055 %2 = fcmp olt half %a1, %a0
1056 %3 = select i1 %2, half %a1, half %a0
1060 define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 {
1061 ; CHECK-LABEL: stack_fold_minsh_commutable:
1063 ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1066 ; CHECK-NEXT: #NO_APP
1067 ; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1069 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1070 %2 = fcmp olt half %a0, %a1
1071 %3 = select i1 %2, half %a0, half %a1
1075 define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 {
1076 ; CHECK-LABEL: stack_fold_minsh_commutable_commuted:
1078 ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1081 ; CHECK-NEXT: #NO_APP
1082 ; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1084 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1085 %2 = fcmp olt half %a1, %a0
1086 %3 = select i1 %2, half %a1, half %a0
1090 define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
1091 ; CHECK-LABEL: stack_fold_minsh_int:
1093 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1096 ; CHECK-NEXT: #NO_APP
1097 ; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1099 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1100 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
1103 declare <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
1105 define <8 x half> @stack_fold_minsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, ptr %passthru) {
1106 ; CHECK-LABEL: stack_fold_minsh_mask:
1108 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1109 ; CHECK-NEXT: kmovd %edi, %k1
1112 ; CHECK-NEXT: #NO_APP
1113 ; CHECK-NEXT: vmovaps (%rsi), %xmm2
1114 ; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1115 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1117 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1118 %2 = load <8 x half>, ptr %passthru
1119 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
1123 define <8 x half> @stack_fold_minsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) {
1124 ; CHECK-LABEL: stack_fold_minsh_maskz:
1126 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1127 ; CHECK-NEXT: kmovd %edi, %k1
1130 ; CHECK-NEXT: #NO_APP
1131 ; CHECK-NEXT: vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
1133 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1134 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4)
1138 define <32 x half> @stack_fold_mulph_zmm(<32 x half> %a0, <32 x half> %a1) {
1139 ; CHECK-LABEL: stack_fold_mulph_zmm:
1141 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1144 ; CHECK-NEXT: #NO_APP
1145 ; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1147 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1148 %2 = fmul <32 x half> %a0, %a1
1152 define <32 x half> @stack_fold_mulph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
1153 ; CHECK-LABEL: stack_fold_mulph_zmm_k:
1155 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1158 ; CHECK-NEXT: #NO_APP
1159 ; CHECK-NEXT: kmovd %edi, %k1
1160 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
1161 ; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1162 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1164 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1165 %2 = fmul <32 x half> %a0, %a1
1166 %3 = bitcast i32 %mask to <32 x i1>
1167 %4 = load <32 x half>, ptr %passthru
1168 %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
1172 define <32 x half> @stack_fold_mulph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
1173 ; CHECK-LABEL: stack_fold_mulph_zmm_k_commuted:
1175 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1178 ; CHECK-NEXT: #NO_APP
1179 ; CHECK-NEXT: kmovd %edi, %k1
1180 ; CHECK-NEXT: vmovaps (%rsi), %zmm2
1181 ; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1182 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1184 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1185 %2 = fmul <32 x half> %a1, %a0
1186 %3 = bitcast i32 %mask to <32 x i1>
1187 %4 = load <32 x half>, ptr %passthru
1188 %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
1192 define <32 x half> @stack_fold_mulph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
1193 ; CHECK-LABEL: stack_fold_mulph_zmm_kz:
1195 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1198 ; CHECK-NEXT: #NO_APP
1199 ; CHECK-NEXT: kmovd %edi, %k1
1200 ; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1202 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1203 %2 = fmul <32 x half> %a1, %a0
1204 %3 = bitcast i32 %mask to <32 x i1>
1205 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
1209 define half @stack_fold_mulsh(half %a0, half %a1) {
1210 ; CHECK-LABEL: stack_fold_mulsh:
1212 ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1215 ; CHECK-NEXT: #NO_APP
1216 ; CHECK-NEXT: vmulsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1218 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1219 %2 = fmul half %a0, %a1
1223 define <8 x half> @stack_fold_mulsh_int(<8 x half> %a0, <8 x half> %a1) {
1224 ; CHECK-LABEL: stack_fold_mulsh_int:
1226 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1229 ; CHECK-NEXT: #NO_APP
1230 ; CHECK-NEXT: vmulsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1232 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1233 %2 = extractelement <8 x half> %a0, i32 0
1234 %3 = extractelement <8 x half> %a1, i32 0
1235 %4 = fmul half %2, %3
1236 %5 = insertelement <8 x half> %a0, half %4, i32 0
1240 define <32 x half> @stack_fold_rcpph(<32 x half> %a0) {
1241 ; CHECK-LABEL: stack_fold_rcpph:
1243 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1246 ; CHECK-NEXT: #NO_APP
1247 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1249 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1250 %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1)
1253 declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half>, i32)
1255 define <32 x half> @stack_fold_rcpph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
1256 ; CHECK-LABEL: stack_fold_rcpph_mask:
1258 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1259 ; CHECK-NEXT: kmovd %esi, %k1
1262 ; CHECK-NEXT: #NO_APP
1263 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1264 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
1265 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1267 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1268 %2 = load <32 x half>, ptr %passthru
1269 %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask)
1273 define <32 x half> @stack_fold_rcpph_maskz(<32 x half> %a0, ptr %mask) {
1274 ; CHECK-LABEL: stack_fold_rcpph_maskz:
1276 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1279 ; CHECK-NEXT: #NO_APP
1280 ; CHECK-NEXT: kmovd (%rdi), %k1
1281 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1283 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1284 %2 = load i32, ptr %mask
1285 %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2)
1289 define <8 x half> @stack_fold_rcpsh(<8 x half> %a0, <8 x half> %a1) {
1290 ; CHECK-LABEL: stack_fold_rcpsh:
1292 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1295 ; CHECK-NEXT: #NO_APP
1296 ; CHECK-NEXT: vrcpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1298 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1299 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1)
1302 declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half>, <8 x half>, <8 x half>, i8)
1304 define <8 x half> @stack_fold_rcpsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
1305 ; CHECK-LABEL: stack_fold_rcpsh_mask:
1307 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1308 ; CHECK-NEXT: kmovd %esi, %k1
1311 ; CHECK-NEXT: #NO_APP
1312 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1313 ; CHECK-NEXT: vrcpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1314 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1316 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1317 %2 = load <8 x half>, ptr %passthru
1318 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask)
1322 define <8 x half> @stack_fold_rcpsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
1323 ; CHECK-LABEL: stack_fold_rcpsh_maskz:
1325 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1328 ; CHECK-NEXT: #NO_APP
1329 ; CHECK-NEXT: kmovb (%rdi), %k1
1330 ; CHECK-NEXT: vrcpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
1332 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1333 %2 = load i8, ptr %mask
1334 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2)
1338 define <32 x half> @stack_fold_reduceph(<32 x half> %a0) {
1339 ; CHECK-LABEL: stack_fold_reduceph:
1341 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1344 ; CHECK-NEXT: #NO_APP
1345 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1347 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1348 %2 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
1351 declare <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
1353 define <32 x half> @stack_fold_reduceph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
1354 ; CHECK-LABEL: stack_fold_reduceph_mask:
1356 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1357 ; CHECK-NEXT: kmovd %esi, %k1
1360 ; CHECK-NEXT: #NO_APP
1361 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1362 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
1363 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1365 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1366 %2 = load <32 x half>, ptr %passthru
1367 %3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
1371 define <32 x half> @stack_fold_reduceph_maskz(<32 x half> %a0, ptr %mask) {
1372 ; CHECK-LABEL: stack_fold_reduceph_maskz:
1374 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1377 ; CHECK-NEXT: #NO_APP
1378 ; CHECK-NEXT: kmovd (%rdi), %k1
1379 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1381 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1382 %2 = load i32, ptr %mask
1383 %3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
1387 define <8 x half> @stack_fold_reducesh(<8 x half> %a0, <8 x half> %a1) {
1388 ; CHECK-LABEL: stack_fold_reducesh:
1390 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1393 ; CHECK-NEXT: #NO_APP
1394 ; CHECK-NEXT: vreducesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1396 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1397 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4)
1400 declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32)
1402 define <8 x half> @stack_fold_reducesh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
1403 ; CHECK-LABEL: stack_fold_reducesh_mask:
1405 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1406 ; CHECK-NEXT: kmovd %esi, %k1
1409 ; CHECK-NEXT: #NO_APP
1410 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1411 ; CHECK-NEXT: vreducesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1412 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1414 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1415 %2 = load <8 x half>, ptr %passthru
1416 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4)
1420 define <8 x half> @stack_fold_reducesh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
1421 ; CHECK-LABEL: stack_fold_reducesh_maskz:
1423 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1426 ; CHECK-NEXT: #NO_APP
1427 ; CHECK-NEXT: kmovb (%rdi), %k1
1428 ; CHECK-NEXT: vreducesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
1430 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1431 %2 = load i8, ptr %mask
1432 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4)
1436 define <32 x half> @stack_fold_rndscaleph(<32 x half> %a0) {
1437 ; CHECK-LABEL: stack_fold_rndscaleph:
1439 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1442 ; CHECK-NEXT: #NO_APP
1443 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1445 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1446 %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
1449 declare <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
1451 define <32 x half> @stack_fold_rndscaleph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
1452 ; CHECK-LABEL: stack_fold_rndscaleph_mask:
1454 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1455 ; CHECK-NEXT: kmovd %esi, %k1
1458 ; CHECK-NEXT: #NO_APP
1459 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1460 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
1461 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1463 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1464 %2 = load <32 x half>, ptr %passthru
1465 %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
1469 define <32 x half> @stack_fold_rndscaleph_maskz(<32 x half> %a0, ptr %mask) {
1470 ; CHECK-LABEL: stack_fold_rndscaleph_maskz:
1472 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1475 ; CHECK-NEXT: #NO_APP
1476 ; CHECK-NEXT: kmovd (%rdi), %k1
1477 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1479 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1480 %2 = load i32, ptr %mask
1481 %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
1485 define <8 x half> @stack_fold_rndscalesh(<8 x half> %a0, <8 x half> %a1) {
1486 ; CHECK-LABEL: stack_fold_rndscalesh:
1488 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1491 ; CHECK-NEXT: #NO_APP
1492 ; CHECK-NEXT: vrndscalesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1494 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1495 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4)
1498 declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32)
1500 define <8 x half> @stack_fold_rndscalesh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
1501 ; CHECK-LABEL: stack_fold_rndscalesh_mask:
1503 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1504 ; CHECK-NEXT: kmovd %esi, %k1
1507 ; CHECK-NEXT: #NO_APP
1508 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1509 ; CHECK-NEXT: vrndscalesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1510 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1512 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1513 %2 = load <8 x half>, ptr %passthru
1514 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4)
1518 define <8 x half> @stack_fold_rndscalesh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
1519 ; CHECK-LABEL: stack_fold_rndscalesh_maskz:
1521 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1524 ; CHECK-NEXT: #NO_APP
1525 ; CHECK-NEXT: kmovb (%rdi), %k1
1526 ; CHECK-NEXT: vrndscalesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
1528 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1529 %2 = load i8, ptr %mask
1530 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4)
1534 define <32 x half> @stack_fold_rsqrtph(<32 x half> %a0) {
1535 ; CHECK-LABEL: stack_fold_rsqrtph:
1537 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1540 ; CHECK-NEXT: #NO_APP
1541 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1543 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1544 %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1)
1547 declare <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half>, <32 x half>, i32)
1549 define <32 x half> @stack_fold_rsqrtph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
1550 ; CHECK-LABEL: stack_fold_rsqrtph_mask:
1552 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1553 ; CHECK-NEXT: kmovd %esi, %k1
1556 ; CHECK-NEXT: #NO_APP
1557 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1558 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
1559 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1561 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1562 %2 = load <32 x half>, ptr %passthru
1563 %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask)
1567 define <32 x half> @stack_fold_rsqrtph_maskz(<32 x half> %a0, ptr %mask) {
1568 ; CHECK-LABEL: stack_fold_rsqrtph_maskz:
1570 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1573 ; CHECK-NEXT: #NO_APP
1574 ; CHECK-NEXT: kmovd (%rdi), %k1
1575 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1577 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1578 %2 = load i32, ptr %mask
1579 %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2)
1583 define <8 x half> @stack_fold_rsqrtsh(<8 x half> %a0, <8 x half> %a1) {
1584 ; CHECK-LABEL: stack_fold_rsqrtsh:
1586 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1589 ; CHECK-NEXT: #NO_APP
1590 ; CHECK-NEXT: vrsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1592 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1593 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1)
1596 declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8)
1598 define <8 x half> @stack_fold_rsqrtsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
1599 ; CHECK-LABEL: stack_fold_rsqrtsh_mask:
1601 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1602 ; CHECK-NEXT: kmovd %esi, %k1
1605 ; CHECK-NEXT: #NO_APP
1606 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1607 ; CHECK-NEXT: vrsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1608 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1610 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1611 %2 = load <8 x half>, ptr %passthru
1612 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask)
1616 define <8 x half> @stack_fold_rsqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
1617 ; CHECK-LABEL: stack_fold_rsqrtsh_maskz:
1619 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1622 ; CHECK-NEXT: #NO_APP
1623 ; CHECK-NEXT: kmovb (%rdi), %k1
1624 ; CHECK-NEXT: vrsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
1626 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1627 %2 = load i8, ptr %mask
1628 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2)
1632 define <32 x half> @stack_fold_sqrtph(<32 x half> %a0) {
1633 ; CHECK-LABEL: stack_fold_sqrtph:
1635 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1638 ; CHECK-NEXT: #NO_APP
1639 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1641 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1642 %2 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
1645 declare <32 x half> @llvm.sqrt.v32f16(<32 x half>)
1647 define <32 x half> @stack_fold_sqrtph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
1648 ; CHECK-LABEL: stack_fold_sqrtph_mask:
1650 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1653 ; CHECK-NEXT: #NO_APP
1654 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
1655 ; CHECK-NEXT: kmovd %esi, %k1
1656 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
1657 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1659 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1660 %2 = load <32 x half>, ptr %passthru
1661 %3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
1662 %4 = bitcast i32 %mask to <32 x i1>
1663 %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> %2
1667 define <32 x half> @stack_fold_sqrtph_maskz(<32 x half> %a0, ptr %mask) {
1668 ; CHECK-LABEL: stack_fold_sqrtph_maskz:
1670 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1673 ; CHECK-NEXT: #NO_APP
1674 ; CHECK-NEXT: kmovd (%rdi), %k1
1675 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
1677 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1678 %2 = load i32, ptr %mask
1679 %3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
1680 %4 = bitcast i32 %2 to <32 x i1>
1681 %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> zeroinitializer
1685 define <8 x half> @stack_fold_sqrtsh(<8 x half> %a0, <8 x half> %a1) {
1686 ; CHECK-LABEL: stack_fold_sqrtsh:
1688 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1691 ; CHECK-NEXT: #NO_APP
1692 ; CHECK-NEXT: vsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1694 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1695 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
1698 declare <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32)
1700 define <8 x half> @stack_fold_sqrtsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
1701 ; CHECK-LABEL: stack_fold_sqrtsh_mask:
1703 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1704 ; CHECK-NEXT: kmovd %esi, %k1
1707 ; CHECK-NEXT: #NO_APP
1708 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1709 ; CHECK-NEXT: vsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1710 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1712 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1713 %2 = load <8 x half>, ptr %passthru
1714 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
1718 define <8 x half> @stack_fold_sqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
1719 ; CHECK-LABEL: stack_fold_sqrtsh_maskz:
1721 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1724 ; CHECK-NEXT: #NO_APP
1725 ; CHECK-NEXT: kmovb (%rdi), %k1
1726 ; CHECK-NEXT: vsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
1728 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1729 %2 = load i8, ptr %mask
1730 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4)
1734 define <32 x half> @stack_fold_subph_zmm(<32 x half> %a0, <32 x half> %a1) {
1735 ; CHECK-LABEL: stack_fold_subph_zmm:
1737 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1740 ; CHECK-NEXT: #NO_APP
1741 ; CHECK-NEXT: vsubph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1743 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1744 %2 = fsub <32 x half> %a0, %a1
1748 define half @stack_fold_subsh(half %a0, half %a1) {
1749 ; CHECK-LABEL: stack_fold_subsh:
1751 ; CHECK-NEXT: vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1754 ; CHECK-NEXT: #NO_APP
1755 ; CHECK-NEXT: vsubsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1758 %2 = fsub half %a0, %a1
1762 define <8 x half> @stack_fold_subsh_int(<8 x half> %a0, <8 x half> %a1) {
1763 ; CHECK-LABEL: stack_fold_subsh_int:
1765 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1768 ; CHECK-NEXT: #NO_APP
1769 ; CHECK-NEXT: vsubsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1771 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1772 %2 = extractelement <8 x half> %a0, i32 0
1773 %3 = extractelement <8 x half> %a1, i32 0
1774 %4 = fsub half %2, %3
1775 %5 = insertelement <8 x half> %a0, half %4, i32 0
1779 define <16 x float> @stack_fold_fmulcph(<16 x float> %a0, <16 x float> %a1) {
1780 ; CHECK-LABEL: stack_fold_fmulcph:
1782 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1785 ; CHECK-NEXT: #NO_APP
1786 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
1787 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1789 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1790 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
1794 define <16 x float> @stack_fold_fmulcph_commute(<16 x float> %a0, <16 x float> %a1) {
1795 ; CHECK-LABEL: stack_fold_fmulcph_commute:
1797 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1800 ; CHECK-NEXT: #NO_APP
1801 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
1802 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1804 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1805 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a1, <16 x float> %a0, <16 x float> undef, i16 -1, i32 4)
1808 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
1810 define <16 x float> @stack_fold_fmulcph_mask(<16 x float> %a0, <16 x float> %a1, ptr %passthru, i16 %mask) {
1811 ; CHECK-LABEL: stack_fold_fmulcph_mask:
1813 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1814 ; CHECK-NEXT: kmovd %esi, %k1
1817 ; CHECK-NEXT: #NO_APP
1818 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
1819 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1820 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1822 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1823 %2 = load <16 x float>, ptr %passthru
1824 %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %2, i16 %mask, i32 4)
1828 define <16 x float> @stack_fold_fmulcph_maskz(<16 x float> %a0, <16 x float> %a1, ptr %mask) {
1829 ; CHECK-LABEL: stack_fold_fmulcph_maskz:
1831 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1834 ; CHECK-NEXT: #NO_APP
1835 ; CHECK-NEXT: kmovw (%rdi), %k1
1836 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload
1837 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1839 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1840 %2 = load i16, ptr %mask
1841 %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
1845 define <16 x float> @stack_fold_fcmulcph(<16 x float> %a0, <16 x float> %a1) {
1846 ; CHECK-LABEL: stack_fold_fcmulcph:
1848 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1851 ; CHECK-NEXT: #NO_APP
1852 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
1853 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1855 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1856 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
1860 define <16 x float> @stack_fold_fcmulcph_commute(<16 x float> %a0, <16 x float> %a1) {
1861 ; CHECK-LABEL: stack_fold_fcmulcph_commute:
1863 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1866 ; CHECK-NEXT: #NO_APP
1867 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
1868 ; CHECK-NEXT: vfcmulcph %zmm0, %zmm1, %zmm2
1869 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1871 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1872 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a1, <16 x float> %a0, <16 x float> undef, i16 -1, i32 4)
1875 declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
1877 define <16 x float> @stack_fold_fcmulcph_mask(<16 x float> %a0, <16 x float> %a1, ptr %passthru, i16 %mask) {
1878 ; CHECK-LABEL: stack_fold_fcmulcph_mask:
1880 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1881 ; CHECK-NEXT: kmovd %esi, %k1
1884 ; CHECK-NEXT: #NO_APP
1885 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
1886 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1887 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1889 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1890 %2 = load <16 x float>, ptr %passthru
1891 %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %2, i16 %mask, i32 4)
1895 define <16 x float> @stack_fold_fcmulcph_maskz(<16 x float> %a0, <16 x float> %a1, ptr %mask) {
1896 ; CHECK-LABEL: stack_fold_fcmulcph_maskz:
1898 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1901 ; CHECK-NEXT: #NO_APP
1902 ; CHECK-NEXT: kmovw (%rdi), %k1
1903 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload
1904 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1906 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1907 %2 = load i16, ptr %mask
1908 %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
1912 define <16 x float> @stack_fold_fmaddcph(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
1913 ; CHECK-LABEL: stack_fold_fmaddcph:
1915 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1918 ; CHECK-NEXT: #NO_APP
1919 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1921 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1922 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 -1, i32 4)
1926 define <16 x float> @stack_fold_fmaddcph_commute(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
1927 ; CHECK-LABEL: stack_fold_fmaddcph_commute:
1929 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1932 ; CHECK-NEXT: #NO_APP
1933 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1935 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1936 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a2, <16 x float> %a1, <16 x float> %a0, i16 -1, i32 4)
1939 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
1941 define <16 x float> @stack_fold_fmaddcph_mask(ptr %p, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
1942 ; CHECK-LABEL: stack_fold_fmaddcph_mask:
1944 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1945 ; CHECK-NEXT: kmovd %esi, %k1
1948 ; CHECK-NEXT: #NO_APP
1949 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
1950 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1951 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1953 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1954 %a0 = load <16 x float>, ptr %p
1955 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 %mask, i32 4)
1959 define <16 x float> @stack_fold_fmaddcph_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, ptr %mask) {
1960 ; CHECK-LABEL: stack_fold_fmaddcph_maskz:
1962 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1965 ; CHECK-NEXT: #NO_APP
1966 ; CHECK-NEXT: kmovw (%rdi), %k1
1967 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
1968 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1970 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1971 %2 = load i16, ptr %mask
1972 %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> zeroinitializer, i16 %2, i32 4)
1975 declare <16 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
1977 define <16 x float> @stack_fold_fcmaddcph(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
1978 ; CHECK-LABEL: stack_fold_fcmaddcph:
1980 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1983 ; CHECK-NEXT: #NO_APP
1984 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1986 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1987 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 -1, i32 4)
1991 define <16 x float> @stack_fold_fcmaddcph_commute(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
1992 ; CHECK-LABEL: stack_fold_fcmaddcph_commute:
1994 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1997 ; CHECK-NEXT: #NO_APP
1998 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
1999 ; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0
2001 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2002 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a2, <16 x float> %a1, <16 x float> %a0, i16 -1, i32 4)
2005 declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
2007 define <16 x float> @stack_fold_fcmaddcph_mask(ptr %p, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
2008 ; CHECK-LABEL: stack_fold_fcmaddcph_mask:
2010 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2011 ; CHECK-NEXT: kmovd %esi, %k1
2014 ; CHECK-NEXT: #NO_APP
2015 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
2016 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2017 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2019 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2020 %a0 = load <16 x float>, ptr %p
2021 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 %mask, i32 4)
2025 define <16 x float> @stack_fold_fcmaddcph_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, ptr %mask) {
2026 ; CHECK-LABEL: stack_fold_fcmaddcph_maskz:
2028 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2031 ; CHECK-NEXT: #NO_APP
2032 ; CHECK-NEXT: kmovw (%rdi), %k1
2033 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2034 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
2036 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2037 %2 = load i16, ptr %mask
2038 %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> zeroinitializer, i16 %2, i32 4)
2041 declare <16 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
2043 define <4 x float> @stack_fold_fmulcsh(<4 x float> %a0, <4 x float> %a1) {
2044 ; CHECK-LABEL: stack_fold_fmulcsh:
2046 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2049 ; CHECK-NEXT: #NO_APP
2050 ; CHECK-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
2051 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
2053 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2054 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
2058 define <4 x float> @stack_fold_fmulcsh_commute(<4 x float> %a0, <4 x float> %a1) {
2059 ; CHECK-LABEL: stack_fold_fmulcsh_commute:
2061 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2064 ; CHECK-NEXT: #NO_APP
2065 ; CHECK-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
2066 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
2068 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2069 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a1, <4 x float> %a0, <4 x float> undef, i8 -1, i32 4)
2072 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
2074 define <4 x float> @stack_fold_fmulcsh_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) {
2075 ; CHECK-LABEL: stack_fold_fmulcsh_mask:
2077 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2078 ; CHECK-NEXT: kmovd %esi, %k1
2081 ; CHECK-NEXT: #NO_APP
2082 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
2083 ; CHECK-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
2084 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
2086 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2087 %2 = load <4 x float>, ptr %passthru
2088 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask, i32 4)
2092 define <4 x float> @stack_fold_fmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
2093 ; CHECK-LABEL: stack_fold_fmulcsh_maskz:
2095 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2098 ; CHECK-NEXT: #NO_APP
2099 ; CHECK-NEXT: kmovb (%rdi), %k1
2100 ; CHECK-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
2101 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
2103 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2104 %2 = load i8, ptr %mask
2105 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
2109 define <4 x float> @stack_fold_fcmulcsh(<4 x float> %a0, <4 x float> %a1) {
2110 ; CHECK-LABEL: stack_fold_fcmulcsh:
2112 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2115 ; CHECK-NEXT: #NO_APP
2116 ; CHECK-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
2117 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
2119 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2120 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
2124 define <4 x float> @stack_fold_fcmulcsh_commute(<4 x float> %a0, <4 x float> %a1) {
2125 ; CHECK-LABEL: stack_fold_fcmulcsh_commute:
2127 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2130 ; CHECK-NEXT: #NO_APP
2131 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2132 ; CHECK-NEXT: vfcmulcsh %xmm0, %xmm1, %xmm2
2133 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
2135 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2136 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a1, <4 x float> %a0, <4 x float> undef, i8 -1, i32 4)
2139 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
2141 define <4 x float> @stack_fold_fcmulcsh_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) {
2142 ; CHECK-LABEL: stack_fold_fcmulcsh_mask:
2144 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2145 ; CHECK-NEXT: kmovd %esi, %k1
2148 ; CHECK-NEXT: #NO_APP
2149 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
2150 ; CHECK-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
2151 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
2153 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2154 %2 = load <4 x float>, ptr %passthru
2155 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask, i32 4)
2159 define <4 x float> @stack_fold_fcmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
2160 ; CHECK-LABEL: stack_fold_fcmulcsh_maskz:
2162 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2165 ; CHECK-NEXT: #NO_APP
2166 ; CHECK-NEXT: kmovb (%rdi), %k1
2167 ; CHECK-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
2168 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
2170 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2171 %2 = load i8, ptr %mask
2172 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
2176 define <4 x float> @stack_fold_fmaddcsh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
2177 ; CHECK-LABEL: stack_fold_fmaddcsh:
2179 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2182 ; CHECK-NEXT: #NO_APP
2183 ; CHECK-NEXT: vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2185 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2186 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1, i32 4)
2190 define <4 x float> @stack_fold_fmaddcsh_commute(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
2191 ; CHECK-LABEL: stack_fold_fmaddcsh_commute:
2193 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2196 ; CHECK-NEXT: #NO_APP
2197 ; CHECK-NEXT: vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2199 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2200 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a2, <4 x float> %a1, <4 x float> %a0, i8 -1, i32 4)
2203 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
2205 define <4 x float> @stack_fold_fmaddcsh_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2206 ; CHECK-LABEL: stack_fold_fmaddcsh_mask:
2208 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2209 ; CHECK-NEXT: kmovd %esi, %k1
2212 ; CHECK-NEXT: #NO_APP
2213 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
2214 ; CHECK-NEXT: vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
2215 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
2217 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2218 %a0 = load <4 x float>, ptr %p
2219 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask, i32 4)
2223 define <4 x float> @stack_fold_fmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) {
2224 ; CHECK-LABEL: stack_fold_fmaddcsh_maskz:
2226 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2229 ; CHECK-NEXT: #NO_APP
2230 ; CHECK-NEXT: kmovb (%rdi), %k1
2231 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2232 ; CHECK-NEXT: vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2234 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2235 %2 = load i8, ptr %mask
2236 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2, i32 4)
2239 declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
2241 define <4 x float> @stack_fold_fcmaddcsh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
2242 ; CHECK-LABEL: stack_fold_fcmaddcsh:
2244 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2247 ; CHECK-NEXT: #NO_APP
2248 ; CHECK-NEXT: vfcmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2250 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2251 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1, i32 4)
2255 define <4 x float> @stack_fold_fcmaddcsh_commute(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
2256 ; CHECK-LABEL: stack_fold_fcmaddcsh_commute:
2258 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2261 ; CHECK-NEXT: #NO_APP
2262 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2263 ; CHECK-NEXT: vfcmaddcsh %xmm1, %xmm2, %xmm0
2265 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2266 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a2, <4 x float> %a1, <4 x float> %a0, i8 -1, i32 4)
2269 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
2271 define <4 x float> @stack_fold_fcmaddcsh_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2272 ; CHECK-LABEL: stack_fold_fcmaddcsh_mask:
2274 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2275 ; CHECK-NEXT: kmovd %esi, %k1
2278 ; CHECK-NEXT: #NO_APP
2279 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
2280 ; CHECK-NEXT: vfcmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
2281 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
2283 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2284 %a0 = load <4 x float>, ptr %p
2285 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask, i32 4)
2289 define <4 x float> @stack_fold_fcmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) {
2290 ; CHECK-LABEL: stack_fold_fcmaddcsh_maskz:
2292 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2295 ; CHECK-NEXT: #NO_APP
2296 ; CHECK-NEXT: kmovb (%rdi), %k1
2297 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2298 ; CHECK-NEXT: vfcmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2300 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2301 %2 = load i8, ptr %mask
2302 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2, i32 4)
2305 declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
2307 attributes #0 = { "unsafe-fp-math"="false" }
2308 attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }