1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512fp16 < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <8 x half> @stack_fold_addph(<8 x half> %a0, <8 x half> %a1) {
13 ; CHECK-LABEL: stack_fold_addph:
15 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
19 ; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
22 %2 = fadd <8 x half> %a0, %a1
26 define <16 x half> @stack_fold_addph_ymm(<16 x half> %a0, <16 x half> %a1) {
27 ; CHECK-LABEL: stack_fold_addph_ymm:
29 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
33 ; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
35 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
36 %2 = fadd <16 x half> %a0, %a1
40 define i8 @stack_fold_cmpph(<8 x half> %a0, <8 x half> %a1) {
41 ; CHECK-LABEL: stack_fold_cmpph:
43 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
47 ; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
48 ; CHECK-NEXT: kmovd %k0, %eax
49 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
51 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
52 %res = call <8 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.128(<8 x half> %a0, <8 x half> %a1, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
53 %2 = bitcast <8 x i1> %res to i8
56 declare <8 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.128(<8 x half>, <8 x half>, i32, <8 x i1>)
58 define i16 @stack_fold_cmpph_ymm(<16 x half> %a0, <16 x half> %a1) {
59 ; CHECK-LABEL: stack_fold_cmpph_ymm:
61 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
65 ; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload
66 ; CHECK-NEXT: kmovd %k0, %eax
67 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
68 ; CHECK-NEXT: vzeroupper
70 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
71 %res = call <16 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.256(<16 x half> %a0, <16 x half> %a1, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
72 %2 = bitcast <16 x i1> %res to i16
75 declare <16 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.256(<16 x half>, <16 x half>, i32, <16 x i1>)
77 define <8 x half> @stack_fold_divph(<8 x half> %a0, <8 x half> %a1) {
78 ; CHECK-LABEL: stack_fold_divph:
80 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
84 ; CHECK-NEXT: vdivph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
86 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
87 %2 = fdiv <8 x half> %a0, %a1
91 define <16 x half> @stack_fold_divph_ymm(<16 x half> %a0, <16 x half> %a1) {
92 ; CHECK-LABEL: stack_fold_divph_ymm:
94 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
98 ; CHECK-NEXT: vdivph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
100 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
101 %2 = fdiv <16 x half> %a0, %a1
105 define i8 @stack_fold_fpclassph(<8 x half> %a0) {
106 ; CHECK-LABEL: stack_fold_fpclassph:
108 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
111 ; CHECK-NEXT: #NO_APP
112 ; CHECK-NEXT: vfpclassphx $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 16-byte Folded Reload
113 ; CHECK-NEXT: # k0 = isNegativeZero(mem)
114 ; CHECK-NEXT: kmovd %k0, %eax
115 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
117 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
118 %2 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %a0, i32 4)
119 %3 = bitcast <8 x i1> %2 to i8
122 declare <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half>, i32)
124 define i8 @stack_fold_fpclassph_mask(<8 x half> %a0, ptr %p) {
125 ; CHECK-LABEL: stack_fold_fpclassph_mask:
127 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
130 ; CHECK-NEXT: #NO_APP
131 ; CHECK-NEXT: kmovb (%rdi), %k1
132 ; CHECK-NEXT: vfpclassphx $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 16-byte Folded Reload
133 ; CHECK-NEXT: kmovd %k0, %eax
134 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
136 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
137 %2 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %a0, i32 4)
138 %mask = load <8 x i1>, ptr %p
139 %3 = and <8 x i1> %2, %mask
140 %4 = bitcast <8 x i1> %3 to i8
144 define i16 @stack_fold_fpclassph_ymm(<16 x half> %a0) {
145 ; CHECK-LABEL: stack_fold_fpclassph_ymm:
147 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
150 ; CHECK-NEXT: #NO_APP
151 ; CHECK-NEXT: vfpclassphy $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 32-byte Folded Reload
152 ; CHECK-NEXT: # k0 = isNegativeZero(mem)
153 ; CHECK-NEXT: kmovd %k0, %eax
154 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
155 ; CHECK-NEXT: vzeroupper
157 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
158 %2 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %a0, i32 4)
159 %3 = bitcast <16 x i1> %2 to i16
162 declare <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half>, i32)
164 define i16 @stack_fold_fpclassph_mask_ymm(<16 x half> %a0, ptr %p) {
165 ; CHECK-LABEL: stack_fold_fpclassph_mask_ymm:
167 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
170 ; CHECK-NEXT: #NO_APP
171 ; CHECK-NEXT: kmovw (%rdi), %k1
172 ; CHECK-NEXT: vfpclassphy $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 32-byte Folded Reload
173 ; CHECK-NEXT: kmovd %k0, %eax
174 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
175 ; CHECK-NEXT: vzeroupper
177 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
178 %2 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %a0, i32 4)
179 %mask = load <16 x i1>, ptr %p
180 %3 = and <16 x i1> %2, %mask
181 %4 = bitcast <16 x i1> %3 to i16
185 define <8 x half> @stack_fold_getexpph(<8 x half> %a0) {
186 ; CHECK-LABEL: stack_fold_getexpph:
188 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
191 ; CHECK-NEXT: #NO_APP
192 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
194 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
195 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
198 declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half>, <8 x half>, i8)
200 define <8 x half> @stack_fold_getexpph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
201 ; CHECK-LABEL: stack_fold_getexpph_mask:
203 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
204 ; CHECK-NEXT: kmovd %esi, %k1
207 ; CHECK-NEXT: #NO_APP
208 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
209 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
210 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
212 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
213 %2 = load <8 x half>, ptr %passthru
214 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
218 define <8 x half> @stack_fold_getexpph_maskz(<8 x half> %a0, ptr %mask) {
219 ; CHECK-LABEL: stack_fold_getexpph_maskz:
221 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
224 ; CHECK-NEXT: #NO_APP
225 ; CHECK-NEXT: kmovb (%rdi), %k1
226 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
228 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
229 %2 = load i8, ptr %mask
230 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
234 define <16 x half> @stack_fold_getexpph_ymm(<16 x half> %a0) {
235 ; CHECK-LABEL: stack_fold_getexpph_ymm:
237 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
240 ; CHECK-NEXT: #NO_APP
241 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
243 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
244 %2 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
247 declare <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half>, <16 x half>, i16)
249 define <16 x half> @stack_fold_getexpph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
250 ; CHECK-LABEL: stack_fold_getexpph_mask_ymm:
252 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
253 ; CHECK-NEXT: kmovd %esi, %k1
256 ; CHECK-NEXT: #NO_APP
257 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
258 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
259 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
261 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
262 %2 = load <16 x half>, ptr %passthru
263 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
267 define <16 x half> @stack_fold_getexpph_maskz_ymm(<16 x half> %a0, ptr %mask) {
268 ; CHECK-LABEL: stack_fold_getexpph_maskz_ymm:
270 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
273 ; CHECK-NEXT: #NO_APP
274 ; CHECK-NEXT: kmovw (%rdi), %k1
275 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
277 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
278 %2 = load i16, ptr %mask
279 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
283 define <8 x half> @stack_fold_getmantph(<8 x half> %a0) {
284 ; CHECK-LABEL: stack_fold_getmantph:
286 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
289 ; CHECK-NEXT: #NO_APP
290 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
292 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
293 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
296 declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half>, i32, <8 x half>, i8)
298 define <8 x half> @stack_fold_getmantph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
299 ; CHECK-LABEL: stack_fold_getmantph_mask:
301 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
302 ; CHECK-NEXT: kmovd %esi, %k1
305 ; CHECK-NEXT: #NO_APP
306 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
307 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
308 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
310 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
311 %2 = load <8 x half>, ptr %passthru
312 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
316 define <8 x half> @stack_fold_getmantph_maskz(<8 x half> %a0, ptr %mask) {
317 ; CHECK-LABEL: stack_fold_getmantph_maskz:
319 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
322 ; CHECK-NEXT: #NO_APP
323 ; CHECK-NEXT: kmovb (%rdi), %k1
324 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
326 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
327 %2 = load i8, ptr %mask
328 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
332 define <16 x half> @stack_fold_getmantph_ymm(<16 x half> %a0) {
333 ; CHECK-LABEL: stack_fold_getmantph_ymm:
335 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
338 ; CHECK-NEXT: #NO_APP
339 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
341 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
342 %2 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
345 declare <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half>, i32, <16 x half>, i16)
347 define <16 x half> @stack_fold_getmantph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
348 ; CHECK-LABEL: stack_fold_getmantph_mask_ymm:
350 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
351 ; CHECK-NEXT: kmovd %esi, %k1
354 ; CHECK-NEXT: #NO_APP
355 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
356 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
357 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
359 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
360 %2 = load <16 x half>, ptr %passthru
361 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
365 define <16 x half> @stack_fold_getmantph_maskz_ymm(<16 x half> %a0, ptr %mask) {
366 ; CHECK-LABEL: stack_fold_getmantph_maskz_ymm:
368 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
371 ; CHECK-NEXT: #NO_APP
372 ; CHECK-NEXT: kmovw (%rdi), %k1
373 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
375 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
376 %2 = load i16, ptr %mask
377 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
381 define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
382 ; CHECK-LABEL: stack_fold_maxph:
384 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
387 ; CHECK-NEXT: #NO_APP
388 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
390 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
391 %2 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %a0, <8 x half> %a1)
394 declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>) nounwind readnone
396 define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
397 ; CHECK-LABEL: stack_fold_maxph_commutable:
399 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
402 ; CHECK-NEXT: #NO_APP
403 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
405 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
406 %2 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %a0, <8 x half> %a1)
410 define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
411 ; CHECK-LABEL: stack_fold_maxph_ymm:
413 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
416 ; CHECK-NEXT: #NO_APP
417 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
419 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
420 %2 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %a0, <16 x half> %a1)
423 declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>) nounwind readnone
425 define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
426 ; CHECK-LABEL: stack_fold_maxph_ymm_commutable:
428 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
431 ; CHECK-NEXT: #NO_APP
432 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
434 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
435 %2 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %a0, <16 x half> %a1)
439 define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
440 ; CHECK-LABEL: stack_fold_minph:
442 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
445 ; CHECK-NEXT: #NO_APP
446 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
448 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
449 %2 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %a0, <8 x half> %a1)
452 declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>) nounwind readnone
454 define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
455 ; CHECK-LABEL: stack_fold_minph_commutable:
457 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
460 ; CHECK-NEXT: #NO_APP
461 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
463 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
464 %2 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %a0, <8 x half> %a1)
468 define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
469 ; CHECK-LABEL: stack_fold_minph_ymm:
471 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
474 ; CHECK-NEXT: #NO_APP
475 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
477 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
478 %2 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %a0, <16 x half> %a1)
481 declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>) nounwind readnone
483 define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
484 ; CHECK-LABEL: stack_fold_minph_ymm_commutable:
486 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
489 ; CHECK-NEXT: #NO_APP
490 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
492 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
493 %2 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %a0, <16 x half> %a1)
497 define <8 x half> @stack_fold_mulph(<8 x half> %a0, <8 x half> %a1) {
498 ; CHECK-LABEL: stack_fold_mulph:
500 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
503 ; CHECK-NEXT: #NO_APP
504 ; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
506 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
507 %2 = fmul <8 x half> %a0, %a1
511 define <16 x half> @stack_fold_mulph_ymm(<16 x half> %a0, <16 x half> %a1) {
512 ; CHECK-LABEL: stack_fold_mulph_ymm:
514 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
517 ; CHECK-NEXT: #NO_APP
518 ; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
520 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
521 %2 = fmul <16 x half> %a0, %a1
525 define <8 x half> @stack_fold_rcpph(<8 x half> %a0) {
526 ; CHECK-LABEL: stack_fold_rcpph:
528 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
531 ; CHECK-NEXT: #NO_APP
532 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
534 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
535 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
538 declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half>, <8 x half>, i8)
540 define <8 x half> @stack_fold_rcpph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
541 ; CHECK-LABEL: stack_fold_rcpph_mask:
543 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
544 ; CHECK-NEXT: kmovd %esi, %k1
547 ; CHECK-NEXT: #NO_APP
548 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
549 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
550 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
552 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
553 %2 = load <8 x half>, ptr %passthru
554 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
558 define <8 x half> @stack_fold_rcpph_maskz(<8 x half> %a0, ptr %mask) {
559 ; CHECK-LABEL: stack_fold_rcpph_maskz:
561 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
564 ; CHECK-NEXT: #NO_APP
565 ; CHECK-NEXT: kmovb (%rdi), %k1
566 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
568 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
569 %2 = load i8, ptr %mask
570 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
574 define <16 x half> @stack_fold_rcpph_ymm(<16 x half> %a0) {
575 ; CHECK-LABEL: stack_fold_rcpph_ymm:
577 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
580 ; CHECK-NEXT: #NO_APP
581 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
583 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
584 %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
587 declare <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half>, <16 x half>, i16)
589 define <16 x half> @stack_fold_rcpph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
590 ; CHECK-LABEL: stack_fold_rcpph_mask_ymm:
592 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
593 ; CHECK-NEXT: kmovd %esi, %k1
596 ; CHECK-NEXT: #NO_APP
597 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
598 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
599 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
601 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
602 %2 = load <16 x half>, ptr %passthru
603 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
607 define <16 x half> @stack_fold_rcpph_maskz_ymm(<16 x half> %a0, ptr %mask) {
608 ; CHECK-LABEL: stack_fold_rcpph_maskz_ymm:
610 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
613 ; CHECK-NEXT: #NO_APP
614 ; CHECK-NEXT: kmovw (%rdi), %k1
615 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
617 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
618 %2 = load i16, ptr %mask
619 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
623 define <8 x half> @stack_fold_reduceph(<8 x half> %a0) {
624 ; CHECK-LABEL: stack_fold_reduceph:
626 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
629 ; CHECK-NEXT: #NO_APP
630 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
632 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
633 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
636 declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half>, i32, <8 x half>, i8)
638 define <8 x half> @stack_fold_reduceph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
639 ; CHECK-LABEL: stack_fold_reduceph_mask:
641 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
642 ; CHECK-NEXT: kmovd %esi, %k1
645 ; CHECK-NEXT: #NO_APP
646 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
647 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
648 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
650 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
651 %2 = load <8 x half>, ptr %passthru
652 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
656 define <8 x half> @stack_fold_reduceph_maskz(<8 x half> %a0, ptr %mask) {
657 ; CHECK-LABEL: stack_fold_reduceph_maskz:
659 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
662 ; CHECK-NEXT: #NO_APP
663 ; CHECK-NEXT: kmovb (%rdi), %k1
664 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
666 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
667 %2 = load i8, ptr %mask
668 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
672 define <16 x half> @stack_fold_reduceph_ymm(<16 x half> %a0) {
673 ; CHECK-LABEL: stack_fold_reduceph_ymm:
675 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
678 ; CHECK-NEXT: #NO_APP
679 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
681 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
682 %2 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
685 declare <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half>, i32, <16 x half>, i16)
687 define <16 x half> @stack_fold_reduceph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
688 ; CHECK-LABEL: stack_fold_reduceph_mask_ymm:
690 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
691 ; CHECK-NEXT: kmovd %esi, %k1
694 ; CHECK-NEXT: #NO_APP
695 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
696 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
697 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
699 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
700 %2 = load <16 x half>, ptr %passthru
701 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
705 define <16 x half> @stack_fold_reduceph_maskz_ymm(<16 x half> %a0, ptr %mask) {
706 ; CHECK-LABEL: stack_fold_reduceph_maskz_ymm:
708 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
711 ; CHECK-NEXT: #NO_APP
712 ; CHECK-NEXT: kmovw (%rdi), %k1
713 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
715 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
716 %2 = load i16, ptr %mask
717 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
721 define <8 x half> @stack_fold_rndscaleph(<8 x half> %a0) {
722 ; CHECK-LABEL: stack_fold_rndscaleph:
724 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
727 ; CHECK-NEXT: #NO_APP
728 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
730 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
731 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
734 declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half>, i32, <8 x half>, i8)
736 define <8 x half> @stack_fold_rndscaleph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
737 ; CHECK-LABEL: stack_fold_rndscaleph_mask:
739 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
740 ; CHECK-NEXT: kmovd %esi, %k1
743 ; CHECK-NEXT: #NO_APP
744 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
745 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
746 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
748 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
749 %2 = load <8 x half>, ptr %passthru
750 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
754 define <8 x half> @stack_fold_rndscaleph_maskz(<8 x half> %a0, ptr %mask) {
755 ; CHECK-LABEL: stack_fold_rndscaleph_maskz:
757 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
760 ; CHECK-NEXT: #NO_APP
761 ; CHECK-NEXT: kmovb (%rdi), %k1
762 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
764 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
765 %2 = load i8, ptr %mask
766 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
770 define <16 x half> @stack_fold_rndscaleph_ymm(<16 x half> %a0) {
771 ; CHECK-LABEL: stack_fold_rndscaleph_ymm:
773 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
776 ; CHECK-NEXT: #NO_APP
777 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
779 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
780 %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
783 declare <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half>, i32, <16 x half>, i16)
785 define <16 x half> @stack_fold_rndscaleph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
786 ; CHECK-LABEL: stack_fold_rndscaleph_mask_ymm:
788 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
789 ; CHECK-NEXT: kmovd %esi, %k1
792 ; CHECK-NEXT: #NO_APP
793 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
794 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
795 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
797 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
798 %2 = load <16 x half>, ptr %passthru
799 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
803 define <16 x half> @stack_fold_rndscaleph_maskz_ymm(<16 x half> %a0, ptr %mask) {
804 ; CHECK-LABEL: stack_fold_rndscaleph_maskz_ymm:
806 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
809 ; CHECK-NEXT: #NO_APP
810 ; CHECK-NEXT: kmovw (%rdi), %k1
811 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
813 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
814 %2 = load i16, ptr %mask
815 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
819 define <8 x half> @stack_fold_rsqrtph(<8 x half> %a0) {
820 ; CHECK-LABEL: stack_fold_rsqrtph:
822 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
825 ; CHECK-NEXT: #NO_APP
826 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
828 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
829 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
832 declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half>, <8 x half>, i8)
834 define <8 x half> @stack_fold_rsqrtph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
835 ; CHECK-LABEL: stack_fold_rsqrtph_mask:
837 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
838 ; CHECK-NEXT: kmovd %esi, %k1
841 ; CHECK-NEXT: #NO_APP
842 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
843 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
844 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
846 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
847 %2 = load <8 x half>, ptr %passthru
848 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
852 define <8 x half> @stack_fold_rsqrtph_maskz(<8 x half> %a0, ptr %mask) {
853 ; CHECK-LABEL: stack_fold_rsqrtph_maskz:
855 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
858 ; CHECK-NEXT: #NO_APP
859 ; CHECK-NEXT: kmovb (%rdi), %k1
860 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
862 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
863 %2 = load i8, ptr %mask
864 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
868 define <16 x half> @stack_fold_rsqrtph_ymm(<16 x half> %a0) {
869 ; CHECK-LABEL: stack_fold_rsqrtph_ymm:
871 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
874 ; CHECK-NEXT: #NO_APP
875 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
877 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
878 %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
881 declare <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half>, <16 x half>, i16)
883 define <16 x half> @stack_fold_rsqrtph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
884 ; CHECK-LABEL: stack_fold_rsqrtph_mask_ymm:
886 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
887 ; CHECK-NEXT: kmovd %esi, %k1
890 ; CHECK-NEXT: #NO_APP
891 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
892 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
893 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
895 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
896 %2 = load <16 x half>, ptr %passthru
897 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
901 define <16 x half> @stack_fold_rsqrtph_maskz_ymm(<16 x half> %a0, ptr %mask) {
902 ; CHECK-LABEL: stack_fold_rsqrtph_maskz_ymm:
904 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
907 ; CHECK-NEXT: #NO_APP
908 ; CHECK-NEXT: kmovw (%rdi), %k1
909 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
911 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
912 %2 = load i16, ptr %mask
913 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
917 define <8 x half> @stack_fold_sqrtph(<8 x half> %a0) {
918 ; CHECK-LABEL: stack_fold_sqrtph:
920 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
923 ; CHECK-NEXT: #NO_APP
924 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
926 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
927 %2 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
930 declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
932 define <8 x half> @stack_fold_sqrtph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
933 ; CHECK-LABEL: stack_fold_sqrtph_mask:
935 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
938 ; CHECK-NEXT: #NO_APP
939 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
940 ; CHECK-NEXT: kmovd %esi, %k1
941 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
942 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
944 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
945 %2 = load <8 x half>, ptr %passthru
946 %3 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
947 %4 = bitcast i8 %mask to <8 x i1>
948 %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> %2
952 define <8 x half> @stack_fold_sqrtph_maskz(<8 x half> %a0, ptr %mask) {
953 ; CHECK-LABEL: stack_fold_sqrtph_maskz:
955 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
958 ; CHECK-NEXT: #NO_APP
959 ; CHECK-NEXT: kmovb (%rdi), %k1
960 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
962 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
963 %2 = load i8, ptr %mask
964 %3 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
965 %4 = bitcast i8 %2 to <8 x i1>
966 %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> zeroinitializer
970 define <16 x half> @stack_fold_sqrtph_ymm(<16 x half> %a0) {
971 ; CHECK-LABEL: stack_fold_sqrtph_ymm:
973 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
976 ; CHECK-NEXT: #NO_APP
977 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
979 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
980 %2 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
983 declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
985 define <16 x half> @stack_fold_sqrtph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
986 ; CHECK-LABEL: stack_fold_sqrtph_mask_ymm:
988 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
991 ; CHECK-NEXT: #NO_APP
992 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
993 ; CHECK-NEXT: kmovd %esi, %k1
994 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
995 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
997 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
998 %2 = load <16 x half>, ptr %passthru
999 %3 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
1000 %4 = bitcast i16 %mask to <16 x i1>
1001 %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> %2
1005 define <16 x half> @stack_fold_sqrtph_maskz_ymm(<16 x half> %a0, ptr %mask) {
1006 ; CHECK-LABEL: stack_fold_sqrtph_maskz_ymm:
1008 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1011 ; CHECK-NEXT: #NO_APP
1012 ; CHECK-NEXT: kmovw (%rdi), %k1
1013 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
1015 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1016 %2 = load i16, ptr %mask
1017 %3 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
1018 %4 = bitcast i16 %2 to <16 x i1>
1019 %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> zeroinitializer
1023 define <8 x half> @stack_fold_subph(<8 x half> %a0, <8 x half> %a1) {
1024 ; CHECK-LABEL: stack_fold_subph:
1026 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1029 ; CHECK-NEXT: #NO_APP
1030 ; CHECK-NEXT: vsubph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1032 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1033 %2 = fsub <8 x half> %a0, %a1
1037 define <16 x half> @stack_fold_subph_ymm(<16 x half> %a0, <16 x half> %a1) {
1038 ; CHECK-LABEL: stack_fold_subph_ymm:
1040 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1043 ; CHECK-NEXT: #NO_APP
1044 ; CHECK-NEXT: vsubph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1046 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1047 %2 = fsub <16 x half> %a0, %a1
1051 define <4 x float> @stack_fold_fmulc(<4 x float> %a0, <4 x float> %a1) {
1052 ; CHECK-LABEL: stack_fold_fmulc:
1054 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1057 ; CHECK-NEXT: #NO_APP
1058 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
1059 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1061 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1062 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
1065 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
1067 define <4 x float> @stack_fold_fmulc_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) {
1068 ; CHECK-LABEL: stack_fold_fmulc_mask:
1070 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1071 ; CHECK-NEXT: kmovd %esi, %k1
1074 ; CHECK-NEXT: #NO_APP
1075 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1076 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1077 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1079 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1080 %2 = load <4 x float>, ptr %passthru
1081 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask)
1085 define <4 x float> @stack_fold_fmulc_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
1086 ; CHECK-LABEL: stack_fold_fmulc_maskz:
1088 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1091 ; CHECK-NEXT: #NO_APP
1092 ; CHECK-NEXT: kmovb (%rdi), %k1
1093 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
1094 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1096 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1097 %2 = load i8, ptr %mask
1098 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
1102 define <4 x float> @stack_fold_fcmulc(<4 x float> %a0, <4 x float> %a1) {
1103 ; CHECK-LABEL: stack_fold_fcmulc:
1105 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1108 ; CHECK-NEXT: #NO_APP
1109 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
1110 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1112 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1113 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
1116 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
1118 define <4 x float> @stack_fold_fcmulc_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) {
1119 ; CHECK-LABEL: stack_fold_fcmulc_mask:
1121 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1122 ; CHECK-NEXT: kmovd %esi, %k1
1125 ; CHECK-NEXT: #NO_APP
1126 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1127 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1128 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1130 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1131 %2 = load <4 x float>, ptr %passthru
1132 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask)
1136 define <4 x float> @stack_fold_fcmulc_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
1137 ; CHECK-LABEL: stack_fold_fcmulc_maskz:
1139 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1142 ; CHECK-NEXT: #NO_APP
1143 ; CHECK-NEXT: kmovb (%rdi), %k1
1144 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
1145 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1147 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1148 %2 = load i8, ptr %mask
1149 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
1153 define <4 x float> @stack_fold_fmaddc(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
1154 ; CHECK-LABEL: stack_fold_fmaddc:
1156 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1159 ; CHECK-NEXT: #NO_APP
1160 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1162 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1163 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1)
1166 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
1168 define <4 x float> @stack_fold_fmaddc_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
1169 ; CHECK-LABEL: stack_fold_fmaddc_mask:
1171 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1172 ; CHECK-NEXT: kmovd %esi, %k1
1175 ; CHECK-NEXT: #NO_APP
1176 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1177 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1178 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1180 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1181 %a0 = load <4 x float>, ptr %p
1182 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask)
1186 define <4 x float> @stack_fold_fmaddc_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) {
1187 ; CHECK-LABEL: stack_fold_fmaddc_maskz:
1189 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1192 ; CHECK-NEXT: #NO_APP
1193 ; CHECK-NEXT: kmovb (%rdi), %k1
1194 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
1195 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
1197 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1198 %2 = load i8, ptr %mask
1199 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2)
1202 declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
1204 define <4 x float> @stack_fold_fcmaddc(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
1205 ; CHECK-LABEL: stack_fold_fcmaddc:
1207 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1210 ; CHECK-NEXT: #NO_APP
1211 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1213 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1214 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1)
1217 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
1219 define <4 x float> @stack_fold_fcmaddc_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
1220 ; CHECK-LABEL: stack_fold_fcmaddc_mask:
1222 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1223 ; CHECK-NEXT: kmovd %esi, %k1
1226 ; CHECK-NEXT: #NO_APP
1227 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1228 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1229 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1231 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1232 %a0 = load <4 x float>, ptr %p
1233 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask)
1237 define <4 x float> @stack_fold_fcmaddc_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) {
1238 ; CHECK-LABEL: stack_fold_fcmaddc_maskz:
1240 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1243 ; CHECK-NEXT: #NO_APP
1244 ; CHECK-NEXT: kmovb (%rdi), %k1
1245 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
1246 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
1248 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1249 %2 = load i8, ptr %mask
1250 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2)
1253 declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
1255 define <8 x float> @stack_fold_fmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
1256 ; CHECK-LABEL: stack_fold_fmulc_ymm:
1258 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1261 ; CHECK-NEXT: #NO_APP
1262 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
1263 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1265 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1266 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
1269 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
1271 define <8 x float> @stack_fold_fmulc_mask_ymm(<8 x float> %a0, <8 x float> %a1, ptr %passthru, i8 %mask) {
1272 ; CHECK-LABEL: stack_fold_fmulc_mask_ymm:
1274 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1275 ; CHECK-NEXT: kmovd %esi, %k1
1278 ; CHECK-NEXT: #NO_APP
1279 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
1280 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1281 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1283 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1284 %2 = load <8 x float>, ptr %passthru
1285 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %2, i8 %mask)
1289 define <8 x float> @stack_fold_fmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, ptr %mask) {
1290 ; CHECK-LABEL: stack_fold_fmulc_maskz_ymm:
1292 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1295 ; CHECK-NEXT: #NO_APP
1296 ; CHECK-NEXT: kmovb (%rdi), %k1
1297 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload
1298 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1300 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1301 %2 = load i8, ptr %mask
1302 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
1306 define <8 x float> @stack_fold_fcmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
1307 ; CHECK-LABEL: stack_fold_fcmulc_ymm:
1309 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1312 ; CHECK-NEXT: #NO_APP
1313 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
1314 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1316 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1317 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
1320 declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
1322 define <8 x float> @stack_fold_fcmulc_mask_ymm(<8 x float> %a0, <8 x float> %a1, ptr %passthru, i8 %mask) {
1323 ; CHECK-LABEL: stack_fold_fcmulc_mask_ymm:
1325 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1326 ; CHECK-NEXT: kmovd %esi, %k1
1329 ; CHECK-NEXT: #NO_APP
1330 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
1331 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1332 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1334 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1335 %2 = load <8 x float>, ptr %passthru
1336 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %2, i8 %mask)
1340 define <8 x float> @stack_fold_fcmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, ptr %mask) {
1341 ; CHECK-LABEL: stack_fold_fcmulc_maskz_ymm:
1343 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1346 ; CHECK-NEXT: #NO_APP
1347 ; CHECK-NEXT: kmovb (%rdi), %k1
1348 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload
1349 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1351 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1352 %2 = load i8, ptr %mask
1353 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
1357 define <8 x float> @stack_fold_fmaddc_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
1358 ; CHECK-LABEL: stack_fold_fmaddc_ymm:
1360 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1363 ; CHECK-NEXT: #NO_APP
1364 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1366 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1367 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 -1)
1370 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
1372 define <8 x float> @stack_fold_fmaddc_mask_ymm(ptr %p, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
1373 ; CHECK-LABEL: stack_fold_fmaddc_mask_ymm:
1375 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1376 ; CHECK-NEXT: kmovd %esi, %k1
1379 ; CHECK-NEXT: #NO_APP
1380 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
1381 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1382 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1384 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1385 %a0 = load <8 x float>, ptr %p
1386 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 %mask)
1390 define <8 x float> @stack_fold_fmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, ptr %mask) {
1391 ; CHECK-LABEL: stack_fold_fmaddc_maskz_ymm:
1393 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1396 ; CHECK-NEXT: #NO_APP
1397 ; CHECK-NEXT: kmovb (%rdi), %k1
1398 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
1399 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1401 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1402 %2 = load i8, ptr %mask
1403 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> zeroinitializer, i8 %2)
1406 declare <8 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
1408 define <8 x float> @stack_fold_fcmaddc_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
1409 ; CHECK-LABEL: stack_fold_fcmaddc_ymm:
1411 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1414 ; CHECK-NEXT: #NO_APP
1415 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1417 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1418 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 -1)
1421 declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
1423 define <8 x float> @stack_fold_fcmaddc_mask_ymm(ptr %p, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
1424 ; CHECK-LABEL: stack_fold_fcmaddc_mask_ymm:
1426 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1427 ; CHECK-NEXT: kmovd %esi, %k1
1430 ; CHECK-NEXT: #NO_APP
1431 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
1432 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1433 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1435 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1436 %a0 = load <8 x float>, ptr %p
1437 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 %mask)
1441 define <8 x float> @stack_fold_fcmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, ptr %mask) {
1442 ; CHECK-LABEL: stack_fold_fcmaddc_maskz_ymm:
1444 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1447 ; CHECK-NEXT: #NO_APP
1448 ; CHECK-NEXT: kmovb (%rdi), %k1
1449 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
1450 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1452 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1453 %2 = load i8, ptr %mask
1454 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> zeroinitializer, i8 %2)
1457 declare <8 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
1459 attributes #0 = { "unsafe-fp-math"="false" }
1460 attributes #1 = { "unsafe-fp-math"="true" }