1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512fp16 < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <8 x half> @stack_fold_addph(<8 x half> %a0, <8 x half> %a1) {
13 ; CHECK-LABEL: stack_fold_addph:
15 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
19 ; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
22 %2 = fadd <8 x half> %a0, %a1
26 define <16 x half> @stack_fold_addph_ymm(<16 x half> %a0, <16 x half> %a1) {
27 ; CHECK-LABEL: stack_fold_addph_ymm:
29 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
33 ; CHECK-NEXT: vaddph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
35 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
36 %2 = fadd <16 x half> %a0, %a1
40 define i8 @stack_fold_cmpph(<8 x half> %a0, <8 x half> %a1) {
41 ; CHECK-LABEL: stack_fold_cmpph:
43 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
47 ; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
48 ; CHECK-NEXT: kmovd %k0, %eax
49 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
51 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
52 %res = call <8 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.128(<8 x half> %a0, <8 x half> %a1, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
53 %2 = bitcast <8 x i1> %res to i8
56 declare <8 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.128(<8 x half>, <8 x half>, i32, <8 x i1>)
58 define i16 @stack_fold_cmpph_ymm(<16 x half> %a0, <16 x half> %a1) {
59 ; CHECK-LABEL: stack_fold_cmpph_ymm:
61 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
65 ; CHECK-NEXT: vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload
66 ; CHECK-NEXT: kmovd %k0, %eax
67 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
68 ; CHECK-NEXT: vzeroupper
70 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
71 %res = call <16 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.256(<16 x half> %a0, <16 x half> %a1, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
72 %2 = bitcast <16 x i1> %res to i16
75 declare <16 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.256(<16 x half>, <16 x half>, i32, <16 x i1>)
77 define <8 x half> @stack_fold_divph(<8 x half> %a0, <8 x half> %a1) {
78 ; CHECK-LABEL: stack_fold_divph:
80 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
84 ; CHECK-NEXT: vdivph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
86 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
87 %2 = fdiv <8 x half> %a0, %a1
91 define <16 x half> @stack_fold_divph_ymm(<16 x half> %a0, <16 x half> %a1) {
92 ; CHECK-LABEL: stack_fold_divph_ymm:
94 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
98 ; CHECK-NEXT: vdivph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
100 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
101 %2 = fdiv <16 x half> %a0, %a1
105 define i8 @stack_fold_fpclassph(<8 x half> %a0) {
106 ; CHECK-LABEL: stack_fold_fpclassph:
108 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
111 ; CHECK-NEXT: #NO_APP
112 ; CHECK-NEXT: vfpclassphx $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 16-byte Folded Reload
113 ; CHECK-NEXT: kmovd %k0, %eax
114 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
116 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
117 %2 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %a0, i32 4)
118 %3 = bitcast <8 x i1> %2 to i8
121 declare <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half>, i32)
123 define i8 @stack_fold_fpclassph_mask(<8 x half> %a0, ptr %p) {
124 ; CHECK-LABEL: stack_fold_fpclassph_mask:
126 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
129 ; CHECK-NEXT: #NO_APP
130 ; CHECK-NEXT: kmovb (%rdi), %k1
131 ; CHECK-NEXT: vfpclassphx $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 16-byte Folded Reload
132 ; CHECK-NEXT: kmovd %k0, %eax
133 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
135 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
136 %2 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %a0, i32 4)
137 %mask = load <8 x i1>, ptr %p
138 %3 = and <8 x i1> %2, %mask
139 %4 = bitcast <8 x i1> %3 to i8
143 define i16 @stack_fold_fpclassph_ymm(<16 x half> %a0) {
144 ; CHECK-LABEL: stack_fold_fpclassph_ymm:
146 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
149 ; CHECK-NEXT: #NO_APP
150 ; CHECK-NEXT: vfpclassphy $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 32-byte Folded Reload
151 ; CHECK-NEXT: kmovd %k0, %eax
152 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
153 ; CHECK-NEXT: vzeroupper
155 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
156 %2 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %a0, i32 4)
157 %3 = bitcast <16 x i1> %2 to i16
160 declare <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half>, i32)
162 define i16 @stack_fold_fpclassph_mask_ymm(<16 x half> %a0, ptr %p) {
163 ; CHECK-LABEL: stack_fold_fpclassph_mask_ymm:
165 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
168 ; CHECK-NEXT: #NO_APP
169 ; CHECK-NEXT: kmovw (%rdi), %k1
170 ; CHECK-NEXT: vfpclassphy $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 32-byte Folded Reload
171 ; CHECK-NEXT: kmovd %k0, %eax
172 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
173 ; CHECK-NEXT: vzeroupper
175 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
176 %2 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %a0, i32 4)
177 %mask = load <16 x i1>, ptr %p
178 %3 = and <16 x i1> %2, %mask
179 %4 = bitcast <16 x i1> %3 to i16
183 define <8 x half> @stack_fold_getexpph(<8 x half> %a0) {
184 ; CHECK-LABEL: stack_fold_getexpph:
186 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
189 ; CHECK-NEXT: #NO_APP
190 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
192 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
193 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
196 declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half>, <8 x half>, i8)
198 define <8 x half> @stack_fold_getexpph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
199 ; CHECK-LABEL: stack_fold_getexpph_mask:
201 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
202 ; CHECK-NEXT: kmovd %esi, %k1
205 ; CHECK-NEXT: #NO_APP
206 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
207 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
208 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
210 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
211 %2 = load <8 x half>, ptr %passthru
212 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
216 define <8 x half> @stack_fold_getexpph_maskz(<8 x half> %a0, ptr %mask) {
217 ; CHECK-LABEL: stack_fold_getexpph_maskz:
219 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
222 ; CHECK-NEXT: #NO_APP
223 ; CHECK-NEXT: kmovb (%rdi), %k1
224 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
226 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
227 %2 = load i8, ptr %mask
228 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
232 define <16 x half> @stack_fold_getexpph_ymm(<16 x half> %a0) {
233 ; CHECK-LABEL: stack_fold_getexpph_ymm:
235 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
238 ; CHECK-NEXT: #NO_APP
239 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
241 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
242 %2 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
245 declare <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half>, <16 x half>, i16)
247 define <16 x half> @stack_fold_getexpph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
248 ; CHECK-LABEL: stack_fold_getexpph_mask_ymm:
250 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
251 ; CHECK-NEXT: kmovd %esi, %k1
254 ; CHECK-NEXT: #NO_APP
255 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
256 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
257 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
259 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
260 %2 = load <16 x half>, ptr %passthru
261 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
265 define <16 x half> @stack_fold_getexpph_maskz_ymm(<16 x half> %a0, ptr %mask) {
266 ; CHECK-LABEL: stack_fold_getexpph_maskz_ymm:
268 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
271 ; CHECK-NEXT: #NO_APP
272 ; CHECK-NEXT: kmovw (%rdi), %k1
273 ; CHECK-NEXT: vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
275 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
276 %2 = load i16, ptr %mask
277 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
281 define <8 x half> @stack_fold_getmantph(<8 x half> %a0) {
282 ; CHECK-LABEL: stack_fold_getmantph:
284 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
287 ; CHECK-NEXT: #NO_APP
288 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
290 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
291 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
294 declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half>, i32, <8 x half>, i8)
296 define <8 x half> @stack_fold_getmantph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
297 ; CHECK-LABEL: stack_fold_getmantph_mask:
299 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
300 ; CHECK-NEXT: kmovd %esi, %k1
303 ; CHECK-NEXT: #NO_APP
304 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
305 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
306 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
308 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
309 %2 = load <8 x half>, ptr %passthru
310 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
314 define <8 x half> @stack_fold_getmantph_maskz(<8 x half> %a0, ptr %mask) {
315 ; CHECK-LABEL: stack_fold_getmantph_maskz:
317 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
320 ; CHECK-NEXT: #NO_APP
321 ; CHECK-NEXT: kmovb (%rdi), %k1
322 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
324 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
325 %2 = load i8, ptr %mask
326 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
330 define <16 x half> @stack_fold_getmantph_ymm(<16 x half> %a0) {
331 ; CHECK-LABEL: stack_fold_getmantph_ymm:
333 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
336 ; CHECK-NEXT: #NO_APP
337 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
339 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
340 %2 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
343 declare <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half>, i32, <16 x half>, i16)
345 define <16 x half> @stack_fold_getmantph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
346 ; CHECK-LABEL: stack_fold_getmantph_mask_ymm:
348 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
349 ; CHECK-NEXT: kmovd %esi, %k1
352 ; CHECK-NEXT: #NO_APP
353 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
354 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
355 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
357 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
358 %2 = load <16 x half>, ptr %passthru
359 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
363 define <16 x half> @stack_fold_getmantph_maskz_ymm(<16 x half> %a0, ptr %mask) {
364 ; CHECK-LABEL: stack_fold_getmantph_maskz_ymm:
366 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
369 ; CHECK-NEXT: #NO_APP
370 ; CHECK-NEXT: kmovw (%rdi), %k1
371 ; CHECK-NEXT: vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
373 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
374 %2 = load i16, ptr %mask
375 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
379 define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
380 ; CHECK-LABEL: stack_fold_maxph:
382 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
385 ; CHECK-NEXT: #NO_APP
386 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
388 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
389 %2 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %a0, <8 x half> %a1)
392 declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>) nounwind readnone
394 define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
395 ; CHECK-LABEL: stack_fold_maxph_commutable:
397 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
400 ; CHECK-NEXT: #NO_APP
401 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
403 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
404 %2 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %a0, <8 x half> %a1)
408 define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
409 ; CHECK-LABEL: stack_fold_maxph_ymm:
411 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
414 ; CHECK-NEXT: #NO_APP
415 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
417 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
418 %2 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %a0, <16 x half> %a1)
421 declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>) nounwind readnone
423 define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
424 ; CHECK-LABEL: stack_fold_maxph_ymm_commutable:
426 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
429 ; CHECK-NEXT: #NO_APP
430 ; CHECK-NEXT: vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
432 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
433 %2 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %a0, <16 x half> %a1)
437 define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
438 ; CHECK-LABEL: stack_fold_minph:
440 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
443 ; CHECK-NEXT: #NO_APP
444 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
446 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
447 %2 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %a0, <8 x half> %a1)
450 declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>) nounwind readnone
452 define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
453 ; CHECK-LABEL: stack_fold_minph_commutable:
455 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
458 ; CHECK-NEXT: #NO_APP
459 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
461 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
462 %2 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %a0, <8 x half> %a1)
466 define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
467 ; CHECK-LABEL: stack_fold_minph_ymm:
469 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
472 ; CHECK-NEXT: #NO_APP
473 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
475 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
476 %2 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %a0, <16 x half> %a1)
479 declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>) nounwind readnone
481 define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
482 ; CHECK-LABEL: stack_fold_minph_ymm_commutable:
484 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
487 ; CHECK-NEXT: #NO_APP
488 ; CHECK-NEXT: vminph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
490 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
491 %2 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %a0, <16 x half> %a1)
495 define <8 x half> @stack_fold_mulph(<8 x half> %a0, <8 x half> %a1) {
496 ; CHECK-LABEL: stack_fold_mulph:
498 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
501 ; CHECK-NEXT: #NO_APP
502 ; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
504 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
505 %2 = fmul <8 x half> %a0, %a1
509 define <16 x half> @stack_fold_mulph_ymm(<16 x half> %a0, <16 x half> %a1) {
510 ; CHECK-LABEL: stack_fold_mulph_ymm:
512 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
515 ; CHECK-NEXT: #NO_APP
516 ; CHECK-NEXT: vmulph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
518 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
519 %2 = fmul <16 x half> %a0, %a1
523 define <8 x half> @stack_fold_rcpph(<8 x half> %a0) {
524 ; CHECK-LABEL: stack_fold_rcpph:
526 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
529 ; CHECK-NEXT: #NO_APP
530 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
533 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
536 declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half>, <8 x half>, i8)
538 define <8 x half> @stack_fold_rcpph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
539 ; CHECK-LABEL: stack_fold_rcpph_mask:
541 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
542 ; CHECK-NEXT: kmovd %esi, %k1
545 ; CHECK-NEXT: #NO_APP
546 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
547 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
548 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
550 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
551 %2 = load <8 x half>, ptr %passthru
552 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
556 define <8 x half> @stack_fold_rcpph_maskz(<8 x half> %a0, ptr %mask) {
557 ; CHECK-LABEL: stack_fold_rcpph_maskz:
559 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
562 ; CHECK-NEXT: #NO_APP
563 ; CHECK-NEXT: kmovb (%rdi), %k1
564 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
566 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
567 %2 = load i8, ptr %mask
568 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
572 define <16 x half> @stack_fold_rcpph_ymm(<16 x half> %a0) {
573 ; CHECK-LABEL: stack_fold_rcpph_ymm:
575 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
578 ; CHECK-NEXT: #NO_APP
579 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
581 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
582 %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
585 declare <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half>, <16 x half>, i16)
587 define <16 x half> @stack_fold_rcpph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
588 ; CHECK-LABEL: stack_fold_rcpph_mask_ymm:
590 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
591 ; CHECK-NEXT: kmovd %esi, %k1
594 ; CHECK-NEXT: #NO_APP
595 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
596 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
597 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
599 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
600 %2 = load <16 x half>, ptr %passthru
601 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
605 define <16 x half> @stack_fold_rcpph_maskz_ymm(<16 x half> %a0, ptr %mask) {
606 ; CHECK-LABEL: stack_fold_rcpph_maskz_ymm:
608 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
611 ; CHECK-NEXT: #NO_APP
612 ; CHECK-NEXT: kmovw (%rdi), %k1
613 ; CHECK-NEXT: vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
615 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
616 %2 = load i16, ptr %mask
617 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
621 define <8 x half> @stack_fold_reduceph(<8 x half> %a0) {
622 ; CHECK-LABEL: stack_fold_reduceph:
624 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
627 ; CHECK-NEXT: #NO_APP
628 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
630 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
631 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
634 declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half>, i32, <8 x half>, i8)
636 define <8 x half> @stack_fold_reduceph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
637 ; CHECK-LABEL: stack_fold_reduceph_mask:
639 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
640 ; CHECK-NEXT: kmovd %esi, %k1
643 ; CHECK-NEXT: #NO_APP
644 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
645 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
646 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
648 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
649 %2 = load <8 x half>, ptr %passthru
650 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
654 define <8 x half> @stack_fold_reduceph_maskz(<8 x half> %a0, ptr %mask) {
655 ; CHECK-LABEL: stack_fold_reduceph_maskz:
657 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
660 ; CHECK-NEXT: #NO_APP
661 ; CHECK-NEXT: kmovb (%rdi), %k1
662 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
664 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
665 %2 = load i8, ptr %mask
666 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
670 define <16 x half> @stack_fold_reduceph_ymm(<16 x half> %a0) {
671 ; CHECK-LABEL: stack_fold_reduceph_ymm:
673 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
676 ; CHECK-NEXT: #NO_APP
677 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
679 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
680 %2 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
683 declare <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half>, i32, <16 x half>, i16)
685 define <16 x half> @stack_fold_reduceph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
686 ; CHECK-LABEL: stack_fold_reduceph_mask_ymm:
688 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
689 ; CHECK-NEXT: kmovd %esi, %k1
692 ; CHECK-NEXT: #NO_APP
693 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
694 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
695 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
697 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
698 %2 = load <16 x half>, ptr %passthru
699 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
703 define <16 x half> @stack_fold_reduceph_maskz_ymm(<16 x half> %a0, ptr %mask) {
704 ; CHECK-LABEL: stack_fold_reduceph_maskz_ymm:
706 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
709 ; CHECK-NEXT: #NO_APP
710 ; CHECK-NEXT: kmovw (%rdi), %k1
711 ; CHECK-NEXT: vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
713 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
714 %2 = load i16, ptr %mask
715 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
719 define <8 x half> @stack_fold_rndscaleph(<8 x half> %a0) {
720 ; CHECK-LABEL: stack_fold_rndscaleph:
722 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
725 ; CHECK-NEXT: #NO_APP
726 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
728 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
729 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
732 declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half>, i32, <8 x half>, i8)
734 define <8 x half> @stack_fold_rndscaleph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
735 ; CHECK-LABEL: stack_fold_rndscaleph_mask:
737 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
738 ; CHECK-NEXT: kmovd %esi, %k1
741 ; CHECK-NEXT: #NO_APP
742 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
743 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
744 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
746 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
747 %2 = load <8 x half>, ptr %passthru
748 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
752 define <8 x half> @stack_fold_rndscaleph_maskz(<8 x half> %a0, ptr %mask) {
753 ; CHECK-LABEL: stack_fold_rndscaleph_maskz:
755 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
758 ; CHECK-NEXT: #NO_APP
759 ; CHECK-NEXT: kmovb (%rdi), %k1
760 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
762 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
763 %2 = load i8, ptr %mask
764 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
768 define <16 x half> @stack_fold_rndscaleph_ymm(<16 x half> %a0) {
769 ; CHECK-LABEL: stack_fold_rndscaleph_ymm:
771 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
774 ; CHECK-NEXT: #NO_APP
775 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
777 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
778 %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
781 declare <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half>, i32, <16 x half>, i16)
783 define <16 x half> @stack_fold_rndscaleph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
784 ; CHECK-LABEL: stack_fold_rndscaleph_mask_ymm:
786 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
787 ; CHECK-NEXT: kmovd %esi, %k1
790 ; CHECK-NEXT: #NO_APP
791 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
792 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
793 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
795 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
796 %2 = load <16 x half>, ptr %passthru
797 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
801 define <16 x half> @stack_fold_rndscaleph_maskz_ymm(<16 x half> %a0, ptr %mask) {
802 ; CHECK-LABEL: stack_fold_rndscaleph_maskz_ymm:
804 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
807 ; CHECK-NEXT: #NO_APP
808 ; CHECK-NEXT: kmovw (%rdi), %k1
809 ; CHECK-NEXT: vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
811 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
812 %2 = load i16, ptr %mask
813 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
817 define <8 x half> @stack_fold_rsqrtph(<8 x half> %a0) {
818 ; CHECK-LABEL: stack_fold_rsqrtph:
820 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
823 ; CHECK-NEXT: #NO_APP
824 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
826 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
827 %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
830 declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half>, <8 x half>, i8)
832 define <8 x half> @stack_fold_rsqrtph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
833 ; CHECK-LABEL: stack_fold_rsqrtph_mask:
835 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
836 ; CHECK-NEXT: kmovd %esi, %k1
839 ; CHECK-NEXT: #NO_APP
840 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
841 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
842 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
844 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
845 %2 = load <8 x half>, ptr %passthru
846 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
850 define <8 x half> @stack_fold_rsqrtph_maskz(<8 x half> %a0, ptr %mask) {
851 ; CHECK-LABEL: stack_fold_rsqrtph_maskz:
853 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
856 ; CHECK-NEXT: #NO_APP
857 ; CHECK-NEXT: kmovb (%rdi), %k1
858 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
860 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
861 %2 = load i8, ptr %mask
862 %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
866 define <16 x half> @stack_fold_rsqrtph_ymm(<16 x half> %a0) {
867 ; CHECK-LABEL: stack_fold_rsqrtph_ymm:
869 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
872 ; CHECK-NEXT: #NO_APP
873 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
875 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
876 %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
879 declare <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half>, <16 x half>, i16)
881 define <16 x half> @stack_fold_rsqrtph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
882 ; CHECK-LABEL: stack_fold_rsqrtph_mask_ymm:
884 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
885 ; CHECK-NEXT: kmovd %esi, %k1
888 ; CHECK-NEXT: #NO_APP
889 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
890 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
891 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
893 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
894 %2 = load <16 x half>, ptr %passthru
895 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
899 define <16 x half> @stack_fold_rsqrtph_maskz_ymm(<16 x half> %a0, ptr %mask) {
900 ; CHECK-LABEL: stack_fold_rsqrtph_maskz_ymm:
902 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
905 ; CHECK-NEXT: #NO_APP
906 ; CHECK-NEXT: kmovw (%rdi), %k1
907 ; CHECK-NEXT: vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
909 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
910 %2 = load i16, ptr %mask
911 %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
915 define <8 x half> @stack_fold_sqrtph(<8 x half> %a0) {
916 ; CHECK-LABEL: stack_fold_sqrtph:
918 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
921 ; CHECK-NEXT: #NO_APP
922 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
924 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
925 %2 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
928 declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
930 define <8 x half> @stack_fold_sqrtph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
931 ; CHECK-LABEL: stack_fold_sqrtph_mask:
933 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
936 ; CHECK-NEXT: #NO_APP
937 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
938 ; CHECK-NEXT: kmovd %esi, %k1
939 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
940 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
942 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
943 %2 = load <8 x half>, ptr %passthru
944 %3 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
945 %4 = bitcast i8 %mask to <8 x i1>
946 %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> %2
950 define <8 x half> @stack_fold_sqrtph_maskz(<8 x half> %a0, ptr %mask) {
951 ; CHECK-LABEL: stack_fold_sqrtph_maskz:
953 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
956 ; CHECK-NEXT: #NO_APP
957 ; CHECK-NEXT: kmovb (%rdi), %k1
958 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
960 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
961 %2 = load i8, ptr %mask
962 %3 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
963 %4 = bitcast i8 %2 to <8 x i1>
964 %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> zeroinitializer
968 define <16 x half> @stack_fold_sqrtph_ymm(<16 x half> %a0) {
969 ; CHECK-LABEL: stack_fold_sqrtph_ymm:
971 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
974 ; CHECK-NEXT: #NO_APP
975 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
977 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
978 %2 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
981 declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
983 define <16 x half> @stack_fold_sqrtph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
984 ; CHECK-LABEL: stack_fold_sqrtph_mask_ymm:
986 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
989 ; CHECK-NEXT: #NO_APP
990 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
991 ; CHECK-NEXT: kmovd %esi, %k1
992 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
993 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
995 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
996 %2 = load <16 x half>, ptr %passthru
997 %3 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
998 %4 = bitcast i16 %mask to <16 x i1>
999 %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> %2
1003 define <16 x half> @stack_fold_sqrtph_maskz_ymm(<16 x half> %a0, ptr %mask) {
1004 ; CHECK-LABEL: stack_fold_sqrtph_maskz_ymm:
1006 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1009 ; CHECK-NEXT: #NO_APP
1010 ; CHECK-NEXT: kmovw (%rdi), %k1
1011 ; CHECK-NEXT: vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
1013 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1014 %2 = load i16, ptr %mask
1015 %3 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
1016 %4 = bitcast i16 %2 to <16 x i1>
1017 %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> zeroinitializer
1021 define <8 x half> @stack_fold_subph(<8 x half> %a0, <8 x half> %a1) {
1022 ; CHECK-LABEL: stack_fold_subph:
1024 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1027 ; CHECK-NEXT: #NO_APP
1028 ; CHECK-NEXT: vsubph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1030 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1031 %2 = fsub <8 x half> %a0, %a1
1035 define <16 x half> @stack_fold_subph_ymm(<16 x half> %a0, <16 x half> %a1) {
1036 ; CHECK-LABEL: stack_fold_subph_ymm:
1038 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1041 ; CHECK-NEXT: #NO_APP
1042 ; CHECK-NEXT: vsubph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
1044 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1045 %2 = fsub <16 x half> %a0, %a1
1049 define <4 x float> @stack_fold_fmulc(<4 x float> %a0, <4 x float> %a1) {
1050 ; CHECK-LABEL: stack_fold_fmulc:
1052 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1055 ; CHECK-NEXT: #NO_APP
1056 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
1057 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1059 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1060 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
1063 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
1065 define <4 x float> @stack_fold_fmulc_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) {
1066 ; CHECK-LABEL: stack_fold_fmulc_mask:
1068 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1069 ; CHECK-NEXT: kmovd %esi, %k1
1072 ; CHECK-NEXT: #NO_APP
1073 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1074 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1075 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1077 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1078 %2 = load <4 x float>, ptr %passthru
1079 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask)
1083 define <4 x float> @stack_fold_fmulc_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
1084 ; CHECK-LABEL: stack_fold_fmulc_maskz:
1086 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1089 ; CHECK-NEXT: #NO_APP
1090 ; CHECK-NEXT: kmovb (%rdi), %k1
1091 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
1092 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1094 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1095 %2 = load i8, ptr %mask
1096 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
1100 define <4 x float> @stack_fold_fcmulc(<4 x float> %a0, <4 x float> %a1) {
1101 ; CHECK-LABEL: stack_fold_fcmulc:
1103 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1106 ; CHECK-NEXT: #NO_APP
1107 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
1108 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1110 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1111 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
1114 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
1116 define <4 x float> @stack_fold_fcmulc_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) {
1117 ; CHECK-LABEL: stack_fold_fcmulc_mask:
1119 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1120 ; CHECK-NEXT: kmovd %esi, %k1
1123 ; CHECK-NEXT: #NO_APP
1124 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1125 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1126 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1128 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1129 %2 = load <4 x float>, ptr %passthru
1130 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask)
1134 define <4 x float> @stack_fold_fcmulc_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
1135 ; CHECK-LABEL: stack_fold_fcmulc_maskz:
1137 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1140 ; CHECK-NEXT: #NO_APP
1141 ; CHECK-NEXT: kmovb (%rdi), %k1
1142 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
1143 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1145 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1146 %2 = load i8, ptr %mask
1147 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
1151 define <4 x float> @stack_fold_fmaddc(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
1152 ; CHECK-LABEL: stack_fold_fmaddc:
1154 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1157 ; CHECK-NEXT: #NO_APP
1158 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1160 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1161 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1)
1164 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
1166 define <4 x float> @stack_fold_fmaddc_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
1167 ; CHECK-LABEL: stack_fold_fmaddc_mask:
1169 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1170 ; CHECK-NEXT: kmovd %esi, %k1
1173 ; CHECK-NEXT: #NO_APP
1174 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1175 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1176 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1178 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1179 %a0 = load <4 x float>, ptr %p
1180 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask)
1184 define <4 x float> @stack_fold_fmaddc_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) {
1185 ; CHECK-LABEL: stack_fold_fmaddc_maskz:
1187 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1190 ; CHECK-NEXT: #NO_APP
1191 ; CHECK-NEXT: kmovb (%rdi), %k1
1192 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
1193 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
1195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1196 %2 = load i8, ptr %mask
1197 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2)
1200 declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
1202 define <4 x float> @stack_fold_fcmaddc(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
1203 ; CHECK-LABEL: stack_fold_fcmaddc:
1205 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1208 ; CHECK-NEXT: #NO_APP
1209 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1211 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1212 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1)
1215 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
1217 define <4 x float> @stack_fold_fcmaddc_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
1218 ; CHECK-LABEL: stack_fold_fcmaddc_mask:
1220 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1221 ; CHECK-NEXT: kmovd %esi, %k1
1224 ; CHECK-NEXT: #NO_APP
1225 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
1226 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
1227 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
1229 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1230 %a0 = load <4 x float>, ptr %p
1231 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask)
1235 define <4 x float> @stack_fold_fcmaddc_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) {
1236 ; CHECK-LABEL: stack_fold_fcmaddc_maskz:
1238 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1241 ; CHECK-NEXT: #NO_APP
1242 ; CHECK-NEXT: kmovb (%rdi), %k1
1243 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
1244 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
1246 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1247 %2 = load i8, ptr %mask
1248 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2)
1251 declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
1253 define <8 x float> @stack_fold_fmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
1254 ; CHECK-LABEL: stack_fold_fmulc_ymm:
1256 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1259 ; CHECK-NEXT: #NO_APP
1260 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
1261 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1263 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1264 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
1267 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
1269 define <8 x float> @stack_fold_fmulc_mask_ymm(<8 x float> %a0, <8 x float> %a1, ptr %passthru, i8 %mask) {
1270 ; CHECK-LABEL: stack_fold_fmulc_mask_ymm:
1272 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1273 ; CHECK-NEXT: kmovd %esi, %k1
1276 ; CHECK-NEXT: #NO_APP
1277 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
1278 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1279 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1281 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1282 %2 = load <8 x float>, ptr %passthru
1283 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %2, i8 %mask)
1287 define <8 x float> @stack_fold_fmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, ptr %mask) {
1288 ; CHECK-LABEL: stack_fold_fmulc_maskz_ymm:
1290 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1293 ; CHECK-NEXT: #NO_APP
1294 ; CHECK-NEXT: kmovb (%rdi), %k1
1295 ; CHECK-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload
1296 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1298 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1299 %2 = load i8, ptr %mask
1300 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
1304 define <8 x float> @stack_fold_fcmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
1305 ; CHECK-LABEL: stack_fold_fcmulc_ymm:
1307 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1310 ; CHECK-NEXT: #NO_APP
1311 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
1312 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1314 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1315 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
1318 declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
1320 define <8 x float> @stack_fold_fcmulc_mask_ymm(<8 x float> %a0, <8 x float> %a1, ptr %passthru, i8 %mask) {
1321 ; CHECK-LABEL: stack_fold_fcmulc_mask_ymm:
1323 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1324 ; CHECK-NEXT: kmovd %esi, %k1
1327 ; CHECK-NEXT: #NO_APP
1328 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
1329 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1330 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1332 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1333 %2 = load <8 x float>, ptr %passthru
1334 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %2, i8 %mask)
1338 define <8 x float> @stack_fold_fcmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, ptr %mask) {
1339 ; CHECK-LABEL: stack_fold_fcmulc_maskz_ymm:
1341 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1344 ; CHECK-NEXT: #NO_APP
1345 ; CHECK-NEXT: kmovb (%rdi), %k1
1346 ; CHECK-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload
1347 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1349 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1350 %2 = load i8, ptr %mask
1351 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
1355 define <8 x float> @stack_fold_fmaddc_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
1356 ; CHECK-LABEL: stack_fold_fmaddc_ymm:
1358 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1361 ; CHECK-NEXT: #NO_APP
1362 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1364 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1365 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 -1)
1368 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
1370 define <8 x float> @stack_fold_fmaddc_mask_ymm(ptr %p, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
1371 ; CHECK-LABEL: stack_fold_fmaddc_mask_ymm:
1373 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1374 ; CHECK-NEXT: kmovd %esi, %k1
1377 ; CHECK-NEXT: #NO_APP
1378 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
1379 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1380 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1382 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1383 %a0 = load <8 x float>, ptr %p
1384 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 %mask)
1388 define <8 x float> @stack_fold_fmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, ptr %mask) {
1389 ; CHECK-LABEL: stack_fold_fmaddc_maskz_ymm:
1391 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1394 ; CHECK-NEXT: #NO_APP
1395 ; CHECK-NEXT: kmovb (%rdi), %k1
1396 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
1397 ; CHECK-NEXT: vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1399 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1400 %2 = load i8, ptr %mask
1401 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> zeroinitializer, i8 %2)
1404 declare <8 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
1406 define <8 x float> @stack_fold_fcmaddc_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
1407 ; CHECK-LABEL: stack_fold_fcmaddc_ymm:
1409 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1412 ; CHECK-NEXT: #NO_APP
1413 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
1415 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1416 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 -1)
1419 declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
1421 define <8 x float> @stack_fold_fcmaddc_mask_ymm(ptr %p, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
1422 ; CHECK-LABEL: stack_fold_fcmaddc_mask_ymm:
1424 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1425 ; CHECK-NEXT: kmovd %esi, %k1
1428 ; CHECK-NEXT: #NO_APP
1429 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
1430 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
1431 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
1433 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1434 %a0 = load <8 x float>, ptr %p
1435 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 %mask)
1439 define <8 x float> @stack_fold_fcmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, ptr %mask) {
1440 ; CHECK-LABEL: stack_fold_fcmaddc_maskz_ymm:
1442 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1445 ; CHECK-NEXT: #NO_APP
1446 ; CHECK-NEXT: kmovb (%rdi), %k1
1447 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
1448 ; CHECK-NEXT: vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
1450 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1451 %2 = load i8, ptr %mask
1452 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> zeroinitializer, i8 %2)
1455 declare <8 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
1457 attributes #0 = { "unsafe-fp-math"="false" }
1458 attributes #1 = { "unsafe-fp-math"="true" }