1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-mulc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE
3 ; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-mulc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE
5 define <16 x float> @fmulcph(<16 x float> %a0, <16 x float> %a1) {
6 ; ENABLE-LABEL: fmulcph:
8 ; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11 ; ENABLE-NEXT: #NO_APP
12 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13 ; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
14 ; ENABLE-NEXT: vfmulcph %zmm1, %zmm0, %zmm2
15 ; ENABLE-NEXT: vmovaps %zmm2, %zmm0
18 ; DISABLE-LABEL: fmulcph:
20 ; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23 ; DISABLE-NEXT: #NO_APP
24 ; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
25 ; DISABLE-NEXT: vmovaps %zmm2, %zmm0
27 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
28 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
32 define <16 x float> @fmulcph_mem(<16 x float> %a0, ptr %p1) {
33 ; ENABLE-LABEL: fmulcph_mem:
35 ; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
38 ; ENABLE-NEXT: #NO_APP
39 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
40 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
41 ; ENABLE-NEXT: vfmulcph (%rdi), %zmm0, %zmm1
42 ; ENABLE-NEXT: vmovaps %zmm1, %zmm0
45 ; DISABLE-LABEL: fmulcph_mem:
47 ; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
50 ; DISABLE-NEXT: #NO_APP
51 ; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
52 ; DISABLE-NEXT: vfmulcph (%rdi), %zmm0, %zmm1
53 ; DISABLE-NEXT: vmovaps %zmm1, %zmm0
55 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
56 %a1 = load <16 x float>, ptr %p1, align 64
57 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
61 define <16 x float> @fmulcph_broadcast(<16 x float> %a0, ptr %p1) {
62 ; ENABLE-LABEL: fmulcph_broadcast:
64 ; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
67 ; ENABLE-NEXT: #NO_APP
68 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
69 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
70 ; ENABLE-NEXT: vfmulcph (%rdi){1to16}, %zmm0, %zmm1
71 ; ENABLE-NEXT: vmovaps %zmm1, %zmm0
74 ; DISABLE-LABEL: fmulcph_broadcast:
76 ; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
79 ; DISABLE-NEXT: #NO_APP
80 ; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
81 ; DISABLE-NEXT: vfmulcph (%rdi){1to16}, %zmm0, %zmm1
82 ; DISABLE-NEXT: vmovaps %zmm1, %zmm0
84 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
85 %v1 = load float, ptr %p1, align 4
86 %t0 = insertelement <16 x float> undef, float %v1, i64 0
87 %a1 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer
88 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
92 define <16 x float> @fmulcph_maskz(<16 x float> %a0, <16 x float> %a1, ptr %mask) {
93 ; ENABLE-LABEL: fmulcph_maskz:
95 ; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
98 ; ENABLE-NEXT: #NO_APP
99 ; ENABLE-NEXT: kmovw (%rdi), %k1
100 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
101 ; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
102 ; ENABLE-NEXT: vfmulcph %zmm1, %zmm0, %zmm2 {%k1} {z}
103 ; ENABLE-NEXT: vmovaps %zmm2, %zmm0
106 ; DISABLE-LABEL: fmulcph_maskz:
108 ; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
111 ; DISABLE-NEXT: #NO_APP
112 ; DISABLE-NEXT: kmovw (%rdi), %k1
113 ; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload
114 ; DISABLE-NEXT: vmovaps %zmm2, %zmm0
116 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
117 %2 = load i16, ptr %mask
118 %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
122 define <16 x float> @fcmulcph(<16 x float> %a0, <16 x float> %a1) {
123 ; ENABLE-LABEL: fcmulcph:
125 ; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
128 ; ENABLE-NEXT: #NO_APP
129 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
130 ; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
131 ; ENABLE-NEXT: vfcmulcph %zmm1, %zmm0, %zmm2
132 ; ENABLE-NEXT: vmovaps %zmm2, %zmm0
135 ; DISABLE-LABEL: fcmulcph:
137 ; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
140 ; DISABLE-NEXT: #NO_APP
141 ; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
142 ; DISABLE-NEXT: vmovaps %zmm2, %zmm0
144 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
145 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
149 define <16 x float> @fcmulcph_mem(<16 x float> %a0, ptr %p1) {
150 ; ENABLE-LABEL: fcmulcph_mem:
152 ; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
155 ; ENABLE-NEXT: #NO_APP
156 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
157 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
158 ; ENABLE-NEXT: vfcmulcph (%rdi), %zmm0, %zmm1
159 ; ENABLE-NEXT: vmovaps %zmm1, %zmm0
162 ; DISABLE-LABEL: fcmulcph_mem:
164 ; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
167 ; DISABLE-NEXT: #NO_APP
168 ; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
169 ; DISABLE-NEXT: vfcmulcph (%rdi), %zmm0, %zmm1
170 ; DISABLE-NEXT: vmovaps %zmm1, %zmm0
172 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
173 %a1 = load <16 x float>, ptr %p1, align 64
174 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
178 define <16 x float> @fcmulcph_broadcast(<16 x float> %a0, ptr %p1) {
179 ; ENABLE-LABEL: fcmulcph_broadcast:
181 ; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
184 ; ENABLE-NEXT: #NO_APP
185 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
186 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
187 ; ENABLE-NEXT: vfcmulcph (%rdi){1to16}, %zmm0, %zmm1
188 ; ENABLE-NEXT: vmovaps %zmm1, %zmm0
191 ; DISABLE-LABEL: fcmulcph_broadcast:
193 ; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
196 ; DISABLE-NEXT: #NO_APP
197 ; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
198 ; DISABLE-NEXT: vfcmulcph (%rdi){1to16}, %zmm0, %zmm1
199 ; DISABLE-NEXT: vmovaps %zmm1, %zmm0
201 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
202 %v1 = load float, ptr %p1, align 4
203 %t0 = insertelement <16 x float> undef, float %v1, i64 0
204 %a1 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer
205 %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
209 define <16 x float> @fcmulcph_maskz(<16 x float> %a0, <16 x float> %a1, ptr %mask) {
210 ; ENABLE-LABEL: fcmulcph_maskz:
212 ; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
215 ; ENABLE-NEXT: #NO_APP
216 ; ENABLE-NEXT: kmovw (%rdi), %k1
217 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
218 ; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
219 ; ENABLE-NEXT: vfcmulcph %zmm1, %zmm0, %zmm2 {%k1} {z}
220 ; ENABLE-NEXT: vmovaps %zmm2, %zmm0
223 ; DISABLE-LABEL: fcmulcph_maskz:
225 ; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
228 ; DISABLE-NEXT: #NO_APP
229 ; DISABLE-NEXT: kmovw (%rdi), %k1
230 ; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload
231 ; DISABLE-NEXT: vmovaps %zmm2, %zmm0
233 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
234 %2 = load i16, ptr %mask
235 %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
239 define <4 x float> @fmulc(<4 x float> %a0, <4 x float> %a1) {
240 ; ENABLE-LABEL: fmulc:
242 ; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
245 ; ENABLE-NEXT: #NO_APP
246 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
247 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
248 ; ENABLE-NEXT: vfmulcph %xmm1, %xmm0, %xmm2
249 ; ENABLE-NEXT: vmovaps %xmm2, %xmm0
252 ; DISABLE-LABEL: fmulc:
254 ; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
257 ; DISABLE-NEXT: #NO_APP
258 ; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
259 ; DISABLE-NEXT: vmovaps %xmm2, %xmm0
261 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
262 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
266 define <4 x float> @fmulc_mem(<4 x float> %a0, ptr %p1) {
267 ; ENABLE-LABEL: fmulc_mem:
269 ; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
272 ; ENABLE-NEXT: #NO_APP
273 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
274 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
275 ; ENABLE-NEXT: vfmulcph (%rdi), %xmm0, %xmm1
276 ; ENABLE-NEXT: vmovaps %xmm1, %xmm0
279 ; DISABLE-LABEL: fmulc_mem:
281 ; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
284 ; DISABLE-NEXT: #NO_APP
285 ; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
286 ; DISABLE-NEXT: vfmulcph (%rdi), %xmm0, %xmm1
287 ; DISABLE-NEXT: vmovaps %xmm1, %xmm0
289 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
290 %a1 = load <4 x float>, ptr %p1, align 64
291 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
295 define <4 x float> @fmulc_broadcast(<4 x float> %a0, ptr %p1) {
296 ; ENABLE-LABEL: fmulc_broadcast:
298 ; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
301 ; ENABLE-NEXT: #NO_APP
302 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
303 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
304 ; ENABLE-NEXT: vfmulcph (%rdi){1to4}, %xmm0, %xmm1
305 ; ENABLE-NEXT: vmovaps %xmm1, %xmm0
308 ; DISABLE-LABEL: fmulc_broadcast:
310 ; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
313 ; DISABLE-NEXT: #NO_APP
314 ; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
315 ; DISABLE-NEXT: vfmulcph (%rdi){1to4}, %xmm0, %xmm1
316 ; DISABLE-NEXT: vmovaps %xmm1, %xmm0
318 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
319 %v1 = load float, ptr %p1, align 4
320 %t0 = insertelement <4 x float> undef, float %v1, i64 0
321 %a1 = shufflevector <4 x float> %t0, <4 x float> undef, <4 x i32> zeroinitializer
322 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
326 define <4 x float> @fmulc_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
327 ; ENABLE-LABEL: fmulc_maskz:
329 ; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
332 ; ENABLE-NEXT: #NO_APP
333 ; ENABLE-NEXT: kmovb (%rdi), %k1
334 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
335 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
336 ; ENABLE-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 {%k1} {z}
337 ; ENABLE-NEXT: vmovaps %xmm2, %xmm0
340 ; DISABLE-LABEL: fmulc_maskz:
342 ; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
345 ; DISABLE-NEXT: #NO_APP
346 ; DISABLE-NEXT: kmovb (%rdi), %k1
347 ; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
348 ; DISABLE-NEXT: vmovaps %xmm2, %xmm0
351 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
352 %2 = load i8, ptr %mask
353 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
357 define <4 x float> @fcmulc(<4 x float> %a0, <4 x float> %a1) {
358 ; ENABLE-LABEL: fcmulc:
360 ; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
363 ; ENABLE-NEXT: #NO_APP
364 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
365 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
366 ; ENABLE-NEXT: vfcmulcph %xmm1, %xmm0, %xmm2
367 ; ENABLE-NEXT: vmovaps %xmm2, %xmm0
370 ; DISABLE-LABEL: fcmulc:
372 ; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
375 ; DISABLE-NEXT: #NO_APP
376 ; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
377 ; DISABLE-NEXT: vmovaps %xmm2, %xmm0
379 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
380 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
384 define <4 x float> @fcmulc_mem(<4 x float> %a0, ptr %p1) {
385 ; ENABLE-LABEL: fcmulc_mem:
387 ; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
390 ; ENABLE-NEXT: #NO_APP
391 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
392 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
393 ; ENABLE-NEXT: vfcmulcph (%rdi), %xmm0, %xmm1
394 ; ENABLE-NEXT: vmovaps %xmm1, %xmm0
397 ; DISABLE-LABEL: fcmulc_mem:
399 ; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
402 ; DISABLE-NEXT: #NO_APP
403 ; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
404 ; DISABLE-NEXT: vfcmulcph (%rdi), %xmm0, %xmm1
405 ; DISABLE-NEXT: vmovaps %xmm1, %xmm0
407 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
408 %a1 = load <4 x float>, ptr %p1, align 64
409 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
413 define <4 x float> @fcmulc_broadcast(<4 x float> %a0, ptr %p1) {
414 ; ENABLE-LABEL: fcmulc_broadcast:
416 ; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
419 ; ENABLE-NEXT: #NO_APP
420 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
421 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
422 ; ENABLE-NEXT: vfcmulcph (%rdi){1to4}, %xmm0, %xmm1
423 ; ENABLE-NEXT: vmovaps %xmm1, %xmm0
426 ; DISABLE-LABEL: fcmulc_broadcast:
428 ; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
431 ; DISABLE-NEXT: #NO_APP
432 ; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
433 ; DISABLE-NEXT: vfcmulcph (%rdi){1to4}, %xmm0, %xmm1
434 ; DISABLE-NEXT: vmovaps %xmm1, %xmm0
436 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
437 %v1 = load float, ptr %p1, align 4
438 %t0 = insertelement <4 x float> undef, float %v1, i64 0
439 %a1 = shufflevector <4 x float> %t0, <4 x float> undef, <4 x i32> zeroinitializer
440 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
444 define <4 x float> @fcmulc_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
445 ; ENABLE-LABEL: fcmulc_maskz:
447 ; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
450 ; ENABLE-NEXT: #NO_APP
451 ; ENABLE-NEXT: kmovb (%rdi), %k1
452 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
453 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
454 ; ENABLE-NEXT: vfcmulcph %xmm1, %xmm0, %xmm2 {%k1} {z}
455 ; ENABLE-NEXT: vmovaps %xmm2, %xmm0
458 ; DISABLE-LABEL: fcmulc_maskz:
460 ; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
463 ; DISABLE-NEXT: #NO_APP
464 ; DISABLE-NEXT: kmovb (%rdi), %k1
465 ; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
466 ; DISABLE-NEXT: vmovaps %xmm2, %xmm0
468 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
469 %2 = load i8, ptr %mask
470 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
474 define <8 x float> @fmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
475 ; ENABLE-LABEL: fmulc_ymm:
477 ; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
480 ; ENABLE-NEXT: #NO_APP
481 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
482 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
483 ; ENABLE-NEXT: vfmulcph %ymm1, %ymm0, %ymm2
484 ; ENABLE-NEXT: vmovaps %ymm2, %ymm0
487 ; DISABLE-LABEL: fmulc_ymm:
489 ; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
492 ; DISABLE-NEXT: #NO_APP
493 ; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
494 ; DISABLE-NEXT: vmovaps %ymm2, %ymm0
496 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
497 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
501 define <8 x float> @fmulc_ymm_mem(<8 x float> %a0, ptr %p1) {
502 ; ENABLE-LABEL: fmulc_ymm_mem:
506 ; ENABLE-NEXT: #NO_APP
507 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
508 ; ENABLE-NEXT: vfmulcph (%rdi), %ymm0, %ymm1
509 ; ENABLE-NEXT: vmovaps %ymm1, %ymm0
512 ; DISABLE-LABEL: fmulc_ymm_mem:
516 ; DISABLE-NEXT: #NO_APP
517 ; DISABLE-NEXT: vfmulcph (%rdi), %ymm0, %ymm1
518 ; DISABLE-NEXT: vmovaps %ymm1, %ymm0
520 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
521 %a1 = load <8 x float>, ptr %p1, align 64
522 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
526 define <8 x float> @fmulc_ymm_broadcast(<8 x float> %a0, ptr %p1) {
527 ; ENABLE-LABEL: fmulc_ymm_broadcast:
531 ; ENABLE-NEXT: #NO_APP
532 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
533 ; ENABLE-NEXT: vfmulcph (%rdi){1to8}, %ymm0, %ymm1
534 ; ENABLE-NEXT: vmovaps %ymm1, %ymm0
537 ; DISABLE-LABEL: fmulc_ymm_broadcast:
541 ; DISABLE-NEXT: #NO_APP
542 ; DISABLE-NEXT: vfmulcph (%rdi){1to8}, %ymm0, %ymm1
543 ; DISABLE-NEXT: vmovaps %ymm1, %ymm0
545 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
546 %v1 = load float, ptr %p1, align 4
547 %t0 = insertelement <8 x float> undef, float %v1, i64 0
548 %a1 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer
549 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
553 define <8 x float> @fmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, ptr %mask) {
554 ; ENABLE-LABEL: fmulc_maskz_ymm:
556 ; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
559 ; ENABLE-NEXT: #NO_APP
560 ; ENABLE-NEXT: kmovb (%rdi), %k1
561 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
562 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
563 ; ENABLE-NEXT: vfmulcph %ymm1, %ymm0, %ymm2 {%k1} {z}
564 ; ENABLE-NEXT: vmovaps %ymm2, %ymm0
567 ; DISABLE-LABEL: fmulc_maskz_ymm:
569 ; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
572 ; DISABLE-NEXT: #NO_APP
573 ; DISABLE-NEXT: kmovb (%rdi), %k1
574 ; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload
575 ; DISABLE-NEXT: vmovaps %ymm2, %ymm0
577 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
578 %2 = load i8, ptr %mask
579 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
583 define <8 x float> @fcmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
584 ; ENABLE-LABEL: fcmulc_ymm:
586 ; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
589 ; ENABLE-NEXT: #NO_APP
590 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
591 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
592 ; ENABLE-NEXT: vfcmulcph %ymm1, %ymm0, %ymm2
593 ; ENABLE-NEXT: vmovaps %ymm2, %ymm0
596 ; DISABLE-LABEL: fcmulc_ymm:
598 ; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
601 ; DISABLE-NEXT: #NO_APP
602 ; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
603 ; DISABLE-NEXT: vmovaps %ymm2, %ymm0
605 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
606 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
610 define <8 x float> @fcmulc_ymm_mem(<8 x float> %a0, ptr %p1) {
611 ; ENABLE-LABEL: fcmulc_ymm_mem:
615 ; ENABLE-NEXT: #NO_APP
616 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
617 ; ENABLE-NEXT: vfcmulcph (%rdi), %ymm0, %ymm1
618 ; ENABLE-NEXT: vmovaps %ymm1, %ymm0
621 ; DISABLE-LABEL: fcmulc_ymm_mem:
625 ; DISABLE-NEXT: #NO_APP
626 ; DISABLE-NEXT: vfcmulcph (%rdi), %ymm0, %ymm1
627 ; DISABLE-NEXT: vmovaps %ymm1, %ymm0
629 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
630 %a1 = load <8 x float>, ptr %p1, align 64
631 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
635 define <8 x float> @fcmulc_ymm_broadcast(<8 x float> %a0, ptr %p1) {
636 ; ENABLE-LABEL: fcmulc_ymm_broadcast:
640 ; ENABLE-NEXT: #NO_APP
641 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
642 ; ENABLE-NEXT: vfcmulcph (%rdi){1to8}, %ymm0, %ymm1
643 ; ENABLE-NEXT: vmovaps %ymm1, %ymm0
646 ; DISABLE-LABEL: fcmulc_ymm_broadcast:
650 ; DISABLE-NEXT: #NO_APP
651 ; DISABLE-NEXT: vfcmulcph (%rdi){1to8}, %ymm0, %ymm1
652 ; DISABLE-NEXT: vmovaps %ymm1, %ymm0
654 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
655 %v1 = load float, ptr %p1, align 4
656 %t0 = insertelement <8 x float> undef, float %v1, i64 0
657 %a1 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer
658 %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
662 define <8 x float> @fcmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, ptr %mask) {
663 ; ENABLE-LABEL: fcmulc_maskz_ymm:
665 ; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
668 ; ENABLE-NEXT: #NO_APP
669 ; ENABLE-NEXT: kmovb (%rdi), %k1
670 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
671 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
672 ; ENABLE-NEXT: vfcmulcph %ymm1, %ymm0, %ymm2 {%k1} {z}
673 ; ENABLE-NEXT: vmovaps %ymm2, %ymm0
676 ; DISABLE-LABEL: fcmulc_maskz_ymm:
678 ; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
681 ; DISABLE-NEXT: #NO_APP
682 ; DISABLE-NEXT: kmovb (%rdi), %k1
683 ; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload
684 ; DISABLE-NEXT: vmovaps %ymm2, %ymm0
686 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
687 %2 = load i8, ptr %mask
688 %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
692 define <4 x float> @fmulcsh(<4 x float> %a0, <4 x float> %a1) {
693 ; ENABLE-LABEL: fmulcsh:
695 ; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
698 ; ENABLE-NEXT: #NO_APP
699 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
700 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
701 ; ENABLE-NEXT: vfmulcsh %xmm1, %xmm0, %xmm2
702 ; ENABLE-NEXT: vmovaps %xmm2, %xmm0
705 ; DISABLE-LABEL: fmulcsh:
707 ; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
710 ; DISABLE-NEXT: #NO_APP
711 ; DISABLE-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
712 ; DISABLE-NEXT: vmovaps %xmm2, %xmm0
714 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
715 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
719 define <4 x float> @fmulcsh_mem(<4 x float> %a0, ptr %p1) {
720 ; ENABLE-LABEL: fmulcsh_mem:
722 ; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
725 ; ENABLE-NEXT: #NO_APP
726 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
727 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
728 ; ENABLE-NEXT: vfmulcsh (%rdi), %xmm0, %xmm1
729 ; ENABLE-NEXT: vmovaps %xmm1, %xmm0
732 ; DISABLE-LABEL: fmulcsh_mem:
734 ; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
737 ; DISABLE-NEXT: #NO_APP
738 ; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
739 ; DISABLE-NEXT: vfmulcsh (%rdi), %xmm0, %xmm1
740 ; DISABLE-NEXT: vmovaps %xmm1, %xmm0
742 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
743 %a1 = load <4 x float>, ptr %p1, align 64
744 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
748 define <4 x float> @fmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
749 ; ENABLE-LABEL: fmulcsh_maskz:
751 ; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
754 ; ENABLE-NEXT: #NO_APP
755 ; ENABLE-NEXT: kmovb (%rdi), %k1
756 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
757 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
758 ; ENABLE-NEXT: vfmulcsh %xmm1, %xmm0, %xmm2 {%k1} {z}
759 ; ENABLE-NEXT: vmovaps %xmm2, %xmm0
762 ; DISABLE-LABEL: fmulcsh_maskz:
764 ; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
767 ; DISABLE-NEXT: #NO_APP
768 ; DISABLE-NEXT: kmovb (%rdi), %k1
769 ; DISABLE-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
770 ; DISABLE-NEXT: vmovaps %xmm2, %xmm0
772 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
773 %2 = load i8, ptr %mask
774 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
778 define <4 x float> @fcmulcsh(<4 x float> %a0, <4 x float> %a1) {
779 ; ENABLE-LABEL: fcmulcsh:
781 ; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
784 ; ENABLE-NEXT: #NO_APP
785 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
786 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
787 ; ENABLE-NEXT: vfcmulcsh %xmm1, %xmm0, %xmm2
788 ; ENABLE-NEXT: vmovaps %xmm2, %xmm0
791 ; DISABLE-LABEL: fcmulcsh:
793 ; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
796 ; DISABLE-NEXT: #NO_APP
797 ; DISABLE-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
798 ; DISABLE-NEXT: vmovaps %xmm2, %xmm0
800 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
801 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
805 define <4 x float> @fcmulcsh_mem(<4 x float> %a0, ptr %p1) {
806 ; ENABLE-LABEL: fcmulcsh_mem:
808 ; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
811 ; ENABLE-NEXT: #NO_APP
812 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
813 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
814 ; ENABLE-NEXT: vfcmulcsh (%rdi), %xmm0, %xmm1
815 ; ENABLE-NEXT: vmovaps %xmm1, %xmm0
818 ; DISABLE-LABEL: fcmulcsh_mem:
820 ; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
823 ; DISABLE-NEXT: #NO_APP
824 ; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
825 ; DISABLE-NEXT: vfcmulcsh (%rdi), %xmm0, %xmm1
826 ; DISABLE-NEXT: vmovaps %xmm1, %xmm0
828 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
829 %a1 = load <4 x float>, ptr %p1, align 64
830 %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
834 define <4 x float> @fcmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
835 ; ENABLE-LABEL: fcmulcsh_maskz:
837 ; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
840 ; ENABLE-NEXT: #NO_APP
841 ; ENABLE-NEXT: kmovb (%rdi), %k1
842 ; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
843 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
844 ; ENABLE-NEXT: vfcmulcsh %xmm1, %xmm0, %xmm2 {%k1} {z}
845 ; ENABLE-NEXT: vmovaps %xmm2, %xmm0
848 ; DISABLE-LABEL: fcmulcsh_maskz:
850 ; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
853 ; DISABLE-NEXT: #NO_APP
854 ; DISABLE-NEXT: kmovb (%rdi), %k1
855 ; DISABLE-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
856 ; DISABLE-NEXT: vmovaps %xmm2, %xmm0
858 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
859 %2 = load i8, ptr %mask
860 %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
864 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
865 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
866 declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
867 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
868 declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
869 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
870 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
871 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)