1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512bf16,+avx512vl < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <32 x i16> @stack_fold_cvtne2ps2bf16(<16 x float> %a0, <16 x float> %a1) {
13 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16:
15 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
22 %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1)
25 declare <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>)
27 define <32 x i16> @stack_fold_cvtne2ps2bf16_mask(<16 x float> %a0, <16 x float> %a1, <32 x i16>* %passthru, i32 %U) {
28 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask:
30 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
34 ; CHECK-NEXT: kmovd %esi, %k1
35 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
36 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
37 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
39 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
40 %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1)
41 %3 = bitcast i32 %U to <32 x i1>
42 ; load needed to keep the operation from being scheduled above the asm block
43 %4 = load <32 x i16>, <32 x i16>* %passthru
44 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
48 define <32 x i16> @stack_fold_cvtne2ps2bf16_maskz(<16 x float> %a0, <16 x float> %a1, i32 %U) {
49 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz:
51 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
55 ; CHECK-NEXT: kmovd %edi, %k1
56 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
58 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
59 %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1)
60 %3 = bitcast i32 %U to <32 x i1>
61 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
65 define <16 x i16> @stack_fold_cvtneps2bf16(<16 x float> %a0) {
66 ; CHECK-LABEL: stack_fold_cvtneps2bf16:
68 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
72 ; CHECK-NEXT: vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 64-byte Folded Reload
74 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
75 %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0)
78 declare <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>)
80 define <16 x i16> @stack_fold_cvtneps2bf16_mask(<16 x float> %a0, <16 x i16>* %passthru, i16 %U) {
81 ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask:
83 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
87 ; CHECK-NEXT: kmovd %esi, %k1
88 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
89 ; CHECK-NEXT: vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 64-byte Folded Reload
90 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
92 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
93 %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0)
94 %3 = bitcast i16 %U to <16 x i1>
95 ; load needed to keep the operation from being scheduled above the asm block
96 %4 = load <16 x i16>, <16 x i16>* %passthru
97 %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4
101 define <16 x i16> @stack_fold_cvtneps2bf16_maskz(<16 x float> %a0, i16 %U) {
102 ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz:
104 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
107 ; CHECK-NEXT: #NO_APP
108 ; CHECK-NEXT: kmovd %edi, %k1
109 ; CHECK-NEXT: vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 64-byte Folded Reload
111 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
112 %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0)
113 %3 = bitcast i16 %U to <16 x i1>
114 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
118 define <16 x float> @stack_fold_vdpbf16ps(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2) {
119 ; CHECK-LABEL: stack_fold_vdpbf16ps:
121 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
124 ; CHECK-NEXT: #NO_APP
125 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
127 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
128 %2 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2)
131 declare <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <16 x i32>, <16 x i32>)
133 define <16 x float> @stack_fold_vdpbf16ps_mask(<16 x float>* %a0, <16 x i32> %a1, <16 x i32> %a2, <16 x float>* %passthru, i16 %U) {
134 ; CHECK-LABEL: stack_fold_vdpbf16ps_mask:
136 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
139 ; CHECK-NEXT: #NO_APP
140 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
141 ; CHECK-NEXT: kmovd %edx, %k1
142 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
143 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
145 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
146 ; load needed to keep the operation from being scheduled above the asm block
147 %2 = load <16 x float>, <16 x float>* %a0
148 %3 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %2, <16 x i32> %a1, <16 x i32> %a2)
149 %4 = bitcast i16 %U to <16 x i1>
150 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %2
154 define <16 x float> @stack_fold_vdpbf16ps_maskz(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %U) {
155 ; CHECK-LABEL: stack_fold_vdpbf16ps_maskz:
157 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
160 ; CHECK-NEXT: #NO_APP
161 ; CHECK-NEXT: kmovw (%rdi), %k1
162 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
164 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
165 %2 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2)
166 %3 = load i16, i16* %U
167 %4 = bitcast i16 %3 to <16 x i1>
168 %5 = select <16 x i1> %4, <16 x float> %2, <16 x float> zeroinitializer
174 define <16 x i16> @stack_fold_cvtne2ps2bf16_ymm(<8 x float> %a0, <8 x float> %a1) {
175 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_ymm:
177 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
180 ; CHECK-NEXT: #NO_APP
181 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
183 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
184 %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1)
187 declare <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>)
189 define <16 x i16> @stack_fold_cvtne2ps2bf16_mask_ymm(<8 x float> %a0, <8 x float> %a1, <16 x i16>* %passthru, i16 %U) {
190 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_ymm:
192 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
195 ; CHECK-NEXT: #NO_APP
196 ; CHECK-NEXT: kmovd %esi, %k1
197 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
198 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
199 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
201 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
202 %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1)
203 %3 = bitcast i16 %U to <16 x i1>
204 ; load needed to keep the operation from being scheduled above the asm block
205 %4 = load <16 x i16>, <16 x i16>* %passthru
206 %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4
210 define <16 x i16> @stack_fold_cvtne2ps2bf16_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i16 %U) {
211 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_ymm:
213 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
216 ; CHECK-NEXT: #NO_APP
217 ; CHECK-NEXT: kmovd %edi, %k1
218 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
220 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
221 %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1)
222 %3 = bitcast i16 %U to <16 x i1>
223 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
227 define <8 x i16> @stack_fold_cvtneps2bf16_ymm(<8 x float> %a0) {
228 ; CHECK-LABEL: stack_fold_cvtneps2bf16_ymm:
230 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
233 ; CHECK-NEXT: #NO_APP
234 ; CHECK-NEXT: vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload
235 ; CHECK-NEXT: vzeroupper
237 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
238 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0)
241 declare <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>)
243 define <8 x i16> @stack_fold_cvtneps2bf16_mask_ymm(<8 x float> %a0, <8 x i16>* %passthru, i8 %U) {
244 ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_ymm:
246 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
249 ; CHECK-NEXT: #NO_APP
250 ; CHECK-NEXT: kmovd %esi, %k1
251 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
252 ; CHECK-NEXT: vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 32-byte Folded Reload
253 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
254 ; CHECK-NEXT: vzeroupper
256 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
257 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0)
258 %3 = bitcast i8 %U to <8 x i1>
259 ; load needed to keep the operation from being scheduled above the asm block
260 %4 = load <8 x i16>, <8 x i16>* %passthru
261 %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4
265 define <8 x i16> @stack_fold_cvtneps2bf16_maskz_ymm(<8 x float> %a0, i8 %U) {
266 ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_ymm:
268 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
271 ; CHECK-NEXT: #NO_APP
272 ; CHECK-NEXT: kmovd %edi, %k1
273 ; CHECK-NEXT: vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 32-byte Folded Reload
274 ; CHECK-NEXT: vzeroupper
276 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
277 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0)
278 %3 = bitcast i8 %U to <8 x i1>
279 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
283 define <8 x float> @stack_fold_vdpbf16ps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2) {
284 ; CHECK-LABEL: stack_fold_vdpbf16ps_ymm:
286 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
289 ; CHECK-NEXT: #NO_APP
290 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
292 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
293 %2 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2)
296 declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <8 x i32>, <8 x i32>)
298 define <8 x float> @stack_fold_vdpbf16ps_mask_ymm(<8 x float>* %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x float>* %passthru, i8 %U) {
299 ; CHECK-LABEL: stack_fold_vdpbf16ps_mask_ymm:
301 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
304 ; CHECK-NEXT: #NO_APP
305 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
306 ; CHECK-NEXT: kmovd %edx, %k1
307 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
308 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
310 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
311 ; load needed to keep the operation from being scheduled above the asm block
312 %2 = load <8 x float>, <8 x float>* %a0
313 %3 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %2, <8 x i32> %a1, <8 x i32> %a2)
314 %4 = bitcast i8 %U to <8 x i1>
315 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %2
319 define <8 x float> @stack_fold_vdpbf16ps_maskz_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2, i8* %U) {
320 ; CHECK-LABEL: stack_fold_vdpbf16ps_maskz_ymm:
322 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
325 ; CHECK-NEXT: #NO_APP
326 ; CHECK-NEXT: movzbl (%rdi), %eax
327 ; CHECK-NEXT: kmovd %eax, %k1
328 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
330 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
331 %2 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2)
333 %4 = bitcast i8 %3 to <8 x i1>
334 %5 = select <8 x i1> %4, <8 x float> %2, <8 x float> zeroinitializer
341 define <8 x i16> @stack_fold_cvtne2ps2bf16_xmm(<4 x float> %a0, <4 x float> %a1) {
342 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_xmm:
344 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
347 ; CHECK-NEXT: #NO_APP
348 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
350 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
351 %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1)
354 declare <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>)
356 define <8 x i16> @stack_fold_cvtne2ps2bf16_mask_xmm(<4 x float> %a0, <4 x float> %a1, <8 x i16>* %passthru, i8 %U) {
357 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_xmm:
359 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
362 ; CHECK-NEXT: #NO_APP
363 ; CHECK-NEXT: kmovd %esi, %k1
364 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
365 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
366 ; CHECK-NEXT: vmovdqa %xmm2, %xmm0
368 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
369 %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1)
370 %3 = bitcast i8 %U to <8 x i1>
371 ; load needed to keep the operation from being scheduled above the asm block
372 %4 = load <8 x i16>, <8 x i16>* %passthru
373 %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4
377 define <8 x i16> @stack_fold_cvtne2ps2bf16_maskz_xmm(<4 x float> %a0, <4 x float> %a1, i8 %U) {
378 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_xmm:
380 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
383 ; CHECK-NEXT: #NO_APP
384 ; CHECK-NEXT: kmovd %edi, %k1
385 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
387 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
388 %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1)
389 %3 = bitcast i8 %U to <8 x i1>
390 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
394 define <8 x i16> @stack_fold_cvtneps2bf16_xmm(<4 x float> %a0) {
395 ; CHECK-LABEL: stack_fold_cvtneps2bf16_xmm:
397 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
400 ; CHECK-NEXT: #NO_APP
401 ; CHECK-NEXT: vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
403 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
404 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
407 declare <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x i16>, <4 x i1>)
409 define <8 x i16> @stack_fold_cvtneps2bf16_mask_xmm(<4 x float> %a0, <8 x i16>* %passthru, i8 %U) {
410 ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_xmm:
412 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
415 ; CHECK-NEXT: #NO_APP
416 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
417 ; CHECK-NEXT: kmovd %esi, %k1
418 ; CHECK-NEXT: vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
419 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
421 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
422 %2 = load <8 x i16>, <8 x i16>* %passthru
423 %3 = bitcast i8 %U to <8 x i1>
424 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
425 %5 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> %2, <4 x i1> %4)
429 define <8 x i16> @stack_fold_cvtneps2bf16_maskz_xmm(<4 x float> %a0, i8 %U) {
430 ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_xmm:
432 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
435 ; CHECK-NEXT: #NO_APP
436 ; CHECK-NEXT: kmovd %edi, %k1
437 ; CHECK-NEXT: vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
439 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
440 %2 = bitcast i8 %U to <8 x i1>
441 %3 = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
442 %4 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> zeroinitializer, <4 x i1> %3)
446 define <4 x float> @stack_fold_vdpbf16ps_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2) {
447 ; CHECK-LABEL: stack_fold_vdpbf16ps_xmm:
449 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
452 ; CHECK-NEXT: #NO_APP
453 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
455 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
456 %2 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2)
459 declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <4 x i32>, <4 x i32>)
461 define <4 x float> @stack_fold_vdpbf16ps_mask_xmm(<4 x float>* %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x float>* %passthru, i8 %U) {
462 ; CHECK-LABEL: stack_fold_vdpbf16ps_mask_xmm:
464 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
467 ; CHECK-NEXT: #NO_APP
468 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
469 ; CHECK-NEXT: kmovd %edx, %k1
470 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
471 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
473 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
474 ; load needed to keep the operation from being scheduled above the asm block
475 %2 = load <4 x float>, <4 x float>* %a0
476 %3 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %2, <4 x i32> %a1, <4 x i32> %a2)
477 %4 = bitcast i8 %U to <8 x i1>
478 %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
479 %6 = select <4 x i1> %5, <4 x float> %3, <4 x float> %2
483 define <4 x float> @stack_fold_vdpbf16ps_maskz_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2, i8* %U) {
484 ; CHECK-LABEL: stack_fold_vdpbf16ps_maskz_xmm:
486 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
489 ; CHECK-NEXT: #NO_APP
490 ; CHECK-NEXT: movzbl (%rdi), %eax
491 ; CHECK-NEXT: kmovd %eax, %k1
492 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
494 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
495 %2 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2)
497 %4 = bitcast i8 %3 to <8 x i1>
498 %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
499 %6 = select <4 x i1> %5, <4 x float> %2, <4 x float> zeroinitializer