1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bf16 -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bf16 -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
5 declare <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1
7 define <2 x i64> @test_mm_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B) local_unnamed_addr #0 {
8 ; CHECK-LABEL: test_mm_cvtne2ps2bf16_128:
9 ; CHECK: # %bb.0: # %entry
10 ; CHECK-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7f,0x08,0x72,0xc1]
11 ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
13 %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
14 %1 = bitcast <8 x bfloat> %0 to <2 x i64>
18 define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B, i8 zeroext %U) local_unnamed_addr #0 {
19 ; X86-LABEL: test_mm_maskz_cvtne2ps2bf16_128:
20 ; X86: # %bb.0: # %entry
21 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
22 ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
23 ; X86-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0x89,0x72,0xc1]
24 ; X86-NEXT: retl # encoding: [0xc3]
26 ; X64-LABEL: test_mm_maskz_cvtne2ps2bf16_128:
27 ; X64: # %bb.0: # %entry
28 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
29 ; X64-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0x89,0x72,0xc1]
30 ; X64-NEXT: retq # encoding: [0xc3]
32 %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
33 %1 = bitcast i8 %U to <8 x i1>
34 %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
35 %3 = bitcast <8 x bfloat> %2 to <2 x i64>
39 define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A, <4 x float> %B) local_unnamed_addr #0 {
40 ; X86-LABEL: test_mm_mask_cvtne2ps2bf16_128:
41 ; X86: # %bb.0: # %entry
42 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
43 ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
44 ; X86-NEXT: vcvtne2ps2bf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x72,0xc2]
45 ; X86-NEXT: retl # encoding: [0xc3]
47 ; X64-LABEL: test_mm_mask_cvtne2ps2bf16_128:
48 ; X64: # %bb.0: # %entry
49 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
50 ; X64-NEXT: vcvtne2ps2bf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x72,0xc2]
51 ; X64-NEXT: retq # encoding: [0xc3]
53 %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
54 %1 = bitcast <2 x i64> %C to <8 x bfloat>
55 %2 = bitcast i8 %U to <8 x i1>
56 %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1
57 %4 = bitcast <8 x bfloat> %3 to <2 x i64>
61 declare <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3
63 define <4 x i64> @test_mm256_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B) local_unnamed_addr #1 {
64 ; CHECK-LABEL: test_mm256_cvtne2ps2bf16_256:
65 ; CHECK: # %bb.0: # %entry
66 ; CHECK-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7f,0x28,0x72,0xc1]
67 ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
69 %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
70 %1 = bitcast <16 x bfloat> %0 to <4 x i64>
74 define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B, i16 zeroext %U) local_unnamed_addr #1 {
75 ; X86-LABEL: test_mm256_maskz_cvtne2ps2bf16_256:
76 ; X86: # %bb.0: # %entry
77 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
78 ; X86-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xa9,0x72,0xc1]
79 ; X86-NEXT: retl # encoding: [0xc3]
81 ; X64-LABEL: test_mm256_maskz_cvtne2ps2bf16_256:
82 ; X64: # %bb.0: # %entry
83 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
84 ; X64-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xa9,0x72,0xc1]
85 ; X64-NEXT: retq # encoding: [0xc3]
87 %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
88 %1 = bitcast i16 %U to <16 x i1>
89 %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
90 %3 = bitcast <16 x bfloat> %2 to <4 x i64>
94 define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(<4 x i64> %C, i16 zeroext %U, <8 x float> %A, <8 x float> %B) local_unnamed_addr #1 {
95 ; X86-LABEL: test_mm256_mask_cvtne2ps2bf16_256:
96 ; X86: # %bb.0: # %entry
97 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
98 ; X86-NEXT: vcvtne2ps2bf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x72,0xc2]
99 ; X86-NEXT: retl # encoding: [0xc3]
101 ; X64-LABEL: test_mm256_mask_cvtne2ps2bf16_256:
102 ; X64: # %bb.0: # %entry
103 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
104 ; X64-NEXT: vcvtne2ps2bf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x72,0xc2]
105 ; X64-NEXT: retq # encoding: [0xc3]
107 %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
108 %1 = bitcast <4 x i64> %C to <16 x bfloat>
109 %2 = bitcast i16 %U to <16 x i1>
110 %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1
111 %4 = bitcast <16 x bfloat> %3 to <4 x i64>
115 declare <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3
117 define <2 x i64> @test_mm256_cvtneps2bf16_256(<8 x float> %A) local_unnamed_addr #2 {
118 ; CHECK-LABEL: test_mm256_cvtneps2bf16_256:
119 ; CHECK: # %bb.0: # %entry
120 ; CHECK-NEXT: vcvtneps2bf16 %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0xc0]
121 ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
122 ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
124 %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
125 %1 = bitcast <8 x bfloat> %0 to <2 x i64>
129 define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(<8 x float> %A, i8 zeroext %U) local_unnamed_addr #2 {
130 ; X86-LABEL: test_mm256_maskz_cvtneps2bf16_256:
131 ; X86: # %bb.0: # %entry
132 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
133 ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
134 ; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x72,0xc0]
135 ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
136 ; X86-NEXT: retl # encoding: [0xc3]
138 ; X64-LABEL: test_mm256_maskz_cvtneps2bf16_256:
139 ; X64: # %bb.0: # %entry
140 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
141 ; X64-NEXT: vcvtneps2bf16 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x72,0xc0]
142 ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
143 ; X64-NEXT: retq # encoding: [0xc3]
145 %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
146 %1 = bitcast i8 %U to <8 x i1>
147 %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
148 %3 = bitcast <8 x bfloat> %2 to <2 x i64>
152 define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(<2 x i64> %C, i8 zeroext %U, <8 x float> %A) local_unnamed_addr #2 {
153 ; X86-LABEL: test_mm256_mask_cvtneps2bf16_256:
154 ; X86: # %bb.0: # %entry
155 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
156 ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
157 ; X86-NEXT: vcvtneps2bf16 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x72,0xc1]
158 ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
159 ; X86-NEXT: retl # encoding: [0xc3]
161 ; X64-LABEL: test_mm256_mask_cvtneps2bf16_256:
162 ; X64: # %bb.0: # %entry
163 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
164 ; X64-NEXT: vcvtneps2bf16 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x72,0xc1]
165 ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
166 ; X64-NEXT: retq # encoding: [0xc3]
168 %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
169 %1 = bitcast <2 x i64> %C to <8 x bfloat>
170 %2 = bitcast i8 %U to <8 x i1>
171 %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1
172 %4 = bitcast <8 x bfloat> %3 to <2 x i64>
176 declare <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x bfloat>, <4 x i1>) #3
178 define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr #2 {
179 ; CHECK-LABEL: test_mm128_cvtneps2bf16_128:
180 ; CHECK: # %bb.0: # %entry
181 ; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
182 ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
184 %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
185 %1 = bitcast <8 x bfloat> %0 to <2 x i64>
189 define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %U) local_unnamed_addr #2 {
190 ; X86-LABEL: test_mm128_maskz_cvtneps2bf16_128:
191 ; X86: # %bb.0: # %entry
192 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
193 ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
194 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x72,0xc0]
195 ; X86-NEXT: retl # encoding: [0xc3]
197 ; X64-LABEL: test_mm128_maskz_cvtneps2bf16_128:
198 ; X64: # %bb.0: # %entry
199 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
200 ; X64-NEXT: vcvtneps2bf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x72,0xc0]
201 ; X64-NEXT: retq # encoding: [0xc3]
203 %0 = bitcast i8 %U to <8 x i1>
204 %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
205 %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> zeroinitializer, <4 x i1> %1) #4
206 %3 = bitcast <8 x bfloat> %2 to <2 x i64>
210 define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 {
211 ; X86-LABEL: test_mm128_mask_cvtneps2bf16_128:
212 ; X86: # %bb.0: # %entry
213 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
214 ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
215 ; X86-NEXT: vcvtneps2bf16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x72,0xc1]
216 ; X86-NEXT: retl # encoding: [0xc3]
218 ; X64-LABEL: test_mm128_mask_cvtneps2bf16_128:
219 ; X64: # %bb.0: # %entry
220 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
221 ; X64-NEXT: vcvtneps2bf16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x72,0xc1]
222 ; X64-NEXT: retq # encoding: [0xc3]
224 %0 = bitcast i8 %U to <8 x i1>
225 %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
226 %2 = bitcast <2 x i64> %C to <8 x bfloat>
227 %3 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %2, <4 x i1> %1) #4
228 %4 = bitcast <8 x bfloat> %3 to <2 x i64>
232 ; Make sure we don't fold a select into the 128 bit form of cvtneps2bf16. It
233 ; always writes zeros to bits 127:64 regardless of mask.
234 define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 {
235 ; X86-LABEL: test_mm128_cvtneps2bf16_128_select:
236 ; X86: # %bb.0: # %entry
237 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
238 ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
239 ; X86-NEXT: vcvtneps2bf16 %xmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc9]
240 ; X86-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1]
241 ; X86-NEXT: retl # encoding: [0xc3]
243 ; X64-LABEL: test_mm128_cvtneps2bf16_128_select:
244 ; X64: # %bb.0: # %entry
245 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
246 ; X64-NEXT: vcvtneps2bf16 %xmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc9]
247 ; X64-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1]
248 ; X64-NEXT: retq # encoding: [0xc3]
250 %0 = bitcast i8 %U to <8 x i1>
251 %1 = bitcast <2 x i64> %C to <8 x bfloat>
252 %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
253 %3 = select <8 x i1> %0, <8 x bfloat> %2, <8 x bfloat> %1
254 %4 = bitcast <8 x bfloat> %3 to <2 x i64>
258 declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x bfloat>) #3
260 define <8 x float> @test_mm256_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 {
261 ; CHECK-LABEL: test_mm256_dpbf16ps_256:
262 ; CHECK: # %bb.0: # %entry
263 ; CHECK-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x52,0xc2]
264 ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
266 %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
270 define <8 x float> @test_mm256_maskz_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B, i8 zeroext %U) local_unnamed_addr #2 {
271 ; X86-LABEL: test_mm256_maskz_dpbf16ps_256:
272 ; X86: # %bb.0: # %entry
273 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
274 ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
275 ; X86-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x52,0xc2]
276 ; X86-NEXT: retl # encoding: [0xc3]
278 ; X64-LABEL: test_mm256_maskz_dpbf16ps_256:
279 ; X64: # %bb.0: # %entry
280 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
281 ; X64-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x52,0xc2]
282 ; X64-NEXT: retq # encoding: [0xc3]
284 %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
285 %1 = bitcast i8 %U to <8 x i1>
286 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
289 define <8 x float> @test_mm256_mask_dpbf16ps_256(i8 zeroext %U, <8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 {
290 ; X86-LABEL: test_mm256_mask_dpbf16ps_256:
291 ; X86: # %bb.0: # %entry
292 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
293 ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
294 ; X86-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x52,0xc2]
295 ; X86-NEXT: retl # encoding: [0xc3]
297 ; X64-LABEL: test_mm256_mask_dpbf16ps_256:
298 ; X64: # %bb.0: # %entry
299 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
300 ; X64-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x52,0xc2]
301 ; X64-NEXT: retq # encoding: [0xc3]
303 %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
304 %1 = bitcast i8 %U to <8 x i1>
305 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %E
309 declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x bfloat>, <8 x bfloat>) #3
311 define <4 x float> @test_mm128_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 {
312 ; CHECK-LABEL: test_mm128_dpbf16ps_128:
313 ; CHECK: # %bb.0: # %entry
314 ; CHECK-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x52,0xc2]
315 ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
317 %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
321 define <4 x float> @test_mm128_maskz_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B, i4 zeroext %U) local_unnamed_addr #2 {
322 ; X86-LABEL: test_mm128_maskz_dpbf16ps_128:
323 ; X86: # %bb.0: # %entry
324 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
325 ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
326 ; X86-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x52,0xc2]
327 ; X86-NEXT: retl # encoding: [0xc3]
329 ; X64-LABEL: test_mm128_maskz_dpbf16ps_128:
330 ; X64: # %bb.0: # %entry
331 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
332 ; X64-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x52,0xc2]
333 ; X64-NEXT: retq # encoding: [0xc3]
335 %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
336 %1 = bitcast i4 %U to <4 x i1>
337 %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> zeroinitializer
340 define <4 x float> @test_mm128_mask_dpbf16ps_128(i4 zeroext %U, <4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 {
341 ; X86-LABEL: test_mm128_mask_dpbf16ps_128:
342 ; X86: # %bb.0: # %entry
343 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
344 ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
345 ; X86-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x52,0xc2]
346 ; X86-NEXT: retl # encoding: [0xc3]
348 ; X64-LABEL: test_mm128_mask_dpbf16ps_128:
349 ; X64: # %bb.0: # %entry
350 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
351 ; X64-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x52,0xc2]
352 ; X64-NEXT: retq # encoding: [0xc3]
354 %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
355 %1 = bitcast i4 %U to <4 x i1>
356 %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> %E
360 define <16 x i16> @test_no_vbroadcast1() {
361 ; CHECK-LABEL: test_no_vbroadcast1:
362 ; CHECK: # %bb.0: # %entry
363 ; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
364 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0]
365 ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
367 %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
368 %1 = bitcast <8 x bfloat> %0 to <8 x i16>
369 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> zeroinitializer
373 ;; FIXME: This should generate the same output as above, but let's fix the crash first.
374 define <16 x bfloat> @test_no_vbroadcast2() nounwind {
375 ; X86-LABEL: test_no_vbroadcast2:
376 ; X86: # %bb.0: # %entry
377 ; X86-NEXT: pushl %ebp # encoding: [0x55]
378 ; X86-NEXT: movl %esp, %ebp # encoding: [0x89,0xe5]
379 ; X86-NEXT: andl $-32, %esp # encoding: [0x83,0xe4,0xe0]
380 ; X86-NEXT: subl $64, %esp # encoding: [0x83,0xec,0x40]
381 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
382 ; X86-NEXT: vmovaps %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
383 ; X86-NEXT: vpbroadcastw (%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
384 ; X86-NEXT: movl %ebp, %esp # encoding: [0x89,0xec]
385 ; X86-NEXT: popl %ebp # encoding: [0x5d]
386 ; X86-NEXT: retl # encoding: [0xc3]
388 ; X64-LABEL: test_no_vbroadcast2:
389 ; X64: # %bb.0: # %entry
390 ; X64-NEXT: pushq %rbp # encoding: [0x55]
391 ; X64-NEXT: movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
392 ; X64-NEXT: andq $-32, %rsp # encoding: [0x48,0x83,0xe4,0xe0]
393 ; X64-NEXT: subq $64, %rsp # encoding: [0x48,0x83,0xec,0x40]
394 ; X64-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0]
395 ; X64-NEXT: vmovaps %xmm0, (%rsp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x04,0x24]
396 ; X64-NEXT: vpbroadcastw (%rsp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x04,0x24]
397 ; X64-NEXT: movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
398 ; X64-NEXT: popq %rbp # encoding: [0x5d]
399 ; X64-NEXT: retq # encoding: [0xc3]
401 %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
402 %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer