1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=X86
3 ; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2
4 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16
5 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16
6 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,BF16,AVXNC
8 define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
11 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
12 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
13 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
14 ; X86-NEXT: movzwl (%edx), %edx
15 ; X86-NEXT: shll $16, %edx
16 ; X86-NEXT: vmovd %edx, %xmm0
17 ; X86-NEXT: movzwl (%ecx), %ecx
18 ; X86-NEXT: shll $16, %ecx
19 ; X86-NEXT: vmovd %ecx, %xmm1
20 ; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
21 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
22 ; X86-NEXT: vpextrw $0, %xmm0, (%eax)
27 ; SSE2-NEXT: pushq %rbx
28 ; SSE2-NEXT: movq %rdx, %rbx
29 ; SSE2-NEXT: movzwl (%rsi), %eax
30 ; SSE2-NEXT: shll $16, %eax
31 ; SSE2-NEXT: movd %eax, %xmm1
32 ; SSE2-NEXT: movzwl (%rdi), %eax
33 ; SSE2-NEXT: shll $16, %eax
34 ; SSE2-NEXT: movd %eax, %xmm0
35 ; SSE2-NEXT: addss %xmm1, %xmm0
36 ; SSE2-NEXT: callq __truncsfbf2@PLT
37 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
38 ; SSE2-NEXT: movw %ax, (%rbx)
39 ; SSE2-NEXT: popq %rbx
44 ; F16-NEXT: movzwl (%rsi), %eax
45 ; F16-NEXT: shll $16, %eax
46 ; F16-NEXT: vmovd %eax, %xmm0
47 ; F16-NEXT: movzwl (%rdi), %eax
48 ; F16-NEXT: shll $16, %eax
49 ; F16-NEXT: vmovd %eax, %xmm1
50 ; F16-NEXT: vaddss %xmm0, %xmm1, %xmm0
51 ; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
52 ; F16-NEXT: vpextrw $0, %xmm0, (%rdx)
57 ; AVXNC-NEXT: movzwl (%rsi), %eax
58 ; AVXNC-NEXT: shll $16, %eax
59 ; AVXNC-NEXT: vmovd %eax, %xmm0
60 ; AVXNC-NEXT: movzwl (%rdi), %eax
61 ; AVXNC-NEXT: shll $16, %eax
62 ; AVXNC-NEXT: vmovd %eax, %xmm1
63 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
64 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
65 ; AVXNC-NEXT: vpextrw $0, %xmm0, (%rdx)
67 %a = load bfloat, ptr %pa
68 %b = load bfloat, ptr %pb
69 %add = fadd bfloat %a, %b
70 store bfloat %add, ptr %pc
74 define bfloat @add2(bfloat %a, bfloat %b) nounwind {
77 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
78 ; X86-NEXT: shll $16, %eax
79 ; X86-NEXT: vmovd %eax, %xmm0
80 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
81 ; X86-NEXT: shll $16, %eax
82 ; X86-NEXT: vmovd %eax, %xmm1
83 ; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
84 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
85 ; X86-NEXT: vmovw %xmm0, %eax
86 ; X86-NEXT: vmovw %eax, %xmm0
91 ; SSE2-NEXT: pushq %rax
92 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
93 ; SSE2-NEXT: pextrw $0, %xmm1, %ecx
94 ; SSE2-NEXT: shll $16, %ecx
95 ; SSE2-NEXT: movd %ecx, %xmm1
96 ; SSE2-NEXT: shll $16, %eax
97 ; SSE2-NEXT: movd %eax, %xmm0
98 ; SSE2-NEXT: addss %xmm1, %xmm0
99 ; SSE2-NEXT: callq __truncsfbf2@PLT
100 ; SSE2-NEXT: popq %rax
105 ; FP16-NEXT: vmovw %xmm0, %eax
106 ; FP16-NEXT: vmovw %xmm1, %ecx
107 ; FP16-NEXT: shll $16, %ecx
108 ; FP16-NEXT: vmovd %ecx, %xmm0
109 ; FP16-NEXT: shll $16, %eax
110 ; FP16-NEXT: vmovd %eax, %xmm1
111 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
112 ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
113 ; FP16-NEXT: vmovw %xmm0, %eax
114 ; FP16-NEXT: vmovw %eax, %xmm0
119 ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
120 ; AVXNC-NEXT: vpextrw $0, %xmm1, %ecx
121 ; AVXNC-NEXT: shll $16, %ecx
122 ; AVXNC-NEXT: vmovd %ecx, %xmm0
123 ; AVXNC-NEXT: shll $16, %eax
124 ; AVXNC-NEXT: vmovd %eax, %xmm1
125 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
126 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
127 ; AVXNC-NEXT: vmovd %xmm0, %eax
128 ; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
130 %add = fadd bfloat %a, %b
134 define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
135 ; X86-LABEL: add_double:
137 ; X86-NEXT: pushl %ebx
138 ; X86-NEXT: pushl %edi
139 ; X86-NEXT: pushl %esi
140 ; X86-NEXT: subl $16, %esp
141 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
142 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
143 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
144 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
145 ; X86-NEXT: vmovsd %xmm0, (%esp)
146 ; X86-NEXT: calll __truncdfbf2
147 ; X86-NEXT: vmovw %xmm0, %edi
148 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
149 ; X86-NEXT: vmovsd %xmm0, (%esp)
150 ; X86-NEXT: calll __truncdfbf2
151 ; X86-NEXT: vmovw %xmm0, %eax
152 ; X86-NEXT: shll $16, %eax
153 ; X86-NEXT: vmovd %eax, %xmm0
154 ; X86-NEXT: shll $16, %edi
155 ; X86-NEXT: vmovd %edi, %xmm1
156 ; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
157 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
158 ; X86-NEXT: vmovw %xmm0, %eax
159 ; X86-NEXT: shll $16, %eax
160 ; X86-NEXT: vmovd %eax, %xmm0
161 ; X86-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
162 ; X86-NEXT: vmovsd %xmm0, (%esi)
163 ; X86-NEXT: addl $16, %esp
164 ; X86-NEXT: popl %esi
165 ; X86-NEXT: popl %edi
166 ; X86-NEXT: popl %ebx
169 ; SSE2-LABEL: add_double:
171 ; SSE2-NEXT: pushq %rbp
172 ; SSE2-NEXT: pushq %r14
173 ; SSE2-NEXT: pushq %rbx
174 ; SSE2-NEXT: movq %rdx, %rbx
175 ; SSE2-NEXT: movq %rsi, %r14
176 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
177 ; SSE2-NEXT: callq __truncdfbf2@PLT
178 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
179 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
180 ; SSE2-NEXT: callq __truncdfbf2@PLT
181 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
182 ; SSE2-NEXT: shll $16, %eax
183 ; SSE2-NEXT: movd %eax, %xmm1
184 ; SSE2-NEXT: shll $16, %ebp
185 ; SSE2-NEXT: movd %ebp, %xmm0
186 ; SSE2-NEXT: addss %xmm1, %xmm0
187 ; SSE2-NEXT: callq __truncsfbf2@PLT
188 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
189 ; SSE2-NEXT: shll $16, %eax
190 ; SSE2-NEXT: movd %eax, %xmm0
191 ; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
192 ; SSE2-NEXT: movsd %xmm0, (%rbx)
193 ; SSE2-NEXT: popq %rbx
194 ; SSE2-NEXT: popq %r14
195 ; SSE2-NEXT: popq %rbp
198 ; FP16-LABEL: add_double:
200 ; FP16-NEXT: pushq %rbp
201 ; FP16-NEXT: pushq %r14
202 ; FP16-NEXT: pushq %rbx
203 ; FP16-NEXT: movq %rdx, %rbx
204 ; FP16-NEXT: movq %rsi, %r14
205 ; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
206 ; FP16-NEXT: callq __truncdfbf2@PLT
207 ; FP16-NEXT: vmovw %xmm0, %ebp
208 ; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
209 ; FP16-NEXT: callq __truncdfbf2@PLT
210 ; FP16-NEXT: vmovw %xmm0, %eax
211 ; FP16-NEXT: shll $16, %eax
212 ; FP16-NEXT: vmovd %eax, %xmm0
213 ; FP16-NEXT: shll $16, %ebp
214 ; FP16-NEXT: vmovd %ebp, %xmm1
215 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
216 ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
217 ; FP16-NEXT: vmovw %xmm0, %eax
218 ; FP16-NEXT: shll $16, %eax
219 ; FP16-NEXT: vmovd %eax, %xmm0
220 ; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
221 ; FP16-NEXT: vmovsd %xmm0, (%rbx)
222 ; FP16-NEXT: popq %rbx
223 ; FP16-NEXT: popq %r14
224 ; FP16-NEXT: popq %rbp
227 ; AVXNC-LABEL: add_double:
229 ; AVXNC-NEXT: pushq %rbp
230 ; AVXNC-NEXT: pushq %r14
231 ; AVXNC-NEXT: pushq %rbx
232 ; AVXNC-NEXT: movq %rdx, %rbx
233 ; AVXNC-NEXT: movq %rsi, %r14
234 ; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
235 ; AVXNC-NEXT: callq __truncdfbf2@PLT
236 ; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp
237 ; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
238 ; AVXNC-NEXT: callq __truncdfbf2@PLT
239 ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
240 ; AVXNC-NEXT: shll $16, %eax
241 ; AVXNC-NEXT: vmovd %eax, %xmm0
242 ; AVXNC-NEXT: shll $16, %ebp
243 ; AVXNC-NEXT: vmovd %ebp, %xmm1
244 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
245 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
246 ; AVXNC-NEXT: vmovd %xmm0, %eax
247 ; AVXNC-NEXT: shll $16, %eax
248 ; AVXNC-NEXT: vmovd %eax, %xmm0
249 ; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
250 ; AVXNC-NEXT: vmovsd %xmm0, (%rbx)
251 ; AVXNC-NEXT: popq %rbx
252 ; AVXNC-NEXT: popq %r14
253 ; AVXNC-NEXT: popq %rbp
255 %la = load double, ptr %pa
256 %a = fptrunc double %la to bfloat
257 %lb = load double, ptr %pb
258 %b = fptrunc double %lb to bfloat
259 %add = fadd bfloat %a, %b
260 %dadd = fpext bfloat %add to double
261 store double %dadd, ptr %pc
265 define double @add_double2(double %da, double %db) nounwind {
266 ; X86-LABEL: add_double2:
268 ; X86-NEXT: pushl %esi
269 ; X86-NEXT: subl $24, %esp
270 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
271 ; X86-NEXT: vmovsd %xmm0, (%esp)
272 ; X86-NEXT: calll __truncdfbf2
273 ; X86-NEXT: vmovw %xmm0, %esi
274 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
275 ; X86-NEXT: vmovsd %xmm0, (%esp)
276 ; X86-NEXT: calll __truncdfbf2
277 ; X86-NEXT: vmovw %xmm0, %eax
278 ; X86-NEXT: shll $16, %eax
279 ; X86-NEXT: vmovd %eax, %xmm0
280 ; X86-NEXT: shll $16, %esi
281 ; X86-NEXT: vmovd %esi, %xmm1
282 ; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
283 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
284 ; X86-NEXT: vmovw %xmm0, %eax
285 ; X86-NEXT: shll $16, %eax
286 ; X86-NEXT: vmovd %eax, %xmm0
287 ; X86-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
288 ; X86-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
289 ; X86-NEXT: fldl {{[0-9]+}}(%esp)
290 ; X86-NEXT: addl $24, %esp
291 ; X86-NEXT: popl %esi
294 ; SSE2-LABEL: add_double2:
296 ; SSE2-NEXT: pushq %rbx
297 ; SSE2-NEXT: subq $16, %rsp
298 ; SSE2-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
299 ; SSE2-NEXT: callq __truncdfbf2@PLT
300 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
301 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
302 ; SSE2-NEXT: # xmm0 = mem[0],zero
303 ; SSE2-NEXT: callq __truncdfbf2@PLT
304 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
305 ; SSE2-NEXT: shll $16, %eax
306 ; SSE2-NEXT: movd %eax, %xmm1
307 ; SSE2-NEXT: shll $16, %ebx
308 ; SSE2-NEXT: movd %ebx, %xmm0
309 ; SSE2-NEXT: addss %xmm1, %xmm0
310 ; SSE2-NEXT: callq __truncsfbf2@PLT
311 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
312 ; SSE2-NEXT: shll $16, %eax
313 ; SSE2-NEXT: movd %eax, %xmm0
314 ; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
315 ; SSE2-NEXT: addq $16, %rsp
316 ; SSE2-NEXT: popq %rbx
319 ; FP16-LABEL: add_double2:
321 ; FP16-NEXT: pushq %rbx
322 ; FP16-NEXT: subq $16, %rsp
323 ; FP16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
324 ; FP16-NEXT: callq __truncdfbf2@PLT
325 ; FP16-NEXT: vmovw %xmm0, %ebx
326 ; FP16-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
327 ; FP16-NEXT: # xmm0 = mem[0],zero
328 ; FP16-NEXT: callq __truncdfbf2@PLT
329 ; FP16-NEXT: vmovw %xmm0, %eax
330 ; FP16-NEXT: shll $16, %eax
331 ; FP16-NEXT: vmovd %eax, %xmm0
332 ; FP16-NEXT: shll $16, %ebx
333 ; FP16-NEXT: vmovd %ebx, %xmm1
334 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
335 ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
336 ; FP16-NEXT: vmovw %xmm0, %eax
337 ; FP16-NEXT: shll $16, %eax
338 ; FP16-NEXT: vmovd %eax, %xmm0
339 ; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
340 ; FP16-NEXT: addq $16, %rsp
341 ; FP16-NEXT: popq %rbx
344 ; AVXNC-LABEL: add_double2:
346 ; AVXNC-NEXT: pushq %rbx
347 ; AVXNC-NEXT: subq $16, %rsp
348 ; AVXNC-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
349 ; AVXNC-NEXT: callq __truncdfbf2@PLT
350 ; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
351 ; AVXNC-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
352 ; AVXNC-NEXT: # xmm0 = mem[0],zero
353 ; AVXNC-NEXT: callq __truncdfbf2@PLT
354 ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
355 ; AVXNC-NEXT: shll $16, %eax
356 ; AVXNC-NEXT: vmovd %eax, %xmm0
357 ; AVXNC-NEXT: shll $16, %ebx
358 ; AVXNC-NEXT: vmovd %ebx, %xmm1
359 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
360 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
361 ; AVXNC-NEXT: vmovd %xmm0, %eax
362 ; AVXNC-NEXT: shll $16, %eax
363 ; AVXNC-NEXT: vmovd %eax, %xmm0
364 ; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
365 ; AVXNC-NEXT: addq $16, %rsp
366 ; AVXNC-NEXT: popq %rbx
368 %a = fptrunc double %da to bfloat
369 %b = fptrunc double %db to bfloat
370 %add = fadd bfloat %a, %b
371 %dadd = fpext bfloat %add to double
375 define void @add_constant(ptr %pa, ptr %pc) nounwind {
376 ; X86-LABEL: add_constant:
378 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
379 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
380 ; X86-NEXT: movzwl (%ecx), %ecx
381 ; X86-NEXT: shll $16, %ecx
382 ; X86-NEXT: vmovd %ecx, %xmm0
383 ; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
384 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
385 ; X86-NEXT: vpextrw $0, %xmm0, (%eax)
388 ; SSE2-LABEL: add_constant:
390 ; SSE2-NEXT: pushq %rbx
391 ; SSE2-NEXT: movq %rsi, %rbx
392 ; SSE2-NEXT: movzwl (%rdi), %eax
393 ; SSE2-NEXT: shll $16, %eax
394 ; SSE2-NEXT: movd %eax, %xmm0
395 ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
396 ; SSE2-NEXT: callq __truncsfbf2@PLT
397 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
398 ; SSE2-NEXT: movw %ax, (%rbx)
399 ; SSE2-NEXT: popq %rbx
402 ; F16-LABEL: add_constant:
404 ; F16-NEXT: movzwl (%rdi), %eax
405 ; F16-NEXT: shll $16, %eax
406 ; F16-NEXT: vmovd %eax, %xmm0
407 ; F16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
408 ; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
409 ; F16-NEXT: vpextrw $0, %xmm0, (%rsi)
412 ; AVXNC-LABEL: add_constant:
414 ; AVXNC-NEXT: movzwl (%rdi), %eax
415 ; AVXNC-NEXT: shll $16, %eax
416 ; AVXNC-NEXT: vmovd %eax, %xmm0
417 ; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
418 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
419 ; AVXNC-NEXT: vpextrw $0, %xmm0, (%rsi)
421 %a = load bfloat, ptr %pa
422 %add = fadd bfloat %a, 1.0
423 store bfloat %add, ptr %pc
427 define bfloat @add_constant2(bfloat %a) nounwind {
428 ; X86-LABEL: add_constant2:
430 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
431 ; X86-NEXT: shll $16, %eax
432 ; X86-NEXT: vmovd %eax, %xmm0
433 ; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
434 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
435 ; X86-NEXT: vmovw %xmm0, %eax
436 ; X86-NEXT: vmovw %eax, %xmm0
439 ; SSE2-LABEL: add_constant2:
441 ; SSE2-NEXT: pushq %rax
442 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
443 ; SSE2-NEXT: shll $16, %eax
444 ; SSE2-NEXT: movd %eax, %xmm0
445 ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
446 ; SSE2-NEXT: callq __truncsfbf2@PLT
447 ; SSE2-NEXT: popq %rax
450 ; FP16-LABEL: add_constant2:
452 ; FP16-NEXT: vmovw %xmm0, %eax
453 ; FP16-NEXT: shll $16, %eax
454 ; FP16-NEXT: vmovd %eax, %xmm0
455 ; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
456 ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
457 ; FP16-NEXT: vmovw %xmm0, %eax
458 ; FP16-NEXT: vmovw %eax, %xmm0
461 ; AVXNC-LABEL: add_constant2:
463 ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
464 ; AVXNC-NEXT: shll $16, %eax
465 ; AVXNC-NEXT: vmovd %eax, %xmm0
466 ; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
467 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
468 ; AVXNC-NEXT: vmovd %xmm0, %eax
469 ; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
471 %add = fadd bfloat %a, 1.0
475 define void @store_constant(ptr %pc) nounwind {
476 ; X86-LABEL: store_constant:
478 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
479 ; X86-NEXT: movw $16256, (%eax) # imm = 0x3F80
482 ; CHECK-LABEL: store_constant:
484 ; CHECK-NEXT: movw $16256, (%rdi) # imm = 0x3F80
486 store bfloat 1.0, ptr %pc
490 define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind {
491 ; X86-LABEL: fold_ext_trunc:
493 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
494 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
495 ; X86-NEXT: movzwl (%ecx), %ecx
496 ; X86-NEXT: movw %cx, (%eax)
499 ; CHECK-LABEL: fold_ext_trunc:
501 ; CHECK-NEXT: movzwl (%rdi), %eax
502 ; CHECK-NEXT: movw %ax, (%rsi)
504 %a = load bfloat, ptr %pa
505 %ext = fpext bfloat %a to float
506 %trunc = fptrunc float %ext to bfloat
507 store bfloat %trunc, ptr %pc
511 define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
512 ; X86-LABEL: fold_ext_trunc2:
514 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
517 ; CHECK-LABEL: fold_ext_trunc2:
520 %ext = fpext bfloat %a to float
521 %trunc = fptrunc float %ext to bfloat
525 define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
528 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
529 ; X86-NEXT: vpslld $16, %ymm1, %ymm1
530 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
531 ; X86-NEXT: vpslld $16, %ymm0, %ymm0
532 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
533 ; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
534 ; X86-NEXT: vzeroupper
539 ; SSE2-NEXT: pushq %rbp
540 ; SSE2-NEXT: pushq %r15
541 ; SSE2-NEXT: pushq %r14
542 ; SSE2-NEXT: pushq %r13
543 ; SSE2-NEXT: pushq %r12
544 ; SSE2-NEXT: pushq %rbx
545 ; SSE2-NEXT: subq $56, %rsp
546 ; SSE2-NEXT: movq %xmm0, %rcx
547 ; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
548 ; SSE2-NEXT: movq %rcx, %rax
549 ; SSE2-NEXT: shrq $48, %rax
550 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
551 ; SSE2-NEXT: movq %xmm1, %rdx
552 ; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
553 ; SSE2-NEXT: movq %rdx, %rax
554 ; SSE2-NEXT: shrq $48, %rax
555 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
556 ; SSE2-NEXT: movq %rcx, %rax
557 ; SSE2-NEXT: shrq $32, %rax
558 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
559 ; SSE2-NEXT: movq %rdx, %rax
560 ; SSE2-NEXT: shrq $32, %rax
561 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
562 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
563 ; SSE2-NEXT: movq %xmm0, %r15
564 ; SSE2-NEXT: movq %r15, %rbx
565 ; SSE2-NEXT: shrq $48, %rbx
566 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1]
567 ; SSE2-NEXT: movq %xmm1, %r14
568 ; SSE2-NEXT: movq %r14, %rbp
569 ; SSE2-NEXT: shrq $48, %rbp
570 ; SSE2-NEXT: movq %r15, %r12
571 ; SSE2-NEXT: shrq $32, %r12
572 ; SSE2-NEXT: movq %r14, %r13
573 ; SSE2-NEXT: shrq $32, %r13
574 ; SSE2-NEXT: movl %r14d, %eax
575 ; SSE2-NEXT: shll $16, %eax
576 ; SSE2-NEXT: movd %eax, %xmm1
577 ; SSE2-NEXT: movl %r15d, %eax
578 ; SSE2-NEXT: shll $16, %eax
579 ; SSE2-NEXT: movd %eax, %xmm0
580 ; SSE2-NEXT: addss %xmm1, %xmm0
581 ; SSE2-NEXT: callq __truncsfbf2@PLT
582 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
583 ; SSE2-NEXT: movzwl %ax, %eax
584 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
585 ; SSE2-NEXT: andl $-65536, %r14d # imm = 0xFFFF0000
586 ; SSE2-NEXT: movd %r14d, %xmm1
587 ; SSE2-NEXT: andl $-65536, %r15d # imm = 0xFFFF0000
588 ; SSE2-NEXT: movd %r15d, %xmm0
589 ; SSE2-NEXT: addss %xmm1, %xmm0
590 ; SSE2-NEXT: callq __truncsfbf2@PLT
591 ; SSE2-NEXT: pextrw $0, %xmm0, %r15d
592 ; SSE2-NEXT: shll $16, %r15d
593 ; SSE2-NEXT: addl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
594 ; SSE2-NEXT: shll $16, %r13d
595 ; SSE2-NEXT: movd %r13d, %xmm1
596 ; SSE2-NEXT: shll $16, %r12d
597 ; SSE2-NEXT: movd %r12d, %xmm0
598 ; SSE2-NEXT: addss %xmm1, %xmm0
599 ; SSE2-NEXT: callq __truncsfbf2@PLT
600 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
601 ; SSE2-NEXT: movzwl %ax, %r14d
602 ; SSE2-NEXT: shll $16, %ebp
603 ; SSE2-NEXT: movd %ebp, %xmm1
604 ; SSE2-NEXT: shll $16, %ebx
605 ; SSE2-NEXT: movd %ebx, %xmm0
606 ; SSE2-NEXT: addss %xmm1, %xmm0
607 ; SSE2-NEXT: callq __truncsfbf2@PLT
608 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
609 ; SSE2-NEXT: shll $16, %ebx
610 ; SSE2-NEXT: orl %r14d, %ebx
611 ; SSE2-NEXT: shlq $32, %rbx
612 ; SSE2-NEXT: orq %r15, %rbx
613 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
614 ; SSE2-NEXT: movl %r15d, %eax
615 ; SSE2-NEXT: shll $16, %eax
616 ; SSE2-NEXT: movd %eax, %xmm1
617 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
618 ; SSE2-NEXT: movl %r14d, %eax
619 ; SSE2-NEXT: shll $16, %eax
620 ; SSE2-NEXT: movd %eax, %xmm0
621 ; SSE2-NEXT: addss %xmm1, %xmm0
622 ; SSE2-NEXT: callq __truncsfbf2@PLT
623 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
624 ; SSE2-NEXT: movzwl %ax, %ebp
625 ; SSE2-NEXT: movq %r15, %rax
626 ; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
627 ; SSE2-NEXT: movd %eax, %xmm1
628 ; SSE2-NEXT: movq %r14, %rax
629 ; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
630 ; SSE2-NEXT: movd %eax, %xmm0
631 ; SSE2-NEXT: addss %xmm1, %xmm0
632 ; SSE2-NEXT: callq __truncsfbf2@PLT
633 ; SSE2-NEXT: pextrw $0, %xmm0, %r14d
634 ; SSE2-NEXT: shll $16, %r14d
635 ; SSE2-NEXT: orl %ebp, %r14d
636 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
637 ; SSE2-NEXT: shll $16, %eax
638 ; SSE2-NEXT: movd %eax, %xmm1
639 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
640 ; SSE2-NEXT: shll $16, %eax
641 ; SSE2-NEXT: movd %eax, %xmm0
642 ; SSE2-NEXT: addss %xmm1, %xmm0
643 ; SSE2-NEXT: callq __truncsfbf2@PLT
644 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
645 ; SSE2-NEXT: movzwl %ax, %ebp
646 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
647 ; SSE2-NEXT: shll $16, %eax
648 ; SSE2-NEXT: movd %eax, %xmm1
649 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
650 ; SSE2-NEXT: shll $16, %eax
651 ; SSE2-NEXT: movd %eax, %xmm0
652 ; SSE2-NEXT: addss %xmm1, %xmm0
653 ; SSE2-NEXT: callq __truncsfbf2@PLT
654 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
655 ; SSE2-NEXT: shll $16, %eax
656 ; SSE2-NEXT: orl %ebp, %eax
657 ; SSE2-NEXT: shlq $32, %rax
658 ; SSE2-NEXT: orq %r14, %rax
659 ; SSE2-NEXT: movq %rax, %xmm0
660 ; SSE2-NEXT: movq %rbx, %xmm1
661 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
662 ; SSE2-NEXT: addq $56, %rsp
663 ; SSE2-NEXT: popq %rbx
664 ; SSE2-NEXT: popq %r12
665 ; SSE2-NEXT: popq %r13
666 ; SSE2-NEXT: popq %r14
667 ; SSE2-NEXT: popq %r15
668 ; SSE2-NEXT: popq %rbp
673 ; F16-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
674 ; F16-NEXT: vpslld $16, %ymm1, %ymm1
675 ; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
676 ; F16-NEXT: vpslld $16, %ymm0, %ymm0
677 ; F16-NEXT: vaddps %ymm1, %ymm0, %ymm0
678 ; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
679 ; F16-NEXT: vzeroupper
684 ; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
685 ; AVXNC-NEXT: vpslld $16, %ymm1, %ymm1
686 ; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
687 ; AVXNC-NEXT: vpslld $16, %ymm0, %ymm0
688 ; AVXNC-NEXT: vaddps %ymm1, %ymm0, %ymm0
689 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
690 ; AVXNC-NEXT: vzeroupper
692 %add = fadd <8 x bfloat> %a, %b
693 ret <8 x bfloat> %add
696 define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
697 ; X86-LABEL: pr62997:
699 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
700 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
701 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
704 ; SSE2-LABEL: pr62997:
706 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
709 ; BF16-LABEL: pr62997:
711 ; BF16-NEXT: vpextrw $0, %xmm0, %eax
712 ; BF16-NEXT: vpextrw $0, %xmm1, %ecx
713 ; BF16-NEXT: vmovd %eax, %xmm0
714 ; BF16-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
717 ; FP16-LABEL: pr62997:
719 ; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
721 %1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0
722 %2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1
726 define <32 x bfloat> @pr63017() {
727 ; X86-LABEL: pr63017:
729 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
732 ; SSE2-LABEL: pr63017:
734 ; SSE2-NEXT: xorps %xmm0, %xmm0
735 ; SSE2-NEXT: xorps %xmm1, %xmm1
736 ; SSE2-NEXT: xorps %xmm2, %xmm2
737 ; SSE2-NEXT: xorps %xmm3, %xmm3
740 ; F16-LABEL: pr63017:
742 ; F16-NEXT: vxorps %xmm0, %xmm0, %xmm0
745 ; AVXNC-LABEL: pr63017:
747 ; AVXNC-NEXT: vxorps %xmm0, %xmm0, %xmm0
748 ; AVXNC-NEXT: vxorps %xmm1, %xmm1, %xmm1
750 ret <32 x bfloat> zeroinitializer
753 define <32 x bfloat> @pr63017_2() nounwind {
754 ; X86-LABEL: pr63017_2:
756 ; X86-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0]
757 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
760 ; SSE2-LABEL: pr63017_2:
762 ; SSE2-NEXT: xorl %eax, %eax
763 ; SSE2-NEXT: testb %al, %al
764 ; SSE2-NEXT: jne .LBB12_1
765 ; SSE2-NEXT: # %bb.2: # %cond.load
766 ; SSE2-NEXT: movzwl (%rax), %eax
767 ; SSE2-NEXT: shll $16, %eax
768 ; SSE2-NEXT: movd %eax, %xmm0
769 ; SSE2-NEXT: jmp .LBB12_3
770 ; SSE2-NEXT: .LBB12_1:
771 ; SSE2-NEXT: movd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
772 ; SSE2-NEXT: .LBB12_3:
773 ; SSE2-NEXT: pushq %r14
774 ; SSE2-NEXT: pushq %rbx
775 ; SSE2-NEXT: subq $88, %rsp
776 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
777 ; SSE2-NEXT: callq __truncsfbf2@PLT
778 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
779 ; SSE2-NEXT: shll $16, %ebx
780 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
781 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
782 ; SSE2-NEXT: callq __truncsfbf2@PLT
783 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
784 ; SSE2-NEXT: movzwl %ax, %r14d
785 ; SSE2-NEXT: orl %ebx, %r14d
786 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
787 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
788 ; SSE2-NEXT: callq __truncsfbf2@PLT
789 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
790 ; SSE2-NEXT: shll $16, %ebx
791 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
792 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
793 ; SSE2-NEXT: callq __truncsfbf2@PLT
794 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
795 ; SSE2-NEXT: movzwl %ax, %eax
796 ; SSE2-NEXT: orl %ebx, %eax
797 ; SSE2-NEXT: shlq $32, %rax
798 ; SSE2-NEXT: orq %r14, %rax
799 ; SSE2-NEXT: movq %rax, %xmm0
800 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
801 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
802 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
803 ; SSE2-NEXT: callq __truncsfbf2@PLT
804 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
805 ; SSE2-NEXT: shll $16, %ebx
806 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
807 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
808 ; SSE2-NEXT: callq __truncsfbf2@PLT
809 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
810 ; SSE2-NEXT: movzwl %ax, %r14d
811 ; SSE2-NEXT: orl %ebx, %r14d
812 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
813 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
814 ; SSE2-NEXT: callq __truncsfbf2@PLT
815 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
816 ; SSE2-NEXT: shll $16, %ebx
817 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
818 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
819 ; SSE2-NEXT: callq __truncsfbf2@PLT
820 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
821 ; SSE2-NEXT: movzwl %ax, %eax
822 ; SSE2-NEXT: orl %ebx, %eax
823 ; SSE2-NEXT: shlq $32, %rax
824 ; SSE2-NEXT: orq %r14, %rax
825 ; SSE2-NEXT: movq %rax, %xmm0
826 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
827 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
828 ; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
829 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
830 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
831 ; SSE2-NEXT: callq __truncsfbf2@PLT
832 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
833 ; SSE2-NEXT: shll $16, %ebx
834 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
835 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
836 ; SSE2-NEXT: callq __truncsfbf2@PLT
837 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
838 ; SSE2-NEXT: movzwl %ax, %r14d
839 ; SSE2-NEXT: orl %ebx, %r14d
840 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
841 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
842 ; SSE2-NEXT: callq __truncsfbf2@PLT
843 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
844 ; SSE2-NEXT: shll $16, %ebx
845 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
846 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
847 ; SSE2-NEXT: callq __truncsfbf2@PLT
848 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
849 ; SSE2-NEXT: movzwl %ax, %eax
850 ; SSE2-NEXT: orl %ebx, %eax
851 ; SSE2-NEXT: shlq $32, %rax
852 ; SSE2-NEXT: orq %r14, %rax
853 ; SSE2-NEXT: movq %rax, %xmm0
854 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
855 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
856 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
857 ; SSE2-NEXT: callq __truncsfbf2@PLT
858 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
859 ; SSE2-NEXT: shll $16, %ebx
860 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
861 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
862 ; SSE2-NEXT: callq __truncsfbf2@PLT
863 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
864 ; SSE2-NEXT: movzwl %ax, %r14d
865 ; SSE2-NEXT: orl %ebx, %r14d
866 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
867 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
868 ; SSE2-NEXT: callq __truncsfbf2@PLT
869 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
870 ; SSE2-NEXT: shll $16, %ebx
871 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
872 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
873 ; SSE2-NEXT: callq __truncsfbf2@PLT
874 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
875 ; SSE2-NEXT: movzwl %ax, %eax
876 ; SSE2-NEXT: orl %ebx, %eax
877 ; SSE2-NEXT: shlq $32, %rax
878 ; SSE2-NEXT: orq %r14, %rax
879 ; SSE2-NEXT: movq %rax, %xmm0
880 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
881 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
882 ; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
883 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
884 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
885 ; SSE2-NEXT: callq __truncsfbf2@PLT
886 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
887 ; SSE2-NEXT: shll $16, %ebx
888 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
889 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
890 ; SSE2-NEXT: callq __truncsfbf2@PLT
891 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
892 ; SSE2-NEXT: movzwl %ax, %r14d
893 ; SSE2-NEXT: orl %ebx, %r14d
894 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
895 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
896 ; SSE2-NEXT: callq __truncsfbf2@PLT
897 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
898 ; SSE2-NEXT: shll $16, %ebx
899 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
900 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
901 ; SSE2-NEXT: callq __truncsfbf2@PLT
902 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
903 ; SSE2-NEXT: movzwl %ax, %eax
904 ; SSE2-NEXT: orl %ebx, %eax
905 ; SSE2-NEXT: shlq $32, %rax
906 ; SSE2-NEXT: orq %r14, %rax
907 ; SSE2-NEXT: movq %rax, %xmm0
908 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
909 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
910 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
911 ; SSE2-NEXT: callq __truncsfbf2@PLT
912 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
913 ; SSE2-NEXT: shll $16, %ebx
914 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
915 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
916 ; SSE2-NEXT: callq __truncsfbf2@PLT
917 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
918 ; SSE2-NEXT: movzwl %ax, %r14d
919 ; SSE2-NEXT: orl %ebx, %r14d
920 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
921 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
922 ; SSE2-NEXT: callq __truncsfbf2@PLT
923 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
924 ; SSE2-NEXT: shll $16, %ebx
925 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
926 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
927 ; SSE2-NEXT: callq __truncsfbf2@PLT
928 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
929 ; SSE2-NEXT: movzwl %ax, %eax
930 ; SSE2-NEXT: orl %ebx, %eax
931 ; SSE2-NEXT: shlq $32, %rax
932 ; SSE2-NEXT: orq %r14, %rax
933 ; SSE2-NEXT: movq %rax, %xmm0
934 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
935 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
936 ; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
937 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
938 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
939 ; SSE2-NEXT: callq __truncsfbf2@PLT
940 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
941 ; SSE2-NEXT: shll $16, %ebx
942 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
943 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
944 ; SSE2-NEXT: callq __truncsfbf2@PLT
945 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
946 ; SSE2-NEXT: movzwl %ax, %r14d
947 ; SSE2-NEXT: orl %ebx, %r14d
948 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
949 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
950 ; SSE2-NEXT: callq __truncsfbf2@PLT
951 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
952 ; SSE2-NEXT: shll $16, %ebx
953 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
954 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
955 ; SSE2-NEXT: callq __truncsfbf2@PLT
956 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
957 ; SSE2-NEXT: movzwl %ax, %eax
958 ; SSE2-NEXT: orl %ebx, %eax
959 ; SSE2-NEXT: shlq $32, %rax
960 ; SSE2-NEXT: orq %r14, %rax
961 ; SSE2-NEXT: movq %rax, %xmm0
962 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
963 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
964 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
965 ; SSE2-NEXT: callq __truncsfbf2@PLT
966 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
967 ; SSE2-NEXT: shll $16, %ebx
968 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
969 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
970 ; SSE2-NEXT: callq __truncsfbf2@PLT
971 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
972 ; SSE2-NEXT: movzwl %ax, %r14d
973 ; SSE2-NEXT: orl %ebx, %r14d
974 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
975 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
976 ; SSE2-NEXT: callq __truncsfbf2@PLT
977 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
978 ; SSE2-NEXT: shll $16, %ebx
979 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
980 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
981 ; SSE2-NEXT: callq __truncsfbf2@PLT
982 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
983 ; SSE2-NEXT: movzwl %ax, %eax
984 ; SSE2-NEXT: orl %ebx, %eax
985 ; SSE2-NEXT: shlq $32, %rax
986 ; SSE2-NEXT: orq %r14, %rax
987 ; SSE2-NEXT: movq %rax, %xmm0
988 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
989 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
990 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
991 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
992 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
993 ; SSE2-NEXT: addq $88, %rsp
994 ; SSE2-NEXT: popq %rbx
995 ; SSE2-NEXT: popq %r14
998 ; FP16-LABEL: pr63017_2:
1000 ; FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0]
1001 ; FP16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1}
1004 ; AVXNC-LABEL: pr63017_2:
1006 ; AVXNC-NEXT: vbroadcastss {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
1007 ; AVXNC-NEXT: xorl %eax, %eax
1008 ; AVXNC-NEXT: testb %al, %al
1009 ; AVXNC-NEXT: jne .LBB12_2
1010 ; AVXNC-NEXT: # %bb.1: # %cond.load
1011 ; AVXNC-NEXT: vmovups (%rax), %ymm0
1012 ; AVXNC-NEXT: .LBB12_2:
1013 ; AVXNC-NEXT: vmovaps %ymm0, %ymm1
1015 %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
1016 ret <32 x bfloat> %1
1019 define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
1020 ; X86-LABEL: pr62997_3:
1022 ; X86-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
1023 ; X86-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
1026 ; SSE2-LABEL: pr62997_3:
1028 ; SSE2-NEXT: movq %xmm0, %rax
1029 ; SSE2-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
1030 ; SSE2-NEXT: andq %rax, %rcx
1031 ; SSE2-NEXT: movzwl %ax, %eax
1032 ; SSE2-NEXT: pextrw $0, %xmm4, %edx
1033 ; SSE2-NEXT: shll $16, %edx
1034 ; SSE2-NEXT: orl %eax, %edx
1035 ; SSE2-NEXT: orq %rcx, %rdx
1036 ; SSE2-NEXT: movq %rdx, %xmm4
1037 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1040 ; FP16-LABEL: pr62997_3:
1042 ; FP16-NEXT: vmovw %xmm1, %eax
1043 ; FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1
1044 ; FP16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
1047 ; AVXNC-LABEL: pr62997_3:
1049 ; AVXNC-NEXT: vpextrw $0, %xmm2, %eax
1050 ; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm2
1051 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1053 %3 = insertelement <32 x bfloat> %0, bfloat %1, i64 1
1054 ret <32 x bfloat> %3
1057 declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)
1059 define <4 x float> @pr64460_1(<4 x bfloat> %a) {
1060 ; X86-LABEL: pr64460_1:
1062 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1063 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1066 ; SSE2-LABEL: pr64460_1:
1068 ; SSE2-NEXT: pxor %xmm1, %xmm1
1069 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1070 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1073 ; AVX-LABEL: pr64460_1:
1075 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
1076 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1078 %b = fpext <4 x bfloat> %a to <4 x float>
1082 define <8 x float> @pr64460_2(<8 x bfloat> %a) {
1083 ; X86-LABEL: pr64460_2:
1085 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1086 ; X86-NEXT: vpslld $16, %ymm0, %ymm0
1089 ; SSE2-LABEL: pr64460_2:
1091 ; SSE2-NEXT: pxor %xmm1, %xmm1
1092 ; SSE2-NEXT: pxor %xmm2, %xmm2
1093 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1094 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1095 ; SSE2-NEXT: movdqa %xmm2, %xmm0
1098 ; AVX-LABEL: pr64460_2:
1100 ; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1101 ; AVX-NEXT: vpslld $16, %ymm0, %ymm0
1103 %b = fpext <8 x bfloat> %a to <8 x float>
1107 define <16 x float> @pr64460_3(<16 x bfloat> %a) {
1108 ; X86-LABEL: pr64460_3:
1110 ; X86-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1111 ; X86-NEXT: vpslld $16, %zmm0, %zmm0
1114 ; SSE2-LABEL: pr64460_3:
1116 ; SSE2-NEXT: pxor %xmm3, %xmm3
1117 ; SSE2-NEXT: pxor %xmm5, %xmm5
1118 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
1119 ; SSE2-NEXT: pxor %xmm4, %xmm4
1120 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1121 ; SSE2-NEXT: pxor %xmm2, %xmm2
1122 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1123 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1124 ; SSE2-NEXT: movdqa %xmm5, %xmm0
1125 ; SSE2-NEXT: movdqa %xmm4, %xmm1
1128 ; F16-LABEL: pr64460_3:
1130 ; F16-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1131 ; F16-NEXT: vpslld $16, %zmm0, %zmm0
1134 ; AVXNC-LABEL: pr64460_3:
1136 ; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1137 ; AVXNC-NEXT: vpslld $16, %ymm1, %ymm2
1138 ; AVXNC-NEXT: vextracti128 $1, %ymm0, %xmm0
1139 ; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1140 ; AVXNC-NEXT: vpslld $16, %ymm0, %ymm1
1141 ; AVXNC-NEXT: vmovdqa %ymm2, %ymm0
1143 %b = fpext <16 x bfloat> %a to <16 x float>
1147 define <8 x double> @pr64460_4(<8 x bfloat> %a) {
1148 ; X86-LABEL: pr64460_4:
1150 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1151 ; X86-NEXT: vpslld $16, %ymm0, %ymm0
1152 ; X86-NEXT: vcvtps2pd %ymm0, %zmm0
1155 ; SSE2-LABEL: pr64460_4:
1157 ; SSE2-NEXT: pxor %xmm3, %xmm3
1158 ; SSE2-NEXT: pxor %xmm1, %xmm1
1159 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1160 ; SSE2-NEXT: cvtps2pd %xmm1, %xmm4
1161 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1162 ; SSE2-NEXT: cvtps2pd %xmm3, %xmm2
1163 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1164 ; SSE2-NEXT: cvtps2pd %xmm0, %xmm1
1165 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
1166 ; SSE2-NEXT: cvtps2pd %xmm0, %xmm3
1167 ; SSE2-NEXT: movaps %xmm4, %xmm0
1170 ; F16-LABEL: pr64460_4:
1172 ; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1173 ; F16-NEXT: vpslld $16, %ymm0, %ymm0
1174 ; F16-NEXT: vcvtps2pd %ymm0, %zmm0
1177 ; AVXNC-LABEL: pr64460_4:
1179 ; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1180 ; AVXNC-NEXT: vpslld $16, %ymm0, %ymm1
1181 ; AVXNC-NEXT: vcvtps2pd %xmm1, %ymm0
1182 ; AVXNC-NEXT: vextracti128 $1, %ymm1, %xmm1
1183 ; AVXNC-NEXT: vcvtps2pd %xmm1, %ymm1
1185 %b = fpext <8 x bfloat> %a to <8 x double>
1189 define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
1190 ; X86-LABEL: fptrunc_v4f32:
1192 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1193 ; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
1194 ; X86-NEXT: vzeroupper
1197 ; SSE2-LABEL: fptrunc_v4f32:
1199 ; SSE2-NEXT: subq $72, %rsp
1200 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
1201 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1202 ; SSE2-NEXT: callq __truncsfbf2@PLT
1203 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1204 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1205 ; SSE2-NEXT: callq __truncsfbf2@PLT
1206 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1207 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1208 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
1209 ; SSE2-NEXT: callq __truncsfbf2@PLT
1210 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1211 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1212 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1213 ; SSE2-NEXT: callq __truncsfbf2@PLT
1214 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1215 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1216 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1217 ; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1218 ; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1219 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1220 ; SSE2-NEXT: addq $72, %rsp
1223 ; F16-LABEL: fptrunc_v4f32:
1225 ; F16-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1226 ; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
1227 ; F16-NEXT: vzeroupper
1230 ; AVXNC-LABEL: fptrunc_v4f32:
1232 ; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1233 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
1234 ; AVXNC-NEXT: vzeroupper
1236 %b = fptrunc <4 x float> %a to <4 x bfloat>
1240 define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
1241 ; X86-LABEL: fptrunc_v8f32:
1243 ; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
1244 ; X86-NEXT: vzeroupper
1247 ; SSE2-LABEL: fptrunc_v8f32:
1249 ; SSE2-NEXT: pushq %rbp
1250 ; SSE2-NEXT: pushq %r14
1251 ; SSE2-NEXT: pushq %rbx
1252 ; SSE2-NEXT: subq $32, %rsp
1253 ; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
1254 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1255 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1256 ; SSE2-NEXT: callq __truncsfbf2@PLT
1257 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1258 ; SSE2-NEXT: shll $16, %ebx
1259 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1260 ; SSE2-NEXT: callq __truncsfbf2@PLT
1261 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1262 ; SSE2-NEXT: movzwl %ax, %r14d
1263 ; SSE2-NEXT: orl %ebx, %r14d
1264 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1265 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1266 ; SSE2-NEXT: callq __truncsfbf2@PLT
1267 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1268 ; SSE2-NEXT: shll $16, %ebp
1269 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1270 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1271 ; SSE2-NEXT: callq __truncsfbf2@PLT
1272 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1273 ; SSE2-NEXT: movzwl %ax, %ebx
1274 ; SSE2-NEXT: orl %ebp, %ebx
1275 ; SSE2-NEXT: shlq $32, %rbx
1276 ; SSE2-NEXT: orq %r14, %rbx
1277 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1278 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1279 ; SSE2-NEXT: callq __truncsfbf2@PLT
1280 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1281 ; SSE2-NEXT: shll $16, %ebp
1282 ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
1283 ; SSE2-NEXT: callq __truncsfbf2@PLT
1284 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1285 ; SSE2-NEXT: movzwl %ax, %r14d
1286 ; SSE2-NEXT: orl %ebp, %r14d
1287 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1288 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1289 ; SSE2-NEXT: callq __truncsfbf2@PLT
1290 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1291 ; SSE2-NEXT: shll $16, %ebp
1292 ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
1293 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1294 ; SSE2-NEXT: callq __truncsfbf2@PLT
1295 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1296 ; SSE2-NEXT: movzwl %ax, %eax
1297 ; SSE2-NEXT: orl %ebp, %eax
1298 ; SSE2-NEXT: shlq $32, %rax
1299 ; SSE2-NEXT: orq %r14, %rax
1300 ; SSE2-NEXT: movq %rax, %xmm1
1301 ; SSE2-NEXT: movq %rbx, %xmm0
1302 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1303 ; SSE2-NEXT: addq $32, %rsp
1304 ; SSE2-NEXT: popq %rbx
1305 ; SSE2-NEXT: popq %r14
1306 ; SSE2-NEXT: popq %rbp
1309 ; F16-LABEL: fptrunc_v8f32:
1311 ; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
1312 ; F16-NEXT: vzeroupper
1315 ; AVXNC-LABEL: fptrunc_v8f32:
1317 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
1318 ; AVXNC-NEXT: vzeroupper
1320 %b = fptrunc <8 x float> %a to <8 x bfloat>
1324 define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
1325 ; X86-LABEL: fptrunc_v16f32:
1327 ; X86-NEXT: vcvtneps2bf16 %zmm0, %ymm0
1330 ; SSE2-LABEL: fptrunc_v16f32:
1332 ; SSE2-NEXT: pushq %rbp
1333 ; SSE2-NEXT: pushq %r15
1334 ; SSE2-NEXT: pushq %r14
1335 ; SSE2-NEXT: pushq %r12
1336 ; SSE2-NEXT: pushq %rbx
1337 ; SSE2-NEXT: subq $64, %rsp
1338 ; SSE2-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
1339 ; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1340 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1341 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1342 ; SSE2-NEXT: movaps %xmm2, %xmm0
1343 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1]
1344 ; SSE2-NEXT: callq __truncsfbf2@PLT
1345 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1346 ; SSE2-NEXT: shll $16, %ebx
1347 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1348 ; SSE2-NEXT: callq __truncsfbf2@PLT
1349 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1350 ; SSE2-NEXT: movzwl %ax, %r14d
1351 ; SSE2-NEXT: orl %ebx, %r14d
1352 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1353 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1354 ; SSE2-NEXT: callq __truncsfbf2@PLT
1355 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1356 ; SSE2-NEXT: shll $16, %ebp
1357 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1358 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1359 ; SSE2-NEXT: callq __truncsfbf2@PLT
1360 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1361 ; SSE2-NEXT: movzwl %ax, %ebx
1362 ; SSE2-NEXT: orl %ebp, %ebx
1363 ; SSE2-NEXT: shlq $32, %rbx
1364 ; SSE2-NEXT: orq %r14, %rbx
1365 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1366 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1367 ; SSE2-NEXT: callq __truncsfbf2@PLT
1368 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1369 ; SSE2-NEXT: shll $16, %ebp
1370 ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
1371 ; SSE2-NEXT: callq __truncsfbf2@PLT
1372 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1373 ; SSE2-NEXT: movzwl %ax, %r15d
1374 ; SSE2-NEXT: orl %ebp, %r15d
1375 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1376 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1377 ; SSE2-NEXT: callq __truncsfbf2@PLT
1378 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1379 ; SSE2-NEXT: shll $16, %ebp
1380 ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
1381 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1382 ; SSE2-NEXT: callq __truncsfbf2@PLT
1383 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1384 ; SSE2-NEXT: movzwl %ax, %r14d
1385 ; SSE2-NEXT: orl %ebp, %r14d
1386 ; SSE2-NEXT: shlq $32, %r14
1387 ; SSE2-NEXT: orq %r15, %r14
1388 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1389 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1390 ; SSE2-NEXT: callq __truncsfbf2@PLT
1391 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1392 ; SSE2-NEXT: shll $16, %ebp
1393 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1394 ; SSE2-NEXT: callq __truncsfbf2@PLT
1395 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1396 ; SSE2-NEXT: movzwl %ax, %r12d
1397 ; SSE2-NEXT: orl %ebp, %r12d
1398 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1399 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1400 ; SSE2-NEXT: callq __truncsfbf2@PLT
1401 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1402 ; SSE2-NEXT: shll $16, %ebp
1403 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1404 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1405 ; SSE2-NEXT: callq __truncsfbf2@PLT
1406 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1407 ; SSE2-NEXT: movzwl %ax, %r15d
1408 ; SSE2-NEXT: orl %ebp, %r15d
1409 ; SSE2-NEXT: shlq $32, %r15
1410 ; SSE2-NEXT: orq %r12, %r15
1411 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1412 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1413 ; SSE2-NEXT: callq __truncsfbf2@PLT
1414 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1415 ; SSE2-NEXT: shll $16, %ebp
1416 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1417 ; SSE2-NEXT: callq __truncsfbf2@PLT
1418 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1419 ; SSE2-NEXT: movzwl %ax, %r12d
1420 ; SSE2-NEXT: orl %ebp, %r12d
1421 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1422 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1423 ; SSE2-NEXT: callq __truncsfbf2@PLT
1424 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1425 ; SSE2-NEXT: shll $16, %ebp
1426 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1427 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1428 ; SSE2-NEXT: callq __truncsfbf2@PLT
1429 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1430 ; SSE2-NEXT: movzwl %ax, %eax
1431 ; SSE2-NEXT: orl %ebp, %eax
1432 ; SSE2-NEXT: shlq $32, %rax
1433 ; SSE2-NEXT: orq %r12, %rax
1434 ; SSE2-NEXT: movq %rax, %xmm1
1435 ; SSE2-NEXT: movq %r15, %xmm0
1436 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1437 ; SSE2-NEXT: movq %r14, %xmm2
1438 ; SSE2-NEXT: movq %rbx, %xmm1
1439 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1440 ; SSE2-NEXT: addq $64, %rsp
1441 ; SSE2-NEXT: popq %rbx
1442 ; SSE2-NEXT: popq %r12
1443 ; SSE2-NEXT: popq %r14
1444 ; SSE2-NEXT: popq %r15
1445 ; SSE2-NEXT: popq %rbp
1448 ; F16-LABEL: fptrunc_v16f32:
1450 ; F16-NEXT: vcvtneps2bf16 %zmm0, %ymm0
1453 ; AVXNC-LABEL: fptrunc_v16f32:
1455 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
1456 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1
1457 ; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1459 %b = fptrunc <16 x float> %a to <16 x bfloat>
1460 ret <16 x bfloat> %b
1463 define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
1464 ; X86-LABEL: fptrunc_v8f64:
1466 ; X86-NEXT: subl $204, %esp
1467 ; X86-NEXT: vmovups %zmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 64-byte Spill
1468 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
1469 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1470 ; X86-NEXT: vmovlps %xmm0, (%esp)
1471 ; X86-NEXT: vzeroupper
1472 ; X86-NEXT: calll __truncdfbf2
1473 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1474 ; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
1475 ; X86-NEXT: vmovhps %xmm0, (%esp)
1476 ; X86-NEXT: calll __truncdfbf2
1477 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1478 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
1479 ; X86-NEXT: vmovlps %xmm0, (%esp)
1480 ; X86-NEXT: vzeroupper
1481 ; X86-NEXT: calll __truncdfbf2
1482 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1483 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
1484 ; X86-NEXT: vmovhps %xmm0, (%esp)
1485 ; X86-NEXT: vzeroupper
1486 ; X86-NEXT: calll __truncdfbf2
1487 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1488 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
1489 ; X86-NEXT: vextractf32x4 $2, %zmm0, %xmm0
1490 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1491 ; X86-NEXT: vmovlps %xmm0, (%esp)
1492 ; X86-NEXT: vzeroupper
1493 ; X86-NEXT: calll __truncdfbf2
1494 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1495 ; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
1496 ; X86-NEXT: vmovhps %xmm0, (%esp)
1497 ; X86-NEXT: calll __truncdfbf2
1498 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1499 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
1500 ; X86-NEXT: vextractf32x4 $3, %zmm0, %xmm0
1501 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1502 ; X86-NEXT: vmovlps %xmm0, (%esp)
1503 ; X86-NEXT: vzeroupper
1504 ; X86-NEXT: calll __truncdfbf2
1505 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1506 ; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
1507 ; X86-NEXT: vmovhps %xmm0, (%esp)
1508 ; X86-NEXT: calll __truncdfbf2
1509 ; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
1510 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1511 ; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
1512 ; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
1513 ; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
1514 ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1515 ; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
1516 ; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
1517 ; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
1518 ; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
1519 ; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
1520 ; X86-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
1521 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1522 ; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1523 ; X86-NEXT: addl $204, %esp
1526 ; SSE2-LABEL: fptrunc_v8f64:
1528 ; SSE2-NEXT: pushq %rbp
1529 ; SSE2-NEXT: pushq %r14
1530 ; SSE2-NEXT: pushq %rbx
1531 ; SSE2-NEXT: subq $64, %rsp
1532 ; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1533 ; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1534 ; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
1535 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1536 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1537 ; SSE2-NEXT: callq __truncdfbf2@PLT
1538 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1539 ; SSE2-NEXT: shll $16, %ebx
1540 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1541 ; SSE2-NEXT: callq __truncdfbf2@PLT
1542 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1543 ; SSE2-NEXT: movzwl %ax, %r14d
1544 ; SSE2-NEXT: orl %ebx, %r14d
1545 ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
1546 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1547 ; SSE2-NEXT: callq __truncdfbf2@PLT
1548 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1549 ; SSE2-NEXT: shll $16, %ebp
1550 ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
1551 ; SSE2-NEXT: callq __truncdfbf2@PLT
1552 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1553 ; SSE2-NEXT: movzwl %ax, %ebx
1554 ; SSE2-NEXT: orl %ebp, %ebx
1555 ; SSE2-NEXT: shlq $32, %rbx
1556 ; SSE2-NEXT: orq %r14, %rbx
1557 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1558 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1559 ; SSE2-NEXT: callq __truncdfbf2@PLT
1560 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1561 ; SSE2-NEXT: shll $16, %ebp
1562 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1563 ; SSE2-NEXT: callq __truncdfbf2@PLT
1564 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1565 ; SSE2-NEXT: movzwl %ax, %r14d
1566 ; SSE2-NEXT: orl %ebp, %r14d
1567 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1568 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1569 ; SSE2-NEXT: callq __truncdfbf2@PLT
1570 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1571 ; SSE2-NEXT: shll $16, %ebp
1572 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1573 ; SSE2-NEXT: callq __truncdfbf2@PLT
1574 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1575 ; SSE2-NEXT: movzwl %ax, %eax
1576 ; SSE2-NEXT: orl %ebp, %eax
1577 ; SSE2-NEXT: shlq $32, %rax
1578 ; SSE2-NEXT: orq %r14, %rax
1579 ; SSE2-NEXT: movq %rax, %xmm1
1580 ; SSE2-NEXT: movq %rbx, %xmm0
1581 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1582 ; SSE2-NEXT: addq $64, %rsp
1583 ; SSE2-NEXT: popq %rbx
1584 ; SSE2-NEXT: popq %r14
1585 ; SSE2-NEXT: popq %rbp
1588 ; FP16-LABEL: fptrunc_v8f64:
1590 ; FP16-NEXT: subq $184, %rsp
1591 ; FP16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1592 ; FP16-NEXT: vextractf128 $1, %ymm0, %xmm0
1593 ; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1594 ; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1595 ; FP16-NEXT: vzeroupper
1596 ; FP16-NEXT: callq __truncdfbf2@PLT
1597 ; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1598 ; FP16-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1599 ; FP16-NEXT: callq __truncdfbf2@PLT
1600 ; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1601 ; FP16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1602 ; FP16-NEXT: # xmm0 = mem[1,0]
1603 ; FP16-NEXT: callq __truncdfbf2@PLT
1604 ; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1605 ; FP16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1606 ; FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1607 ; FP16-NEXT: vzeroupper
1608 ; FP16-NEXT: callq __truncdfbf2@PLT
1609 ; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1610 ; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1611 ; FP16-NEXT: vextractf32x4 $2, %zmm0, %xmm0
1612 ; FP16-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
1613 ; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1614 ; FP16-NEXT: vzeroupper
1615 ; FP16-NEXT: callq __truncdfbf2@PLT
1616 ; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1617 ; FP16-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
1618 ; FP16-NEXT: callq __truncdfbf2@PLT
1619 ; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
1620 ; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1621 ; FP16-NEXT: vextractf32x4 $3, %zmm0, %xmm0
1622 ; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1623 ; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1624 ; FP16-NEXT: vzeroupper
1625 ; FP16-NEXT: callq __truncdfbf2@PLT
1626 ; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1627 ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1628 ; FP16-NEXT: callq __truncdfbf2@PLT
1629 ; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1630 ; FP16-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1631 ; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
1632 ; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
1633 ; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
1634 ; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1635 ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1636 ; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
1637 ; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
1638 ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1639 ; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
1640 ; FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
1641 ; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1642 ; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1643 ; FP16-NEXT: addq $184, %rsp
1646 ; AVXNC-LABEL: fptrunc_v8f64:
1648 ; AVXNC-NEXT: pushq %rbp
1649 ; AVXNC-NEXT: pushq %r15
1650 ; AVXNC-NEXT: pushq %r14
1651 ; AVXNC-NEXT: pushq %r13
1652 ; AVXNC-NEXT: pushq %r12
1653 ; AVXNC-NEXT: pushq %rbx
1654 ; AVXNC-NEXT: subq $168, %rsp
1655 ; AVXNC-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1656 ; AVXNC-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1657 ; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1658 ; AVXNC-NEXT: vzeroupper
1659 ; AVXNC-NEXT: callq __truncdfbf2@PLT
1660 ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1661 ; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1662 ; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
1663 ; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
1664 ; AVXNC-NEXT: vzeroupper
1665 ; AVXNC-NEXT: callq __truncdfbf2@PLT
1666 ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1667 ; AVXNC-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
1668 ; AVXNC-NEXT: # xmm0 = mem[1,0]
1669 ; AVXNC-NEXT: callq __truncdfbf2@PLT
1670 ; AVXNC-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
1671 ; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1672 ; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1673 ; AVXNC-NEXT: vzeroupper
1674 ; AVXNC-NEXT: callq __truncdfbf2@PLT
1675 ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1676 ; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1677 ; AVXNC-NEXT: # xmm0 = mem[1,0]
1678 ; AVXNC-NEXT: callq __truncdfbf2@PLT
1679 ; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1680 ; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1681 ; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
1682 ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1683 ; AVXNC-NEXT: vzeroupper
1684 ; AVXNC-NEXT: callq __truncdfbf2@PLT
1685 ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1686 ; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1687 ; AVXNC-NEXT: # xmm0 = mem[1,0]
1688 ; AVXNC-NEXT: callq __truncdfbf2@PLT
1689 ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
1690 ; AVXNC-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1691 ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1692 ; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp
1693 ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1694 ; AVXNC-NEXT: vpextrw $0, %xmm0, %r14d
1695 ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1696 ; AVXNC-NEXT: vpextrw $0, %xmm0, %r15d
1697 ; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
1698 ; AVXNC-NEXT: vpextrw $0, %xmm0, %r12d
1699 ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1700 ; AVXNC-NEXT: vpextrw $0, %xmm0, %r13d
1701 ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1702 ; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
1703 ; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1704 ; AVXNC-NEXT: # xmm0 = mem[1,0]
1705 ; AVXNC-NEXT: callq __truncdfbf2@PLT
1706 ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
1707 ; AVXNC-NEXT: vmovd %ebx, %xmm0
1708 ; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
1709 ; AVXNC-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0
1710 ; AVXNC-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0
1711 ; AVXNC-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0
1712 ; AVXNC-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
1713 ; AVXNC-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0
1714 ; AVXNC-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
1715 ; AVXNC-NEXT: addq $168, %rsp
1716 ; AVXNC-NEXT: popq %rbx
1717 ; AVXNC-NEXT: popq %r12
1718 ; AVXNC-NEXT: popq %r13
1719 ; AVXNC-NEXT: popq %r14
1720 ; AVXNC-NEXT: popq %r15
1721 ; AVXNC-NEXT: popq %rbp
1723 %b = fptrunc <8 x double> %a to <8 x bfloat>
1727 define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
1728 ; X86-LABEL: test_v8bf16_v32bf16:
1730 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1731 ; X86-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1734 ; SSE2-LABEL: test_v8bf16_v32bf16:
1736 ; SSE2-NEXT: movaps (%rdi), %xmm0
1737 ; SSE2-NEXT: movaps %xmm0, %xmm1
1738 ; SSE2-NEXT: movaps %xmm0, %xmm2
1739 ; SSE2-NEXT: movaps %xmm0, %xmm3
1742 ; F16-LABEL: test_v8bf16_v32bf16:
1744 ; F16-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1747 ; AVXNC-LABEL: test_v8bf16_v32bf16:
1749 ; AVXNC-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1750 ; AVXNC-NEXT: vmovaps %ymm0, %ymm1
1752 %2 = load <8 x bfloat>, ptr %0, align 16
1753 %3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1754 ret <32 x bfloat> %3
1757 define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
1758 ; X86-LABEL: concat_v8bf16:
1760 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1761 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1764 ; SSE2-LABEL: concat_v8bf16:
1768 ; AVX-LABEL: concat_v8bf16:
1770 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1771 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1773 %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1774 ret <16 x bfloat> %a
1777 define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) {
1778 ; X86-LABEL: extract_v32bf16_v8bf16:
1780 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
1781 ; X86-NEXT: vzeroupper
1784 ; SSE2-LABEL: extract_v32bf16_v8bf16:
1786 ; SSE2-NEXT: pextrw $0, %xmm1, %eax
1787 ; SSE2-NEXT: pextrw $1, %xmm1, %ecx
1788 ; SSE2-NEXT: shll $16, %ecx
1789 ; SSE2-NEXT: orl %eax, %ecx
1790 ; SSE2-NEXT: pextrw $2, %xmm1, %eax
1791 ; SSE2-NEXT: pextrw $3, %xmm1, %edx
1792 ; SSE2-NEXT: shll $16, %edx
1793 ; SSE2-NEXT: orl %eax, %edx
1794 ; SSE2-NEXT: shlq $32, %rdx
1795 ; SSE2-NEXT: orq %rcx, %rdx
1796 ; SSE2-NEXT: pextrw $4, %xmm1, %eax
1797 ; SSE2-NEXT: pextrw $5, %xmm1, %ecx
1798 ; SSE2-NEXT: shll $16, %ecx
1799 ; SSE2-NEXT: orl %eax, %ecx
1800 ; SSE2-NEXT: pextrw $6, %xmm1, %eax
1801 ; SSE2-NEXT: pextrw $7, %xmm1, %esi
1802 ; SSE2-NEXT: shll $16, %esi
1803 ; SSE2-NEXT: orl %eax, %esi
1804 ; SSE2-NEXT: shlq $32, %rsi
1805 ; SSE2-NEXT: orq %rcx, %rsi
1806 ; SSE2-NEXT: movq %rsi, %xmm1
1807 ; SSE2-NEXT: movq %rdx, %xmm0
1808 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1811 ; AVX-LABEL: extract_v32bf16_v8bf16:
1813 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1814 ; AVX-NEXT: vzeroupper
1816 %a = shufflevector <32 x bfloat> %x, <32 x bfloat> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1820 define <16 x bfloat> @concat_zero_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
1821 ; X86-LABEL: concat_zero_v8bf16:
1823 ; X86-NEXT: vmovaps %xmm0, %xmm0
1826 ; SSE2-LABEL: concat_zero_v8bf16:
1828 ; SSE2-NEXT: xorps %xmm1, %xmm1
1831 ; AVX-LABEL: concat_zero_v8bf16:
1833 ; AVX-NEXT: vmovaps %xmm0, %xmm0
1835 %a = shufflevector <8 x bfloat> %x, <8 x bfloat> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1836 ret <16 x bfloat> %a
1839 define <16 x bfloat> @concat_dup_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
1840 ; X86-LABEL: concat_dup_v8bf16:
1842 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1843 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1846 ; SSE2-LABEL: concat_dup_v8bf16:
1848 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
1851 ; AVX-LABEL: concat_dup_v8bf16:
1853 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1854 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1856 %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1857 ret <16 x bfloat> %a
1860 define float @trunc_ext(float %a) nounwind {
1861 ; X86-LABEL: trunc_ext:
1863 ; X86-NEXT: pushl %eax
1864 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1865 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
1866 ; X86-NEXT: vmovw %xmm0, %eax
1867 ; X86-NEXT: shll $16, %eax
1868 ; X86-NEXT: vmovd %eax, %xmm0
1869 ; X86-NEXT: vmovd %xmm0, (%esp)
1870 ; X86-NEXT: flds (%esp)
1871 ; X86-NEXT: popl %eax
1874 ; SSE2-LABEL: trunc_ext:
1876 ; SSE2-NEXT: pushq %rax
1877 ; SSE2-NEXT: callq __truncsfbf2@PLT
1878 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1879 ; SSE2-NEXT: shll $16, %eax
1880 ; SSE2-NEXT: movd %eax, %xmm0
1881 ; SSE2-NEXT: popq %rax
1884 ; FP16-LABEL: trunc_ext:
1886 ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
1887 ; FP16-NEXT: vmovw %xmm0, %eax
1888 ; FP16-NEXT: shll $16, %eax
1889 ; FP16-NEXT: vmovd %eax, %xmm0
1892 ; AVXNC-LABEL: trunc_ext:
1894 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
1895 ; AVXNC-NEXT: vmovd %xmm0, %eax
1896 ; AVXNC-NEXT: shll $16, %eax
1897 ; AVXNC-NEXT: vmovd %eax, %xmm0
1899 %b = fptrunc float %a to bfloat
1900 %c = fpext bfloat %b to float
1904 define void @PR92471(ptr %0, ptr %1) nounwind {
1905 ; X86-LABEL: PR92471:
1907 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1908 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1909 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1910 ; X86-NEXT: vpinsrd $1, 4(%ecx), %xmm0, %xmm0
1911 ; X86-NEXT: vpinsrd $2, 8(%ecx), %xmm0, %xmm0
1912 ; X86-NEXT: vpinsrw $6, 12(%ecx), %xmm0, %xmm0
1913 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1914 ; X86-NEXT: vpslld $16, %ymm0, %ymm0
1915 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
1916 ; X86-NEXT: vpextrd $2, %xmm1, 24(%eax)
1917 ; X86-NEXT: vpextrd $1, %xmm1, 20(%eax)
1918 ; X86-NEXT: vmovd %xmm1, 16(%eax)
1919 ; X86-NEXT: vmovdqu %xmm0, (%eax)
1920 ; X86-NEXT: vzeroupper
1923 ; SSE2-LABEL: PR92471:
1925 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1926 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1927 ; SSE2-NEXT: pinsrw $2, 12(%rdi), %xmm1
1928 ; SSE2-NEXT: pxor %xmm2, %xmm2
1929 ; SSE2-NEXT: pxor %xmm3, %xmm3
1930 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1931 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1932 ; SSE2-NEXT: movdqu %xmm2, (%rsi)
1933 ; SSE2-NEXT: movq %xmm3, 16(%rsi)
1934 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
1935 ; SSE2-NEXT: movd %xmm0, 24(%rsi)
1938 ; AVX-LABEL: PR92471:
1940 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1941 ; AVX-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm0
1942 ; AVX-NEXT: vpinsrw $6, 12(%rdi), %xmm0, %xmm0
1943 ; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1944 ; AVX-NEXT: vpslld $16, %ymm0, %ymm0
1945 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
1946 ; AVX-NEXT: vpextrd $2, %xmm1, 24(%rsi)
1947 ; AVX-NEXT: vmovq %xmm1, 16(%rsi)
1948 ; AVX-NEXT: vmovdqu %xmm0, (%rsi)
1949 ; AVX-NEXT: vzeroupper
1951 %3 = load <7 x bfloat>, ptr %0, align 2
1952 %4 = fpext <7 x bfloat> %3 to <7 x float>
1953 store <7 x float> %4, ptr %1, align 4
1957 define bfloat @PR108936(x86_fp80 %0) nounwind {
1958 ; X86-LABEL: PR108936:
1960 ; X86-NEXT: subl $12, %esp
1961 ; X86-NEXT: fldt {{[0-9]+}}(%esp)
1962 ; X86-NEXT: fstpt (%esp)
1963 ; X86-NEXT: calll __truncxfbf2
1964 ; X86-NEXT: addl $12, %esp
1967 ; CHECK-LABEL: PR108936:
1969 ; CHECK-NEXT: subq $24, %rsp
1970 ; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
1971 ; CHECK-NEXT: fstpt (%rsp)
1972 ; CHECK-NEXT: callq __truncxfbf2@PLT
1973 ; CHECK-NEXT: addq $24, %rsp
1975 %2 = fptrunc x86_fp80 %0 to bfloat
1979 define bfloat @PR115710(fp128 %0) nounwind {
1980 ; X86-LABEL: PR115710:
1982 ; X86-NEXT: subl $28, %esp
1983 ; X86-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0
1984 ; X86-NEXT: vmovups %xmm0, (%esp)
1985 ; X86-NEXT: calll __trunctfbf2
1986 ; X86-NEXT: # kill: def $ax killed $ax def $eax
1987 ; X86-NEXT: vmovw %eax, %xmm0
1988 ; X86-NEXT: addl $28, %esp
1991 ; CHECK-LABEL: PR115710:
1993 ; CHECK-NEXT: pushq %rax
1994 ; CHECK-NEXT: callq __trunctfbf2@PLT
1995 ; CHECK-NEXT: popq %rax
1997 %2 = fptrunc fp128 %0 to bfloat