1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=X86
3 ; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2
4 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16
5 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16
6 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,BF16,AVXNC
8 define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
11 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
12 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
13 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
14 ; X86-NEXT: movzwl (%edx), %edx
15 ; X86-NEXT: shll $16, %edx
16 ; X86-NEXT: vmovd %edx, %xmm0
17 ; X86-NEXT: movzwl (%ecx), %ecx
18 ; X86-NEXT: shll $16, %ecx
19 ; X86-NEXT: vmovd %ecx, %xmm1
20 ; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
21 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
22 ; X86-NEXT: vpextrw $0, %xmm0, (%eax)
27 ; SSE2-NEXT: pushq %rbx
28 ; SSE2-NEXT: movq %rdx, %rbx
29 ; SSE2-NEXT: movzwl (%rsi), %eax
30 ; SSE2-NEXT: shll $16, %eax
31 ; SSE2-NEXT: movd %eax, %xmm1
32 ; SSE2-NEXT: movzwl (%rdi), %eax
33 ; SSE2-NEXT: shll $16, %eax
34 ; SSE2-NEXT: movd %eax, %xmm0
35 ; SSE2-NEXT: addss %xmm1, %xmm0
36 ; SSE2-NEXT: callq __truncsfbf2@PLT
37 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
38 ; SSE2-NEXT: movw %ax, (%rbx)
39 ; SSE2-NEXT: popq %rbx
44 ; F16-NEXT: movzwl (%rsi), %eax
45 ; F16-NEXT: shll $16, %eax
46 ; F16-NEXT: vmovd %eax, %xmm0
47 ; F16-NEXT: movzwl (%rdi), %eax
48 ; F16-NEXT: shll $16, %eax
49 ; F16-NEXT: vmovd %eax, %xmm1
50 ; F16-NEXT: vaddss %xmm0, %xmm1, %xmm0
51 ; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
52 ; F16-NEXT: vpextrw $0, %xmm0, (%rdx)
57 ; AVXNC-NEXT: movzwl (%rsi), %eax
58 ; AVXNC-NEXT: shll $16, %eax
59 ; AVXNC-NEXT: vmovd %eax, %xmm0
60 ; AVXNC-NEXT: movzwl (%rdi), %eax
61 ; AVXNC-NEXT: shll $16, %eax
62 ; AVXNC-NEXT: vmovd %eax, %xmm1
63 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
64 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
65 ; AVXNC-NEXT: vpextrw $0, %xmm0, (%rdx)
67 %a = load bfloat, ptr %pa
68 %b = load bfloat, ptr %pb
69 %add = fadd bfloat %a, %b
70 store bfloat %add, ptr %pc
74 define bfloat @add2(bfloat %a, bfloat %b) nounwind {
77 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
78 ; X86-NEXT: shll $16, %eax
79 ; X86-NEXT: vmovd %eax, %xmm0
80 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
81 ; X86-NEXT: shll $16, %eax
82 ; X86-NEXT: vmovd %eax, %xmm1
83 ; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
84 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
85 ; X86-NEXT: vmovw %xmm0, %eax
86 ; X86-NEXT: vmovw %eax, %xmm0
91 ; SSE2-NEXT: pushq %rax
92 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
93 ; SSE2-NEXT: pextrw $0, %xmm1, %ecx
94 ; SSE2-NEXT: shll $16, %ecx
95 ; SSE2-NEXT: movd %ecx, %xmm1
96 ; SSE2-NEXT: shll $16, %eax
97 ; SSE2-NEXT: movd %eax, %xmm0
98 ; SSE2-NEXT: addss %xmm1, %xmm0
99 ; SSE2-NEXT: callq __truncsfbf2@PLT
100 ; SSE2-NEXT: popq %rax
105 ; FP16-NEXT: vmovw %xmm0, %eax
106 ; FP16-NEXT: vmovw %xmm1, %ecx
107 ; FP16-NEXT: shll $16, %ecx
108 ; FP16-NEXT: vmovd %ecx, %xmm0
109 ; FP16-NEXT: shll $16, %eax
110 ; FP16-NEXT: vmovd %eax, %xmm1
111 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
112 ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
113 ; FP16-NEXT: vmovw %xmm0, %eax
114 ; FP16-NEXT: vmovw %eax, %xmm0
119 ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
120 ; AVXNC-NEXT: vpextrw $0, %xmm1, %ecx
121 ; AVXNC-NEXT: shll $16, %ecx
122 ; AVXNC-NEXT: vmovd %ecx, %xmm0
123 ; AVXNC-NEXT: shll $16, %eax
124 ; AVXNC-NEXT: vmovd %eax, %xmm1
125 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
126 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
127 ; AVXNC-NEXT: vmovd %xmm0, %eax
128 ; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
130 %add = fadd bfloat %a, %b
134 define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
135 ; X86-LABEL: add_double:
137 ; X86-NEXT: pushl %ebx
138 ; X86-NEXT: pushl %edi
139 ; X86-NEXT: pushl %esi
140 ; X86-NEXT: subl $16, %esp
141 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
142 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
143 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
144 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
145 ; X86-NEXT: vmovsd %xmm0, (%esp)
146 ; X86-NEXT: calll __truncdfbf2
147 ; X86-NEXT: vmovw %xmm0, %edi
148 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
149 ; X86-NEXT: vmovsd %xmm0, (%esp)
150 ; X86-NEXT: calll __truncdfbf2
151 ; X86-NEXT: vmovw %xmm0, %eax
152 ; X86-NEXT: shll $16, %eax
153 ; X86-NEXT: vmovd %eax, %xmm0
154 ; X86-NEXT: shll $16, %edi
155 ; X86-NEXT: vmovd %edi, %xmm1
156 ; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
157 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
158 ; X86-NEXT: vmovw %xmm0, %eax
159 ; X86-NEXT: shll $16, %eax
160 ; X86-NEXT: vmovd %eax, %xmm0
161 ; X86-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
162 ; X86-NEXT: vmovsd %xmm0, (%esi)
163 ; X86-NEXT: addl $16, %esp
164 ; X86-NEXT: popl %esi
165 ; X86-NEXT: popl %edi
166 ; X86-NEXT: popl %ebx
169 ; SSE2-LABEL: add_double:
171 ; SSE2-NEXT: pushq %rbp
172 ; SSE2-NEXT: pushq %r14
173 ; SSE2-NEXT: pushq %rbx
174 ; SSE2-NEXT: movq %rdx, %rbx
175 ; SSE2-NEXT: movq %rsi, %r14
176 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
177 ; SSE2-NEXT: callq __truncdfbf2@PLT
178 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
179 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
180 ; SSE2-NEXT: callq __truncdfbf2@PLT
181 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
182 ; SSE2-NEXT: shll $16, %eax
183 ; SSE2-NEXT: movd %eax, %xmm1
184 ; SSE2-NEXT: shll $16, %ebp
185 ; SSE2-NEXT: movd %ebp, %xmm0
186 ; SSE2-NEXT: addss %xmm1, %xmm0
187 ; SSE2-NEXT: callq __truncsfbf2@PLT
188 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
189 ; SSE2-NEXT: shll $16, %eax
190 ; SSE2-NEXT: movd %eax, %xmm0
191 ; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
192 ; SSE2-NEXT: movsd %xmm0, (%rbx)
193 ; SSE2-NEXT: popq %rbx
194 ; SSE2-NEXT: popq %r14
195 ; SSE2-NEXT: popq %rbp
198 ; FP16-LABEL: add_double:
200 ; FP16-NEXT: pushq %rbp
201 ; FP16-NEXT: pushq %r14
202 ; FP16-NEXT: pushq %rbx
203 ; FP16-NEXT: movq %rdx, %rbx
204 ; FP16-NEXT: movq %rsi, %r14
205 ; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
206 ; FP16-NEXT: callq __truncdfbf2@PLT
207 ; FP16-NEXT: vmovw %xmm0, %ebp
208 ; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
209 ; FP16-NEXT: callq __truncdfbf2@PLT
210 ; FP16-NEXT: vmovw %xmm0, %eax
211 ; FP16-NEXT: shll $16, %eax
212 ; FP16-NEXT: vmovd %eax, %xmm0
213 ; FP16-NEXT: shll $16, %ebp
214 ; FP16-NEXT: vmovd %ebp, %xmm1
215 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
216 ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
217 ; FP16-NEXT: vmovw %xmm0, %eax
218 ; FP16-NEXT: shll $16, %eax
219 ; FP16-NEXT: vmovd %eax, %xmm0
220 ; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
221 ; FP16-NEXT: vmovsd %xmm0, (%rbx)
222 ; FP16-NEXT: popq %rbx
223 ; FP16-NEXT: popq %r14
224 ; FP16-NEXT: popq %rbp
227 ; AVXNC-LABEL: add_double:
229 ; AVXNC-NEXT: pushq %rbp
230 ; AVXNC-NEXT: pushq %r14
231 ; AVXNC-NEXT: pushq %rbx
232 ; AVXNC-NEXT: movq %rdx, %rbx
233 ; AVXNC-NEXT: movq %rsi, %r14
234 ; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
235 ; AVXNC-NEXT: callq __truncdfbf2@PLT
236 ; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp
237 ; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
238 ; AVXNC-NEXT: callq __truncdfbf2@PLT
239 ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
240 ; AVXNC-NEXT: shll $16, %eax
241 ; AVXNC-NEXT: vmovd %eax, %xmm0
242 ; AVXNC-NEXT: shll $16, %ebp
243 ; AVXNC-NEXT: vmovd %ebp, %xmm1
244 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
245 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
246 ; AVXNC-NEXT: vmovd %xmm0, %eax
247 ; AVXNC-NEXT: shll $16, %eax
248 ; AVXNC-NEXT: vmovd %eax, %xmm0
249 ; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
250 ; AVXNC-NEXT: vmovsd %xmm0, (%rbx)
251 ; AVXNC-NEXT: popq %rbx
252 ; AVXNC-NEXT: popq %r14
253 ; AVXNC-NEXT: popq %rbp
255 %la = load double, ptr %pa
256 %a = fptrunc double %la to bfloat
257 %lb = load double, ptr %pb
258 %b = fptrunc double %lb to bfloat
259 %add = fadd bfloat %a, %b
260 %dadd = fpext bfloat %add to double
261 store double %dadd, ptr %pc
265 define double @add_double2(double %da, double %db) nounwind {
266 ; X86-LABEL: add_double2:
268 ; X86-NEXT: pushl %esi
269 ; X86-NEXT: subl $24, %esp
270 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
271 ; X86-NEXT: vmovsd %xmm0, (%esp)
272 ; X86-NEXT: calll __truncdfbf2
273 ; X86-NEXT: vmovw %xmm0, %esi
274 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
275 ; X86-NEXT: vmovsd %xmm0, (%esp)
276 ; X86-NEXT: calll __truncdfbf2
277 ; X86-NEXT: vmovw %xmm0, %eax
278 ; X86-NEXT: shll $16, %eax
279 ; X86-NEXT: vmovd %eax, %xmm0
280 ; X86-NEXT: shll $16, %esi
281 ; X86-NEXT: vmovd %esi, %xmm1
282 ; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
283 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
284 ; X86-NEXT: vmovw %xmm0, %eax
285 ; X86-NEXT: shll $16, %eax
286 ; X86-NEXT: vmovd %eax, %xmm0
287 ; X86-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
288 ; X86-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
289 ; X86-NEXT: fldl {{[0-9]+}}(%esp)
290 ; X86-NEXT: addl $24, %esp
291 ; X86-NEXT: popl %esi
294 ; SSE2-LABEL: add_double2:
296 ; SSE2-NEXT: pushq %rbx
297 ; SSE2-NEXT: subq $16, %rsp
298 ; SSE2-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
299 ; SSE2-NEXT: callq __truncdfbf2@PLT
300 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
301 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
302 ; SSE2-NEXT: # xmm0 = mem[0],zero
303 ; SSE2-NEXT: callq __truncdfbf2@PLT
304 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
305 ; SSE2-NEXT: shll $16, %eax
306 ; SSE2-NEXT: movd %eax, %xmm1
307 ; SSE2-NEXT: shll $16, %ebx
308 ; SSE2-NEXT: movd %ebx, %xmm0
309 ; SSE2-NEXT: addss %xmm1, %xmm0
310 ; SSE2-NEXT: callq __truncsfbf2@PLT
311 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
312 ; SSE2-NEXT: shll $16, %eax
313 ; SSE2-NEXT: movd %eax, %xmm0
314 ; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
315 ; SSE2-NEXT: addq $16, %rsp
316 ; SSE2-NEXT: popq %rbx
319 ; FP16-LABEL: add_double2:
321 ; FP16-NEXT: pushq %rbx
322 ; FP16-NEXT: subq $16, %rsp
323 ; FP16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
324 ; FP16-NEXT: callq __truncdfbf2@PLT
325 ; FP16-NEXT: vmovw %xmm0, %ebx
326 ; FP16-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
327 ; FP16-NEXT: # xmm0 = mem[0],zero
328 ; FP16-NEXT: callq __truncdfbf2@PLT
329 ; FP16-NEXT: vmovw %xmm0, %eax
330 ; FP16-NEXT: shll $16, %eax
331 ; FP16-NEXT: vmovd %eax, %xmm0
332 ; FP16-NEXT: shll $16, %ebx
333 ; FP16-NEXT: vmovd %ebx, %xmm1
334 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
335 ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
336 ; FP16-NEXT: vmovw %xmm0, %eax
337 ; FP16-NEXT: shll $16, %eax
338 ; FP16-NEXT: vmovd %eax, %xmm0
339 ; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
340 ; FP16-NEXT: addq $16, %rsp
341 ; FP16-NEXT: popq %rbx
344 ; AVXNC-LABEL: add_double2:
346 ; AVXNC-NEXT: pushq %rbx
347 ; AVXNC-NEXT: subq $16, %rsp
348 ; AVXNC-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
349 ; AVXNC-NEXT: callq __truncdfbf2@PLT
350 ; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
351 ; AVXNC-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
352 ; AVXNC-NEXT: # xmm0 = mem[0],zero
353 ; AVXNC-NEXT: callq __truncdfbf2@PLT
354 ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
355 ; AVXNC-NEXT: shll $16, %eax
356 ; AVXNC-NEXT: vmovd %eax, %xmm0
357 ; AVXNC-NEXT: shll $16, %ebx
358 ; AVXNC-NEXT: vmovd %ebx, %xmm1
359 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
360 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
361 ; AVXNC-NEXT: vmovd %xmm0, %eax
362 ; AVXNC-NEXT: shll $16, %eax
363 ; AVXNC-NEXT: vmovd %eax, %xmm0
364 ; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
365 ; AVXNC-NEXT: addq $16, %rsp
366 ; AVXNC-NEXT: popq %rbx
368 %a = fptrunc double %da to bfloat
369 %b = fptrunc double %db to bfloat
370 %add = fadd bfloat %a, %b
371 %dadd = fpext bfloat %add to double
375 define void @add_constant(ptr %pa, ptr %pc) nounwind {
376 ; X86-LABEL: add_constant:
378 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
379 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
380 ; X86-NEXT: movzwl (%ecx), %ecx
381 ; X86-NEXT: shll $16, %ecx
382 ; X86-NEXT: vmovd %ecx, %xmm0
383 ; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
384 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
385 ; X86-NEXT: vpextrw $0, %xmm0, (%eax)
388 ; SSE2-LABEL: add_constant:
390 ; SSE2-NEXT: pushq %rbx
391 ; SSE2-NEXT: movq %rsi, %rbx
392 ; SSE2-NEXT: movzwl (%rdi), %eax
393 ; SSE2-NEXT: shll $16, %eax
394 ; SSE2-NEXT: movd %eax, %xmm0
395 ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
396 ; SSE2-NEXT: callq __truncsfbf2@PLT
397 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
398 ; SSE2-NEXT: movw %ax, (%rbx)
399 ; SSE2-NEXT: popq %rbx
402 ; F16-LABEL: add_constant:
404 ; F16-NEXT: movzwl (%rdi), %eax
405 ; F16-NEXT: shll $16, %eax
406 ; F16-NEXT: vmovd %eax, %xmm0
407 ; F16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
408 ; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
409 ; F16-NEXT: vpextrw $0, %xmm0, (%rsi)
412 ; AVXNC-LABEL: add_constant:
414 ; AVXNC-NEXT: movzwl (%rdi), %eax
415 ; AVXNC-NEXT: shll $16, %eax
416 ; AVXNC-NEXT: vmovd %eax, %xmm0
417 ; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
418 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
419 ; AVXNC-NEXT: vpextrw $0, %xmm0, (%rsi)
421 %a = load bfloat, ptr %pa
422 %add = fadd bfloat %a, 1.0
423 store bfloat %add, ptr %pc
427 define bfloat @add_constant2(bfloat %a) nounwind {
428 ; X86-LABEL: add_constant2:
430 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
431 ; X86-NEXT: shll $16, %eax
432 ; X86-NEXT: vmovd %eax, %xmm0
433 ; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
434 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
435 ; X86-NEXT: vmovw %xmm0, %eax
436 ; X86-NEXT: vmovw %eax, %xmm0
439 ; SSE2-LABEL: add_constant2:
441 ; SSE2-NEXT: pushq %rax
442 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
443 ; SSE2-NEXT: shll $16, %eax
444 ; SSE2-NEXT: movd %eax, %xmm0
445 ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
446 ; SSE2-NEXT: callq __truncsfbf2@PLT
447 ; SSE2-NEXT: popq %rax
450 ; FP16-LABEL: add_constant2:
452 ; FP16-NEXT: vmovw %xmm0, %eax
453 ; FP16-NEXT: shll $16, %eax
454 ; FP16-NEXT: vmovd %eax, %xmm0
455 ; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
456 ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
457 ; FP16-NEXT: vmovw %xmm0, %eax
458 ; FP16-NEXT: vmovw %eax, %xmm0
461 ; AVXNC-LABEL: add_constant2:
463 ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
464 ; AVXNC-NEXT: shll $16, %eax
465 ; AVXNC-NEXT: vmovd %eax, %xmm0
466 ; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
467 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
468 ; AVXNC-NEXT: vmovd %xmm0, %eax
469 ; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
471 %add = fadd bfloat %a, 1.0
475 define void @store_constant(ptr %pc) nounwind {
476 ; X86-LABEL: store_constant:
478 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
479 ; X86-NEXT: movw $16256, (%eax) # imm = 0x3F80
482 ; CHECK-LABEL: store_constant:
484 ; CHECK-NEXT: movw $16256, (%rdi) # imm = 0x3F80
486 store bfloat 1.0, ptr %pc
490 define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind {
491 ; X86-LABEL: fold_ext_trunc:
493 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
494 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
495 ; X86-NEXT: movzwl (%ecx), %ecx
496 ; X86-NEXT: movw %cx, (%eax)
499 ; CHECK-LABEL: fold_ext_trunc:
501 ; CHECK-NEXT: movzwl (%rdi), %eax
502 ; CHECK-NEXT: movw %ax, (%rsi)
504 %a = load bfloat, ptr %pa
505 %ext = fpext bfloat %a to float
506 %trunc = fptrunc float %ext to bfloat
507 store bfloat %trunc, ptr %pc
511 define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
512 ; X86-LABEL: fold_ext_trunc2:
514 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
517 ; CHECK-LABEL: fold_ext_trunc2:
520 %ext = fpext bfloat %a to float
521 %trunc = fptrunc float %ext to bfloat
525 define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
528 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
529 ; X86-NEXT: vpslld $16, %ymm1, %ymm1
530 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
531 ; X86-NEXT: vpslld $16, %ymm0, %ymm0
532 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
533 ; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
534 ; X86-NEXT: vzeroupper
539 ; SSE2-NEXT: pushq %rbp
540 ; SSE2-NEXT: pushq %r15
541 ; SSE2-NEXT: pushq %r14
542 ; SSE2-NEXT: pushq %r13
543 ; SSE2-NEXT: pushq %r12
544 ; SSE2-NEXT: pushq %rbx
545 ; SSE2-NEXT: subq $56, %rsp
546 ; SSE2-NEXT: movq %xmm0, %rcx
547 ; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
548 ; SSE2-NEXT: movq %rcx, %rax
549 ; SSE2-NEXT: shrq $48, %rax
550 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
551 ; SSE2-NEXT: movq %xmm1, %rdx
552 ; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
553 ; SSE2-NEXT: movq %rdx, %rax
554 ; SSE2-NEXT: shrq $48, %rax
555 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
556 ; SSE2-NEXT: movq %rcx, %rax
557 ; SSE2-NEXT: shrq $32, %rax
558 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
559 ; SSE2-NEXT: movq %rdx, %rax
560 ; SSE2-NEXT: shrq $32, %rax
561 ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
562 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
563 ; SSE2-NEXT: movq %xmm0, %r15
564 ; SSE2-NEXT: movq %r15, %rbx
565 ; SSE2-NEXT: shrq $48, %rbx
566 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1]
567 ; SSE2-NEXT: movq %xmm1, %r14
568 ; SSE2-NEXT: movq %r14, %rbp
569 ; SSE2-NEXT: shrq $48, %rbp
570 ; SSE2-NEXT: movq %r15, %r12
571 ; SSE2-NEXT: shrq $32, %r12
572 ; SSE2-NEXT: movq %r14, %r13
573 ; SSE2-NEXT: shrq $32, %r13
574 ; SSE2-NEXT: movl %r14d, %eax
575 ; SSE2-NEXT: shll $16, %eax
576 ; SSE2-NEXT: movd %eax, %xmm1
577 ; SSE2-NEXT: movl %r15d, %eax
578 ; SSE2-NEXT: shll $16, %eax
579 ; SSE2-NEXT: movd %eax, %xmm0
580 ; SSE2-NEXT: addss %xmm1, %xmm0
581 ; SSE2-NEXT: callq __truncsfbf2@PLT
582 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
583 ; SSE2-NEXT: movzwl %ax, %eax
584 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
585 ; SSE2-NEXT: andl $-65536, %r14d # imm = 0xFFFF0000
586 ; SSE2-NEXT: movd %r14d, %xmm1
587 ; SSE2-NEXT: andl $-65536, %r15d # imm = 0xFFFF0000
588 ; SSE2-NEXT: movd %r15d, %xmm0
589 ; SSE2-NEXT: addss %xmm1, %xmm0
590 ; SSE2-NEXT: callq __truncsfbf2@PLT
591 ; SSE2-NEXT: pextrw $0, %xmm0, %r15d
592 ; SSE2-NEXT: shll $16, %r15d
593 ; SSE2-NEXT: addl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
594 ; SSE2-NEXT: shll $16, %r13d
595 ; SSE2-NEXT: movd %r13d, %xmm1
596 ; SSE2-NEXT: shll $16, %r12d
597 ; SSE2-NEXT: movd %r12d, %xmm0
598 ; SSE2-NEXT: addss %xmm1, %xmm0
599 ; SSE2-NEXT: callq __truncsfbf2@PLT
600 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
601 ; SSE2-NEXT: movzwl %ax, %r14d
602 ; SSE2-NEXT: shll $16, %ebp
603 ; SSE2-NEXT: movd %ebp, %xmm1
604 ; SSE2-NEXT: shll $16, %ebx
605 ; SSE2-NEXT: movd %ebx, %xmm0
606 ; SSE2-NEXT: addss %xmm1, %xmm0
607 ; SSE2-NEXT: callq __truncsfbf2@PLT
608 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
609 ; SSE2-NEXT: shll $16, %ebx
610 ; SSE2-NEXT: orl %r14d, %ebx
611 ; SSE2-NEXT: shlq $32, %rbx
612 ; SSE2-NEXT: orq %r15, %rbx
613 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
614 ; SSE2-NEXT: movl %r15d, %eax
615 ; SSE2-NEXT: shll $16, %eax
616 ; SSE2-NEXT: movd %eax, %xmm1
617 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
618 ; SSE2-NEXT: movl %r14d, %eax
619 ; SSE2-NEXT: shll $16, %eax
620 ; SSE2-NEXT: movd %eax, %xmm0
621 ; SSE2-NEXT: addss %xmm1, %xmm0
622 ; SSE2-NEXT: callq __truncsfbf2@PLT
623 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
624 ; SSE2-NEXT: movzwl %ax, %ebp
625 ; SSE2-NEXT: movq %r15, %rax
626 ; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
627 ; SSE2-NEXT: movd %eax, %xmm1
628 ; SSE2-NEXT: movq %r14, %rax
629 ; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
630 ; SSE2-NEXT: movd %eax, %xmm0
631 ; SSE2-NEXT: addss %xmm1, %xmm0
632 ; SSE2-NEXT: callq __truncsfbf2@PLT
633 ; SSE2-NEXT: pextrw $0, %xmm0, %r14d
634 ; SSE2-NEXT: shll $16, %r14d
635 ; SSE2-NEXT: orl %ebp, %r14d
636 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
637 ; SSE2-NEXT: shll $16, %eax
638 ; SSE2-NEXT: movd %eax, %xmm1
639 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
640 ; SSE2-NEXT: shll $16, %eax
641 ; SSE2-NEXT: movd %eax, %xmm0
642 ; SSE2-NEXT: addss %xmm1, %xmm0
643 ; SSE2-NEXT: callq __truncsfbf2@PLT
644 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
645 ; SSE2-NEXT: movzwl %ax, %ebp
646 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
647 ; SSE2-NEXT: shll $16, %eax
648 ; SSE2-NEXT: movd %eax, %xmm1
649 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
650 ; SSE2-NEXT: shll $16, %eax
651 ; SSE2-NEXT: movd %eax, %xmm0
652 ; SSE2-NEXT: addss %xmm1, %xmm0
653 ; SSE2-NEXT: callq __truncsfbf2@PLT
654 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
655 ; SSE2-NEXT: shll $16, %eax
656 ; SSE2-NEXT: orl %ebp, %eax
657 ; SSE2-NEXT: shlq $32, %rax
658 ; SSE2-NEXT: orq %r14, %rax
659 ; SSE2-NEXT: movq %rax, %xmm0
660 ; SSE2-NEXT: movq %rbx, %xmm1
661 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
662 ; SSE2-NEXT: addq $56, %rsp
663 ; SSE2-NEXT: popq %rbx
664 ; SSE2-NEXT: popq %r12
665 ; SSE2-NEXT: popq %r13
666 ; SSE2-NEXT: popq %r14
667 ; SSE2-NEXT: popq %r15
668 ; SSE2-NEXT: popq %rbp
673 ; F16-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
674 ; F16-NEXT: vpslld $16, %ymm1, %ymm1
675 ; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
676 ; F16-NEXT: vpslld $16, %ymm0, %ymm0
677 ; F16-NEXT: vaddps %ymm1, %ymm0, %ymm0
678 ; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
679 ; F16-NEXT: vzeroupper
684 ; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
685 ; AVXNC-NEXT: vpslld $16, %ymm1, %ymm1
686 ; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
687 ; AVXNC-NEXT: vpslld $16, %ymm0, %ymm0
688 ; AVXNC-NEXT: vaddps %ymm1, %ymm0, %ymm0
689 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
690 ; AVXNC-NEXT: vzeroupper
692 %add = fadd <8 x bfloat> %a, %b
693 ret <8 x bfloat> %add
696 define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
697 ; X86-LABEL: pr62997:
699 ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
700 ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
701 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
704 ; SSE2-LABEL: pr62997:
706 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
709 ; BF16-LABEL: pr62997:
711 ; BF16-NEXT: vpextrw $0, %xmm0, %eax
712 ; BF16-NEXT: vpextrw $0, %xmm1, %ecx
713 ; BF16-NEXT: vmovd %eax, %xmm0
714 ; BF16-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
717 ; FP16-LABEL: pr62997:
719 ; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
721 %1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0
722 %2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1
726 define <32 x bfloat> @pr63017() {
727 ; X86-LABEL: pr63017:
729 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
732 ; SSE2-LABEL: pr63017:
734 ; SSE2-NEXT: xorps %xmm0, %xmm0
735 ; SSE2-NEXT: xorps %xmm1, %xmm1
736 ; SSE2-NEXT: xorps %xmm2, %xmm2
737 ; SSE2-NEXT: xorps %xmm3, %xmm3
740 ; F16-LABEL: pr63017:
742 ; F16-NEXT: vxorps %xmm0, %xmm0, %xmm0
745 ; AVXNC-LABEL: pr63017:
747 ; AVXNC-NEXT: vxorps %xmm0, %xmm0, %xmm0
748 ; AVXNC-NEXT: vxorps %xmm1, %xmm1, %xmm1
750 ret <32 x bfloat> zeroinitializer
753 define <32 x bfloat> @pr63017_2() nounwind {
754 ; X86-LABEL: pr63017_2:
756 ; X86-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0]
757 ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
760 ; SSE2-LABEL: pr63017_2:
762 ; SSE2-NEXT: pushq %r14
763 ; SSE2-NEXT: pushq %rbx
764 ; SSE2-NEXT: subq $200, %rsp
765 ; SSE2-NEXT: xorl %eax, %eax
766 ; SSE2-NEXT: testb %al, %al
767 ; SSE2-NEXT: jne .LBB12_1
768 ; SSE2-NEXT: # %bb.2: # %cond.load
769 ; SSE2-NEXT: movzwl (%rax), %eax
770 ; SSE2-NEXT: shll $16, %eax
771 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
772 ; SSE2-NEXT: movd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
773 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
774 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
775 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
776 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
777 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
778 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
779 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
780 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
781 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
782 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
783 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
784 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
785 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
786 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
787 ; SSE2-NEXT: movdqa %xmm0, %xmm15
788 ; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
789 ; SSE2-NEXT: movdqa %xmm0, %xmm13
790 ; SSE2-NEXT: movdqa %xmm0, %xmm14
791 ; SSE2-NEXT: movdqa %xmm0, %xmm11
792 ; SSE2-NEXT: movdqa %xmm0, %xmm12
793 ; SSE2-NEXT: movdqa %xmm0, %xmm9
794 ; SSE2-NEXT: movdqa %xmm0, %xmm10
795 ; SSE2-NEXT: movdqa %xmm0, %xmm7
796 ; SSE2-NEXT: movdqa %xmm0, %xmm8
797 ; SSE2-NEXT: movdqa %xmm0, %xmm5
798 ; SSE2-NEXT: movdqa %xmm0, %xmm6
799 ; SSE2-NEXT: movdqa %xmm0, %xmm3
800 ; SSE2-NEXT: movdqa %xmm0, %xmm4
801 ; SSE2-NEXT: movdqa %xmm0, %xmm1
802 ; SSE2-NEXT: movdqa %xmm0, %xmm2
803 ; SSE2-NEXT: jmp .LBB12_3
804 ; SSE2-NEXT: .LBB12_1:
805 ; SSE2-NEXT: movd {{.*#+}} xmm2 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
806 ; SSE2-NEXT: movdqa %xmm2, %xmm0
807 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
808 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
809 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
810 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
811 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
812 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
813 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
814 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
815 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
816 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
817 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
818 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
819 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
820 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
821 ; SSE2-NEXT: movdqa %xmm2, %xmm15
822 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
823 ; SSE2-NEXT: movdqa %xmm2, %xmm13
824 ; SSE2-NEXT: movdqa %xmm2, %xmm14
825 ; SSE2-NEXT: movdqa %xmm2, %xmm11
826 ; SSE2-NEXT: movdqa %xmm2, %xmm12
827 ; SSE2-NEXT: movdqa %xmm2, %xmm9
828 ; SSE2-NEXT: movdqa %xmm2, %xmm10
829 ; SSE2-NEXT: movdqa %xmm2, %xmm7
830 ; SSE2-NEXT: movdqa %xmm2, %xmm8
831 ; SSE2-NEXT: movdqa %xmm2, %xmm5
832 ; SSE2-NEXT: movdqa %xmm2, %xmm6
833 ; SSE2-NEXT: movdqa %xmm2, %xmm3
834 ; SSE2-NEXT: movdqa %xmm2, %xmm4
835 ; SSE2-NEXT: movdqa %xmm2, %xmm1
836 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
837 ; SSE2-NEXT: .LBB12_3: # %else
838 ; SSE2-NEXT: xorl %eax, %eax
839 ; SSE2-NEXT: testb %al, %al
840 ; SSE2-NEXT: jne .LBB12_5
841 ; SSE2-NEXT: # %bb.4: # %cond.load1
842 ; SSE2-NEXT: movzwl (%rax), %eax
843 ; SSE2-NEXT: shll $16, %eax
844 ; SSE2-NEXT: movd %eax, %xmm0
845 ; SSE2-NEXT: .LBB12_5: # %else2
846 ; SSE2-NEXT: xorl %eax, %eax
847 ; SSE2-NEXT: testb %al, %al
848 ; SSE2-NEXT: jne .LBB12_7
849 ; SSE2-NEXT: # %bb.6: # %cond.load4
850 ; SSE2-NEXT: movzwl (%rax), %eax
851 ; SSE2-NEXT: shll $16, %eax
852 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
853 ; SSE2-NEXT: .LBB12_7: # %else5
854 ; SSE2-NEXT: xorl %eax, %eax
855 ; SSE2-NEXT: testb %al, %al
856 ; SSE2-NEXT: jne .LBB12_9
857 ; SSE2-NEXT: # %bb.8: # %cond.load7
858 ; SSE2-NEXT: movzwl (%rax), %eax
859 ; SSE2-NEXT: shll $16, %eax
860 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
861 ; SSE2-NEXT: .LBB12_9: # %else8
862 ; SSE2-NEXT: xorl %eax, %eax
863 ; SSE2-NEXT: testb %al, %al
864 ; SSE2-NEXT: jne .LBB12_11
865 ; SSE2-NEXT: # %bb.10: # %cond.load10
866 ; SSE2-NEXT: movzwl (%rax), %eax
867 ; SSE2-NEXT: shll $16, %eax
868 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
869 ; SSE2-NEXT: .LBB12_11: # %else11
870 ; SSE2-NEXT: xorl %eax, %eax
871 ; SSE2-NEXT: testb %al, %al
872 ; SSE2-NEXT: jne .LBB12_13
873 ; SSE2-NEXT: # %bb.12: # %cond.load13
874 ; SSE2-NEXT: movzwl (%rax), %eax
875 ; SSE2-NEXT: shll $16, %eax
876 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
877 ; SSE2-NEXT: .LBB12_13: # %else14
878 ; SSE2-NEXT: xorl %eax, %eax
879 ; SSE2-NEXT: testb %al, %al
880 ; SSE2-NEXT: jne .LBB12_15
881 ; SSE2-NEXT: # %bb.14: # %cond.load16
882 ; SSE2-NEXT: movzwl (%rax), %eax
883 ; SSE2-NEXT: shll $16, %eax
884 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
885 ; SSE2-NEXT: .LBB12_15: # %else17
886 ; SSE2-NEXT: xorl %eax, %eax
887 ; SSE2-NEXT: testb %al, %al
888 ; SSE2-NEXT: jne .LBB12_17
889 ; SSE2-NEXT: # %bb.16: # %cond.load19
890 ; SSE2-NEXT: movzwl (%rax), %eax
891 ; SSE2-NEXT: shll $16, %eax
892 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
893 ; SSE2-NEXT: .LBB12_17: # %else20
894 ; SSE2-NEXT: xorl %eax, %eax
895 ; SSE2-NEXT: testb %al, %al
896 ; SSE2-NEXT: jne .LBB12_19
897 ; SSE2-NEXT: # %bb.18: # %cond.load22
898 ; SSE2-NEXT: movzwl (%rax), %eax
899 ; SSE2-NEXT: shll $16, %eax
900 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
901 ; SSE2-NEXT: .LBB12_19: # %else23
902 ; SSE2-NEXT: xorl %eax, %eax
903 ; SSE2-NEXT: testb %al, %al
904 ; SSE2-NEXT: jne .LBB12_21
905 ; SSE2-NEXT: # %bb.20: # %cond.load25
906 ; SSE2-NEXT: movzwl (%rax), %eax
907 ; SSE2-NEXT: shll $16, %eax
908 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
909 ; SSE2-NEXT: .LBB12_21: # %else26
910 ; SSE2-NEXT: xorl %eax, %eax
911 ; SSE2-NEXT: testb %al, %al
912 ; SSE2-NEXT: jne .LBB12_23
913 ; SSE2-NEXT: # %bb.22: # %cond.load28
914 ; SSE2-NEXT: movzwl (%rax), %eax
915 ; SSE2-NEXT: shll $16, %eax
916 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
917 ; SSE2-NEXT: .LBB12_23: # %else29
918 ; SSE2-NEXT: xorl %eax, %eax
919 ; SSE2-NEXT: testb %al, %al
920 ; SSE2-NEXT: jne .LBB12_25
921 ; SSE2-NEXT: # %bb.24: # %cond.load31
922 ; SSE2-NEXT: movzwl (%rax), %eax
923 ; SSE2-NEXT: shll $16, %eax
924 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
925 ; SSE2-NEXT: .LBB12_25: # %else32
926 ; SSE2-NEXT: xorl %eax, %eax
927 ; SSE2-NEXT: testb %al, %al
928 ; SSE2-NEXT: jne .LBB12_27
929 ; SSE2-NEXT: # %bb.26: # %cond.load34
930 ; SSE2-NEXT: movzwl (%rax), %eax
931 ; SSE2-NEXT: shll $16, %eax
932 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
933 ; SSE2-NEXT: .LBB12_27: # %else35
934 ; SSE2-NEXT: xorl %eax, %eax
935 ; SSE2-NEXT: testb %al, %al
936 ; SSE2-NEXT: jne .LBB12_29
937 ; SSE2-NEXT: # %bb.28: # %cond.load37
938 ; SSE2-NEXT: movzwl (%rax), %eax
939 ; SSE2-NEXT: shll $16, %eax
940 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
941 ; SSE2-NEXT: .LBB12_29: # %else38
942 ; SSE2-NEXT: xorl %eax, %eax
943 ; SSE2-NEXT: testb %al, %al
944 ; SSE2-NEXT: jne .LBB12_31
945 ; SSE2-NEXT: # %bb.30: # %cond.load40
946 ; SSE2-NEXT: movzwl (%rax), %eax
947 ; SSE2-NEXT: shll $16, %eax
948 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
949 ; SSE2-NEXT: .LBB12_31: # %else41
950 ; SSE2-NEXT: xorl %eax, %eax
951 ; SSE2-NEXT: testb %al, %al
952 ; SSE2-NEXT: jne .LBB12_33
953 ; SSE2-NEXT: # %bb.32: # %cond.load43
954 ; SSE2-NEXT: movzwl (%rax), %eax
955 ; SSE2-NEXT: shll $16, %eax
956 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
957 ; SSE2-NEXT: .LBB12_33: # %else44
958 ; SSE2-NEXT: xorl %eax, %eax
959 ; SSE2-NEXT: testb %al, %al
960 ; SSE2-NEXT: jne .LBB12_35
961 ; SSE2-NEXT: # %bb.34: # %cond.load46
962 ; SSE2-NEXT: movzwl (%rax), %eax
963 ; SSE2-NEXT: shll $16, %eax
964 ; SSE2-NEXT: movd %eax, %xmm15
965 ; SSE2-NEXT: .LBB12_35: # %else47
966 ; SSE2-NEXT: xorl %eax, %eax
967 ; SSE2-NEXT: testb %al, %al
968 ; SSE2-NEXT: jne .LBB12_37
969 ; SSE2-NEXT: # %bb.36: # %cond.load49
970 ; SSE2-NEXT: movzwl (%rax), %eax
971 ; SSE2-NEXT: shll $16, %eax
972 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
973 ; SSE2-NEXT: .LBB12_37: # %else50
974 ; SSE2-NEXT: xorl %eax, %eax
975 ; SSE2-NEXT: testb %al, %al
976 ; SSE2-NEXT: jne .LBB12_39
977 ; SSE2-NEXT: # %bb.38: # %cond.load52
978 ; SSE2-NEXT: movzwl (%rax), %eax
979 ; SSE2-NEXT: shll $16, %eax
980 ; SSE2-NEXT: movd %eax, %xmm13
981 ; SSE2-NEXT: .LBB12_39: # %else53
982 ; SSE2-NEXT: xorl %eax, %eax
983 ; SSE2-NEXT: testb %al, %al
984 ; SSE2-NEXT: jne .LBB12_41
985 ; SSE2-NEXT: # %bb.40: # %cond.load55
986 ; SSE2-NEXT: movzwl (%rax), %eax
987 ; SSE2-NEXT: shll $16, %eax
988 ; SSE2-NEXT: movd %eax, %xmm14
989 ; SSE2-NEXT: .LBB12_41: # %else56
990 ; SSE2-NEXT: xorl %eax, %eax
991 ; SSE2-NEXT: testb %al, %al
992 ; SSE2-NEXT: jne .LBB12_43
993 ; SSE2-NEXT: # %bb.42: # %cond.load58
994 ; SSE2-NEXT: movzwl (%rax), %eax
995 ; SSE2-NEXT: shll $16, %eax
996 ; SSE2-NEXT: movd %eax, %xmm11
997 ; SSE2-NEXT: .LBB12_43: # %else59
998 ; SSE2-NEXT: xorl %eax, %eax
999 ; SSE2-NEXT: testb %al, %al
1000 ; SSE2-NEXT: jne .LBB12_45
1001 ; SSE2-NEXT: # %bb.44: # %cond.load61
1002 ; SSE2-NEXT: movzwl (%rax), %eax
1003 ; SSE2-NEXT: shll $16, %eax
1004 ; SSE2-NEXT: movd %eax, %xmm12
1005 ; SSE2-NEXT: .LBB12_45: # %else62
1006 ; SSE2-NEXT: xorl %eax, %eax
1007 ; SSE2-NEXT: testb %al, %al
1008 ; SSE2-NEXT: jne .LBB12_47
1009 ; SSE2-NEXT: # %bb.46: # %cond.load64
1010 ; SSE2-NEXT: movzwl (%rax), %eax
1011 ; SSE2-NEXT: shll $16, %eax
1012 ; SSE2-NEXT: movd %eax, %xmm9
1013 ; SSE2-NEXT: .LBB12_47: # %else65
1014 ; SSE2-NEXT: xorl %eax, %eax
1015 ; SSE2-NEXT: testb %al, %al
1016 ; SSE2-NEXT: jne .LBB12_49
1017 ; SSE2-NEXT: # %bb.48: # %cond.load67
1018 ; SSE2-NEXT: movzwl (%rax), %eax
1019 ; SSE2-NEXT: shll $16, %eax
1020 ; SSE2-NEXT: movd %eax, %xmm10
1021 ; SSE2-NEXT: .LBB12_49: # %else68
1022 ; SSE2-NEXT: xorl %eax, %eax
1023 ; SSE2-NEXT: testb %al, %al
1024 ; SSE2-NEXT: jne .LBB12_51
1025 ; SSE2-NEXT: # %bb.50: # %cond.load70
1026 ; SSE2-NEXT: movzwl (%rax), %eax
1027 ; SSE2-NEXT: shll $16, %eax
1028 ; SSE2-NEXT: movd %eax, %xmm7
1029 ; SSE2-NEXT: .LBB12_51: # %else71
1030 ; SSE2-NEXT: xorl %eax, %eax
1031 ; SSE2-NEXT: testb %al, %al
1032 ; SSE2-NEXT: jne .LBB12_53
1033 ; SSE2-NEXT: # %bb.52: # %cond.load73
1034 ; SSE2-NEXT: movzwl (%rax), %eax
1035 ; SSE2-NEXT: shll $16, %eax
1036 ; SSE2-NEXT: movd %eax, %xmm8
1037 ; SSE2-NEXT: .LBB12_53: # %else74
1038 ; SSE2-NEXT: xorl %eax, %eax
1039 ; SSE2-NEXT: testb %al, %al
1040 ; SSE2-NEXT: jne .LBB12_55
1041 ; SSE2-NEXT: # %bb.54: # %cond.load76
1042 ; SSE2-NEXT: movzwl (%rax), %eax
1043 ; SSE2-NEXT: shll $16, %eax
1044 ; SSE2-NEXT: movd %eax, %xmm5
1045 ; SSE2-NEXT: .LBB12_55: # %else77
1046 ; SSE2-NEXT: xorl %eax, %eax
1047 ; SSE2-NEXT: testb %al, %al
1048 ; SSE2-NEXT: jne .LBB12_57
1049 ; SSE2-NEXT: # %bb.56: # %cond.load79
1050 ; SSE2-NEXT: movzwl (%rax), %eax
1051 ; SSE2-NEXT: shll $16, %eax
1052 ; SSE2-NEXT: movd %eax, %xmm6
1053 ; SSE2-NEXT: .LBB12_57: # %else80
1054 ; SSE2-NEXT: xorl %eax, %eax
1055 ; SSE2-NEXT: testb %al, %al
1056 ; SSE2-NEXT: jne .LBB12_59
1057 ; SSE2-NEXT: # %bb.58: # %cond.load82
1058 ; SSE2-NEXT: movzwl (%rax), %eax
1059 ; SSE2-NEXT: shll $16, %eax
1060 ; SSE2-NEXT: movd %eax, %xmm3
1061 ; SSE2-NEXT: .LBB12_59: # %else83
1062 ; SSE2-NEXT: xorl %eax, %eax
1063 ; SSE2-NEXT: testb %al, %al
1064 ; SSE2-NEXT: jne .LBB12_61
1065 ; SSE2-NEXT: # %bb.60: # %cond.load85
1066 ; SSE2-NEXT: movzwl (%rax), %eax
1067 ; SSE2-NEXT: shll $16, %eax
1068 ; SSE2-NEXT: movd %eax, %xmm4
1069 ; SSE2-NEXT: .LBB12_61: # %else86
1070 ; SSE2-NEXT: xorl %eax, %eax
1071 ; SSE2-NEXT: testb %al, %al
1072 ; SSE2-NEXT: jne .LBB12_63
1073 ; SSE2-NEXT: # %bb.62: # %cond.load88
1074 ; SSE2-NEXT: movzwl (%rax), %eax
1075 ; SSE2-NEXT: shll $16, %eax
1076 ; SSE2-NEXT: movd %eax, %xmm1
1077 ; SSE2-NEXT: .LBB12_63: # %else89
1078 ; SSE2-NEXT: xorl %eax, %eax
1079 ; SSE2-NEXT: testb %al, %al
1080 ; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1081 ; SSE2-NEXT: movd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1082 ; SSE2-NEXT: movd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1083 ; SSE2-NEXT: movd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1084 ; SSE2-NEXT: movd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1085 ; SSE2-NEXT: movd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1086 ; SSE2-NEXT: movd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1087 ; SSE2-NEXT: movd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1088 ; SSE2-NEXT: movd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1089 ; SSE2-NEXT: movd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1090 ; SSE2-NEXT: movd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1091 ; SSE2-NEXT: movd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1092 ; SSE2-NEXT: movd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1093 ; SSE2-NEXT: movd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1094 ; SSE2-NEXT: jne .LBB12_64
1095 ; SSE2-NEXT: # %bb.65: # %cond.load91
1096 ; SSE2-NEXT: movzwl (%rax), %eax
1097 ; SSE2-NEXT: shll $16, %eax
1098 ; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1099 ; SSE2-NEXT: jmp .LBB12_66
1100 ; SSE2-NEXT: .LBB12_64:
1101 ; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1102 ; SSE2-NEXT: .LBB12_66: # %else92
1103 ; SSE2-NEXT: callq __truncsfbf2@PLT
1104 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1105 ; SSE2-NEXT: shll $16, %ebx
1106 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1107 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1108 ; SSE2-NEXT: callq __truncsfbf2@PLT
1109 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1110 ; SSE2-NEXT: movzwl %ax, %r14d
1111 ; SSE2-NEXT: orl %ebx, %r14d
1112 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1113 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1114 ; SSE2-NEXT: callq __truncsfbf2@PLT
1115 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1116 ; SSE2-NEXT: shll $16, %ebx
1117 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1118 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1119 ; SSE2-NEXT: callq __truncsfbf2@PLT
1120 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1121 ; SSE2-NEXT: movzwl %ax, %eax
1122 ; SSE2-NEXT: orl %ebx, %eax
1123 ; SSE2-NEXT: shlq $32, %rax
1124 ; SSE2-NEXT: orq %r14, %rax
1125 ; SSE2-NEXT: movq %rax, %xmm0
1126 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1127 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1128 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1129 ; SSE2-NEXT: callq __truncsfbf2@PLT
1130 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1131 ; SSE2-NEXT: shll $16, %ebx
1132 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1133 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1134 ; SSE2-NEXT: callq __truncsfbf2@PLT
1135 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1136 ; SSE2-NEXT: movzwl %ax, %r14d
1137 ; SSE2-NEXT: orl %ebx, %r14d
1138 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1139 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1140 ; SSE2-NEXT: callq __truncsfbf2@PLT
1141 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1142 ; SSE2-NEXT: shll $16, %ebx
1143 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1144 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1145 ; SSE2-NEXT: callq __truncsfbf2@PLT
1146 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1147 ; SSE2-NEXT: movzwl %ax, %eax
1148 ; SSE2-NEXT: orl %ebx, %eax
1149 ; SSE2-NEXT: shlq $32, %rax
1150 ; SSE2-NEXT: orq %r14, %rax
1151 ; SSE2-NEXT: movq %rax, %xmm0
1152 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1153 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1154 ; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1155 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1156 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1157 ; SSE2-NEXT: callq __truncsfbf2@PLT
1158 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1159 ; SSE2-NEXT: shll $16, %ebx
1160 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1161 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1162 ; SSE2-NEXT: callq __truncsfbf2@PLT
1163 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1164 ; SSE2-NEXT: movzwl %ax, %r14d
1165 ; SSE2-NEXT: orl %ebx, %r14d
1166 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1167 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1168 ; SSE2-NEXT: callq __truncsfbf2@PLT
1169 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1170 ; SSE2-NEXT: shll $16, %ebx
1171 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1172 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1173 ; SSE2-NEXT: callq __truncsfbf2@PLT
1174 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1175 ; SSE2-NEXT: movzwl %ax, %eax
1176 ; SSE2-NEXT: orl %ebx, %eax
1177 ; SSE2-NEXT: shlq $32, %rax
1178 ; SSE2-NEXT: orq %r14, %rax
1179 ; SSE2-NEXT: movq %rax, %xmm0
1180 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1181 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1182 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1183 ; SSE2-NEXT: callq __truncsfbf2@PLT
1184 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1185 ; SSE2-NEXT: shll $16, %ebx
1186 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1187 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1188 ; SSE2-NEXT: callq __truncsfbf2@PLT
1189 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1190 ; SSE2-NEXT: movzwl %ax, %r14d
1191 ; SSE2-NEXT: orl %ebx, %r14d
1192 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1193 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1194 ; SSE2-NEXT: callq __truncsfbf2@PLT
1195 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1196 ; SSE2-NEXT: shll $16, %ebx
1197 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1198 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1199 ; SSE2-NEXT: callq __truncsfbf2@PLT
1200 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1201 ; SSE2-NEXT: movzwl %ax, %eax
1202 ; SSE2-NEXT: orl %ebx, %eax
1203 ; SSE2-NEXT: shlq $32, %rax
1204 ; SSE2-NEXT: orq %r14, %rax
1205 ; SSE2-NEXT: movq %rax, %xmm0
1206 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1207 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1208 ; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1209 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1210 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1211 ; SSE2-NEXT: callq __truncsfbf2@PLT
1212 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1213 ; SSE2-NEXT: shll $16, %ebx
1214 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1215 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1216 ; SSE2-NEXT: callq __truncsfbf2@PLT
1217 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1218 ; SSE2-NEXT: movzwl %ax, %r14d
1219 ; SSE2-NEXT: orl %ebx, %r14d
1220 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1221 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1222 ; SSE2-NEXT: callq __truncsfbf2@PLT
1223 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1224 ; SSE2-NEXT: shll $16, %ebx
1225 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1226 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1227 ; SSE2-NEXT: callq __truncsfbf2@PLT
1228 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1229 ; SSE2-NEXT: movzwl %ax, %eax
1230 ; SSE2-NEXT: orl %ebx, %eax
1231 ; SSE2-NEXT: shlq $32, %rax
1232 ; SSE2-NEXT: orq %r14, %rax
1233 ; SSE2-NEXT: movq %rax, %xmm0
1234 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1235 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1236 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1237 ; SSE2-NEXT: callq __truncsfbf2@PLT
1238 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1239 ; SSE2-NEXT: shll $16, %ebx
1240 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1241 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1242 ; SSE2-NEXT: callq __truncsfbf2@PLT
1243 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1244 ; SSE2-NEXT: movzwl %ax, %r14d
1245 ; SSE2-NEXT: orl %ebx, %r14d
1246 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1247 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1248 ; SSE2-NEXT: callq __truncsfbf2@PLT
1249 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1250 ; SSE2-NEXT: shll $16, %ebx
1251 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1252 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1253 ; SSE2-NEXT: callq __truncsfbf2@PLT
1254 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1255 ; SSE2-NEXT: movzwl %ax, %eax
1256 ; SSE2-NEXT: orl %ebx, %eax
1257 ; SSE2-NEXT: shlq $32, %rax
1258 ; SSE2-NEXT: orq %r14, %rax
1259 ; SSE2-NEXT: movq %rax, %xmm0
1260 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1261 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1262 ; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1263 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1264 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1265 ; SSE2-NEXT: callq __truncsfbf2@PLT
1266 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1267 ; SSE2-NEXT: shll $16, %ebx
1268 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1269 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1270 ; SSE2-NEXT: callq __truncsfbf2@PLT
1271 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1272 ; SSE2-NEXT: movzwl %ax, %r14d
1273 ; SSE2-NEXT: orl %ebx, %r14d
1274 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1275 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1276 ; SSE2-NEXT: callq __truncsfbf2@PLT
1277 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1278 ; SSE2-NEXT: shll $16, %ebx
1279 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1280 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1281 ; SSE2-NEXT: callq __truncsfbf2@PLT
1282 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1283 ; SSE2-NEXT: movzwl %ax, %eax
1284 ; SSE2-NEXT: orl %ebx, %eax
1285 ; SSE2-NEXT: shlq $32, %rax
1286 ; SSE2-NEXT: orq %r14, %rax
1287 ; SSE2-NEXT: movq %rax, %xmm0
1288 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1289 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1290 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1291 ; SSE2-NEXT: callq __truncsfbf2@PLT
1292 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1293 ; SSE2-NEXT: shll $16, %ebx
1294 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1295 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1296 ; SSE2-NEXT: callq __truncsfbf2@PLT
1297 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1298 ; SSE2-NEXT: movzwl %ax, %r14d
1299 ; SSE2-NEXT: orl %ebx, %r14d
1300 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1301 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1302 ; SSE2-NEXT: callq __truncsfbf2@PLT
1303 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1304 ; SSE2-NEXT: shll $16, %ebx
1305 ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
1306 ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
1307 ; SSE2-NEXT: callq __truncsfbf2@PLT
1308 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1309 ; SSE2-NEXT: movzwl %ax, %eax
1310 ; SSE2-NEXT: orl %ebx, %eax
1311 ; SSE2-NEXT: shlq $32, %rax
1312 ; SSE2-NEXT: orq %r14, %rax
1313 ; SSE2-NEXT: movq %rax, %xmm0
1314 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1315 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1316 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1317 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1318 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1319 ; SSE2-NEXT: addq $200, %rsp
1320 ; SSE2-NEXT: popq %rbx
1321 ; SSE2-NEXT: popq %r14
1324 ; FP16-LABEL: pr63017_2:
1326 ; FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0]
1327 ; FP16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1}
1330 ; AVXNC-LABEL: pr63017_2:
1332 ; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
1333 ; AVXNC-NEXT: xorl %eax, %eax
1334 ; AVXNC-NEXT: testb %al, %al
1335 ; AVXNC-NEXT: vmovdqa %ymm0, %ymm1
1336 ; AVXNC-NEXT: jne .LBB12_2
1337 ; AVXNC-NEXT: # %bb.1: # %cond.load
1338 ; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm1 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
1339 ; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
1340 ; AVXNC-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm2
1341 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1342 ; AVXNC-NEXT: .LBB12_2: # %else
1343 ; AVXNC-NEXT: xorl %eax, %eax
1344 ; AVXNC-NEXT: testb %al, %al
1345 ; AVXNC-NEXT: jne .LBB12_4
1346 ; AVXNC-NEXT: # %bb.3: # %cond.load1
1347 ; AVXNC-NEXT: vpinsrw $1, (%rax), %xmm0, %xmm2
1348 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1349 ; AVXNC-NEXT: .LBB12_4: # %else2
1350 ; AVXNC-NEXT: xorl %eax, %eax
1351 ; AVXNC-NEXT: testb %al, %al
1352 ; AVXNC-NEXT: jne .LBB12_6
1353 ; AVXNC-NEXT: # %bb.5: # %cond.load4
1354 ; AVXNC-NEXT: vpinsrw $2, (%rax), %xmm0, %xmm2
1355 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1356 ; AVXNC-NEXT: .LBB12_6: # %else5
1357 ; AVXNC-NEXT: xorl %eax, %eax
1358 ; AVXNC-NEXT: testb %al, %al
1359 ; AVXNC-NEXT: jne .LBB12_8
1360 ; AVXNC-NEXT: # %bb.7: # %cond.load7
1361 ; AVXNC-NEXT: vpinsrw $3, (%rax), %xmm0, %xmm2
1362 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1363 ; AVXNC-NEXT: .LBB12_8: # %else8
1364 ; AVXNC-NEXT: xorl %eax, %eax
1365 ; AVXNC-NEXT: testb %al, %al
1366 ; AVXNC-NEXT: jne .LBB12_10
1367 ; AVXNC-NEXT: # %bb.9: # %cond.load10
1368 ; AVXNC-NEXT: vpinsrw $4, (%rax), %xmm0, %xmm2
1369 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1370 ; AVXNC-NEXT: .LBB12_10: # %else11
1371 ; AVXNC-NEXT: xorl %eax, %eax
1372 ; AVXNC-NEXT: testb %al, %al
1373 ; AVXNC-NEXT: jne .LBB12_12
1374 ; AVXNC-NEXT: # %bb.11: # %cond.load13
1375 ; AVXNC-NEXT: vpinsrw $5, (%rax), %xmm0, %xmm2
1376 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1377 ; AVXNC-NEXT: .LBB12_12: # %else14
1378 ; AVXNC-NEXT: xorl %eax, %eax
1379 ; AVXNC-NEXT: testb %al, %al
1380 ; AVXNC-NEXT: jne .LBB12_14
1381 ; AVXNC-NEXT: # %bb.13: # %cond.load16
1382 ; AVXNC-NEXT: vpinsrw $6, (%rax), %xmm0, %xmm2
1383 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1384 ; AVXNC-NEXT: .LBB12_14: # %else17
1385 ; AVXNC-NEXT: xorl %eax, %eax
1386 ; AVXNC-NEXT: testb %al, %al
1387 ; AVXNC-NEXT: jne .LBB12_16
1388 ; AVXNC-NEXT: # %bb.15: # %cond.load19
1389 ; AVXNC-NEXT: vpinsrw $7, (%rax), %xmm0, %xmm2
1390 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1391 ; AVXNC-NEXT: .LBB12_16: # %else20
1392 ; AVXNC-NEXT: xorl %eax, %eax
1393 ; AVXNC-NEXT: testb %al, %al
1394 ; AVXNC-NEXT: jne .LBB12_18
1395 ; AVXNC-NEXT: # %bb.17: # %cond.load22
1396 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1397 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
1398 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
1399 ; AVXNC-NEXT: .LBB12_18: # %else23
1400 ; AVXNC-NEXT: xorl %eax, %eax
1401 ; AVXNC-NEXT: testb %al, %al
1402 ; AVXNC-NEXT: jne .LBB12_20
1403 ; AVXNC-NEXT: # %bb.19: # %cond.load25
1404 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1405 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
1406 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
1407 ; AVXNC-NEXT: .LBB12_20: # %else26
1408 ; AVXNC-NEXT: xorl %eax, %eax
1409 ; AVXNC-NEXT: testb %al, %al
1410 ; AVXNC-NEXT: jne .LBB12_22
1411 ; AVXNC-NEXT: # %bb.21: # %cond.load28
1412 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1413 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4,5,6,7,8,9],ymm2[10],ymm0[11,12,13,14,15]
1414 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
1415 ; AVXNC-NEXT: .LBB12_22: # %else29
1416 ; AVXNC-NEXT: xorl %eax, %eax
1417 ; AVXNC-NEXT: testb %al, %al
1418 ; AVXNC-NEXT: jne .LBB12_24
1419 ; AVXNC-NEXT: # %bb.23: # %cond.load31
1420 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1421 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
1422 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
1423 ; AVXNC-NEXT: .LBB12_24: # %else32
1424 ; AVXNC-NEXT: xorl %eax, %eax
1425 ; AVXNC-NEXT: testb %al, %al
1426 ; AVXNC-NEXT: jne .LBB12_26
1427 ; AVXNC-NEXT: # %bb.25: # %cond.load34
1428 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1429 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7,8,9,10,11],ymm2[12],ymm0[13,14,15]
1430 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
1431 ; AVXNC-NEXT: .LBB12_26: # %else35
1432 ; AVXNC-NEXT: xorl %eax, %eax
1433 ; AVXNC-NEXT: testb %al, %al
1434 ; AVXNC-NEXT: jne .LBB12_28
1435 ; AVXNC-NEXT: # %bb.27: # %cond.load37
1436 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1437 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7,8,9,10,11,12],ymm2[13],ymm0[14,15]
1438 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
1439 ; AVXNC-NEXT: .LBB12_28: # %else38
1440 ; AVXNC-NEXT: xorl %eax, %eax
1441 ; AVXNC-NEXT: testb %al, %al
1442 ; AVXNC-NEXT: jne .LBB12_30
1443 ; AVXNC-NEXT: # %bb.29: # %cond.load40
1444 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1445 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15]
1446 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
1447 ; AVXNC-NEXT: .LBB12_30: # %else41
1448 ; AVXNC-NEXT: xorl %eax, %eax
1449 ; AVXNC-NEXT: testb %al, %al
1450 ; AVXNC-NEXT: jne .LBB12_32
1451 ; AVXNC-NEXT: # %bb.31: # %cond.load43
1452 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1453 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7],ymm0[8,9,10,11,12,13,14],ymm2[15]
1454 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
1455 ; AVXNC-NEXT: .LBB12_32: # %else44
1456 ; AVXNC-NEXT: xorl %eax, %eax
1457 ; AVXNC-NEXT: testb %al, %al
1458 ; AVXNC-NEXT: jne .LBB12_34
1459 ; AVXNC-NEXT: # %bb.33: # %cond.load46
1460 ; AVXNC-NEXT: vpinsrw $0, (%rax), %xmm1, %xmm2
1461 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1462 ; AVXNC-NEXT: .LBB12_34: # %else47
1463 ; AVXNC-NEXT: xorl %eax, %eax
1464 ; AVXNC-NEXT: testb %al, %al
1465 ; AVXNC-NEXT: jne .LBB12_36
1466 ; AVXNC-NEXT: # %bb.35: # %cond.load49
1467 ; AVXNC-NEXT: vpinsrw $1, (%rax), %xmm1, %xmm2
1468 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1469 ; AVXNC-NEXT: .LBB12_36: # %else50
1470 ; AVXNC-NEXT: xorl %eax, %eax
1471 ; AVXNC-NEXT: testb %al, %al
1472 ; AVXNC-NEXT: jne .LBB12_38
1473 ; AVXNC-NEXT: # %bb.37: # %cond.load52
1474 ; AVXNC-NEXT: vpinsrw $2, (%rax), %xmm1, %xmm2
1475 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1476 ; AVXNC-NEXT: .LBB12_38: # %else53
1477 ; AVXNC-NEXT: xorl %eax, %eax
1478 ; AVXNC-NEXT: testb %al, %al
1479 ; AVXNC-NEXT: jne .LBB12_40
1480 ; AVXNC-NEXT: # %bb.39: # %cond.load55
1481 ; AVXNC-NEXT: vpinsrw $3, (%rax), %xmm1, %xmm2
1482 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1483 ; AVXNC-NEXT: .LBB12_40: # %else56
1484 ; AVXNC-NEXT: xorl %eax, %eax
1485 ; AVXNC-NEXT: testb %al, %al
1486 ; AVXNC-NEXT: jne .LBB12_42
1487 ; AVXNC-NEXT: # %bb.41: # %cond.load58
1488 ; AVXNC-NEXT: vpinsrw $4, (%rax), %xmm1, %xmm2
1489 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1490 ; AVXNC-NEXT: .LBB12_42: # %else59
1491 ; AVXNC-NEXT: xorl %eax, %eax
1492 ; AVXNC-NEXT: testb %al, %al
1493 ; AVXNC-NEXT: jne .LBB12_44
1494 ; AVXNC-NEXT: # %bb.43: # %cond.load61
1495 ; AVXNC-NEXT: vpinsrw $5, (%rax), %xmm1, %xmm2
1496 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1497 ; AVXNC-NEXT: .LBB12_44: # %else62
1498 ; AVXNC-NEXT: xorl %eax, %eax
1499 ; AVXNC-NEXT: testb %al, %al
1500 ; AVXNC-NEXT: jne .LBB12_46
1501 ; AVXNC-NEXT: # %bb.45: # %cond.load64
1502 ; AVXNC-NEXT: vpinsrw $6, (%rax), %xmm1, %xmm2
1503 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1504 ; AVXNC-NEXT: .LBB12_46: # %else65
1505 ; AVXNC-NEXT: xorl %eax, %eax
1506 ; AVXNC-NEXT: testb %al, %al
1507 ; AVXNC-NEXT: jne .LBB12_48
1508 ; AVXNC-NEXT: # %bb.47: # %cond.load67
1509 ; AVXNC-NEXT: vpinsrw $7, (%rax), %xmm1, %xmm2
1510 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1511 ; AVXNC-NEXT: .LBB12_48: # %else68
1512 ; AVXNC-NEXT: xorl %eax, %eax
1513 ; AVXNC-NEXT: testb %al, %al
1514 ; AVXNC-NEXT: jne .LBB12_50
1515 ; AVXNC-NEXT: # %bb.49: # %cond.load70
1516 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1517 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
1518 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1519 ; AVXNC-NEXT: .LBB12_50: # %else71
1520 ; AVXNC-NEXT: xorl %eax, %eax
1521 ; AVXNC-NEXT: testb %al, %al
1522 ; AVXNC-NEXT: jne .LBB12_52
1523 ; AVXNC-NEXT: # %bb.51: # %cond.load73
1524 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1525 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15]
1526 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1527 ; AVXNC-NEXT: .LBB12_52: # %else74
1528 ; AVXNC-NEXT: xorl %eax, %eax
1529 ; AVXNC-NEXT: testb %al, %al
1530 ; AVXNC-NEXT: jne .LBB12_54
1531 ; AVXNC-NEXT: # %bb.53: # %cond.load76
1532 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1533 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15]
1534 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1535 ; AVXNC-NEXT: .LBB12_54: # %else77
1536 ; AVXNC-NEXT: xorl %eax, %eax
1537 ; AVXNC-NEXT: testb %al, %al
1538 ; AVXNC-NEXT: jne .LBB12_56
1539 ; AVXNC-NEXT: # %bb.55: # %cond.load79
1540 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1541 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15]
1542 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1543 ; AVXNC-NEXT: .LBB12_56: # %else80
1544 ; AVXNC-NEXT: xorl %eax, %eax
1545 ; AVXNC-NEXT: testb %al, %al
1546 ; AVXNC-NEXT: jne .LBB12_58
1547 ; AVXNC-NEXT: # %bb.57: # %cond.load82
1548 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1549 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
1550 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1551 ; AVXNC-NEXT: .LBB12_58: # %else83
1552 ; AVXNC-NEXT: xorl %eax, %eax
1553 ; AVXNC-NEXT: testb %al, %al
1554 ; AVXNC-NEXT: jne .LBB12_60
1555 ; AVXNC-NEXT: # %bb.59: # %cond.load85
1556 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1557 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7,8,9,10,11,12],ymm2[13],ymm1[14,15]
1558 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1559 ; AVXNC-NEXT: .LBB12_60: # %else86
1560 ; AVXNC-NEXT: xorl %eax, %eax
1561 ; AVXNC-NEXT: testb %al, %al
1562 ; AVXNC-NEXT: jne .LBB12_62
1563 ; AVXNC-NEXT: # %bb.61: # %cond.load88
1564 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1565 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
1566 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1567 ; AVXNC-NEXT: .LBB12_62: # %else89
1568 ; AVXNC-NEXT: xorl %eax, %eax
1569 ; AVXNC-NEXT: testb %al, %al
1570 ; AVXNC-NEXT: jne .LBB12_64
1571 ; AVXNC-NEXT: # %bb.63: # %cond.load91
1572 ; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2
1573 ; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm2[7],ymm1[8,9,10,11,12,13,14],ymm2[15]
1574 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1575 ; AVXNC-NEXT: .LBB12_64: # %else92
1577 %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
1578 ret <32 x bfloat> %1
1581 define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
1582 ; X86-LABEL: pr62997_3:
1584 ; X86-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
1585 ; X86-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
1588 ; SSE2-LABEL: pr62997_3:
1590 ; SSE2-NEXT: movq %xmm0, %rax
1591 ; SSE2-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
1592 ; SSE2-NEXT: andq %rax, %rcx
1593 ; SSE2-NEXT: movzwl %ax, %eax
1594 ; SSE2-NEXT: pextrw $0, %xmm4, %edx
1595 ; SSE2-NEXT: shll $16, %edx
1596 ; SSE2-NEXT: orl %eax, %edx
1597 ; SSE2-NEXT: orq %rcx, %rdx
1598 ; SSE2-NEXT: movq %rdx, %xmm4
1599 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1602 ; FP16-LABEL: pr62997_3:
1604 ; FP16-NEXT: vmovw %xmm1, %eax
1605 ; FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1
1606 ; FP16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
1609 ; AVXNC-LABEL: pr62997_3:
1611 ; AVXNC-NEXT: vpextrw $0, %xmm2, %eax
1612 ; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm2
1613 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1615 %3 = insertelement <32 x bfloat> %0, bfloat %1, i64 1
1616 ret <32 x bfloat> %3
1619 declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)
1621 define <4 x float> @pr64460_1(<4 x bfloat> %a) {
1622 ; X86-LABEL: pr64460_1:
1624 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
1625 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1628 ; SSE2-LABEL: pr64460_1:
1630 ; SSE2-NEXT: pxor %xmm1, %xmm1
1631 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1632 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1635 ; AVX-LABEL: pr64460_1:
1637 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
1638 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1640 %b = fpext <4 x bfloat> %a to <4 x float>
1644 define <8 x float> @pr64460_2(<8 x bfloat> %a) {
1645 ; X86-LABEL: pr64460_2:
1647 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1648 ; X86-NEXT: vpslld $16, %ymm0, %ymm0
1651 ; SSE2-LABEL: pr64460_2:
1653 ; SSE2-NEXT: pxor %xmm1, %xmm1
1654 ; SSE2-NEXT: pxor %xmm2, %xmm2
1655 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1656 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1657 ; SSE2-NEXT: movdqa %xmm2, %xmm0
1660 ; AVX-LABEL: pr64460_2:
1662 ; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1663 ; AVX-NEXT: vpslld $16, %ymm0, %ymm0
1665 %b = fpext <8 x bfloat> %a to <8 x float>
1669 define <16 x float> @pr64460_3(<16 x bfloat> %a) {
1670 ; X86-LABEL: pr64460_3:
1672 ; X86-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1673 ; X86-NEXT: vpslld $16, %zmm0, %zmm0
1676 ; SSE2-LABEL: pr64460_3:
1678 ; SSE2-NEXT: pxor %xmm3, %xmm3
1679 ; SSE2-NEXT: pxor %xmm5, %xmm5
1680 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
1681 ; SSE2-NEXT: pxor %xmm4, %xmm4
1682 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1683 ; SSE2-NEXT: pxor %xmm2, %xmm2
1684 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1685 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1686 ; SSE2-NEXT: movdqa %xmm5, %xmm0
1687 ; SSE2-NEXT: movdqa %xmm4, %xmm1
1690 ; F16-LABEL: pr64460_3:
1692 ; F16-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1693 ; F16-NEXT: vpslld $16, %zmm0, %zmm0
1696 ; AVXNC-LABEL: pr64460_3:
1698 ; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1699 ; AVXNC-NEXT: vpslld $16, %ymm1, %ymm2
1700 ; AVXNC-NEXT: vextracti128 $1, %ymm0, %xmm0
1701 ; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1702 ; AVXNC-NEXT: vpslld $16, %ymm0, %ymm1
1703 ; AVXNC-NEXT: vmovdqa %ymm2, %ymm0
1705 %b = fpext <16 x bfloat> %a to <16 x float>
1709 define <8 x double> @pr64460_4(<8 x bfloat> %a) {
1710 ; X86-LABEL: pr64460_4:
1712 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1713 ; X86-NEXT: vpslld $16, %ymm0, %ymm0
1714 ; X86-NEXT: vcvtps2pd %ymm0, %zmm0
1717 ; SSE2-LABEL: pr64460_4:
1719 ; SSE2-NEXT: pxor %xmm3, %xmm3
1720 ; SSE2-NEXT: pxor %xmm1, %xmm1
1721 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1722 ; SSE2-NEXT: cvtps2pd %xmm1, %xmm4
1723 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1724 ; SSE2-NEXT: cvtps2pd %xmm3, %xmm2
1725 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1726 ; SSE2-NEXT: cvtps2pd %xmm0, %xmm1
1727 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
1728 ; SSE2-NEXT: cvtps2pd %xmm0, %xmm3
1729 ; SSE2-NEXT: movaps %xmm4, %xmm0
1732 ; F16-LABEL: pr64460_4:
1734 ; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1735 ; F16-NEXT: vpslld $16, %ymm0, %ymm0
1736 ; F16-NEXT: vcvtps2pd %ymm0, %zmm0
1739 ; AVXNC-LABEL: pr64460_4:
1741 ; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1742 ; AVXNC-NEXT: vpslld $16, %ymm0, %ymm1
1743 ; AVXNC-NEXT: vcvtps2pd %xmm1, %ymm0
1744 ; AVXNC-NEXT: vextracti128 $1, %ymm1, %xmm1
1745 ; AVXNC-NEXT: vcvtps2pd %xmm1, %ymm1
1747 %b = fpext <8 x bfloat> %a to <8 x double>
1751 define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
1752 ; X86-LABEL: fptrunc_v4f32:
1754 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1755 ; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
1756 ; X86-NEXT: vzeroupper
1759 ; SSE2-LABEL: fptrunc_v4f32:
1761 ; SSE2-NEXT: subq $72, %rsp
1762 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
1763 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1764 ; SSE2-NEXT: callq __truncsfbf2@PLT
1765 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1766 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1767 ; SSE2-NEXT: callq __truncsfbf2@PLT
1768 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1769 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1770 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
1771 ; SSE2-NEXT: callq __truncsfbf2@PLT
1772 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1773 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1774 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1775 ; SSE2-NEXT: callq __truncsfbf2@PLT
1776 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1777 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1778 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1779 ; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1780 ; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1781 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1782 ; SSE2-NEXT: addq $72, %rsp
1785 ; F16-LABEL: fptrunc_v4f32:
1787 ; F16-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1788 ; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
1789 ; F16-NEXT: vzeroupper
1792 ; AVXNC-LABEL: fptrunc_v4f32:
1794 ; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1795 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
1796 ; AVXNC-NEXT: vzeroupper
1798 %b = fptrunc <4 x float> %a to <4 x bfloat>
1802 define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
1803 ; X86-LABEL: fptrunc_v8f32:
1805 ; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
1806 ; X86-NEXT: vzeroupper
1809 ; SSE2-LABEL: fptrunc_v8f32:
1811 ; SSE2-NEXT: pushq %rbp
1812 ; SSE2-NEXT: pushq %r14
1813 ; SSE2-NEXT: pushq %rbx
1814 ; SSE2-NEXT: subq $32, %rsp
1815 ; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
1816 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1817 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1818 ; SSE2-NEXT: callq __truncsfbf2@PLT
1819 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1820 ; SSE2-NEXT: shll $16, %ebx
1821 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1822 ; SSE2-NEXT: callq __truncsfbf2@PLT
1823 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1824 ; SSE2-NEXT: movzwl %ax, %r14d
1825 ; SSE2-NEXT: orl %ebx, %r14d
1826 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1827 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1828 ; SSE2-NEXT: callq __truncsfbf2@PLT
1829 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1830 ; SSE2-NEXT: shll $16, %ebp
1831 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1832 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1833 ; SSE2-NEXT: callq __truncsfbf2@PLT
1834 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1835 ; SSE2-NEXT: movzwl %ax, %ebx
1836 ; SSE2-NEXT: orl %ebp, %ebx
1837 ; SSE2-NEXT: shlq $32, %rbx
1838 ; SSE2-NEXT: orq %r14, %rbx
1839 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1840 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1841 ; SSE2-NEXT: callq __truncsfbf2@PLT
1842 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1843 ; SSE2-NEXT: shll $16, %ebp
1844 ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
1845 ; SSE2-NEXT: callq __truncsfbf2@PLT
1846 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1847 ; SSE2-NEXT: movzwl %ax, %r14d
1848 ; SSE2-NEXT: orl %ebp, %r14d
1849 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1850 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1851 ; SSE2-NEXT: callq __truncsfbf2@PLT
1852 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1853 ; SSE2-NEXT: shll $16, %ebp
1854 ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
1855 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1856 ; SSE2-NEXT: callq __truncsfbf2@PLT
1857 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1858 ; SSE2-NEXT: movzwl %ax, %eax
1859 ; SSE2-NEXT: orl %ebp, %eax
1860 ; SSE2-NEXT: shlq $32, %rax
1861 ; SSE2-NEXT: orq %r14, %rax
1862 ; SSE2-NEXT: movq %rax, %xmm1
1863 ; SSE2-NEXT: movq %rbx, %xmm0
1864 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1865 ; SSE2-NEXT: addq $32, %rsp
1866 ; SSE2-NEXT: popq %rbx
1867 ; SSE2-NEXT: popq %r14
1868 ; SSE2-NEXT: popq %rbp
1871 ; F16-LABEL: fptrunc_v8f32:
1873 ; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
1874 ; F16-NEXT: vzeroupper
1877 ; AVXNC-LABEL: fptrunc_v8f32:
1879 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
1880 ; AVXNC-NEXT: vzeroupper
1882 %b = fptrunc <8 x float> %a to <8 x bfloat>
1886 define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
1887 ; X86-LABEL: fptrunc_v16f32:
1889 ; X86-NEXT: vcvtneps2bf16 %zmm0, %ymm0
1892 ; SSE2-LABEL: fptrunc_v16f32:
1894 ; SSE2-NEXT: pushq %rbp
1895 ; SSE2-NEXT: pushq %r15
1896 ; SSE2-NEXT: pushq %r14
1897 ; SSE2-NEXT: pushq %r12
1898 ; SSE2-NEXT: pushq %rbx
1899 ; SSE2-NEXT: subq $64, %rsp
1900 ; SSE2-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill
1901 ; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1902 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1903 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1904 ; SSE2-NEXT: movaps %xmm2, %xmm0
1905 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1]
1906 ; SSE2-NEXT: callq __truncsfbf2@PLT
1907 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
1908 ; SSE2-NEXT: shll $16, %ebx
1909 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1910 ; SSE2-NEXT: callq __truncsfbf2@PLT
1911 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1912 ; SSE2-NEXT: movzwl %ax, %r14d
1913 ; SSE2-NEXT: orl %ebx, %r14d
1914 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1915 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1916 ; SSE2-NEXT: callq __truncsfbf2@PLT
1917 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1918 ; SSE2-NEXT: shll $16, %ebp
1919 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1920 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1921 ; SSE2-NEXT: callq __truncsfbf2@PLT
1922 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1923 ; SSE2-NEXT: movzwl %ax, %ebx
1924 ; SSE2-NEXT: orl %ebp, %ebx
1925 ; SSE2-NEXT: shlq $32, %rbx
1926 ; SSE2-NEXT: orq %r14, %rbx
1927 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1928 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1929 ; SSE2-NEXT: callq __truncsfbf2@PLT
1930 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1931 ; SSE2-NEXT: shll $16, %ebp
1932 ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
1933 ; SSE2-NEXT: callq __truncsfbf2@PLT
1934 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1935 ; SSE2-NEXT: movzwl %ax, %r15d
1936 ; SSE2-NEXT: orl %ebp, %r15d
1937 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1938 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1939 ; SSE2-NEXT: callq __truncsfbf2@PLT
1940 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1941 ; SSE2-NEXT: shll $16, %ebp
1942 ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
1943 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1944 ; SSE2-NEXT: callq __truncsfbf2@PLT
1945 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1946 ; SSE2-NEXT: movzwl %ax, %r14d
1947 ; SSE2-NEXT: orl %ebp, %r14d
1948 ; SSE2-NEXT: shlq $32, %r14
1949 ; SSE2-NEXT: orq %r15, %r14
1950 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1951 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1952 ; SSE2-NEXT: callq __truncsfbf2@PLT
1953 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1954 ; SSE2-NEXT: shll $16, %ebp
1955 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1956 ; SSE2-NEXT: callq __truncsfbf2@PLT
1957 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1958 ; SSE2-NEXT: movzwl %ax, %r12d
1959 ; SSE2-NEXT: orl %ebp, %r12d
1960 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1961 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1962 ; SSE2-NEXT: callq __truncsfbf2@PLT
1963 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1964 ; SSE2-NEXT: shll $16, %ebp
1965 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1966 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1967 ; SSE2-NEXT: callq __truncsfbf2@PLT
1968 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1969 ; SSE2-NEXT: movzwl %ax, %r15d
1970 ; SSE2-NEXT: orl %ebp, %r15d
1971 ; SSE2-NEXT: shlq $32, %r15
1972 ; SSE2-NEXT: orq %r12, %r15
1973 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1974 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1975 ; SSE2-NEXT: callq __truncsfbf2@PLT
1976 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1977 ; SSE2-NEXT: shll $16, %ebp
1978 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1979 ; SSE2-NEXT: callq __truncsfbf2@PLT
1980 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1981 ; SSE2-NEXT: movzwl %ax, %r12d
1982 ; SSE2-NEXT: orl %ebp, %r12d
1983 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1984 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1985 ; SSE2-NEXT: callq __truncsfbf2@PLT
1986 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
1987 ; SSE2-NEXT: shll $16, %ebp
1988 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1989 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1990 ; SSE2-NEXT: callq __truncsfbf2@PLT
1991 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
1992 ; SSE2-NEXT: movzwl %ax, %eax
1993 ; SSE2-NEXT: orl %ebp, %eax
1994 ; SSE2-NEXT: shlq $32, %rax
1995 ; SSE2-NEXT: orq %r12, %rax
1996 ; SSE2-NEXT: movq %rax, %xmm1
1997 ; SSE2-NEXT: movq %r15, %xmm0
1998 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1999 ; SSE2-NEXT: movq %r14, %xmm2
2000 ; SSE2-NEXT: movq %rbx, %xmm1
2001 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2002 ; SSE2-NEXT: addq $64, %rsp
2003 ; SSE2-NEXT: popq %rbx
2004 ; SSE2-NEXT: popq %r12
2005 ; SSE2-NEXT: popq %r14
2006 ; SSE2-NEXT: popq %r15
2007 ; SSE2-NEXT: popq %rbp
2010 ; F16-LABEL: fptrunc_v16f32:
2012 ; F16-NEXT: vcvtneps2bf16 %zmm0, %ymm0
2015 ; AVXNC-LABEL: fptrunc_v16f32:
2017 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
2018 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1
2019 ; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2021 %b = fptrunc <16 x float> %a to <16 x bfloat>
2022 ret <16 x bfloat> %b
2025 define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
2026 ; X86-LABEL: fptrunc_v8f64:
2028 ; X86-NEXT: subl $204, %esp
2029 ; X86-NEXT: vmovups %zmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 64-byte Spill
2030 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
2031 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2032 ; X86-NEXT: vmovlps %xmm0, (%esp)
2033 ; X86-NEXT: vzeroupper
2034 ; X86-NEXT: calll __truncdfbf2
2035 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2036 ; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2037 ; X86-NEXT: vmovhps %xmm0, (%esp)
2038 ; X86-NEXT: calll __truncdfbf2
2039 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2040 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
2041 ; X86-NEXT: vmovlps %xmm0, (%esp)
2042 ; X86-NEXT: vzeroupper
2043 ; X86-NEXT: calll __truncdfbf2
2044 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2045 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
2046 ; X86-NEXT: vmovhps %xmm0, (%esp)
2047 ; X86-NEXT: vzeroupper
2048 ; X86-NEXT: calll __truncdfbf2
2049 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2050 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
2051 ; X86-NEXT: vextractf32x4 $2, %zmm0, %xmm0
2052 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2053 ; X86-NEXT: vmovlps %xmm0, (%esp)
2054 ; X86-NEXT: vzeroupper
2055 ; X86-NEXT: calll __truncdfbf2
2056 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2057 ; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2058 ; X86-NEXT: vmovhps %xmm0, (%esp)
2059 ; X86-NEXT: calll __truncdfbf2
2060 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2061 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
2062 ; X86-NEXT: vextractf32x4 $3, %zmm0, %xmm0
2063 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2064 ; X86-NEXT: vmovlps %xmm0, (%esp)
2065 ; X86-NEXT: vzeroupper
2066 ; X86-NEXT: calll __truncdfbf2
2067 ; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2068 ; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2069 ; X86-NEXT: vmovhps %xmm0, (%esp)
2070 ; X86-NEXT: calll __truncdfbf2
2071 ; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
2072 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2073 ; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
2074 ; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2075 ; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2076 ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2077 ; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
2078 ; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2079 ; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2080 ; X86-NEXT: vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
2081 ; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
2082 ; X86-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
2083 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2084 ; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2085 ; X86-NEXT: addl $204, %esp
2088 ; SSE2-LABEL: fptrunc_v8f64:
2090 ; SSE2-NEXT: pushq %rbp
2091 ; SSE2-NEXT: pushq %r14
2092 ; SSE2-NEXT: pushq %rbx
2093 ; SSE2-NEXT: subq $64, %rsp
2094 ; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2095 ; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2096 ; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
2097 ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2098 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
2099 ; SSE2-NEXT: callq __truncdfbf2@PLT
2100 ; SSE2-NEXT: pextrw $0, %xmm0, %ebx
2101 ; SSE2-NEXT: shll $16, %ebx
2102 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2103 ; SSE2-NEXT: callq __truncdfbf2@PLT
2104 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
2105 ; SSE2-NEXT: movzwl %ax, %r14d
2106 ; SSE2-NEXT: orl %ebx, %r14d
2107 ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
2108 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
2109 ; SSE2-NEXT: callq __truncdfbf2@PLT
2110 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
2111 ; SSE2-NEXT: shll $16, %ebp
2112 ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
2113 ; SSE2-NEXT: callq __truncdfbf2@PLT
2114 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
2115 ; SSE2-NEXT: movzwl %ax, %ebx
2116 ; SSE2-NEXT: orl %ebp, %ebx
2117 ; SSE2-NEXT: shlq $32, %rbx
2118 ; SSE2-NEXT: orq %r14, %rbx
2119 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2120 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
2121 ; SSE2-NEXT: callq __truncdfbf2@PLT
2122 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
2123 ; SSE2-NEXT: shll $16, %ebp
2124 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2125 ; SSE2-NEXT: callq __truncdfbf2@PLT
2126 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
2127 ; SSE2-NEXT: movzwl %ax, %r14d
2128 ; SSE2-NEXT: orl %ebp, %r14d
2129 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2130 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
2131 ; SSE2-NEXT: callq __truncdfbf2@PLT
2132 ; SSE2-NEXT: pextrw $0, %xmm0, %ebp
2133 ; SSE2-NEXT: shll $16, %ebp
2134 ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2135 ; SSE2-NEXT: callq __truncdfbf2@PLT
2136 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
2137 ; SSE2-NEXT: movzwl %ax, %eax
2138 ; SSE2-NEXT: orl %ebp, %eax
2139 ; SSE2-NEXT: shlq $32, %rax
2140 ; SSE2-NEXT: orq %r14, %rax
2141 ; SSE2-NEXT: movq %rax, %xmm1
2142 ; SSE2-NEXT: movq %rbx, %xmm0
2143 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2144 ; SSE2-NEXT: addq $64, %rsp
2145 ; SSE2-NEXT: popq %rbx
2146 ; SSE2-NEXT: popq %r14
2147 ; SSE2-NEXT: popq %rbp
2150 ; FP16-LABEL: fptrunc_v8f64:
2152 ; FP16-NEXT: subq $184, %rsp
2153 ; FP16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2154 ; FP16-NEXT: vextractf128 $1, %ymm0, %xmm0
2155 ; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2156 ; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
2157 ; FP16-NEXT: vzeroupper
2158 ; FP16-NEXT: callq __truncdfbf2@PLT
2159 ; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2160 ; FP16-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2161 ; FP16-NEXT: callq __truncdfbf2@PLT
2162 ; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2163 ; FP16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2164 ; FP16-NEXT: # xmm0 = mem[1,0]
2165 ; FP16-NEXT: callq __truncdfbf2@PLT
2166 ; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2167 ; FP16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2168 ; FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2169 ; FP16-NEXT: vzeroupper
2170 ; FP16-NEXT: callq __truncdfbf2@PLT
2171 ; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2172 ; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2173 ; FP16-NEXT: vextractf32x4 $2, %zmm0, %xmm0
2174 ; FP16-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2175 ; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
2176 ; FP16-NEXT: vzeroupper
2177 ; FP16-NEXT: callq __truncdfbf2@PLT
2178 ; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2179 ; FP16-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2180 ; FP16-NEXT: callq __truncdfbf2@PLT
2181 ; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
2182 ; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2183 ; FP16-NEXT: vextractf32x4 $3, %zmm0, %xmm0
2184 ; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2185 ; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
2186 ; FP16-NEXT: vzeroupper
2187 ; FP16-NEXT: callq __truncdfbf2@PLT
2188 ; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2189 ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2190 ; FP16-NEXT: callq __truncdfbf2@PLT
2191 ; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2192 ; FP16-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2193 ; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
2194 ; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2195 ; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2196 ; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2197 ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2198 ; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2199 ; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2200 ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2201 ; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
2202 ; FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
2203 ; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2204 ; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2205 ; FP16-NEXT: addq $184, %rsp
2208 ; AVXNC-LABEL: fptrunc_v8f64:
2210 ; AVXNC-NEXT: pushq %rbp
2211 ; AVXNC-NEXT: pushq %r15
2212 ; AVXNC-NEXT: pushq %r14
2213 ; AVXNC-NEXT: pushq %r13
2214 ; AVXNC-NEXT: pushq %r12
2215 ; AVXNC-NEXT: pushq %rbx
2216 ; AVXNC-NEXT: subq $168, %rsp
2217 ; AVXNC-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2218 ; AVXNC-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2219 ; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2220 ; AVXNC-NEXT: vzeroupper
2221 ; AVXNC-NEXT: callq __truncdfbf2@PLT
2222 ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2223 ; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2224 ; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
2225 ; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
2226 ; AVXNC-NEXT: vzeroupper
2227 ; AVXNC-NEXT: callq __truncdfbf2@PLT
2228 ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2229 ; AVXNC-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2230 ; AVXNC-NEXT: # xmm0 = mem[1,0]
2231 ; AVXNC-NEXT: callq __truncdfbf2@PLT
2232 ; AVXNC-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2233 ; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2234 ; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2235 ; AVXNC-NEXT: vzeroupper
2236 ; AVXNC-NEXT: callq __truncdfbf2@PLT
2237 ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2238 ; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2239 ; AVXNC-NEXT: # xmm0 = mem[1,0]
2240 ; AVXNC-NEXT: callq __truncdfbf2@PLT
2241 ; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2242 ; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2243 ; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
2244 ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2245 ; AVXNC-NEXT: vzeroupper
2246 ; AVXNC-NEXT: callq __truncdfbf2@PLT
2247 ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2248 ; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2249 ; AVXNC-NEXT: # xmm0 = mem[1,0]
2250 ; AVXNC-NEXT: callq __truncdfbf2@PLT
2251 ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
2252 ; AVXNC-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2253 ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2254 ; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp
2255 ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2256 ; AVXNC-NEXT: vpextrw $0, %xmm0, %r14d
2257 ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2258 ; AVXNC-NEXT: vpextrw $0, %xmm0, %r15d
2259 ; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
2260 ; AVXNC-NEXT: vpextrw $0, %xmm0, %r12d
2261 ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2262 ; AVXNC-NEXT: vpextrw $0, %xmm0, %r13d
2263 ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2264 ; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
2265 ; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2266 ; AVXNC-NEXT: # xmm0 = mem[1,0]
2267 ; AVXNC-NEXT: callq __truncdfbf2@PLT
2268 ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
2269 ; AVXNC-NEXT: vmovd %ebx, %xmm0
2270 ; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
2271 ; AVXNC-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0
2272 ; AVXNC-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0
2273 ; AVXNC-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0
2274 ; AVXNC-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
2275 ; AVXNC-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0
2276 ; AVXNC-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2277 ; AVXNC-NEXT: addq $168, %rsp
2278 ; AVXNC-NEXT: popq %rbx
2279 ; AVXNC-NEXT: popq %r12
2280 ; AVXNC-NEXT: popq %r13
2281 ; AVXNC-NEXT: popq %r14
2282 ; AVXNC-NEXT: popq %r15
2283 ; AVXNC-NEXT: popq %rbp
2285 %b = fptrunc <8 x double> %a to <8 x bfloat>
2289 define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
2290 ; X86-LABEL: test_v8bf16_v32bf16:
2292 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2293 ; X86-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2296 ; SSE2-LABEL: test_v8bf16_v32bf16:
2298 ; SSE2-NEXT: movaps (%rdi), %xmm0
2299 ; SSE2-NEXT: movaps %xmm0, %xmm1
2300 ; SSE2-NEXT: movaps %xmm0, %xmm2
2301 ; SSE2-NEXT: movaps %xmm0, %xmm3
2304 ; F16-LABEL: test_v8bf16_v32bf16:
2306 ; F16-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2309 ; AVXNC-LABEL: test_v8bf16_v32bf16:
2311 ; AVXNC-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
2312 ; AVXNC-NEXT: vmovaps %ymm0, %ymm1
2314 %2 = load <8 x bfloat>, ptr %0, align 16
2315 %3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2316 ret <32 x bfloat> %3
2319 define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
2320 ; X86-LABEL: concat_v8bf16:
2322 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2323 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2326 ; SSE2-LABEL: concat_v8bf16:
2330 ; AVX-LABEL: concat_v8bf16:
2332 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2333 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2335 %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2336 ret <16 x bfloat> %a
2339 define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) {
2340 ; X86-LABEL: extract_v32bf16_v8bf16:
2342 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
2343 ; X86-NEXT: vzeroupper
2346 ; SSE2-LABEL: extract_v32bf16_v8bf16:
2348 ; SSE2-NEXT: pextrw $0, %xmm1, %eax
2349 ; SSE2-NEXT: pextrw $1, %xmm1, %ecx
2350 ; SSE2-NEXT: shll $16, %ecx
2351 ; SSE2-NEXT: orl %eax, %ecx
2352 ; SSE2-NEXT: pextrw $2, %xmm1, %eax
2353 ; SSE2-NEXT: pextrw $3, %xmm1, %edx
2354 ; SSE2-NEXT: shll $16, %edx
2355 ; SSE2-NEXT: orl %eax, %edx
2356 ; SSE2-NEXT: shlq $32, %rdx
2357 ; SSE2-NEXT: orq %rcx, %rdx
2358 ; SSE2-NEXT: pextrw $4, %xmm1, %eax
2359 ; SSE2-NEXT: pextrw $5, %xmm1, %ecx
2360 ; SSE2-NEXT: shll $16, %ecx
2361 ; SSE2-NEXT: orl %eax, %ecx
2362 ; SSE2-NEXT: pextrw $6, %xmm1, %eax
2363 ; SSE2-NEXT: pextrw $7, %xmm1, %esi
2364 ; SSE2-NEXT: shll $16, %esi
2365 ; SSE2-NEXT: orl %eax, %esi
2366 ; SSE2-NEXT: shlq $32, %rsi
2367 ; SSE2-NEXT: orq %rcx, %rsi
2368 ; SSE2-NEXT: movq %rsi, %xmm1
2369 ; SSE2-NEXT: movq %rdx, %xmm0
2370 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2373 ; AVX-LABEL: extract_v32bf16_v8bf16:
2375 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
2376 ; AVX-NEXT: vzeroupper
2378 %a = shufflevector <32 x bfloat> %x, <32 x bfloat> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2382 define <16 x bfloat> @concat_zero_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
2383 ; X86-LABEL: concat_zero_v8bf16:
2385 ; X86-NEXT: vmovaps %xmm0, %xmm0
2388 ; SSE2-LABEL: concat_zero_v8bf16:
2390 ; SSE2-NEXT: xorps %xmm1, %xmm1
2393 ; AVX-LABEL: concat_zero_v8bf16:
2395 ; AVX-NEXT: vmovaps %xmm0, %xmm0
2397 %a = shufflevector <8 x bfloat> %x, <8 x bfloat> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2398 ret <16 x bfloat> %a
2401 define <16 x bfloat> @concat_dup_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
2402 ; X86-LABEL: concat_dup_v8bf16:
2404 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2405 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2408 ; SSE2-LABEL: concat_dup_v8bf16:
2410 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2413 ; AVX-LABEL: concat_dup_v8bf16:
2415 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2416 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2418 %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2419 ret <16 x bfloat> %a
2422 define float @trunc_ext(float %a) nounwind {
2423 ; X86-LABEL: trunc_ext:
2425 ; X86-NEXT: pushl %eax
2426 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2427 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
2428 ; X86-NEXT: vmovw %xmm0, %eax
2429 ; X86-NEXT: shll $16, %eax
2430 ; X86-NEXT: vmovd %eax, %xmm0
2431 ; X86-NEXT: vmovd %xmm0, (%esp)
2432 ; X86-NEXT: flds (%esp)
2433 ; X86-NEXT: popl %eax
2436 ; SSE2-LABEL: trunc_ext:
2438 ; SSE2-NEXT: pushq %rax
2439 ; SSE2-NEXT: callq __truncsfbf2@PLT
2440 ; SSE2-NEXT: pextrw $0, %xmm0, %eax
2441 ; SSE2-NEXT: shll $16, %eax
2442 ; SSE2-NEXT: movd %eax, %xmm0
2443 ; SSE2-NEXT: popq %rax
2446 ; FP16-LABEL: trunc_ext:
2448 ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
2449 ; FP16-NEXT: vmovw %xmm0, %eax
2450 ; FP16-NEXT: shll $16, %eax
2451 ; FP16-NEXT: vmovd %eax, %xmm0
2454 ; AVXNC-LABEL: trunc_ext:
2456 ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
2457 ; AVXNC-NEXT: vmovd %xmm0, %eax
2458 ; AVXNC-NEXT: shll $16, %eax
2459 ; AVXNC-NEXT: vmovd %eax, %xmm0
2461 %b = fptrunc float %a to bfloat
2462 %c = fpext bfloat %b to float
2466 define void @PR92471(ptr %0, ptr %1) nounwind {
2467 ; X86-LABEL: PR92471:
2469 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2470 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
2471 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2472 ; X86-NEXT: vpinsrd $1, 4(%ecx), %xmm0, %xmm0
2473 ; X86-NEXT: vpinsrd $2, 8(%ecx), %xmm0, %xmm0
2474 ; X86-NEXT: vpinsrw $6, 12(%ecx), %xmm0, %xmm0
2475 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2476 ; X86-NEXT: vpslld $16, %ymm0, %ymm0
2477 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
2478 ; X86-NEXT: vpextrd $2, %xmm1, 24(%eax)
2479 ; X86-NEXT: vpextrd $1, %xmm1, 20(%eax)
2480 ; X86-NEXT: vmovd %xmm1, 16(%eax)
2481 ; X86-NEXT: vmovdqu %xmm0, (%eax)
2482 ; X86-NEXT: vzeroupper
2485 ; SSE2-LABEL: PR92471:
2487 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2488 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2489 ; SSE2-NEXT: pinsrw $2, 12(%rdi), %xmm1
2490 ; SSE2-NEXT: pxor %xmm2, %xmm2
2491 ; SSE2-NEXT: pxor %xmm3, %xmm3
2492 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
2493 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2494 ; SSE2-NEXT: movdqu %xmm2, (%rsi)
2495 ; SSE2-NEXT: movq %xmm3, 16(%rsi)
2496 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
2497 ; SSE2-NEXT: movd %xmm0, 24(%rsi)
2500 ; AVX-LABEL: PR92471:
2502 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2503 ; AVX-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm0
2504 ; AVX-NEXT: vpinsrw $6, 12(%rdi), %xmm0, %xmm0
2505 ; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2506 ; AVX-NEXT: vpslld $16, %ymm0, %ymm0
2507 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
2508 ; AVX-NEXT: vpextrd $2, %xmm1, 24(%rsi)
2509 ; AVX-NEXT: vmovq %xmm1, 16(%rsi)
2510 ; AVX-NEXT: vmovdqu %xmm0, (%rsi)
2511 ; AVX-NEXT: vzeroupper
2513 %3 = load <7 x bfloat>, ptr %0, align 2
2514 %4 = fpext <7 x bfloat> %3 to <7 x float>
2515 store <7 x float> %4, ptr %1, align 4