1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL
12 define float @cvt_i16_to_f32(i16 %a0) nounwind {
13 ; ALL-LABEL: cvt_i16_to_f32:
15 ; ALL-NEXT: movswl %di, %eax
16 ; ALL-NEXT: vmovd %eax, %xmm0
17 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
19 %1 = bitcast i16 %a0 to half
20 %2 = fpext half %1 to float
24 define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
25 ; ALL-LABEL: cvt_4i16_to_4f32:
27 ; ALL-NEXT: vmovq %xmm0, %rax
28 ; ALL-NEXT: movq %rax, %rcx
29 ; ALL-NEXT: movq %rax, %rdx
30 ; ALL-NEXT: movswl %ax, %esi
31 ; ALL-NEXT: # kill: def $eax killed $eax killed $rax
32 ; ALL-NEXT: shrl $16, %eax
33 ; ALL-NEXT: shrq $32, %rcx
34 ; ALL-NEXT: shrq $48, %rdx
35 ; ALL-NEXT: movswl %dx, %edx
36 ; ALL-NEXT: vmovd %edx, %xmm0
37 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
38 ; ALL-NEXT: movswl %cx, %ecx
39 ; ALL-NEXT: vmovd %ecx, %xmm1
40 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
42 ; ALL-NEXT: vmovd %eax, %xmm2
43 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
44 ; ALL-NEXT: vmovd %esi, %xmm3
45 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
46 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
47 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
48 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
50 %1 = bitcast <4 x i16> %a0 to <4 x half>
51 %2 = fpext <4 x half> %1 to <4 x float>
55 define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
56 ; ALL-LABEL: cvt_8i16_to_4f32:
58 ; ALL-NEXT: vmovq %xmm0, %rax
59 ; ALL-NEXT: movq %rax, %rcx
60 ; ALL-NEXT: movq %rax, %rdx
61 ; ALL-NEXT: movswl %ax, %esi
62 ; ALL-NEXT: # kill: def $eax killed $eax killed $rax
63 ; ALL-NEXT: shrl $16, %eax
64 ; ALL-NEXT: shrq $32, %rcx
65 ; ALL-NEXT: shrq $48, %rdx
66 ; ALL-NEXT: movswl %dx, %edx
67 ; ALL-NEXT: vmovd %edx, %xmm0
68 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
69 ; ALL-NEXT: movswl %cx, %ecx
70 ; ALL-NEXT: vmovd %ecx, %xmm1
71 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
73 ; ALL-NEXT: vmovd %eax, %xmm2
74 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
75 ; ALL-NEXT: vmovd %esi, %xmm3
76 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
77 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
78 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
79 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
81 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
82 %2 = bitcast <4 x i16> %1 to <4 x half>
83 %3 = fpext <4 x half> %2 to <4 x float>
87 define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
88 ; ALL-LABEL: cvt_8i16_to_8f32:
90 ; ALL-NEXT: vpextrq $1, %xmm0, %rdx
91 ; ALL-NEXT: movq %rdx, %r8
92 ; ALL-NEXT: movq %rdx, %r10
93 ; ALL-NEXT: movswl %dx, %r9d
94 ; ALL-NEXT: # kill: def $edx killed $edx killed $rdx
95 ; ALL-NEXT: shrl $16, %edx
96 ; ALL-NEXT: shrq $32, %r8
97 ; ALL-NEXT: shrq $48, %r10
98 ; ALL-NEXT: vmovq %xmm0, %rdi
99 ; ALL-NEXT: movq %rdi, %rax
100 ; ALL-NEXT: movq %rdi, %rsi
101 ; ALL-NEXT: movswl %di, %ecx
102 ; ALL-NEXT: # kill: def $edi killed $edi killed $rdi
103 ; ALL-NEXT: shrl $16, %edi
104 ; ALL-NEXT: shrq $32, %rax
105 ; ALL-NEXT: shrq $48, %rsi
106 ; ALL-NEXT: movswl %si, %esi
107 ; ALL-NEXT: vmovd %esi, %xmm0
108 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
110 ; ALL-NEXT: vmovd %eax, %xmm1
111 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
112 ; ALL-NEXT: movswl %di, %eax
113 ; ALL-NEXT: vmovd %eax, %xmm2
114 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
115 ; ALL-NEXT: vmovd %ecx, %xmm3
116 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
117 ; ALL-NEXT: movswl %r10w, %eax
118 ; ALL-NEXT: vmovd %eax, %xmm4
119 ; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
120 ; ALL-NEXT: movswl %r8w, %eax
121 ; ALL-NEXT: vmovd %eax, %xmm5
122 ; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
123 ; ALL-NEXT: movswl %dx, %eax
124 ; ALL-NEXT: vmovd %eax, %xmm6
125 ; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
126 ; ALL-NEXT: vmovd %r9d, %xmm7
127 ; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
128 ; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
129 ; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
130 ; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
131 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
132 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
133 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
134 ; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
136 %1 = bitcast <8 x i16> %a0 to <8 x half>
137 %2 = fpext <8 x half> %1 to <8 x float>
141 define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
142 ; AVX1-LABEL: cvt_16i16_to_16f32:
144 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
145 ; AVX1-NEXT: vmovq %xmm4, %rax
146 ; AVX1-NEXT: movq %rax, %rcx
147 ; AVX1-NEXT: shrq $48, %rcx
148 ; AVX1-NEXT: movswl %cx, %ecx
149 ; AVX1-NEXT: vmovd %ecx, %xmm8
150 ; AVX1-NEXT: movq %rax, %rcx
151 ; AVX1-NEXT: shrq $32, %rcx
152 ; AVX1-NEXT: movswl %cx, %ecx
153 ; AVX1-NEXT: vmovd %ecx, %xmm9
154 ; AVX1-NEXT: movswl %ax, %ecx
155 ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
156 ; AVX1-NEXT: shrl $16, %eax
158 ; AVX1-NEXT: vmovd %eax, %xmm10
159 ; AVX1-NEXT: vpextrq $1, %xmm4, %rax
160 ; AVX1-NEXT: vmovd %ecx, %xmm11
161 ; AVX1-NEXT: movq %rax, %rcx
162 ; AVX1-NEXT: shrq $48, %rcx
163 ; AVX1-NEXT: movswl %cx, %ecx
164 ; AVX1-NEXT: vmovd %ecx, %xmm12
165 ; AVX1-NEXT: movq %rax, %rcx
166 ; AVX1-NEXT: shrq $32, %rcx
167 ; AVX1-NEXT: movswl %cx, %ecx
168 ; AVX1-NEXT: vmovd %ecx, %xmm13
169 ; AVX1-NEXT: movswl %ax, %ecx
170 ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
171 ; AVX1-NEXT: shrl $16, %eax
173 ; AVX1-NEXT: vmovd %eax, %xmm14
174 ; AVX1-NEXT: vmovq %xmm0, %rax
175 ; AVX1-NEXT: vmovd %ecx, %xmm15
176 ; AVX1-NEXT: movq %rax, %rcx
177 ; AVX1-NEXT: shrq $48, %rcx
178 ; AVX1-NEXT: movswl %cx, %ecx
179 ; AVX1-NEXT: vmovd %ecx, %xmm2
180 ; AVX1-NEXT: movq %rax, %rcx
181 ; AVX1-NEXT: shrq $32, %rcx
182 ; AVX1-NEXT: movswl %cx, %ecx
183 ; AVX1-NEXT: vmovd %ecx, %xmm3
184 ; AVX1-NEXT: movswl %ax, %ecx
185 ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
186 ; AVX1-NEXT: shrl $16, %eax
188 ; AVX1-NEXT: vmovd %eax, %xmm4
189 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
190 ; AVX1-NEXT: vmovd %ecx, %xmm0
191 ; AVX1-NEXT: movq %rax, %rcx
192 ; AVX1-NEXT: shrq $48, %rcx
193 ; AVX1-NEXT: movswl %cx, %ecx
194 ; AVX1-NEXT: vmovd %ecx, %xmm5
195 ; AVX1-NEXT: movq %rax, %rcx
196 ; AVX1-NEXT: shrq $32, %rcx
197 ; AVX1-NEXT: movswl %cx, %ecx
198 ; AVX1-NEXT: vmovd %ecx, %xmm6
199 ; AVX1-NEXT: movl %eax, %ecx
200 ; AVX1-NEXT: shrl $16, %ecx
201 ; AVX1-NEXT: movswl %cx, %ecx
202 ; AVX1-NEXT: vmovd %ecx, %xmm7
204 ; AVX1-NEXT: vmovd %eax, %xmm1
205 ; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8
206 ; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9
207 ; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10
208 ; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11
209 ; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12
210 ; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13
211 ; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14
212 ; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15
213 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
214 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
215 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
216 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
217 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
218 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
219 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
220 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
221 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
222 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
223 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
224 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
225 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
226 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
227 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
228 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
229 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
230 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
231 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
232 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
233 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
234 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
237 ; AVX2-LABEL: cvt_16i16_to_16f32:
239 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
240 ; AVX2-NEXT: vmovq %xmm4, %rax
241 ; AVX2-NEXT: movq %rax, %rcx
242 ; AVX2-NEXT: shrq $48, %rcx
243 ; AVX2-NEXT: movswl %cx, %ecx
244 ; AVX2-NEXT: vmovd %ecx, %xmm8
245 ; AVX2-NEXT: movq %rax, %rcx
246 ; AVX2-NEXT: shrq $32, %rcx
247 ; AVX2-NEXT: movswl %cx, %ecx
248 ; AVX2-NEXT: vmovd %ecx, %xmm9
249 ; AVX2-NEXT: movswl %ax, %ecx
250 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
251 ; AVX2-NEXT: shrl $16, %eax
253 ; AVX2-NEXT: vmovd %eax, %xmm10
254 ; AVX2-NEXT: vpextrq $1, %xmm4, %rax
255 ; AVX2-NEXT: vmovd %ecx, %xmm11
256 ; AVX2-NEXT: movq %rax, %rcx
257 ; AVX2-NEXT: shrq $48, %rcx
258 ; AVX2-NEXT: movswl %cx, %ecx
259 ; AVX2-NEXT: vmovd %ecx, %xmm12
260 ; AVX2-NEXT: movq %rax, %rcx
261 ; AVX2-NEXT: shrq $32, %rcx
262 ; AVX2-NEXT: movswl %cx, %ecx
263 ; AVX2-NEXT: vmovd %ecx, %xmm13
264 ; AVX2-NEXT: movswl %ax, %ecx
265 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
266 ; AVX2-NEXT: shrl $16, %eax
268 ; AVX2-NEXT: vmovd %eax, %xmm14
269 ; AVX2-NEXT: vmovq %xmm0, %rax
270 ; AVX2-NEXT: vmovd %ecx, %xmm15
271 ; AVX2-NEXT: movq %rax, %rcx
272 ; AVX2-NEXT: shrq $48, %rcx
273 ; AVX2-NEXT: movswl %cx, %ecx
274 ; AVX2-NEXT: vmovd %ecx, %xmm2
275 ; AVX2-NEXT: movq %rax, %rcx
276 ; AVX2-NEXT: shrq $32, %rcx
277 ; AVX2-NEXT: movswl %cx, %ecx
278 ; AVX2-NEXT: vmovd %ecx, %xmm3
279 ; AVX2-NEXT: movswl %ax, %ecx
280 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
281 ; AVX2-NEXT: shrl $16, %eax
283 ; AVX2-NEXT: vmovd %eax, %xmm4
284 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
285 ; AVX2-NEXT: vmovd %ecx, %xmm0
286 ; AVX2-NEXT: movq %rax, %rcx
287 ; AVX2-NEXT: shrq $48, %rcx
288 ; AVX2-NEXT: movswl %cx, %ecx
289 ; AVX2-NEXT: vmovd %ecx, %xmm5
290 ; AVX2-NEXT: movq %rax, %rcx
291 ; AVX2-NEXT: shrq $32, %rcx
292 ; AVX2-NEXT: movswl %cx, %ecx
293 ; AVX2-NEXT: vmovd %ecx, %xmm6
294 ; AVX2-NEXT: movl %eax, %ecx
295 ; AVX2-NEXT: shrl $16, %ecx
296 ; AVX2-NEXT: movswl %cx, %ecx
297 ; AVX2-NEXT: vmovd %ecx, %xmm7
299 ; AVX2-NEXT: vmovd %eax, %xmm1
300 ; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8
301 ; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9
302 ; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10
303 ; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11
304 ; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12
305 ; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13
306 ; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14
307 ; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15
308 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
309 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
310 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
311 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
312 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
313 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
314 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
315 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
316 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
317 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
318 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
319 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
320 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
321 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
322 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
323 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
324 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
325 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
326 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
327 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
328 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
329 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
332 ; AVX512F-LABEL: cvt_16i16_to_16f32:
334 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10
335 ; AVX512F-NEXT: vmovq %xmm0, %rax
336 ; AVX512F-NEXT: movq %rax, %rcx
337 ; AVX512F-NEXT: shrq $48, %rcx
338 ; AVX512F-NEXT: movswl %cx, %ecx
339 ; AVX512F-NEXT: vmovd %ecx, %xmm8
340 ; AVX512F-NEXT: movq %rax, %rcx
341 ; AVX512F-NEXT: shrq $32, %rcx
342 ; AVX512F-NEXT: movswl %cx, %ecx
343 ; AVX512F-NEXT: vmovd %ecx, %xmm9
344 ; AVX512F-NEXT: movswl %ax, %ecx
345 ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
346 ; AVX512F-NEXT: shrl $16, %eax
348 ; AVX512F-NEXT: vmovd %eax, %xmm11
349 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
350 ; AVX512F-NEXT: vmovd %ecx, %xmm12
351 ; AVX512F-NEXT: movq %rax, %rcx
352 ; AVX512F-NEXT: shrq $48, %rcx
353 ; AVX512F-NEXT: movswl %cx, %ecx
354 ; AVX512F-NEXT: vmovd %ecx, %xmm13
355 ; AVX512F-NEXT: movq %rax, %rcx
356 ; AVX512F-NEXT: shrq $32, %rcx
357 ; AVX512F-NEXT: movswl %cx, %ecx
358 ; AVX512F-NEXT: vmovd %ecx, %xmm14
359 ; AVX512F-NEXT: movswl %ax, %ecx
360 ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
361 ; AVX512F-NEXT: shrl $16, %eax
363 ; AVX512F-NEXT: vmovd %eax, %xmm15
364 ; AVX512F-NEXT: vmovq %xmm10, %rax
365 ; AVX512F-NEXT: vmovd %ecx, %xmm2
366 ; AVX512F-NEXT: movq %rax, %rcx
367 ; AVX512F-NEXT: shrq $48, %rcx
368 ; AVX512F-NEXT: movswl %cx, %ecx
369 ; AVX512F-NEXT: vmovd %ecx, %xmm3
370 ; AVX512F-NEXT: movq %rax, %rcx
371 ; AVX512F-NEXT: shrq $32, %rcx
372 ; AVX512F-NEXT: movswl %cx, %ecx
373 ; AVX512F-NEXT: vmovd %ecx, %xmm1
374 ; AVX512F-NEXT: movswl %ax, %ecx
375 ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
376 ; AVX512F-NEXT: shrl $16, %eax
378 ; AVX512F-NEXT: vmovd %eax, %xmm4
379 ; AVX512F-NEXT: vpextrq $1, %xmm10, %rax
380 ; AVX512F-NEXT: vmovd %ecx, %xmm10
381 ; AVX512F-NEXT: movq %rax, %rcx
382 ; AVX512F-NEXT: shrq $48, %rcx
383 ; AVX512F-NEXT: movswl %cx, %ecx
384 ; AVX512F-NEXT: vmovd %ecx, %xmm5
385 ; AVX512F-NEXT: movq %rax, %rcx
386 ; AVX512F-NEXT: shrq $32, %rcx
387 ; AVX512F-NEXT: movswl %cx, %ecx
388 ; AVX512F-NEXT: vmovd %ecx, %xmm6
389 ; AVX512F-NEXT: movl %eax, %ecx
390 ; AVX512F-NEXT: shrl $16, %ecx
391 ; AVX512F-NEXT: movswl %cx, %ecx
392 ; AVX512F-NEXT: vmovd %ecx, %xmm7
394 ; AVX512F-NEXT: vmovd %eax, %xmm0
395 ; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8
396 ; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9
397 ; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11
398 ; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12
399 ; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13
400 ; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14
401 ; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15
402 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
403 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
404 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
405 ; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
406 ; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10
407 ; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
408 ; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
409 ; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
410 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
411 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
412 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
413 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
414 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
415 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
416 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
417 ; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
418 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
419 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
420 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
421 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
422 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
423 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
424 ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
425 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
428 ; AVX512VL-LABEL: cvt_16i16_to_16f32:
430 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm10
431 ; AVX512VL-NEXT: vmovq %xmm0, %rax
432 ; AVX512VL-NEXT: movq %rax, %rcx
433 ; AVX512VL-NEXT: shrq $48, %rcx
434 ; AVX512VL-NEXT: movswl %cx, %ecx
435 ; AVX512VL-NEXT: vmovd %ecx, %xmm8
436 ; AVX512VL-NEXT: movq %rax, %rcx
437 ; AVX512VL-NEXT: shrq $32, %rcx
438 ; AVX512VL-NEXT: movswl %cx, %ecx
439 ; AVX512VL-NEXT: vmovd %ecx, %xmm9
440 ; AVX512VL-NEXT: movswl %ax, %ecx
441 ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
442 ; AVX512VL-NEXT: shrl $16, %eax
443 ; AVX512VL-NEXT: cwtl
444 ; AVX512VL-NEXT: vmovd %eax, %xmm11
445 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
446 ; AVX512VL-NEXT: vmovd %ecx, %xmm12
447 ; AVX512VL-NEXT: movq %rax, %rcx
448 ; AVX512VL-NEXT: shrq $48, %rcx
449 ; AVX512VL-NEXT: movswl %cx, %ecx
450 ; AVX512VL-NEXT: vmovd %ecx, %xmm13
451 ; AVX512VL-NEXT: movq %rax, %rcx
452 ; AVX512VL-NEXT: shrq $32, %rcx
453 ; AVX512VL-NEXT: movswl %cx, %ecx
454 ; AVX512VL-NEXT: vmovd %ecx, %xmm14
455 ; AVX512VL-NEXT: movswl %ax, %ecx
456 ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
457 ; AVX512VL-NEXT: shrl $16, %eax
458 ; AVX512VL-NEXT: cwtl
459 ; AVX512VL-NEXT: vmovd %eax, %xmm15
460 ; AVX512VL-NEXT: vmovq %xmm10, %rax
461 ; AVX512VL-NEXT: vmovd %ecx, %xmm16
462 ; AVX512VL-NEXT: movq %rax, %rcx
463 ; AVX512VL-NEXT: shrq $48, %rcx
464 ; AVX512VL-NEXT: movswl %cx, %ecx
465 ; AVX512VL-NEXT: vmovd %ecx, %xmm17
466 ; AVX512VL-NEXT: movq %rax, %rcx
467 ; AVX512VL-NEXT: shrq $32, %rcx
468 ; AVX512VL-NEXT: movswl %cx, %ecx
469 ; AVX512VL-NEXT: vmovd %ecx, %xmm18
470 ; AVX512VL-NEXT: movswl %ax, %ecx
471 ; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
472 ; AVX512VL-NEXT: shrl $16, %eax
473 ; AVX512VL-NEXT: cwtl
474 ; AVX512VL-NEXT: vmovd %eax, %xmm19
475 ; AVX512VL-NEXT: vpextrq $1, %xmm10, %rax
476 ; AVX512VL-NEXT: vmovd %ecx, %xmm10
477 ; AVX512VL-NEXT: movq %rax, %rcx
478 ; AVX512VL-NEXT: shrq $48, %rcx
479 ; AVX512VL-NEXT: movswl %cx, %ecx
480 ; AVX512VL-NEXT: vmovd %ecx, %xmm20
481 ; AVX512VL-NEXT: movq %rax, %rcx
482 ; AVX512VL-NEXT: shrq $32, %rcx
483 ; AVX512VL-NEXT: movswl %cx, %ecx
484 ; AVX512VL-NEXT: vmovd %ecx, %xmm21
485 ; AVX512VL-NEXT: movl %eax, %ecx
486 ; AVX512VL-NEXT: shrl $16, %ecx
487 ; AVX512VL-NEXT: movswl %cx, %ecx
488 ; AVX512VL-NEXT: vmovd %ecx, %xmm22
489 ; AVX512VL-NEXT: cwtl
490 ; AVX512VL-NEXT: vmovd %eax, %xmm2
491 ; AVX512VL-NEXT: vcvtph2ps %xmm8, %xmm8
492 ; AVX512VL-NEXT: vcvtph2ps %xmm9, %xmm9
493 ; AVX512VL-NEXT: vcvtph2ps %xmm11, %xmm11
494 ; AVX512VL-NEXT: vcvtph2ps %xmm12, %xmm12
495 ; AVX512VL-NEXT: vcvtph2ps %xmm13, %xmm13
496 ; AVX512VL-NEXT: vcvtph2ps %xmm14, %xmm14
497 ; AVX512VL-NEXT: vcvtph2ps %xmm15, %xmm15
498 ; AVX512VL-NEXT: vcvtph2ps %xmm16, %xmm16
499 ; AVX512VL-NEXT: vcvtph2ps %xmm17, %xmm4
500 ; AVX512VL-NEXT: vcvtph2ps %xmm18, %xmm0
501 ; AVX512VL-NEXT: vcvtph2ps %xmm19, %xmm5
502 ; AVX512VL-NEXT: vcvtph2ps %xmm10, %xmm7
503 ; AVX512VL-NEXT: vcvtph2ps %xmm20, %xmm3
504 ; AVX512VL-NEXT: vcvtph2ps %xmm21, %xmm6
505 ; AVX512VL-NEXT: vcvtph2ps %xmm22, %xmm1
506 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
507 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
508 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
509 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
510 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3]
511 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
512 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
513 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
514 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3]
515 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
516 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
517 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
518 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
519 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
520 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
521 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
522 ; AVX512VL-NEXT: retq
523 %1 = bitcast <16 x i16> %a0 to <16 x half>
524 %2 = fpext <16 x half> %1 to <16 x float>
529 ; Half to Float (Load)
532 define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
533 ; ALL-LABEL: load_cvt_i16_to_f32:
535 ; ALL-NEXT: movswl (%rdi), %eax
536 ; ALL-NEXT: vmovd %eax, %xmm0
537 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
539 %1 = load i16, i16* %a0
540 %2 = bitcast i16 %1 to half
541 %3 = fpext half %2 to float
545 define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
546 ; ALL-LABEL: load_cvt_4i16_to_4f32:
548 ; ALL-NEXT: movswl 6(%rdi), %eax
549 ; ALL-NEXT: vmovd %eax, %xmm0
550 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
551 ; ALL-NEXT: movswl 4(%rdi), %eax
552 ; ALL-NEXT: vmovd %eax, %xmm1
553 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
554 ; ALL-NEXT: movswl (%rdi), %eax
555 ; ALL-NEXT: vmovd %eax, %xmm2
556 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
557 ; ALL-NEXT: movswl 2(%rdi), %eax
558 ; ALL-NEXT: vmovd %eax, %xmm3
559 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
560 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
561 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
562 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
564 %1 = load <4 x i16>, <4 x i16>* %a0
565 %2 = bitcast <4 x i16> %1 to <4 x half>
566 %3 = fpext <4 x half> %2 to <4 x float>
570 define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
571 ; ALL-LABEL: load_cvt_8i16_to_4f32:
573 ; ALL-NEXT: movq (%rdi), %rax
574 ; ALL-NEXT: movq %rax, %rcx
575 ; ALL-NEXT: movq %rax, %rdx
576 ; ALL-NEXT: movswl %ax, %esi
577 ; ALL-NEXT: # kill: def $eax killed $eax killed $rax
578 ; ALL-NEXT: shrl $16, %eax
579 ; ALL-NEXT: shrq $32, %rcx
580 ; ALL-NEXT: shrq $48, %rdx
581 ; ALL-NEXT: movswl %dx, %edx
582 ; ALL-NEXT: vmovd %edx, %xmm0
583 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
584 ; ALL-NEXT: movswl %cx, %ecx
585 ; ALL-NEXT: vmovd %ecx, %xmm1
586 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
588 ; ALL-NEXT: vmovd %eax, %xmm2
589 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
590 ; ALL-NEXT: vmovd %esi, %xmm3
591 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
592 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
593 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
594 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
596 %1 = load <8 x i16>, <8 x i16>* %a0
597 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
598 %3 = bitcast <4 x i16> %2 to <4 x half>
599 %4 = fpext <4 x half> %3 to <4 x float>
603 define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
604 ; ALL-LABEL: load_cvt_8i16_to_8f32:
606 ; ALL-NEXT: movswl 6(%rdi), %eax
607 ; ALL-NEXT: vmovd %eax, %xmm0
608 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
609 ; ALL-NEXT: movswl 4(%rdi), %eax
610 ; ALL-NEXT: vmovd %eax, %xmm1
611 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
612 ; ALL-NEXT: movswl (%rdi), %eax
613 ; ALL-NEXT: vmovd %eax, %xmm2
614 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
615 ; ALL-NEXT: movswl 2(%rdi), %eax
616 ; ALL-NEXT: vmovd %eax, %xmm3
617 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
618 ; ALL-NEXT: movswl 14(%rdi), %eax
619 ; ALL-NEXT: vmovd %eax, %xmm4
620 ; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
621 ; ALL-NEXT: movswl 12(%rdi), %eax
622 ; ALL-NEXT: vmovd %eax, %xmm5
623 ; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
624 ; ALL-NEXT: movswl 8(%rdi), %eax
625 ; ALL-NEXT: vmovd %eax, %xmm6
626 ; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
627 ; ALL-NEXT: movswl 10(%rdi), %eax
628 ; ALL-NEXT: vmovd %eax, %xmm7
629 ; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
630 ; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
631 ; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
632 ; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
633 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
634 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
635 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
636 ; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
638 %1 = load <8 x i16>, <8 x i16>* %a0
639 %2 = bitcast <8 x i16> %1 to <8 x half>
640 %3 = fpext <8 x half> %2 to <8 x float>
644 define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
645 ; AVX1-LABEL: load_cvt_16i16_to_16f32:
647 ; AVX1-NEXT: movswl 22(%rdi), %eax
648 ; AVX1-NEXT: vmovd %eax, %xmm0
649 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm8
650 ; AVX1-NEXT: movswl 20(%rdi), %eax
651 ; AVX1-NEXT: vmovd %eax, %xmm0
652 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm9
653 ; AVX1-NEXT: movswl 16(%rdi), %eax
654 ; AVX1-NEXT: vmovd %eax, %xmm0
655 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm10
656 ; AVX1-NEXT: movswl 18(%rdi), %eax
657 ; AVX1-NEXT: vmovd %eax, %xmm0
658 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm11
659 ; AVX1-NEXT: movswl 30(%rdi), %eax
660 ; AVX1-NEXT: vmovd %eax, %xmm0
661 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm12
662 ; AVX1-NEXT: movswl 28(%rdi), %eax
663 ; AVX1-NEXT: vmovd %eax, %xmm0
664 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm13
665 ; AVX1-NEXT: movswl 24(%rdi), %eax
666 ; AVX1-NEXT: vmovd %eax, %xmm0
667 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm14
668 ; AVX1-NEXT: movswl 26(%rdi), %eax
669 ; AVX1-NEXT: vmovd %eax, %xmm0
670 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm15
671 ; AVX1-NEXT: movswl 6(%rdi), %eax
672 ; AVX1-NEXT: vmovd %eax, %xmm0
673 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
674 ; AVX1-NEXT: movswl 4(%rdi), %eax
675 ; AVX1-NEXT: vmovd %eax, %xmm2
676 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
677 ; AVX1-NEXT: movswl (%rdi), %eax
678 ; AVX1-NEXT: vmovd %eax, %xmm3
679 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
680 ; AVX1-NEXT: movswl 2(%rdi), %eax
681 ; AVX1-NEXT: vmovd %eax, %xmm4
682 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
683 ; AVX1-NEXT: movswl 14(%rdi), %eax
684 ; AVX1-NEXT: vmovd %eax, %xmm5
685 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
686 ; AVX1-NEXT: movswl 12(%rdi), %eax
687 ; AVX1-NEXT: vmovd %eax, %xmm6
688 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
689 ; AVX1-NEXT: movswl 8(%rdi), %eax
690 ; AVX1-NEXT: vmovd %eax, %xmm7
691 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
692 ; AVX1-NEXT: movswl 10(%rdi), %eax
693 ; AVX1-NEXT: vmovd %eax, %xmm1
694 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
695 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
696 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
697 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
698 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
699 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
700 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
701 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
702 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
703 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
704 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
705 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
706 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
707 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
708 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
711 ; AVX2-LABEL: load_cvt_16i16_to_16f32:
713 ; AVX2-NEXT: movswl 22(%rdi), %eax
714 ; AVX2-NEXT: vmovd %eax, %xmm0
715 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm8
716 ; AVX2-NEXT: movswl 20(%rdi), %eax
717 ; AVX2-NEXT: vmovd %eax, %xmm0
718 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm9
719 ; AVX2-NEXT: movswl 16(%rdi), %eax
720 ; AVX2-NEXT: vmovd %eax, %xmm0
721 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm10
722 ; AVX2-NEXT: movswl 18(%rdi), %eax
723 ; AVX2-NEXT: vmovd %eax, %xmm0
724 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm11
725 ; AVX2-NEXT: movswl 30(%rdi), %eax
726 ; AVX2-NEXT: vmovd %eax, %xmm0
727 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm12
728 ; AVX2-NEXT: movswl 28(%rdi), %eax
729 ; AVX2-NEXT: vmovd %eax, %xmm0
730 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm13
731 ; AVX2-NEXT: movswl 24(%rdi), %eax
732 ; AVX2-NEXT: vmovd %eax, %xmm0
733 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm14
734 ; AVX2-NEXT: movswl 26(%rdi), %eax
735 ; AVX2-NEXT: vmovd %eax, %xmm0
736 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm15
737 ; AVX2-NEXT: movswl 6(%rdi), %eax
738 ; AVX2-NEXT: vmovd %eax, %xmm0
739 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
740 ; AVX2-NEXT: movswl 4(%rdi), %eax
741 ; AVX2-NEXT: vmovd %eax, %xmm2
742 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
743 ; AVX2-NEXT: movswl (%rdi), %eax
744 ; AVX2-NEXT: vmovd %eax, %xmm3
745 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
746 ; AVX2-NEXT: movswl 2(%rdi), %eax
747 ; AVX2-NEXT: vmovd %eax, %xmm4
748 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
749 ; AVX2-NEXT: movswl 14(%rdi), %eax
750 ; AVX2-NEXT: vmovd %eax, %xmm5
751 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
752 ; AVX2-NEXT: movswl 12(%rdi), %eax
753 ; AVX2-NEXT: vmovd %eax, %xmm6
754 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
755 ; AVX2-NEXT: movswl 8(%rdi), %eax
756 ; AVX2-NEXT: vmovd %eax, %xmm7
757 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
758 ; AVX2-NEXT: movswl 10(%rdi), %eax
759 ; AVX2-NEXT: vmovd %eax, %xmm1
760 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
761 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
762 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
763 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
764 ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
765 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
766 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
767 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
768 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
769 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
770 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
771 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
772 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
773 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
774 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
777 ; AVX512F-LABEL: load_cvt_16i16_to_16f32:
779 ; AVX512F-NEXT: movswl 6(%rdi), %eax
780 ; AVX512F-NEXT: vmovd %eax, %xmm0
781 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm8
782 ; AVX512F-NEXT: movswl 4(%rdi), %eax
783 ; AVX512F-NEXT: vmovd %eax, %xmm0
784 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm9
785 ; AVX512F-NEXT: movswl (%rdi), %eax
786 ; AVX512F-NEXT: vmovd %eax, %xmm0
787 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm10
788 ; AVX512F-NEXT: movswl 2(%rdi), %eax
789 ; AVX512F-NEXT: vmovd %eax, %xmm0
790 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm11
791 ; AVX512F-NEXT: movswl 14(%rdi), %eax
792 ; AVX512F-NEXT: vmovd %eax, %xmm0
793 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm12
794 ; AVX512F-NEXT: movswl 12(%rdi), %eax
795 ; AVX512F-NEXT: vmovd %eax, %xmm0
796 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm13
797 ; AVX512F-NEXT: movswl 8(%rdi), %eax
798 ; AVX512F-NEXT: vmovd %eax, %xmm0
799 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm14
800 ; AVX512F-NEXT: movswl 10(%rdi), %eax
801 ; AVX512F-NEXT: vmovd %eax, %xmm0
802 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm15
803 ; AVX512F-NEXT: movswl 22(%rdi), %eax
804 ; AVX512F-NEXT: vmovd %eax, %xmm0
805 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
806 ; AVX512F-NEXT: movswl 20(%rdi), %eax
807 ; AVX512F-NEXT: vmovd %eax, %xmm1
808 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
809 ; AVX512F-NEXT: movswl 16(%rdi), %eax
810 ; AVX512F-NEXT: vmovd %eax, %xmm2
811 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
812 ; AVX512F-NEXT: movswl 18(%rdi), %eax
813 ; AVX512F-NEXT: vmovd %eax, %xmm3
814 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
815 ; AVX512F-NEXT: movswl 30(%rdi), %eax
816 ; AVX512F-NEXT: vmovd %eax, %xmm4
817 ; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
818 ; AVX512F-NEXT: movswl 28(%rdi), %eax
819 ; AVX512F-NEXT: vmovd %eax, %xmm5
820 ; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
821 ; AVX512F-NEXT: movswl 24(%rdi), %eax
822 ; AVX512F-NEXT: vmovd %eax, %xmm6
823 ; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
824 ; AVX512F-NEXT: movswl 26(%rdi), %eax
825 ; AVX512F-NEXT: vmovd %eax, %xmm7
826 ; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
827 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
828 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
829 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
830 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
831 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
832 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
833 ; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
834 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
835 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
836 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
837 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
838 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
839 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
840 ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
841 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
844 ; AVX512VL-LABEL: load_cvt_16i16_to_16f32:
846 ; AVX512VL-NEXT: movswl 6(%rdi), %eax
847 ; AVX512VL-NEXT: vmovd %eax, %xmm0
848 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm8
849 ; AVX512VL-NEXT: movswl 4(%rdi), %eax
850 ; AVX512VL-NEXT: vmovd %eax, %xmm1
851 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm9
852 ; AVX512VL-NEXT: movswl (%rdi), %eax
853 ; AVX512VL-NEXT: vmovd %eax, %xmm2
854 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm10
855 ; AVX512VL-NEXT: movswl 2(%rdi), %eax
856 ; AVX512VL-NEXT: vmovd %eax, %xmm3
857 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm11
858 ; AVX512VL-NEXT: movswl 14(%rdi), %eax
859 ; AVX512VL-NEXT: vmovd %eax, %xmm4
860 ; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm12
861 ; AVX512VL-NEXT: movswl 12(%rdi), %eax
862 ; AVX512VL-NEXT: vmovd %eax, %xmm5
863 ; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm13
864 ; AVX512VL-NEXT: movswl 8(%rdi), %eax
865 ; AVX512VL-NEXT: vmovd %eax, %xmm6
866 ; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm14
867 ; AVX512VL-NEXT: movswl 10(%rdi), %eax
868 ; AVX512VL-NEXT: vmovd %eax, %xmm7
869 ; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm15
870 ; AVX512VL-NEXT: movswl 22(%rdi), %eax
871 ; AVX512VL-NEXT: vmovd %eax, %xmm0
872 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
873 ; AVX512VL-NEXT: movswl 20(%rdi), %eax
874 ; AVX512VL-NEXT: vmovd %eax, %xmm1
875 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
876 ; AVX512VL-NEXT: movswl 16(%rdi), %eax
877 ; AVX512VL-NEXT: vmovd %eax, %xmm2
878 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
879 ; AVX512VL-NEXT: movswl 18(%rdi), %eax
880 ; AVX512VL-NEXT: vmovd %eax, %xmm3
881 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
882 ; AVX512VL-NEXT: movswl 30(%rdi), %eax
883 ; AVX512VL-NEXT: vmovd %eax, %xmm4
884 ; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
885 ; AVX512VL-NEXT: movswl 28(%rdi), %eax
886 ; AVX512VL-NEXT: vmovd %eax, %xmm5
887 ; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
888 ; AVX512VL-NEXT: movswl 24(%rdi), %eax
889 ; AVX512VL-NEXT: vmovd %eax, %xmm6
890 ; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
891 ; AVX512VL-NEXT: movswl 26(%rdi), %eax
892 ; AVX512VL-NEXT: vmovd %eax, %xmm7
893 ; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
894 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
895 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
896 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
897 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
898 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
899 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
900 ; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
901 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
902 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
903 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
904 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
905 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
906 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
907 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
908 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
909 ; AVX512VL-NEXT: retq
910 %1 = load <16 x i16>, <16 x i16>* %a0
911 %2 = bitcast <16 x i16> %1 to <16 x half>
912 %3 = fpext <16 x half> %2 to <16 x float>
920 define double @cvt_i16_to_f64(i16 %a0) nounwind {
921 ; ALL-LABEL: cvt_i16_to_f64:
923 ; ALL-NEXT: movswl %di, %eax
924 ; ALL-NEXT: vmovd %eax, %xmm0
925 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
926 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
928 %1 = bitcast i16 %a0 to half
929 %2 = fpext half %1 to double
933 define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
934 ; ALL-LABEL: cvt_2i16_to_2f64:
936 ; ALL-NEXT: vmovd %xmm0, %eax
937 ; ALL-NEXT: movswl %ax, %ecx
938 ; ALL-NEXT: shrl $16, %eax
940 ; ALL-NEXT: vmovd %eax, %xmm0
941 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
942 ; ALL-NEXT: vmovd %ecx, %xmm1
943 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
944 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
945 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
946 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
948 %1 = bitcast <2 x i16> %a0 to <2 x half>
949 %2 = fpext <2 x half> %1 to <2 x double>
953 define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
954 ; ALL-LABEL: cvt_4i16_to_4f64:
956 ; ALL-NEXT: vmovq %xmm0, %rax
957 ; ALL-NEXT: movq %rax, %rcx
958 ; ALL-NEXT: movl %eax, %edx
959 ; ALL-NEXT: movswl %ax, %esi
960 ; ALL-NEXT: shrq $48, %rax
961 ; ALL-NEXT: shrq $32, %rcx
962 ; ALL-NEXT: shrl $16, %edx
963 ; ALL-NEXT: movswl %dx, %edx
964 ; ALL-NEXT: vmovd %edx, %xmm0
965 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
966 ; ALL-NEXT: vmovd %esi, %xmm1
967 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
968 ; ALL-NEXT: movswl %cx, %ecx
969 ; ALL-NEXT: vmovd %ecx, %xmm2
970 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
972 ; ALL-NEXT: vmovd %eax, %xmm3
973 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
974 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
975 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
976 ; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
977 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
978 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
979 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
980 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
982 %1 = bitcast <4 x i16> %a0 to <4 x half>
983 %2 = fpext <4 x half> %1 to <4 x double>
987 define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
988 ; ALL-LABEL: cvt_8i16_to_2f64:
990 ; ALL-NEXT: vmovd %xmm0, %eax
991 ; ALL-NEXT: movswl %ax, %ecx
992 ; ALL-NEXT: shrl $16, %eax
994 ; ALL-NEXT: vmovd %eax, %xmm0
995 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
996 ; ALL-NEXT: vmovd %ecx, %xmm1
997 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
998 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
999 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1000 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1002 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
1003 %2 = bitcast <2 x i16> %1 to <2 x half>
1004 %3 = fpext <2 x half> %2 to <2 x double>
1008 define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
1009 ; ALL-LABEL: cvt_8i16_to_4f64:
1011 ; ALL-NEXT: vmovq %xmm0, %rax
1012 ; ALL-NEXT: movq %rax, %rcx
1013 ; ALL-NEXT: movl %eax, %edx
1014 ; ALL-NEXT: movswl %ax, %esi
1015 ; ALL-NEXT: shrq $48, %rax
1016 ; ALL-NEXT: shrq $32, %rcx
1017 ; ALL-NEXT: shrl $16, %edx
1018 ; ALL-NEXT: movswl %dx, %edx
1019 ; ALL-NEXT: vmovd %edx, %xmm0
1020 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1021 ; ALL-NEXT: vmovd %esi, %xmm1
1022 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
1023 ; ALL-NEXT: movswl %cx, %ecx
1024 ; ALL-NEXT: vmovd %ecx, %xmm2
1025 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
1027 ; ALL-NEXT: vmovd %eax, %xmm3
1028 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
1029 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1030 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1031 ; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1032 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1033 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1034 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1035 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1037 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1038 %2 = bitcast <4 x i16> %1 to <4 x half>
1039 %3 = fpext <4 x half> %2 to <4 x double>
1043 define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
1044 ; AVX1-LABEL: cvt_8i16_to_8f64:
1046 ; AVX1-NEXT: vmovq %xmm0, %rdx
1047 ; AVX1-NEXT: movq %rdx, %r9
1048 ; AVX1-NEXT: movl %edx, %r10d
1049 ; AVX1-NEXT: movswl %dx, %r8d
1050 ; AVX1-NEXT: shrq $48, %rdx
1051 ; AVX1-NEXT: shrq $32, %r9
1052 ; AVX1-NEXT: shrl $16, %r10d
1053 ; AVX1-NEXT: vpextrq $1, %xmm0, %rdi
1054 ; AVX1-NEXT: movq %rdi, %rsi
1055 ; AVX1-NEXT: movl %edi, %eax
1056 ; AVX1-NEXT: movswl %di, %ecx
1057 ; AVX1-NEXT: shrq $48, %rdi
1058 ; AVX1-NEXT: shrq $32, %rsi
1059 ; AVX1-NEXT: shrl $16, %eax
1061 ; AVX1-NEXT: vmovd %eax, %xmm0
1062 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
1063 ; AVX1-NEXT: vmovd %ecx, %xmm0
1064 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
1065 ; AVX1-NEXT: movswl %si, %eax
1066 ; AVX1-NEXT: vmovd %eax, %xmm0
1067 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
1068 ; AVX1-NEXT: movswl %di, %eax
1069 ; AVX1-NEXT: vmovd %eax, %xmm0
1070 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4
1071 ; AVX1-NEXT: movswl %r10w, %eax
1072 ; AVX1-NEXT: vmovd %eax, %xmm0
1073 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
1074 ; AVX1-NEXT: vmovd %r8d, %xmm5
1075 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
1076 ; AVX1-NEXT: movswl %r9w, %eax
1077 ; AVX1-NEXT: vmovd %eax, %xmm6
1078 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
1079 ; AVX1-NEXT: movswl %dx, %eax
1080 ; AVX1-NEXT: vmovd %eax, %xmm7
1081 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
1082 ; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1083 ; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1084 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1085 ; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1086 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1087 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0]
1088 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
1089 ; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1090 ; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1091 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1092 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1093 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1094 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1095 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1098 ; AVX2-LABEL: cvt_8i16_to_8f64:
1100 ; AVX2-NEXT: vmovq %xmm0, %rdx
1101 ; AVX2-NEXT: movq %rdx, %r9
1102 ; AVX2-NEXT: movl %edx, %r10d
1103 ; AVX2-NEXT: movswl %dx, %r8d
1104 ; AVX2-NEXT: shrq $48, %rdx
1105 ; AVX2-NEXT: shrq $32, %r9
1106 ; AVX2-NEXT: shrl $16, %r10d
1107 ; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
1108 ; AVX2-NEXT: movq %rdi, %rsi
1109 ; AVX2-NEXT: movl %edi, %eax
1110 ; AVX2-NEXT: movswl %di, %ecx
1111 ; AVX2-NEXT: shrq $48, %rdi
1112 ; AVX2-NEXT: shrq $32, %rsi
1113 ; AVX2-NEXT: shrl $16, %eax
1115 ; AVX2-NEXT: vmovd %eax, %xmm0
1116 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
1117 ; AVX2-NEXT: vmovd %ecx, %xmm0
1118 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
1119 ; AVX2-NEXT: movswl %si, %eax
1120 ; AVX2-NEXT: vmovd %eax, %xmm0
1121 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
1122 ; AVX2-NEXT: movswl %di, %eax
1123 ; AVX2-NEXT: vmovd %eax, %xmm0
1124 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4
1125 ; AVX2-NEXT: movswl %r10w, %eax
1126 ; AVX2-NEXT: vmovd %eax, %xmm0
1127 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
1128 ; AVX2-NEXT: vmovd %r8d, %xmm5
1129 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
1130 ; AVX2-NEXT: movswl %r9w, %eax
1131 ; AVX2-NEXT: vmovd %eax, %xmm6
1132 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
1133 ; AVX2-NEXT: movswl %dx, %eax
1134 ; AVX2-NEXT: vmovd %eax, %xmm7
1135 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
1136 ; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1137 ; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1138 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1139 ; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1140 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1141 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0]
1142 ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
1143 ; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1144 ; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1145 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1146 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1147 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1148 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1149 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1152 ; AVX512-LABEL: cvt_8i16_to_8f64:
1154 ; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
1155 ; AVX512-NEXT: movq %rdx, %r9
1156 ; AVX512-NEXT: movl %edx, %r10d
1157 ; AVX512-NEXT: movswl %dx, %r8d
1158 ; AVX512-NEXT: shrq $48, %rdx
1159 ; AVX512-NEXT: shrq $32, %r9
1160 ; AVX512-NEXT: shrl $16, %r10d
1161 ; AVX512-NEXT: vmovq %xmm0, %rdi
1162 ; AVX512-NEXT: movq %rdi, %rsi
1163 ; AVX512-NEXT: movl %edi, %eax
1164 ; AVX512-NEXT: movswl %di, %ecx
1165 ; AVX512-NEXT: shrq $48, %rdi
1166 ; AVX512-NEXT: shrq $32, %rsi
1167 ; AVX512-NEXT: shrl $16, %eax
1169 ; AVX512-NEXT: vmovd %eax, %xmm0
1170 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
1171 ; AVX512-NEXT: vmovd %ecx, %xmm1
1172 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
1173 ; AVX512-NEXT: movswl %si, %eax
1174 ; AVX512-NEXT: vmovd %eax, %xmm2
1175 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1176 ; AVX512-NEXT: movswl %di, %eax
1177 ; AVX512-NEXT: vmovd %eax, %xmm3
1178 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1179 ; AVX512-NEXT: movswl %r10w, %eax
1180 ; AVX512-NEXT: vmovd %eax, %xmm4
1181 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
1182 ; AVX512-NEXT: vmovd %r8d, %xmm5
1183 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1184 ; AVX512-NEXT: movswl %r9w, %eax
1185 ; AVX512-NEXT: vmovd %eax, %xmm6
1186 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
1187 ; AVX512-NEXT: movswl %dx, %eax
1188 ; AVX512-NEXT: vmovd %eax, %xmm7
1189 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
1190 ; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1191 ; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1192 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1193 ; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1194 ; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1195 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
1196 ; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
1197 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1198 ; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1199 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1200 ; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1201 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1202 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1203 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1204 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
1206 %1 = bitcast <8 x i16> %a0 to <8 x half>
1207 %2 = fpext <8 x half> %1 to <8 x double>
1212 ; Half to Double (Load)
1215 define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
1216 ; ALL-LABEL: load_cvt_i16_to_f64:
1218 ; ALL-NEXT: movswl (%rdi), %eax
1219 ; ALL-NEXT: vmovd %eax, %xmm0
1220 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1221 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1223 %1 = load i16, i16* %a0
1224 %2 = bitcast i16 %1 to half
1225 %3 = fpext half %2 to double
1229 define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
1230 ; ALL-LABEL: load_cvt_2i16_to_2f64:
1232 ; ALL-NEXT: movswl (%rdi), %eax
1233 ; ALL-NEXT: vmovd %eax, %xmm0
1234 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1235 ; ALL-NEXT: movswl 2(%rdi), %eax
1236 ; ALL-NEXT: vmovd %eax, %xmm1
1237 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
1238 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1239 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1240 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1242 %1 = load <2 x i16>, <2 x i16>* %a0
1243 %2 = bitcast <2 x i16> %1 to <2 x half>
1244 %3 = fpext <2 x half> %2 to <2 x double>
1248 define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
1249 ; ALL-LABEL: load_cvt_4i16_to_4f64:
1251 ; ALL-NEXT: movswl (%rdi), %eax
1252 ; ALL-NEXT: vmovd %eax, %xmm0
1253 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1254 ; ALL-NEXT: movswl 2(%rdi), %eax
1255 ; ALL-NEXT: vmovd %eax, %xmm1
1256 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
1257 ; ALL-NEXT: movswl 4(%rdi), %eax
1258 ; ALL-NEXT: vmovd %eax, %xmm2
1259 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
1260 ; ALL-NEXT: movswl 6(%rdi), %eax
1261 ; ALL-NEXT: vmovd %eax, %xmm3
1262 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
1263 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1264 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1265 ; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1266 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1267 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1268 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1269 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1271 %1 = load <4 x i16>, <4 x i16>* %a0
1272 %2 = bitcast <4 x i16> %1 to <4 x half>
1273 %3 = fpext <4 x half> %2 to <4 x double>
1277 define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
1278 ; ALL-LABEL: load_cvt_8i16_to_4f64:
1280 ; ALL-NEXT: movq (%rdi), %rax
1281 ; ALL-NEXT: movq %rax, %rcx
1282 ; ALL-NEXT: movl %eax, %edx
1283 ; ALL-NEXT: movswl %ax, %esi
1284 ; ALL-NEXT: shrq $48, %rax
1285 ; ALL-NEXT: shrq $32, %rcx
1286 ; ALL-NEXT: shrl $16, %edx
1287 ; ALL-NEXT: movswl %dx, %edx
1288 ; ALL-NEXT: vmovd %edx, %xmm0
1289 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1290 ; ALL-NEXT: vmovd %esi, %xmm1
1291 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
1292 ; ALL-NEXT: movswl %cx, %ecx
1293 ; ALL-NEXT: vmovd %ecx, %xmm2
1294 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
1296 ; ALL-NEXT: vmovd %eax, %xmm3
1297 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
1298 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1299 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1300 ; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1301 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1302 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1303 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1304 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1306 %1 = load <8 x i16>, <8 x i16>* %a0
1307 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1308 %3 = bitcast <4 x i16> %2 to <4 x half>
1309 %4 = fpext <4 x half> %3 to <4 x double>
1313 define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
1314 ; AVX1-LABEL: load_cvt_8i16_to_8f64:
1316 ; AVX1-NEXT: movswl 8(%rdi), %eax
1317 ; AVX1-NEXT: vmovd %eax, %xmm0
1318 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
1319 ; AVX1-NEXT: movswl 10(%rdi), %eax
1320 ; AVX1-NEXT: vmovd %eax, %xmm0
1321 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
1322 ; AVX1-NEXT: movswl 12(%rdi), %eax
1323 ; AVX1-NEXT: vmovd %eax, %xmm0
1324 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
1325 ; AVX1-NEXT: movswl 14(%rdi), %eax
1326 ; AVX1-NEXT: vmovd %eax, %xmm0
1327 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4
1328 ; AVX1-NEXT: movswl (%rdi), %eax
1329 ; AVX1-NEXT: vmovd %eax, %xmm0
1330 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
1331 ; AVX1-NEXT: movswl 2(%rdi), %eax
1332 ; AVX1-NEXT: vmovd %eax, %xmm5
1333 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
1334 ; AVX1-NEXT: movswl 4(%rdi), %eax
1335 ; AVX1-NEXT: vmovd %eax, %xmm6
1336 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
1337 ; AVX1-NEXT: movswl 6(%rdi), %eax
1338 ; AVX1-NEXT: vmovd %eax, %xmm7
1339 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
1340 ; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1341 ; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1342 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1343 ; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1344 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1345 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1346 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
1347 ; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1348 ; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1349 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1350 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1351 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1352 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1353 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1356 ; AVX2-LABEL: load_cvt_8i16_to_8f64:
1358 ; AVX2-NEXT: movswl 8(%rdi), %eax
1359 ; AVX2-NEXT: vmovd %eax, %xmm0
1360 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
1361 ; AVX2-NEXT: movswl 10(%rdi), %eax
1362 ; AVX2-NEXT: vmovd %eax, %xmm0
1363 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
1364 ; AVX2-NEXT: movswl 12(%rdi), %eax
1365 ; AVX2-NEXT: vmovd %eax, %xmm0
1366 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
1367 ; AVX2-NEXT: movswl 14(%rdi), %eax
1368 ; AVX2-NEXT: vmovd %eax, %xmm0
1369 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4
1370 ; AVX2-NEXT: movswl (%rdi), %eax
1371 ; AVX2-NEXT: vmovd %eax, %xmm0
1372 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
1373 ; AVX2-NEXT: movswl 2(%rdi), %eax
1374 ; AVX2-NEXT: vmovd %eax, %xmm5
1375 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
1376 ; AVX2-NEXT: movswl 4(%rdi), %eax
1377 ; AVX2-NEXT: vmovd %eax, %xmm6
1378 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
1379 ; AVX2-NEXT: movswl 6(%rdi), %eax
1380 ; AVX2-NEXT: vmovd %eax, %xmm7
1381 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
1382 ; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1383 ; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1384 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1385 ; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1386 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1387 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1388 ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
1389 ; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1390 ; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1391 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
1392 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1393 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1394 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1395 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1398 ; AVX512-LABEL: load_cvt_8i16_to_8f64:
1400 ; AVX512-NEXT: movswl (%rdi), %eax
1401 ; AVX512-NEXT: vmovd %eax, %xmm0
1402 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
1403 ; AVX512-NEXT: movswl 2(%rdi), %eax
1404 ; AVX512-NEXT: vmovd %eax, %xmm1
1405 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
1406 ; AVX512-NEXT: movswl 4(%rdi), %eax
1407 ; AVX512-NEXT: vmovd %eax, %xmm2
1408 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1409 ; AVX512-NEXT: movswl 6(%rdi), %eax
1410 ; AVX512-NEXT: vmovd %eax, %xmm3
1411 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1412 ; AVX512-NEXT: movswl 8(%rdi), %eax
1413 ; AVX512-NEXT: vmovd %eax, %xmm4
1414 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
1415 ; AVX512-NEXT: movswl 10(%rdi), %eax
1416 ; AVX512-NEXT: vmovd %eax, %xmm5
1417 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1418 ; AVX512-NEXT: movswl 12(%rdi), %eax
1419 ; AVX512-NEXT: vmovd %eax, %xmm6
1420 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
1421 ; AVX512-NEXT: movswl 14(%rdi), %eax
1422 ; AVX512-NEXT: vmovd %eax, %xmm7
1423 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
1424 ; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1425 ; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1426 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1427 ; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1428 ; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1429 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
1430 ; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
1431 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1432 ; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1433 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1434 ; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1435 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1436 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1437 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1438 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
1440 %1 = load <8 x i16>, <8 x i16>* %a0
1441 %2 = bitcast <8 x i16> %1 to <8 x half>
1442 %3 = fpext <8 x half> %2 to <8 x double>
1450 define i16 @cvt_f32_to_i16(float %a0) nounwind {
1451 ; ALL-LABEL: cvt_f32_to_i16:
1453 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1454 ; ALL-NEXT: vmovd %xmm0, %eax
1455 ; ALL-NEXT: # kill: def $ax killed $ax killed $eax
1457 %1 = fptrunc float %a0 to half
1458 %2 = bitcast half %1 to i16
1462 define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
1463 ; ALL-LABEL: cvt_4f32_to_4i16:
1465 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1466 ; ALL-NEXT: vmovd %xmm1, %eax
1467 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1468 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1469 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1470 ; ALL-NEXT: vmovd %xmm1, %eax
1471 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1472 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1473 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1474 ; ALL-NEXT: vmovd %xmm1, %eax
1475 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1476 ; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1477 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1478 ; ALL-NEXT: vmovd %xmm0, %eax
1479 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1480 ; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
1482 %1 = fptrunc <4 x float> %a0 to <4 x half>
1483 %2 = bitcast <4 x half> %1 to <4 x i16>
1487 define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
1488 ; ALL-LABEL: cvt_4f32_to_8i16_undef:
1490 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1491 ; ALL-NEXT: vmovd %xmm1, %eax
1492 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1493 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1494 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1495 ; ALL-NEXT: vmovd %xmm1, %eax
1496 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1497 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1498 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1499 ; ALL-NEXT: vmovd %xmm1, %eax
1500 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1501 ; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1502 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1503 ; ALL-NEXT: vmovd %xmm0, %eax
1504 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1505 ; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
1507 %1 = fptrunc <4 x float> %a0 to <4 x half>
1508 %2 = bitcast <4 x half> %1 to <4 x i16>
1509 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1513 define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
1514 ; ALL-LABEL: cvt_4f32_to_8i16_zero:
1516 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1517 ; ALL-NEXT: vmovd %xmm1, %eax
1518 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1519 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1520 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1521 ; ALL-NEXT: vmovd %xmm1, %eax
1522 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1523 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1524 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1525 ; ALL-NEXT: vmovd %xmm1, %eax
1526 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1527 ; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1528 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1529 ; ALL-NEXT: vmovd %xmm0, %eax
1530 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1531 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1533 %1 = fptrunc <4 x float> %a0 to <4 x half>
1534 %2 = bitcast <4 x half> %1 to <4 x i16>
1535 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1539 define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
1540 ; ALL-LABEL: cvt_8f32_to_8i16:
1542 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1543 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1544 ; ALL-NEXT: vmovd %xmm1, %eax
1545 ; ALL-NEXT: shll $16, %eax
1546 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1547 ; ALL-NEXT: vmovd %xmm1, %ecx
1548 ; ALL-NEXT: movzwl %cx, %ecx
1549 ; ALL-NEXT: orl %eax, %ecx
1550 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1551 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1552 ; ALL-NEXT: vmovd %xmm1, %edx
1553 ; ALL-NEXT: shll $16, %edx
1554 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1555 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1556 ; ALL-NEXT: vmovd %xmm1, %eax
1557 ; ALL-NEXT: movzwl %ax, %eax
1558 ; ALL-NEXT: orl %edx, %eax
1559 ; ALL-NEXT: shlq $32, %rax
1560 ; ALL-NEXT: orq %rcx, %rax
1561 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
1562 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1563 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1564 ; ALL-NEXT: vmovd %xmm1, %ecx
1565 ; ALL-NEXT: shll $16, %ecx
1566 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1567 ; ALL-NEXT: vmovd %xmm1, %edx
1568 ; ALL-NEXT: movzwl %dx, %edx
1569 ; ALL-NEXT: orl %ecx, %edx
1570 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1571 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1572 ; ALL-NEXT: vmovd %xmm1, %ecx
1573 ; ALL-NEXT: shll $16, %ecx
1574 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1575 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1576 ; ALL-NEXT: vmovd %xmm0, %esi
1577 ; ALL-NEXT: movzwl %si, %esi
1578 ; ALL-NEXT: orl %ecx, %esi
1579 ; ALL-NEXT: shlq $32, %rsi
1580 ; ALL-NEXT: orq %rdx, %rsi
1581 ; ALL-NEXT: vmovq %rsi, %xmm0
1582 ; ALL-NEXT: vmovq %rax, %xmm1
1583 ; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1584 ; ALL-NEXT: vzeroupper
1586 %1 = fptrunc <8 x float> %a0 to <8 x half>
1587 %2 = bitcast <8 x half> %1 to <8 x i16>
1591 define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
1592 ; AVX1-LABEL: cvt_16f32_to_16i16:
1594 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm2
1595 ; AVX1-NEXT: vmovd %xmm2, %eax
1596 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1597 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1598 ; AVX1-NEXT: vmovd %eax, %xmm3
1599 ; AVX1-NEXT: vmovd %xmm2, %eax
1600 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1601 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1602 ; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1603 ; AVX1-NEXT: vmovd %xmm2, %eax
1604 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1605 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1606 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1607 ; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1608 ; AVX1-NEXT: vmovd %xmm1, %eax
1609 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm1
1610 ; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1611 ; AVX1-NEXT: vmovd %xmm1, %eax
1612 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1613 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1614 ; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1615 ; AVX1-NEXT: vmovd %xmm1, %eax
1616 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1617 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1618 ; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1619 ; AVX1-NEXT: vmovd %xmm1, %eax
1620 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1621 ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1622 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1623 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
1624 ; AVX1-NEXT: vmovd %xmm2, %eax
1625 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
1626 ; AVX1-NEXT: vmovd %xmm1, %eax
1627 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1628 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1629 ; AVX1-NEXT: vmovd %eax, %xmm3
1630 ; AVX1-NEXT: vmovd %xmm1, %eax
1631 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1632 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1633 ; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1634 ; AVX1-NEXT: vmovd %xmm1, %eax
1635 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1636 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1637 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1638 ; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1639 ; AVX1-NEXT: vmovd %xmm0, %eax
1640 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1641 ; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1642 ; AVX1-NEXT: vmovd %xmm0, %eax
1643 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1644 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1645 ; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1646 ; AVX1-NEXT: vmovd %xmm0, %eax
1647 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1648 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1649 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1650 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1651 ; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1652 ; AVX1-NEXT: vmovd %xmm1, %eax
1653 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
1654 ; AVX1-NEXT: vmovd %xmm0, %eax
1655 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
1656 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1659 ; AVX2-LABEL: cvt_16f32_to_16i16:
1661 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm2
1662 ; AVX2-NEXT: vmovd %xmm2, %eax
1663 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1664 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1665 ; AVX2-NEXT: vmovd %eax, %xmm3
1666 ; AVX2-NEXT: vmovd %xmm2, %eax
1667 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1668 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1669 ; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1670 ; AVX2-NEXT: vmovd %xmm2, %eax
1671 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
1672 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1673 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1674 ; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1675 ; AVX2-NEXT: vmovd %xmm1, %eax
1676 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm1
1677 ; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1678 ; AVX2-NEXT: vmovd %xmm1, %eax
1679 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1680 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1681 ; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1682 ; AVX2-NEXT: vmovd %xmm1, %eax
1683 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1684 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1685 ; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1686 ; AVX2-NEXT: vmovd %xmm1, %eax
1687 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1688 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1689 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1690 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
1691 ; AVX2-NEXT: vmovd %xmm2, %eax
1692 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
1693 ; AVX2-NEXT: vmovd %xmm1, %eax
1694 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1695 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1696 ; AVX2-NEXT: vmovd %eax, %xmm3
1697 ; AVX2-NEXT: vmovd %xmm1, %eax
1698 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1699 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1700 ; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1701 ; AVX2-NEXT: vmovd %xmm1, %eax
1702 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1703 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1704 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1705 ; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1706 ; AVX2-NEXT: vmovd %xmm0, %eax
1707 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1708 ; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1709 ; AVX2-NEXT: vmovd %xmm0, %eax
1710 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1711 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1712 ; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1713 ; AVX2-NEXT: vmovd %xmm0, %eax
1714 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1715 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1716 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1717 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1718 ; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1719 ; AVX2-NEXT: vmovd %xmm1, %eax
1720 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
1721 ; AVX2-NEXT: vmovd %xmm0, %eax
1722 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
1723 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1726 ; AVX512-LABEL: cvt_16f32_to_16i16:
1728 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
1729 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2
1730 ; AVX512-NEXT: vmovd %xmm2, %eax
1731 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1732 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1733 ; AVX512-NEXT: vmovd %eax, %xmm3
1734 ; AVX512-NEXT: vmovd %xmm2, %eax
1735 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1736 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1737 ; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1738 ; AVX512-NEXT: vmovd %xmm2, %eax
1739 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
1740 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1741 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1742 ; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1743 ; AVX512-NEXT: vmovd %xmm1, %eax
1744 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
1745 ; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1746 ; AVX512-NEXT: vmovd %xmm1, %eax
1747 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1748 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1749 ; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1750 ; AVX512-NEXT: vmovd %xmm1, %eax
1751 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1752 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1753 ; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1754 ; AVX512-NEXT: vmovd %xmm1, %eax
1755 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1756 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1757 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1758 ; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
1759 ; AVX512-NEXT: vmovd %xmm2, %eax
1760 ; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
1761 ; AVX512-NEXT: vmovd %xmm1, %eax
1762 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1763 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1764 ; AVX512-NEXT: vmovd %eax, %xmm3
1765 ; AVX512-NEXT: vmovd %xmm1, %eax
1766 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1767 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1768 ; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1769 ; AVX512-NEXT: vmovd %xmm1, %eax
1770 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1771 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1772 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1773 ; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1774 ; AVX512-NEXT: vmovd %xmm0, %eax
1775 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1776 ; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1777 ; AVX512-NEXT: vmovd %xmm0, %eax
1778 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1779 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1780 ; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1781 ; AVX512-NEXT: vmovd %xmm0, %eax
1782 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
1783 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1784 ; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1785 ; AVX512-NEXT: vmovd %xmm0, %eax
1786 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1787 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1788 ; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
1789 ; AVX512-NEXT: vmovd %xmm0, %eax
1790 ; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
1791 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1793 %1 = fptrunc <16 x float> %a0 to <16 x half>
1794 %2 = bitcast <16 x half> %1 to <16 x i16>
1799 ; Float to Half (Store)
1802 define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
1803 ; ALL-LABEL: store_cvt_f32_to_i16:
1805 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1806 ; ALL-NEXT: vmovd %xmm0, %eax
1807 ; ALL-NEXT: movw %ax, (%rdi)
1809 %1 = fptrunc float %a0 to half
1810 %2 = bitcast half %1 to i16
1811 store i16 %2, i16* %a1
1815 define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind {
1816 ; ALL-LABEL: store_cvt_4f32_to_4i16:
1818 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1819 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1820 ; ALL-NEXT: vmovd %xmm1, %eax
1821 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1822 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1823 ; ALL-NEXT: vmovd %xmm1, %ecx
1824 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1825 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1826 ; ALL-NEXT: vmovd %xmm1, %edx
1827 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1828 ; ALL-NEXT: vmovd %xmm0, %esi
1829 ; ALL-NEXT: movw %si, (%rdi)
1830 ; ALL-NEXT: movw %dx, 6(%rdi)
1831 ; ALL-NEXT: movw %cx, 4(%rdi)
1832 ; ALL-NEXT: movw %ax, 2(%rdi)
1834 %1 = fptrunc <4 x float> %a0 to <4 x half>
1835 %2 = bitcast <4 x half> %1 to <4 x i16>
1836 store <4 x i16> %2, <4 x i16>* %a1
1840 define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounwind {
1841 ; ALL-LABEL: store_cvt_4f32_to_8i16_undef:
1843 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1844 ; ALL-NEXT: vmovd %xmm1, %eax
1845 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1846 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1847 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1848 ; ALL-NEXT: vmovd %xmm1, %eax
1849 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1850 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1851 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1852 ; ALL-NEXT: vmovd %xmm1, %eax
1853 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1854 ; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1855 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1856 ; ALL-NEXT: vmovd %xmm0, %eax
1857 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1858 ; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
1859 ; ALL-NEXT: vmovaps %xmm0, (%rdi)
1861 %1 = fptrunc <4 x float> %a0 to <4 x half>
1862 %2 = bitcast <4 x half> %1 to <4 x i16>
1863 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1864 store <8 x i16> %3, <8 x i16>* %a1
1868 define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind {
1869 ; ALL-LABEL: store_cvt_4f32_to_8i16_zero:
1871 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1872 ; ALL-NEXT: vmovd %xmm1, %eax
1873 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1874 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1875 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1876 ; ALL-NEXT: vmovd %xmm1, %eax
1877 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1878 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1879 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1880 ; ALL-NEXT: vmovd %xmm1, %eax
1881 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1882 ; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1883 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1884 ; ALL-NEXT: vmovd %xmm0, %eax
1885 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1886 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1887 ; ALL-NEXT: vmovaps %xmm0, (%rdi)
1889 %1 = fptrunc <4 x float> %a0 to <4 x half>
1890 %2 = bitcast <4 x half> %1 to <4 x i16>
1891 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1892 store <8 x i16> %3, <8 x i16>* %a1
1896 define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
1897 ; ALL-LABEL: store_cvt_8f32_to_8i16:
1899 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1900 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1901 ; ALL-NEXT: vmovd %xmm1, %r8d
1902 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1903 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1904 ; ALL-NEXT: vmovd %xmm1, %r9d
1905 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1906 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1907 ; ALL-NEXT: vmovd %xmm1, %r10d
1908 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
1909 ; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1910 ; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1911 ; ALL-NEXT: vmovd %xmm2, %r11d
1912 ; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1913 ; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1914 ; ALL-NEXT: vmovd %xmm2, %eax
1915 ; ALL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
1916 ; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1917 ; ALL-NEXT: vmovd %xmm2, %ecx
1918 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1919 ; ALL-NEXT: vmovd %xmm0, %edx
1920 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1921 ; ALL-NEXT: vmovd %xmm0, %esi
1922 ; ALL-NEXT: movw %si, 8(%rdi)
1923 ; ALL-NEXT: movw %dx, (%rdi)
1924 ; ALL-NEXT: movw %cx, 14(%rdi)
1925 ; ALL-NEXT: movw %ax, 12(%rdi)
1926 ; ALL-NEXT: movw %r11w, 10(%rdi)
1927 ; ALL-NEXT: movw %r10w, 6(%rdi)
1928 ; ALL-NEXT: movw %r9w, 4(%rdi)
1929 ; ALL-NEXT: movw %r8w, 2(%rdi)
1930 ; ALL-NEXT: vzeroupper
1932 %1 = fptrunc <8 x float> %a0 to <8 x half>
1933 %2 = bitcast <8 x half> %1 to <8 x i16>
1934 store <8 x i16> %2, <8 x i16>* %a1
1938 define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwind {
1939 ; AVX1-LABEL: store_cvt_16f32_to_16i16:
1941 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1942 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1943 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm4
1944 ; AVX1-NEXT: vmovd %xmm4, %eax
1945 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm4
1946 ; AVX1-NEXT: movw %ax, 24(%rdi)
1947 ; AVX1-NEXT: vmovd %xmm4, %eax
1948 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm4
1949 ; AVX1-NEXT: movw %ax, 16(%rdi)
1950 ; AVX1-NEXT: vmovd %xmm4, %eax
1951 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm4
1952 ; AVX1-NEXT: movw %ax, 8(%rdi)
1953 ; AVX1-NEXT: vmovd %xmm4, %eax
1954 ; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
1955 ; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1956 ; AVX1-NEXT: movw %ax, (%rdi)
1957 ; AVX1-NEXT: vmovd %xmm4, %eax
1958 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
1959 ; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1960 ; AVX1-NEXT: movw %ax, 30(%rdi)
1961 ; AVX1-NEXT: vmovd %xmm4, %eax
1962 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
1963 ; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1964 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
1965 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1966 ; AVX1-NEXT: movw %ax, 28(%rdi)
1967 ; AVX1-NEXT: vmovd %xmm3, %eax
1968 ; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
1969 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1970 ; AVX1-NEXT: movw %ax, 26(%rdi)
1971 ; AVX1-NEXT: vmovd %xmm3, %eax
1972 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1973 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1974 ; AVX1-NEXT: movw %ax, 22(%rdi)
1975 ; AVX1-NEXT: vmovd %xmm3, %eax
1976 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
1977 ; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1978 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1979 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1980 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
1981 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1982 ; AVX1-NEXT: movw %ax, 20(%rdi)
1983 ; AVX1-NEXT: vmovd %xmm1, %eax
1984 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
1985 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1986 ; AVX1-NEXT: movw %ax, 18(%rdi)
1987 ; AVX1-NEXT: vmovd %xmm1, %eax
1988 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1989 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1990 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1991 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1992 ; AVX1-NEXT: movw %ax, 14(%rdi)
1993 ; AVX1-NEXT: vmovd %xmm2, %eax
1994 ; AVX1-NEXT: movw %ax, 12(%rdi)
1995 ; AVX1-NEXT: vmovd %xmm1, %eax
1996 ; AVX1-NEXT: movw %ax, 10(%rdi)
1997 ; AVX1-NEXT: vmovd %xmm0, %eax
1998 ; AVX1-NEXT: movw %ax, 6(%rdi)
1999 ; AVX1-NEXT: vmovd %xmm3, %eax
2000 ; AVX1-NEXT: movw %ax, 4(%rdi)
2001 ; AVX1-NEXT: vmovd %xmm4, %eax
2002 ; AVX1-NEXT: movw %ax, 2(%rdi)
2003 ; AVX1-NEXT: vzeroupper
2006 ; AVX2-LABEL: store_cvt_16f32_to_16i16:
2008 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
2009 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3
2010 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm4
2011 ; AVX2-NEXT: vmovd %xmm4, %eax
2012 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm4
2013 ; AVX2-NEXT: movw %ax, 24(%rdi)
2014 ; AVX2-NEXT: vmovd %xmm4, %eax
2015 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm4
2016 ; AVX2-NEXT: movw %ax, 16(%rdi)
2017 ; AVX2-NEXT: vmovd %xmm4, %eax
2018 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm4
2019 ; AVX2-NEXT: movw %ax, 8(%rdi)
2020 ; AVX2-NEXT: vmovd %xmm4, %eax
2021 ; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
2022 ; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2023 ; AVX2-NEXT: movw %ax, (%rdi)
2024 ; AVX2-NEXT: vmovd %xmm4, %eax
2025 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
2026 ; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2027 ; AVX2-NEXT: movw %ax, 30(%rdi)
2028 ; AVX2-NEXT: vmovd %xmm4, %eax
2029 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2030 ; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2031 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
2032 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2033 ; AVX2-NEXT: movw %ax, 28(%rdi)
2034 ; AVX2-NEXT: vmovd %xmm3, %eax
2035 ; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
2036 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2037 ; AVX2-NEXT: movw %ax, 26(%rdi)
2038 ; AVX2-NEXT: vmovd %xmm3, %eax
2039 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
2040 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2041 ; AVX2-NEXT: movw %ax, 22(%rdi)
2042 ; AVX2-NEXT: vmovd %xmm3, %eax
2043 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
2044 ; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2045 ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2046 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2047 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
2048 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2049 ; AVX2-NEXT: movw %ax, 20(%rdi)
2050 ; AVX2-NEXT: vmovd %xmm1, %eax
2051 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
2052 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2053 ; AVX2-NEXT: movw %ax, 18(%rdi)
2054 ; AVX2-NEXT: vmovd %xmm1, %eax
2055 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
2056 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2057 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
2058 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2059 ; AVX2-NEXT: movw %ax, 14(%rdi)
2060 ; AVX2-NEXT: vmovd %xmm2, %eax
2061 ; AVX2-NEXT: movw %ax, 12(%rdi)
2062 ; AVX2-NEXT: vmovd %xmm1, %eax
2063 ; AVX2-NEXT: movw %ax, 10(%rdi)
2064 ; AVX2-NEXT: vmovd %xmm0, %eax
2065 ; AVX2-NEXT: movw %ax, 6(%rdi)
2066 ; AVX2-NEXT: vmovd %xmm3, %eax
2067 ; AVX2-NEXT: movw %ax, 4(%rdi)
2068 ; AVX2-NEXT: vmovd %xmm4, %eax
2069 ; AVX2-NEXT: movw %ax, 2(%rdi)
2070 ; AVX2-NEXT: vzeroupper
2073 ; AVX512-LABEL: store_cvt_16f32_to_16i16:
2075 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
2076 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2
2077 ; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3
2078 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4
2079 ; AVX512-NEXT: vmovd %xmm4, %eax
2080 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4
2081 ; AVX512-NEXT: movw %ax, 24(%rdi)
2082 ; AVX512-NEXT: vmovd %xmm4, %eax
2083 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4
2084 ; AVX512-NEXT: movw %ax, 16(%rdi)
2085 ; AVX512-NEXT: vmovd %xmm4, %eax
2086 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4
2087 ; AVX512-NEXT: movw %ax, 8(%rdi)
2088 ; AVX512-NEXT: vmovd %xmm4, %eax
2089 ; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
2090 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2091 ; AVX512-NEXT: movw %ax, (%rdi)
2092 ; AVX512-NEXT: vmovd %xmm4, %eax
2093 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
2094 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2095 ; AVX512-NEXT: movw %ax, 30(%rdi)
2096 ; AVX512-NEXT: vmovd %xmm4, %eax
2097 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2098 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
2099 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
2100 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2101 ; AVX512-NEXT: movw %ax, 28(%rdi)
2102 ; AVX512-NEXT: vmovd %xmm3, %eax
2103 ; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
2104 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2105 ; AVX512-NEXT: movw %ax, 26(%rdi)
2106 ; AVX512-NEXT: vmovd %xmm3, %eax
2107 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
2108 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2109 ; AVX512-NEXT: movw %ax, 22(%rdi)
2110 ; AVX512-NEXT: vmovd %xmm3, %eax
2111 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
2112 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
2113 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2114 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2115 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
2116 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2117 ; AVX512-NEXT: movw %ax, 20(%rdi)
2118 ; AVX512-NEXT: vmovd %xmm2, %eax
2119 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2120 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2121 ; AVX512-NEXT: movw %ax, 18(%rdi)
2122 ; AVX512-NEXT: vmovd %xmm2, %eax
2123 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2124 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2125 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
2126 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2127 ; AVX512-NEXT: movw %ax, 14(%rdi)
2128 ; AVX512-NEXT: vmovd %xmm1, %eax
2129 ; AVX512-NEXT: movw %ax, 12(%rdi)
2130 ; AVX512-NEXT: vmovd %xmm2, %eax
2131 ; AVX512-NEXT: movw %ax, 10(%rdi)
2132 ; AVX512-NEXT: vmovd %xmm0, %eax
2133 ; AVX512-NEXT: movw %ax, 6(%rdi)
2134 ; AVX512-NEXT: vmovd %xmm3, %eax
2135 ; AVX512-NEXT: movw %ax, 4(%rdi)
2136 ; AVX512-NEXT: vmovd %xmm4, %eax
2137 ; AVX512-NEXT: movw %ax, 2(%rdi)
2138 ; AVX512-NEXT: vzeroupper
2140 %1 = fptrunc <16 x float> %a0 to <16 x half>
2141 %2 = bitcast <16 x half> %1 to <16 x i16>
2142 store <16 x i16> %2, <16 x i16>* %a1
2150 define i16 @cvt_f64_to_i16(double %a0) nounwind {
2151 ; ALL-LABEL: cvt_f64_to_i16:
2153 ; ALL-NEXT: jmp __truncdfhf2 # TAILCALL
2154 %1 = fptrunc double %a0 to half
2155 %2 = bitcast half %1 to i16
2159 define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
2160 ; ALL-LABEL: cvt_2f64_to_2i16:
2162 ; ALL-NEXT: subq $40, %rsp
2163 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2164 ; ALL-NEXT: callq __truncdfhf2
2165 ; ALL-NEXT: movw %ax, (%rsp)
2166 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2167 ; ALL-NEXT: # xmm0 = mem[1,0]
2168 ; ALL-NEXT: callq __truncdfhf2
2169 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2170 ; ALL-NEXT: vmovaps (%rsp), %xmm0
2171 ; ALL-NEXT: addq $40, %rsp
2173 %1 = fptrunc <2 x double> %a0 to <2 x half>
2174 %2 = bitcast <2 x half> %1 to <2 x i16>
2178 define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
2179 ; ALL-LABEL: cvt_4f64_to_4i16:
2181 ; ALL-NEXT: subq $88, %rsp
2182 ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2183 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
2184 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2185 ; ALL-NEXT: vzeroupper
2186 ; ALL-NEXT: callq __truncdfhf2
2187 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2188 ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2189 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2190 ; ALL-NEXT: vzeroupper
2191 ; ALL-NEXT: callq __truncdfhf2
2192 ; ALL-NEXT: movw %ax, (%rsp)
2193 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2194 ; ALL-NEXT: # xmm0 = mem[1,0]
2195 ; ALL-NEXT: callq __truncdfhf2
2196 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2197 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2198 ; ALL-NEXT: # xmm0 = mem[1,0]
2199 ; ALL-NEXT: callq __truncdfhf2
2200 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2201 ; ALL-NEXT: vmovaps (%rsp), %xmm0
2202 ; ALL-NEXT: addq $88, %rsp
2204 %1 = fptrunc <4 x double> %a0 to <4 x half>
2205 %2 = bitcast <4 x half> %1 to <4 x i16>
2209 define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
2210 ; ALL-LABEL: cvt_4f64_to_8i16_undef:
2212 ; ALL-NEXT: subq $88, %rsp
2213 ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2214 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
2215 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2216 ; ALL-NEXT: vzeroupper
2217 ; ALL-NEXT: callq __truncdfhf2
2218 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2219 ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2220 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2221 ; ALL-NEXT: vzeroupper
2222 ; ALL-NEXT: callq __truncdfhf2
2223 ; ALL-NEXT: movw %ax, (%rsp)
2224 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2225 ; ALL-NEXT: # xmm0 = mem[1,0]
2226 ; ALL-NEXT: callq __truncdfhf2
2227 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2228 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2229 ; ALL-NEXT: # xmm0 = mem[1,0]
2230 ; ALL-NEXT: callq __truncdfhf2
2231 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2232 ; ALL-NEXT: vmovaps (%rsp), %xmm0
2233 ; ALL-NEXT: addq $88, %rsp
2235 %1 = fptrunc <4 x double> %a0 to <4 x half>
2236 %2 = bitcast <4 x half> %1 to <4 x i16>
2237 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2241 define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
2242 ; ALL-LABEL: cvt_4f64_to_8i16_zero:
2244 ; ALL-NEXT: subq $88, %rsp
2245 ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2246 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
2247 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2248 ; ALL-NEXT: vzeroupper
2249 ; ALL-NEXT: callq __truncdfhf2
2250 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2251 ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2252 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2253 ; ALL-NEXT: vzeroupper
2254 ; ALL-NEXT: callq __truncdfhf2
2255 ; ALL-NEXT: movw %ax, (%rsp)
2256 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2257 ; ALL-NEXT: # xmm0 = mem[1,0]
2258 ; ALL-NEXT: callq __truncdfhf2
2259 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2260 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2261 ; ALL-NEXT: # xmm0 = mem[1,0]
2262 ; ALL-NEXT: callq __truncdfhf2
2263 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2264 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2265 ; ALL-NEXT: addq $88, %rsp
2267 %1 = fptrunc <4 x double> %a0 to <4 x half>
2268 %2 = bitcast <4 x half> %1 to <4 x i16>
2269 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2273 define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
2274 ; AVX1-LABEL: cvt_8f64_to_8i16:
2276 ; AVX1-NEXT: pushq %r15
2277 ; AVX1-NEXT: pushq %r14
2278 ; AVX1-NEXT: pushq %rbx
2279 ; AVX1-NEXT: subq $64, %rsp
2280 ; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
2281 ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2282 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2283 ; AVX1-NEXT: vzeroupper
2284 ; AVX1-NEXT: callq __truncdfhf2
2285 ; AVX1-NEXT: movl %eax, %ebx
2286 ; AVX1-NEXT: shll $16, %ebx
2287 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2288 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2289 ; AVX1-NEXT: vzeroupper
2290 ; AVX1-NEXT: callq __truncdfhf2
2291 ; AVX1-NEXT: movzwl %ax, %r15d
2292 ; AVX1-NEXT: orl %ebx, %r15d
2293 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2294 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2295 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2296 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2297 ; AVX1-NEXT: vzeroupper
2298 ; AVX1-NEXT: callq __truncdfhf2
2299 ; AVX1-NEXT: movl %eax, %ebx
2300 ; AVX1-NEXT: shll $16, %ebx
2301 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2302 ; AVX1-NEXT: callq __truncdfhf2
2303 ; AVX1-NEXT: movzwl %ax, %r14d
2304 ; AVX1-NEXT: orl %ebx, %r14d
2305 ; AVX1-NEXT: shlq $32, %r14
2306 ; AVX1-NEXT: orq %r15, %r14
2307 ; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2308 ; AVX1-NEXT: # xmm0 = mem[1,0]
2309 ; AVX1-NEXT: callq __truncdfhf2
2310 ; AVX1-NEXT: movl %eax, %ebx
2311 ; AVX1-NEXT: shll $16, %ebx
2312 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2313 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2314 ; AVX1-NEXT: vzeroupper
2315 ; AVX1-NEXT: callq __truncdfhf2
2316 ; AVX1-NEXT: movzwl %ax, %r15d
2317 ; AVX1-NEXT: orl %ebx, %r15d
2318 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2319 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2320 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2321 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2322 ; AVX1-NEXT: vzeroupper
2323 ; AVX1-NEXT: callq __truncdfhf2
2324 ; AVX1-NEXT: movl %eax, %ebx
2325 ; AVX1-NEXT: shll $16, %ebx
2326 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2327 ; AVX1-NEXT: callq __truncdfhf2
2328 ; AVX1-NEXT: movzwl %ax, %eax
2329 ; AVX1-NEXT: orl %ebx, %eax
2330 ; AVX1-NEXT: shlq $32, %rax
2331 ; AVX1-NEXT: orq %r15, %rax
2332 ; AVX1-NEXT: vmovq %rax, %xmm0
2333 ; AVX1-NEXT: vmovq %r14, %xmm1
2334 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2335 ; AVX1-NEXT: addq $64, %rsp
2336 ; AVX1-NEXT: popq %rbx
2337 ; AVX1-NEXT: popq %r14
2338 ; AVX1-NEXT: popq %r15
2341 ; AVX2-LABEL: cvt_8f64_to_8i16:
2343 ; AVX2-NEXT: pushq %r15
2344 ; AVX2-NEXT: pushq %r14
2345 ; AVX2-NEXT: pushq %rbx
2346 ; AVX2-NEXT: subq $64, %rsp
2347 ; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
2348 ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2349 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2350 ; AVX2-NEXT: vzeroupper
2351 ; AVX2-NEXT: callq __truncdfhf2
2352 ; AVX2-NEXT: movl %eax, %ebx
2353 ; AVX2-NEXT: shll $16, %ebx
2354 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2355 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2356 ; AVX2-NEXT: vzeroupper
2357 ; AVX2-NEXT: callq __truncdfhf2
2358 ; AVX2-NEXT: movzwl %ax, %r15d
2359 ; AVX2-NEXT: orl %ebx, %r15d
2360 ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2361 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2362 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2363 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2364 ; AVX2-NEXT: vzeroupper
2365 ; AVX2-NEXT: callq __truncdfhf2
2366 ; AVX2-NEXT: movl %eax, %ebx
2367 ; AVX2-NEXT: shll $16, %ebx
2368 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2369 ; AVX2-NEXT: callq __truncdfhf2
2370 ; AVX2-NEXT: movzwl %ax, %r14d
2371 ; AVX2-NEXT: orl %ebx, %r14d
2372 ; AVX2-NEXT: shlq $32, %r14
2373 ; AVX2-NEXT: orq %r15, %r14
2374 ; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
2375 ; AVX2-NEXT: # xmm0 = mem[1,0]
2376 ; AVX2-NEXT: callq __truncdfhf2
2377 ; AVX2-NEXT: movl %eax, %ebx
2378 ; AVX2-NEXT: shll $16, %ebx
2379 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2380 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2381 ; AVX2-NEXT: vzeroupper
2382 ; AVX2-NEXT: callq __truncdfhf2
2383 ; AVX2-NEXT: movzwl %ax, %r15d
2384 ; AVX2-NEXT: orl %ebx, %r15d
2385 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2386 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2387 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2388 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2389 ; AVX2-NEXT: vzeroupper
2390 ; AVX2-NEXT: callq __truncdfhf2
2391 ; AVX2-NEXT: movl %eax, %ebx
2392 ; AVX2-NEXT: shll $16, %ebx
2393 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2394 ; AVX2-NEXT: callq __truncdfhf2
2395 ; AVX2-NEXT: movzwl %ax, %eax
2396 ; AVX2-NEXT: orl %ebx, %eax
2397 ; AVX2-NEXT: shlq $32, %rax
2398 ; AVX2-NEXT: orq %r15, %rax
2399 ; AVX2-NEXT: vmovq %rax, %xmm0
2400 ; AVX2-NEXT: vmovq %r14, %xmm1
2401 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2402 ; AVX2-NEXT: addq $64, %rsp
2403 ; AVX2-NEXT: popq %rbx
2404 ; AVX2-NEXT: popq %r14
2405 ; AVX2-NEXT: popq %r15
2408 ; AVX512-LABEL: cvt_8f64_to_8i16:
2410 ; AVX512-NEXT: pushq %r15
2411 ; AVX512-NEXT: pushq %r14
2412 ; AVX512-NEXT: pushq %rbx
2413 ; AVX512-NEXT: subq $96, %rsp
2414 ; AVX512-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill
2415 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2416 ; AVX512-NEXT: vzeroupper
2417 ; AVX512-NEXT: callq __truncdfhf2
2418 ; AVX512-NEXT: movl %eax, %ebx
2419 ; AVX512-NEXT: shll $16, %ebx
2420 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
2421 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2422 ; AVX512-NEXT: vzeroupper
2423 ; AVX512-NEXT: callq __truncdfhf2
2424 ; AVX512-NEXT: movzwl %ax, %r15d
2425 ; AVX512-NEXT: orl %ebx, %r15d
2426 ; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload
2427 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2428 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2429 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2430 ; AVX512-NEXT: vzeroupper
2431 ; AVX512-NEXT: callq __truncdfhf2
2432 ; AVX512-NEXT: movl %eax, %ebx
2433 ; AVX512-NEXT: shll $16, %ebx
2434 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2435 ; AVX512-NEXT: callq __truncdfhf2
2436 ; AVX512-NEXT: movzwl %ax, %r14d
2437 ; AVX512-NEXT: orl %ebx, %r14d
2438 ; AVX512-NEXT: shlq $32, %r14
2439 ; AVX512-NEXT: orq %r15, %r14
2440 ; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload
2441 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2442 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2443 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2444 ; AVX512-NEXT: vzeroupper
2445 ; AVX512-NEXT: callq __truncdfhf2
2446 ; AVX512-NEXT: movl %eax, %ebx
2447 ; AVX512-NEXT: shll $16, %ebx
2448 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2449 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2450 ; AVX512-NEXT: vzeroupper
2451 ; AVX512-NEXT: callq __truncdfhf2
2452 ; AVX512-NEXT: movzwl %ax, %r15d
2453 ; AVX512-NEXT: orl %ebx, %r15d
2454 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2455 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2456 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2457 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2458 ; AVX512-NEXT: vzeroupper
2459 ; AVX512-NEXT: callq __truncdfhf2
2460 ; AVX512-NEXT: movl %eax, %ebx
2461 ; AVX512-NEXT: shll $16, %ebx
2462 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2463 ; AVX512-NEXT: callq __truncdfhf2
2464 ; AVX512-NEXT: movzwl %ax, %eax
2465 ; AVX512-NEXT: orl %ebx, %eax
2466 ; AVX512-NEXT: shlq $32, %rax
2467 ; AVX512-NEXT: orq %r15, %rax
2468 ; AVX512-NEXT: vmovq %rax, %xmm0
2469 ; AVX512-NEXT: vmovq %r14, %xmm1
2470 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2471 ; AVX512-NEXT: addq $96, %rsp
2472 ; AVX512-NEXT: popq %rbx
2473 ; AVX512-NEXT: popq %r14
2474 ; AVX512-NEXT: popq %r15
2476 %1 = fptrunc <8 x double> %a0 to <8 x half>
2477 %2 = bitcast <8 x half> %1 to <8 x i16>
2482 ; Double to Half (Store)
2485 define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind {
2486 ; ALL-LABEL: store_cvt_f64_to_i16:
2488 ; ALL-NEXT: pushq %rbx
2489 ; ALL-NEXT: movq %rdi, %rbx
2490 ; ALL-NEXT: callq __truncdfhf2
2491 ; ALL-NEXT: movw %ax, (%rbx)
2492 ; ALL-NEXT: popq %rbx
2494 %1 = fptrunc double %a0 to half
2495 %2 = bitcast half %1 to i16
2496 store i16 %2, i16* %a1
2500 define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind {
2501 ; ALL-LABEL: store_cvt_2f64_to_2i16:
2503 ; ALL-NEXT: pushq %rbp
2504 ; ALL-NEXT: pushq %rbx
2505 ; ALL-NEXT: subq $24, %rsp
2506 ; ALL-NEXT: movq %rdi, %rbx
2507 ; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2508 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2509 ; ALL-NEXT: callq __truncdfhf2
2510 ; ALL-NEXT: movl %eax, %ebp
2511 ; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2512 ; ALL-NEXT: callq __truncdfhf2
2513 ; ALL-NEXT: movw %ax, (%rbx)
2514 ; ALL-NEXT: movw %bp, 2(%rbx)
2515 ; ALL-NEXT: addq $24, %rsp
2516 ; ALL-NEXT: popq %rbx
2517 ; ALL-NEXT: popq %rbp
2519 %1 = fptrunc <2 x double> %a0 to <2 x half>
2520 %2 = bitcast <2 x half> %1 to <2 x i16>
2521 store <2 x i16> %2, <2 x i16>* %a1
2525 define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
2526 ; AVX1-LABEL: store_cvt_4f64_to_4i16:
2528 ; AVX1-NEXT: pushq %rbp
2529 ; AVX1-NEXT: pushq %r15
2530 ; AVX1-NEXT: pushq %r14
2531 ; AVX1-NEXT: pushq %rbx
2532 ; AVX1-NEXT: subq $88, %rsp
2533 ; AVX1-NEXT: movq %rdi, %rbx
2534 ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2535 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2536 ; AVX1-NEXT: vzeroupper
2537 ; AVX1-NEXT: callq __truncdfhf2
2538 ; AVX1-NEXT: movl %eax, %r14d
2539 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2540 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2541 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2542 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2543 ; AVX1-NEXT: vzeroupper
2544 ; AVX1-NEXT: callq __truncdfhf2
2545 ; AVX1-NEXT: movl %eax, %r15d
2546 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2547 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2548 ; AVX1-NEXT: vzeroupper
2549 ; AVX1-NEXT: callq __truncdfhf2
2550 ; AVX1-NEXT: movl %eax, %ebp
2551 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2552 ; AVX1-NEXT: callq __truncdfhf2
2553 ; AVX1-NEXT: movw %ax, 4(%rbx)
2554 ; AVX1-NEXT: movw %bp, (%rbx)
2555 ; AVX1-NEXT: movw %r15w, 6(%rbx)
2556 ; AVX1-NEXT: movw %r14w, 2(%rbx)
2557 ; AVX1-NEXT: addq $88, %rsp
2558 ; AVX1-NEXT: popq %rbx
2559 ; AVX1-NEXT: popq %r14
2560 ; AVX1-NEXT: popq %r15
2561 ; AVX1-NEXT: popq %rbp
2564 ; AVX2-LABEL: store_cvt_4f64_to_4i16:
2566 ; AVX2-NEXT: pushq %rbp
2567 ; AVX2-NEXT: pushq %r15
2568 ; AVX2-NEXT: pushq %r14
2569 ; AVX2-NEXT: pushq %rbx
2570 ; AVX2-NEXT: subq $88, %rsp
2571 ; AVX2-NEXT: movq %rdi, %rbx
2572 ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2573 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2574 ; AVX2-NEXT: vzeroupper
2575 ; AVX2-NEXT: callq __truncdfhf2
2576 ; AVX2-NEXT: movl %eax, %r14d
2577 ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2578 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2579 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2580 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2581 ; AVX2-NEXT: vzeroupper
2582 ; AVX2-NEXT: callq __truncdfhf2
2583 ; AVX2-NEXT: movl %eax, %r15d
2584 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2585 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2586 ; AVX2-NEXT: vzeroupper
2587 ; AVX2-NEXT: callq __truncdfhf2
2588 ; AVX2-NEXT: movl %eax, %ebp
2589 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2590 ; AVX2-NEXT: callq __truncdfhf2
2591 ; AVX2-NEXT: movw %ax, 4(%rbx)
2592 ; AVX2-NEXT: movw %bp, (%rbx)
2593 ; AVX2-NEXT: movw %r15w, 6(%rbx)
2594 ; AVX2-NEXT: movw %r14w, 2(%rbx)
2595 ; AVX2-NEXT: addq $88, %rsp
2596 ; AVX2-NEXT: popq %rbx
2597 ; AVX2-NEXT: popq %r14
2598 ; AVX2-NEXT: popq %r15
2599 ; AVX2-NEXT: popq %rbp
2602 ; AVX512-LABEL: store_cvt_4f64_to_4i16:
2604 ; AVX512-NEXT: pushq %rbp
2605 ; AVX512-NEXT: pushq %r15
2606 ; AVX512-NEXT: pushq %r14
2607 ; AVX512-NEXT: pushq %rbx
2608 ; AVX512-NEXT: subq $88, %rsp
2609 ; AVX512-NEXT: movq %rdi, %rbx
2610 ; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2611 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2612 ; AVX512-NEXT: vzeroupper
2613 ; AVX512-NEXT: callq __truncdfhf2
2614 ; AVX512-NEXT: movl %eax, %r14d
2615 ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2616 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2617 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2618 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2619 ; AVX512-NEXT: vzeroupper
2620 ; AVX512-NEXT: callq __truncdfhf2
2621 ; AVX512-NEXT: movl %eax, %r15d
2622 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2623 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2624 ; AVX512-NEXT: vzeroupper
2625 ; AVX512-NEXT: callq __truncdfhf2
2626 ; AVX512-NEXT: movl %eax, %ebp
2627 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2628 ; AVX512-NEXT: callq __truncdfhf2
2629 ; AVX512-NEXT: movw %ax, 4(%rbx)
2630 ; AVX512-NEXT: movw %bp, (%rbx)
2631 ; AVX512-NEXT: movw %r15w, 6(%rbx)
2632 ; AVX512-NEXT: movw %r14w, 2(%rbx)
2633 ; AVX512-NEXT: addq $88, %rsp
2634 ; AVX512-NEXT: popq %rbx
2635 ; AVX512-NEXT: popq %r14
2636 ; AVX512-NEXT: popq %r15
2637 ; AVX512-NEXT: popq %rbp
2639 %1 = fptrunc <4 x double> %a0 to <4 x half>
2640 %2 = bitcast <4 x half> %1 to <4 x i16>
2641 store <4 x i16> %2, <4 x i16>* %a1
2645 define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind {
2646 ; ALL-LABEL: store_cvt_4f64_to_8i16_undef:
2648 ; ALL-NEXT: pushq %rbx
2649 ; ALL-NEXT: subq $80, %rsp
2650 ; ALL-NEXT: movq %rdi, %rbx
2651 ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2652 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
2653 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2654 ; ALL-NEXT: vzeroupper
2655 ; ALL-NEXT: callq __truncdfhf2
2656 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2657 ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2658 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2659 ; ALL-NEXT: vzeroupper
2660 ; ALL-NEXT: callq __truncdfhf2
2661 ; ALL-NEXT: movw %ax, (%rsp)
2662 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2663 ; ALL-NEXT: # xmm0 = mem[1,0]
2664 ; ALL-NEXT: callq __truncdfhf2
2665 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2666 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2667 ; ALL-NEXT: # xmm0 = mem[1,0]
2668 ; ALL-NEXT: callq __truncdfhf2
2669 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2670 ; ALL-NEXT: vmovaps (%rsp), %xmm0
2671 ; ALL-NEXT: vmovaps %xmm0, (%rbx)
2672 ; ALL-NEXT: addq $80, %rsp
2673 ; ALL-NEXT: popq %rbx
2675 %1 = fptrunc <4 x double> %a0 to <4 x half>
2676 %2 = bitcast <4 x half> %1 to <4 x i16>
2677 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2678 store <8 x i16> %3, <8 x i16>* %a1
2682 define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind {
2683 ; ALL-LABEL: store_cvt_4f64_to_8i16_zero:
2685 ; ALL-NEXT: pushq %rbx
2686 ; ALL-NEXT: subq $80, %rsp
2687 ; ALL-NEXT: movq %rdi, %rbx
2688 ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2689 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
2690 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2691 ; ALL-NEXT: vzeroupper
2692 ; ALL-NEXT: callq __truncdfhf2
2693 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2694 ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2695 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2696 ; ALL-NEXT: vzeroupper
2697 ; ALL-NEXT: callq __truncdfhf2
2698 ; ALL-NEXT: movw %ax, (%rsp)
2699 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2700 ; ALL-NEXT: # xmm0 = mem[1,0]
2701 ; ALL-NEXT: callq __truncdfhf2
2702 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2703 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2704 ; ALL-NEXT: # xmm0 = mem[1,0]
2705 ; ALL-NEXT: callq __truncdfhf2
2706 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2707 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2708 ; ALL-NEXT: vmovaps %xmm0, (%rbx)
2709 ; ALL-NEXT: addq $80, %rsp
2710 ; ALL-NEXT: popq %rbx
2712 %1 = fptrunc <4 x double> %a0 to <4 x half>
2713 %2 = bitcast <4 x half> %1 to <4 x i16>
2714 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2715 store <8 x i16> %3, <8 x i16>* %a1
2719 define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
2720 ; AVX1-LABEL: store_cvt_8f64_to_8i16:
2722 ; AVX1-NEXT: pushq %rbp
2723 ; AVX1-NEXT: pushq %r15
2724 ; AVX1-NEXT: pushq %r14
2725 ; AVX1-NEXT: pushq %r13
2726 ; AVX1-NEXT: pushq %r12
2727 ; AVX1-NEXT: pushq %rbx
2728 ; AVX1-NEXT: subq $136, %rsp
2729 ; AVX1-NEXT: movq %rdi, %rbx
2730 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2731 ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2732 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2733 ; AVX1-NEXT: vzeroupper
2734 ; AVX1-NEXT: callq __truncdfhf2
2735 ; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2736 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2737 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2738 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2739 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2740 ; AVX1-NEXT: vzeroupper
2741 ; AVX1-NEXT: callq __truncdfhf2
2742 ; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2743 ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2744 ; AVX1-NEXT: # xmm0 = mem[1,0]
2745 ; AVX1-NEXT: callq __truncdfhf2
2746 ; AVX1-NEXT: movl %eax, %r12d
2747 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2748 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2749 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2750 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2751 ; AVX1-NEXT: vzeroupper
2752 ; AVX1-NEXT: callq __truncdfhf2
2753 ; AVX1-NEXT: movl %eax, %r13d
2754 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2755 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2756 ; AVX1-NEXT: vzeroupper
2757 ; AVX1-NEXT: callq __truncdfhf2
2758 ; AVX1-NEXT: movl %eax, %ebp
2759 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2760 ; AVX1-NEXT: callq __truncdfhf2
2761 ; AVX1-NEXT: movl %eax, %r14d
2762 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2763 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2764 ; AVX1-NEXT: vzeroupper
2765 ; AVX1-NEXT: callq __truncdfhf2
2766 ; AVX1-NEXT: movl %eax, %r15d
2767 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2768 ; AVX1-NEXT: callq __truncdfhf2
2769 ; AVX1-NEXT: movw %ax, 12(%rbx)
2770 ; AVX1-NEXT: movw %r15w, 8(%rbx)
2771 ; AVX1-NEXT: movw %r14w, 4(%rbx)
2772 ; AVX1-NEXT: movw %bp, (%rbx)
2773 ; AVX1-NEXT: movw %r13w, 14(%rbx)
2774 ; AVX1-NEXT: movw %r12w, 10(%rbx)
2775 ; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
2776 ; AVX1-NEXT: movw %ax, 6(%rbx)
2777 ; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
2778 ; AVX1-NEXT: movw %ax, 2(%rbx)
2779 ; AVX1-NEXT: addq $136, %rsp
2780 ; AVX1-NEXT: popq %rbx
2781 ; AVX1-NEXT: popq %r12
2782 ; AVX1-NEXT: popq %r13
2783 ; AVX1-NEXT: popq %r14
2784 ; AVX1-NEXT: popq %r15
2785 ; AVX1-NEXT: popq %rbp
2788 ; AVX2-LABEL: store_cvt_8f64_to_8i16:
2790 ; AVX2-NEXT: pushq %rbp
2791 ; AVX2-NEXT: pushq %r15
2792 ; AVX2-NEXT: pushq %r14
2793 ; AVX2-NEXT: pushq %r13
2794 ; AVX2-NEXT: pushq %r12
2795 ; AVX2-NEXT: pushq %rbx
2796 ; AVX2-NEXT: subq $136, %rsp
2797 ; AVX2-NEXT: movq %rdi, %rbx
2798 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2799 ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2800 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2801 ; AVX2-NEXT: vzeroupper
2802 ; AVX2-NEXT: callq __truncdfhf2
2803 ; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2804 ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2805 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2806 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2807 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2808 ; AVX2-NEXT: vzeroupper
2809 ; AVX2-NEXT: callq __truncdfhf2
2810 ; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2811 ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2812 ; AVX2-NEXT: # xmm0 = mem[1,0]
2813 ; AVX2-NEXT: callq __truncdfhf2
2814 ; AVX2-NEXT: movl %eax, %r12d
2815 ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2816 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2817 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2818 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2819 ; AVX2-NEXT: vzeroupper
2820 ; AVX2-NEXT: callq __truncdfhf2
2821 ; AVX2-NEXT: movl %eax, %r13d
2822 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2823 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2824 ; AVX2-NEXT: vzeroupper
2825 ; AVX2-NEXT: callq __truncdfhf2
2826 ; AVX2-NEXT: movl %eax, %ebp
2827 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2828 ; AVX2-NEXT: callq __truncdfhf2
2829 ; AVX2-NEXT: movl %eax, %r14d
2830 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2831 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2832 ; AVX2-NEXT: vzeroupper
2833 ; AVX2-NEXT: callq __truncdfhf2
2834 ; AVX2-NEXT: movl %eax, %r15d
2835 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2836 ; AVX2-NEXT: callq __truncdfhf2
2837 ; AVX2-NEXT: movw %ax, 12(%rbx)
2838 ; AVX2-NEXT: movw %r15w, 8(%rbx)
2839 ; AVX2-NEXT: movw %r14w, 4(%rbx)
2840 ; AVX2-NEXT: movw %bp, (%rbx)
2841 ; AVX2-NEXT: movw %r13w, 14(%rbx)
2842 ; AVX2-NEXT: movw %r12w, 10(%rbx)
2843 ; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
2844 ; AVX2-NEXT: movw %ax, 6(%rbx)
2845 ; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
2846 ; AVX2-NEXT: movw %ax, 2(%rbx)
2847 ; AVX2-NEXT: addq $136, %rsp
2848 ; AVX2-NEXT: popq %rbx
2849 ; AVX2-NEXT: popq %r12
2850 ; AVX2-NEXT: popq %r13
2851 ; AVX2-NEXT: popq %r14
2852 ; AVX2-NEXT: popq %r15
2853 ; AVX2-NEXT: popq %rbp
2856 ; AVX512-LABEL: store_cvt_8f64_to_8i16:
2858 ; AVX512-NEXT: pushq %rbp
2859 ; AVX512-NEXT: pushq %r15
2860 ; AVX512-NEXT: pushq %r14
2861 ; AVX512-NEXT: pushq %r13
2862 ; AVX512-NEXT: pushq %r12
2863 ; AVX512-NEXT: pushq %rbx
2864 ; AVX512-NEXT: subq $200, %rsp
2865 ; AVX512-NEXT: movq %rdi, %rbx
2866 ; AVX512-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2867 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2868 ; AVX512-NEXT: vzeroupper
2869 ; AVX512-NEXT: callq __truncdfhf2
2870 ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2871 ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2872 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2873 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2874 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2875 ; AVX512-NEXT: vzeroupper
2876 ; AVX512-NEXT: callq __truncdfhf2
2877 ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2878 ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2879 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2880 ; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2881 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2882 ; AVX512-NEXT: vzeroupper
2883 ; AVX512-NEXT: callq __truncdfhf2
2884 ; AVX512-NEXT: movl %eax, %r12d
2885 ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2886 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2887 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2888 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2889 ; AVX512-NEXT: vzeroupper
2890 ; AVX512-NEXT: callq __truncdfhf2
2891 ; AVX512-NEXT: movl %eax, %r13d
2892 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2893 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2894 ; AVX512-NEXT: vzeroupper
2895 ; AVX512-NEXT: callq __truncdfhf2
2896 ; AVX512-NEXT: movl %eax, %ebp
2897 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2898 ; AVX512-NEXT: callq __truncdfhf2
2899 ; AVX512-NEXT: movl %eax, %r14d
2900 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2901 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2902 ; AVX512-NEXT: vzeroupper
2903 ; AVX512-NEXT: callq __truncdfhf2
2904 ; AVX512-NEXT: movl %eax, %r15d
2905 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2906 ; AVX512-NEXT: callq __truncdfhf2
2907 ; AVX512-NEXT: movw %ax, 12(%rbx)
2908 ; AVX512-NEXT: movw %r15w, 8(%rbx)
2909 ; AVX512-NEXT: movw %r14w, 4(%rbx)
2910 ; AVX512-NEXT: movw %bp, (%rbx)
2911 ; AVX512-NEXT: movw %r13w, 14(%rbx)
2912 ; AVX512-NEXT: movw %r12w, 10(%rbx)
2913 ; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
2914 ; AVX512-NEXT: movw %ax, 6(%rbx)
2915 ; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
2916 ; AVX512-NEXT: movw %ax, 2(%rbx)
2917 ; AVX512-NEXT: addq $200, %rsp
2918 ; AVX512-NEXT: popq %rbx
2919 ; AVX512-NEXT: popq %r12
2920 ; AVX512-NEXT: popq %r13
2921 ; AVX512-NEXT: popq %r14
2922 ; AVX512-NEXT: popq %r15
2923 ; AVX512-NEXT: popq %rbp
2925 %1 = fptrunc <8 x double> %a0 to <8 x half>
2926 %2 = bitcast <8 x half> %1 to <8 x i16>
2927 store <8 x i16> %2, <8 x i16>* %a1