1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512FP16
8 define <2 x double> @prefer_f32_v2f64(ptr %p) nounwind {
9 ; SSE-LABEL: prefer_f32_v2f64:
10 ; SSE: # %bb.0: # %entry
11 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
12 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
13 ; SSE-NEXT: cvtps2pd %xmm0, %xmm0
16 ; AVX-LABEL: prefer_f32_v2f64:
17 ; AVX: # %bb.0: # %entry
18 ; AVX-NEXT: vbroadcastss (%rdi), %xmm0
19 ; AVX-NEXT: vcvtps2pd %xmm0, %xmm0
22 ; AVX512-LABEL: prefer_f32_v2f64:
23 ; AVX512: # %bb.0: # %entry
24 ; AVX512-NEXT: vcvtps2pd (%rdi){1to2}, %xmm0
27 %0 = load float, ptr %p, align 4
28 %vecinit.i = insertelement <2 x float> undef, float %0, i64 0
29 %vecinit3.i = shufflevector <2 x float> %vecinit.i, <2 x float> poison, <2 x i32> zeroinitializer
30 %conv.i = fpext <2 x float> %vecinit3.i to <2 x double>
31 ret <2 x double> %conv.i
34 define <4 x double> @prefer_f32_v4f64(ptr %p) nounwind {
35 ; SSE-LABEL: prefer_f32_v4f64:
36 ; SSE: # %bb.0: # %entry
37 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
38 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
39 ; SSE-NEXT: cvtps2pd %xmm0, %xmm0
40 ; SSE-NEXT: movaps %xmm0, %xmm1
43 ; AVX-LABEL: prefer_f32_v4f64:
44 ; AVX: # %bb.0: # %entry
45 ; AVX-NEXT: vbroadcastss (%rdi), %xmm0
46 ; AVX-NEXT: vcvtps2pd %xmm0, %ymm0
49 ; AVX512-LABEL: prefer_f32_v4f64:
50 ; AVX512: # %bb.0: # %entry
51 ; AVX512-NEXT: vcvtps2pd (%rdi){1to4}, %ymm0
54 %0 = load float, ptr %p, align 4
55 %vecinit.i = insertelement <4 x float> undef, float %0, i64 0
56 %vecinit3.i = shufflevector <4 x float> %vecinit.i, <4 x float> poison, <4 x i32> zeroinitializer
57 %conv.i = fpext <4 x float> %vecinit3.i to <4 x double>
58 ret <4 x double> %conv.i
61 define <4 x float> @prefer_f16_v4f32(ptr %p) nounwind {
62 ; SSE-LABEL: prefer_f16_v4f32:
63 ; SSE: # %bb.0: # %entry
64 ; SSE-NEXT: pushq %rax
65 ; SSE-NEXT: pinsrw $0, (%rdi), %xmm0
66 ; SSE-NEXT: callq __extendhfsf2@PLT
67 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
71 ; AVX1-LABEL: prefer_f16_v4f32:
72 ; AVX1: # %bb.0: # %entry
73 ; AVX1-NEXT: pushq %rax
74 ; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
75 ; AVX1-NEXT: callq __extendhfsf2@PLT
76 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
77 ; AVX1-NEXT: popq %rax
80 ; AVX2-LABEL: prefer_f16_v4f32:
81 ; AVX2: # %bb.0: # %entry
82 ; AVX2-NEXT: pushq %rax
83 ; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
84 ; AVX2-NEXT: callq __extendhfsf2@PLT
85 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
86 ; AVX2-NEXT: popq %rax
89 ; AVX512F-LABEL: prefer_f16_v4f32:
90 ; AVX512F: # %bb.0: # %entry
91 ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
92 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
95 ; AVX512FP16-LABEL: prefer_f16_v4f32:
96 ; AVX512FP16: # %bb.0: # %entry
97 ; AVX512FP16-NEXT: vcvtph2psx (%rdi){1to4}, %xmm0
98 ; AVX512FP16-NEXT: retq
100 %0 = load half, ptr %p, align 4
101 %vecinit.i = insertelement <4 x half> undef, half %0, i64 0
102 %vecinit3.i = shufflevector <4 x half> %vecinit.i, <4 x half> poison, <4 x i32> zeroinitializer
103 %conv.i = fpext <4 x half> %vecinit3.i to <4 x float>
104 ret <4 x float> %conv.i
107 define <8 x float> @prefer_f16_v8f32(ptr %p) nounwind {
108 ; SSE-LABEL: prefer_f16_v8f32:
109 ; SSE: # %bb.0: # %entry
110 ; SSE-NEXT: pushq %rax
111 ; SSE-NEXT: pinsrw $0, (%rdi), %xmm0
112 ; SSE-NEXT: callq __extendhfsf2@PLT
113 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
114 ; SSE-NEXT: movaps %xmm0, %xmm1
115 ; SSE-NEXT: popq %rax
118 ; AVX1-LABEL: prefer_f16_v8f32:
119 ; AVX1: # %bb.0: # %entry
120 ; AVX1-NEXT: pushq %rax
121 ; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
122 ; AVX1-NEXT: callq __extendhfsf2@PLT
123 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
124 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
125 ; AVX1-NEXT: popq %rax
128 ; AVX2-LABEL: prefer_f16_v8f32:
129 ; AVX2: # %bb.0: # %entry
130 ; AVX2-NEXT: pushq %rax
131 ; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
132 ; AVX2-NEXT: callq __extendhfsf2@PLT
133 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
134 ; AVX2-NEXT: popq %rax
137 ; AVX512F-LABEL: prefer_f16_v8f32:
138 ; AVX512F: # %bb.0: # %entry
139 ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
140 ; AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0
143 ; AVX512FP16-LABEL: prefer_f16_v8f32:
144 ; AVX512FP16: # %bb.0: # %entry
145 ; AVX512FP16-NEXT: vcvtph2psx (%rdi){1to8}, %ymm0
146 ; AVX512FP16-NEXT: retq
148 %0 = load half, ptr %p, align 4
149 %vecinit.i = insertelement <8 x half> undef, half %0, i64 0
150 %vecinit3.i = shufflevector <8 x half> %vecinit.i, <8 x half> poison, <8 x i32> zeroinitializer
151 %conv.i = fpext <8 x half> %vecinit3.i to <8 x float>
152 ret <8 x float> %conv.i
155 define <2 x double> @prefer_f16_v2f64(ptr %p) nounwind {
156 ; SSE-LABEL: prefer_f16_v2f64:
157 ; SSE: # %bb.0: # %entry
158 ; SSE-NEXT: pushq %rax
159 ; SSE-NEXT: pinsrw $0, (%rdi), %xmm0
160 ; SSE-NEXT: callq __extendhfsf2@PLT
161 ; SSE-NEXT: cvtss2sd %xmm0, %xmm0
162 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
163 ; SSE-NEXT: popq %rax
166 ; AVX-LABEL: prefer_f16_v2f64:
167 ; AVX: # %bb.0: # %entry
168 ; AVX-NEXT: pushq %rax
169 ; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
170 ; AVX-NEXT: callq __extendhfsf2@PLT
171 ; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
172 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
173 ; AVX-NEXT: popq %rax
176 ; AVX512F-LABEL: prefer_f16_v2f64:
177 ; AVX512F: # %bb.0: # %entry
178 ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
179 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
180 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
181 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
182 ; AVX512F-NEXT: vcvtps2pd %xmm0, %xmm0
185 ; AVX512FP16-LABEL: prefer_f16_v2f64:
186 ; AVX512FP16: # %bb.0: # %entry
187 ; AVX512FP16-NEXT: vcvtph2pd (%rdi){1to2}, %xmm0
188 ; AVX512FP16-NEXT: retq
190 %0 = load half, ptr %p, align 4
191 %vecinit.i = insertelement <2 x half> undef, half %0, i64 0
192 %vecinit3.i = shufflevector <2 x half> %vecinit.i, <2 x half> poison, <2 x i32> zeroinitializer
193 %conv.i = fpext <2 x half> %vecinit3.i to <2 x double>
194 ret <2 x double> %conv.i
197 define <4 x double> @prefer_f16_v4f64(ptr %p) nounwind {
198 ; SSE-LABEL: prefer_f16_v4f64:
199 ; SSE: # %bb.0: # %entry
200 ; SSE-NEXT: pushq %rax
201 ; SSE-NEXT: pinsrw $0, (%rdi), %xmm0
202 ; SSE-NEXT: callq __extendhfsf2@PLT
203 ; SSE-NEXT: cvtss2sd %xmm0, %xmm0
204 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
205 ; SSE-NEXT: movaps %xmm0, %xmm1
206 ; SSE-NEXT: popq %rax
209 ; AVX1-LABEL: prefer_f16_v4f64:
210 ; AVX1: # %bb.0: # %entry
211 ; AVX1-NEXT: pushq %rax
212 ; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
213 ; AVX1-NEXT: callq __extendhfsf2@PLT
214 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
215 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
216 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
217 ; AVX1-NEXT: popq %rax
220 ; AVX2-LABEL: prefer_f16_v4f64:
221 ; AVX2: # %bb.0: # %entry
222 ; AVX2-NEXT: pushq %rax
223 ; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
224 ; AVX2-NEXT: callq __extendhfsf2@PLT
225 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
226 ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
227 ; AVX2-NEXT: popq %rax
230 ; AVX512F-LABEL: prefer_f16_v4f64:
231 ; AVX512F: # %bb.0: # %entry
232 ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0
233 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
234 ; AVX512F-NEXT: vcvtps2pd %xmm0, %ymm0
237 ; AVX512FP16-LABEL: prefer_f16_v4f64:
238 ; AVX512FP16: # %bb.0: # %entry
239 ; AVX512FP16-NEXT: vcvtph2pd (%rdi){1to4}, %ymm0
240 ; AVX512FP16-NEXT: retq
242 %0 = load half, ptr %p, align 4
243 %vecinit.i = insertelement <4 x half> undef, half %0, i64 0
244 %vecinit3.i = shufflevector <4 x half> %vecinit.i, <4 x half> poison, <4 x i32> zeroinitializer
245 %conv.i = fpext <4 x half> %vecinit3.i to <4 x double>
246 ret <4 x double> %conv.i