1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2
6 define <16 x i8> @load8_ins_elt0_v16i8(i8* %p) nounwind {
7 ; SSE-LABEL: load8_ins_elt0_v16i8:
9 ; SSE-NEXT: movzbl (%rdi), %eax
10 ; SSE-NEXT: movd %eax, %xmm0
13 ; AVX1-LABEL: load8_ins_elt0_v16i8:
15 ; AVX1-NEXT: movzbl (%rdi), %eax
16 ; AVX1-NEXT: vmovd %eax, %xmm0
19 ; AVX2-LABEL: load8_ins_elt0_v16i8:
21 ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
24 %ins = insertelement <16 x i8> undef, i8 %x, i32 0
28 define <8 x i16> @load16_ins_elt0_v8i16(i16* %p) nounwind {
29 ; SSE-LABEL: load16_ins_elt0_v8i16:
31 ; SSE-NEXT: movzwl (%rdi), %eax
32 ; SSE-NEXT: movd %eax, %xmm0
35 ; AVX1-LABEL: load16_ins_elt0_v8i16:
37 ; AVX1-NEXT: movzwl (%rdi), %eax
38 ; AVX1-NEXT: vmovd %eax, %xmm0
41 ; AVX2-LABEL: load16_ins_elt0_v8i16:
43 ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
45 %x = load i16, i16* %p
46 %ins = insertelement <8 x i16> undef, i16 %x, i32 0
50 define <4 x i32> @load32_ins_elt0_v4i32(i32* %p) nounwind {
51 ; SSE-LABEL: load32_ins_elt0_v4i32:
53 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
56 ; AVX-LABEL: load32_ins_elt0_v4i32:
58 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
60 %x = load i32, i32* %p
61 %ins = insertelement <4 x i32> undef, i32 %x, i32 0
65 define <2 x i64> @load64_ins_elt0_v2i64(i64* %p) nounwind {
66 ; SSE-LABEL: load64_ins_elt0_v2i64:
68 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
71 ; AVX-LABEL: load64_ins_elt0_v2i64:
73 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
75 %x = load i64, i64* %p
76 %ins = insertelement <2 x i64> undef, i64 %x, i32 0
80 define <4 x float> @load32_ins_elt0_v4f32(float* %p) nounwind {
81 ; SSE-LABEL: load32_ins_elt0_v4f32:
83 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
86 ; AVX-LABEL: load32_ins_elt0_v4f32:
88 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
90 %x = load float, float* %p
91 %ins = insertelement <4 x float> undef, float %x, i32 0
95 define <2 x double> @load64_ins_elt0_v2f64(double* %p) nounwind {
96 ; SSE-LABEL: load64_ins_elt0_v2f64:
98 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
101 ; AVX-LABEL: load64_ins_elt0_v2f64:
103 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
105 %x = load double, double* %p
106 %ins = insertelement <2 x double> undef, double %x, i32 0
107 ret <2 x double> %ins
110 define <16 x i8> @load8_ins_eltc_v16i8(i8* %p) nounwind {
111 ; SSE-LABEL: load8_ins_eltc_v16i8:
113 ; SSE-NEXT: movzbl (%rdi), %eax
114 ; SSE-NEXT: movd %eax, %xmm0
115 ; SSE-NEXT: pslld $24, %xmm0
118 ; AVX1-LABEL: load8_ins_eltc_v16i8:
120 ; AVX1-NEXT: movzbl (%rdi), %eax
121 ; AVX1-NEXT: vmovd %eax, %xmm0
122 ; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
125 ; AVX2-LABEL: load8_ins_eltc_v16i8:
127 ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0
130 %ins = insertelement <16 x i8> undef, i8 %x, i32 3
134 define <8 x i16> @load16_ins_eltc_v8i16(i16* %p) nounwind {
135 ; SSE-LABEL: load16_ins_eltc_v8i16:
137 ; SSE-NEXT: movzwl (%rdi), %eax
138 ; SSE-NEXT: movd %eax, %xmm0
139 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
142 ; AVX1-LABEL: load16_ins_eltc_v8i16:
144 ; AVX1-NEXT: movzwl (%rdi), %eax
145 ; AVX1-NEXT: vmovd %eax, %xmm0
146 ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
149 ; AVX2-LABEL: load16_ins_eltc_v8i16:
151 ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
153 %x = load i16, i16* %p
154 %ins = insertelement <8 x i16> undef, i16 %x, i32 5
158 define <4 x i32> @load32_ins_eltc_v4i32(i32* %p) nounwind {
159 ; SSE-LABEL: load32_ins_eltc_v4i32:
161 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
162 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
165 ; AVX-LABEL: load32_ins_eltc_v4i32:
167 ; AVX-NEXT: vbroadcastss (%rdi), %xmm0
169 %x = load i32, i32* %p
170 %ins = insertelement <4 x i32> undef, i32 %x, i32 2
174 define <2 x i64> @load64_ins_eltc_v2i64(i64* %p) nounwind {
175 ; SSE-LABEL: load64_ins_eltc_v2i64:
177 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
178 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
181 ; AVX-LABEL: load64_ins_eltc_v2i64:
183 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
185 %x = load i64, i64* %p
186 %ins = insertelement <2 x i64> undef, i64 %x, i32 1
190 define <4 x float> @load32_ins_eltc_v4f32(float* %p) nounwind {
191 ; SSE-LABEL: load32_ins_eltc_v4f32:
193 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
194 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,2,0]
197 ; AVX-LABEL: load32_ins_eltc_v4f32:
199 ; AVX-NEXT: vbroadcastss (%rdi), %xmm0
201 %x = load float, float* %p
202 %ins = insertelement <4 x float> undef, float %x, i32 3
206 define <2 x double> @load64_ins_eltc_v2f64(double* %p) nounwind {
207 ; SSE-LABEL: load64_ins_eltc_v2f64:
209 ; SSE-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
212 ; AVX-LABEL: load64_ins_eltc_v2f64:
214 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
216 %x = load double, double* %p
217 %ins = insertelement <2 x double> undef, double %x, i32 1
218 ret <2 x double> %ins
221 define <32 x i8> @load8_ins_elt0_v32i8(i8* %p) nounwind {
222 ; SSE-LABEL: load8_ins_elt0_v32i8:
224 ; SSE-NEXT: movzbl (%rdi), %eax
225 ; SSE-NEXT: movd %eax, %xmm0
228 ; AVX1-LABEL: load8_ins_elt0_v32i8:
230 ; AVX1-NEXT: movzbl (%rdi), %eax
231 ; AVX1-NEXT: vmovd %eax, %xmm0
234 ; AVX2-LABEL: load8_ins_elt0_v32i8:
236 ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0
239 %ins = insertelement <32 x i8> undef, i8 %x, i32 0
243 define <16 x i16> @load16_ins_elt0_v16i16(i16* %p) nounwind {
244 ; SSE-LABEL: load16_ins_elt0_v16i16:
246 ; SSE-NEXT: movzwl (%rdi), %eax
247 ; SSE-NEXT: movd %eax, %xmm0
250 ; AVX1-LABEL: load16_ins_elt0_v16i16:
252 ; AVX1-NEXT: movzwl (%rdi), %eax
253 ; AVX1-NEXT: vmovd %eax, %xmm0
256 ; AVX2-LABEL: load16_ins_elt0_v16i16:
258 ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0
260 %x = load i16, i16* %p
261 %ins = insertelement <16 x i16> undef, i16 %x, i32 0
265 define <8 x i32> @load32_ins_elt0_v8i32(i32* %p) nounwind {
266 ; SSE-LABEL: load32_ins_elt0_v8i32:
268 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
271 ; AVX-LABEL: load32_ins_elt0_v8i32:
273 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
275 %x = load i32, i32* %p
276 %ins = insertelement <8 x i32> undef, i32 %x, i32 0
280 define <4 x i64> @load64_ins_elt0_v4i64(i64* %p) nounwind {
281 ; SSE-LABEL: load64_ins_elt0_v4i64:
283 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
286 ; AVX-LABEL: load64_ins_elt0_v4i64:
288 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
290 %x = load i64, i64* %p
291 %ins = insertelement <4 x i64> undef, i64 %x, i32 0
295 define <8 x float> @load32_ins_elt0_v8f32(float* %p) nounwind {
296 ; SSE-LABEL: load32_ins_elt0_v8f32:
298 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
301 ; AVX-LABEL: load32_ins_elt0_v8f32:
303 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
305 %x = load float, float* %p
306 %ins = insertelement <8 x float> undef, float %x, i32 0
310 define <4 x double> @load64_ins_elt0_v4f64(double* %p) nounwind {
311 ; SSE-LABEL: load64_ins_elt0_v4f64:
313 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
316 ; AVX-LABEL: load64_ins_elt0_v4f64:
318 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
320 %x = load double, double* %p
321 %ins = insertelement <4 x double> undef, double %x, i32 0
322 ret <4 x double> %ins
325 define <32 x i8> @load8_ins_eltc_v32i8(i8* %p) nounwind {
326 ; SSE-LABEL: load8_ins_eltc_v32i8:
328 ; SSE-NEXT: movzbl (%rdi), %eax
329 ; SSE-NEXT: movd %eax, %xmm1
330 ; SSE-NEXT: psllq $40, %xmm1
333 ; AVX1-LABEL: load8_ins_eltc_v32i8:
335 ; AVX1-NEXT: movzbl (%rdi), %eax
336 ; AVX1-NEXT: vmovd %eax, %xmm0
337 ; AVX1-NEXT: vpsllq $40, %xmm0, %xmm0
338 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
341 ; AVX2-LABEL: load8_ins_eltc_v32i8:
343 ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0
346 %ins = insertelement <32 x i8> undef, i8 %x, i32 21
350 define <16 x i16> @load16_ins_eltc_v16i16(i16* %p) nounwind {
351 ; SSE-LABEL: load16_ins_eltc_v16i16:
353 ; SSE-NEXT: movzwl (%rdi), %eax
354 ; SSE-NEXT: movd %eax, %xmm1
355 ; SSE-NEXT: psllq $48, %xmm1
358 ; AVX1-LABEL: load16_ins_eltc_v16i16:
360 ; AVX1-NEXT: movzwl (%rdi), %eax
361 ; AVX1-NEXT: vmovd %eax, %xmm0
362 ; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
363 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
366 ; AVX2-LABEL: load16_ins_eltc_v16i16:
368 ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0
370 %x = load i16, i16* %p
371 %ins = insertelement <16 x i16> undef, i16 %x, i32 11
375 define <8 x i32> @load32_ins_eltc_v8i32(i32* %p) nounwind {
376 ; SSE-LABEL: load32_ins_eltc_v8i32:
378 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
379 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
382 ; AVX-LABEL: load32_ins_eltc_v8i32:
384 ; AVX-NEXT: vbroadcastss (%rdi), %ymm0
386 %x = load i32, i32* %p
387 %ins = insertelement <8 x i32> undef, i32 %x, i32 7
391 define <4 x i64> @load64_ins_eltc_v4i64(i64* %p) nounwind {
392 ; SSE-LABEL: load64_ins_eltc_v4i64:
394 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
395 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
398 ; AVX-LABEL: load64_ins_eltc_v4i64:
400 ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
402 %x = load i64, i64* %p
403 %ins = insertelement <4 x i64> undef, i64 %x, i32 3
407 define <8 x float> @load32_ins_eltc_v8f32(float* %p) nounwind {
408 ; SSE-LABEL: load32_ins_eltc_v8f32:
410 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
411 ; SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
414 ; AVX-LABEL: load32_ins_eltc_v8f32:
416 ; AVX-NEXT: vbroadcastss (%rdi), %ymm0
418 %x = load float, float* %p
419 %ins = insertelement <8 x float> undef, float %x, i32 5
423 define <4 x double> @load64_ins_eltc_v4f64(double* %p) nounwind {
424 ; SSE-LABEL: load64_ins_eltc_v4f64:
426 ; SSE-NEXT: movddup {{.*#+}} xmm1 = mem[0,0]
429 ; AVX-LABEL: load64_ins_eltc_v4f64:
431 ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
433 %x = load double, double* %p
434 %ins = insertelement <4 x double> undef, double %x, i32 3
435 ret <4 x double> %ins