1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
5 ; Check constant loads of every 128-bit and 256-bit vector type
6 ; for size optimization using splat ops available with AVX and AVX2.
8 ; There is no AVX broadcast from double to 128-bit vector because movddup has been around since SSE3 (grrr).
9 define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
10 ; CHECK-LABEL: splat_v2f64:
12 ; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
13 ; CHECK-NEXT: # xmm1 = mem[0,0]
14 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
16 %add = fadd <2 x double> %x, <double 1.0, double 1.0>
20 define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
21 ; CHECK-LABEL: splat_v4f64:
23 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
24 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
26 %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
30 define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
31 ; CHECK-LABEL: splat_v4f32:
33 ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
34 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
36 %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
40 define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
41 ; CHECK-LABEL: splat_v8f32:
43 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
44 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
46 %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
50 ; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value.
51 ; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq.
52 define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 {
53 ; AVX-LABEL: splat_v2i64:
55 ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [2,2]
56 ; AVX-NEXT: # xmm1 = mem[0,0]
57 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
60 ; AVX2-LABEL: splat_v2i64:
62 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2]
63 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
65 %add = add <2 x i64> %x, <i64 2, i64 2>
69 ; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors,
70 ; and then we fake it: use vmovddup to splat 64-bit value.
71 define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
72 ; AVX-LABEL: splat_v4i64:
74 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
75 ; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [2,2]
76 ; AVX-NEXT: # xmm2 = mem[0,0]
77 ; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1
78 ; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
79 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
82 ; AVX2-LABEL: splat_v4i64:
84 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2]
85 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
87 %add = add <4 x i64> %x, <i64 2, i64 2, i64 2, i64 2>
91 ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
92 define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 {
93 ; AVX-LABEL: splat_v4i32:
95 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2]
96 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
99 ; AVX2-LABEL: splat_v4i32:
101 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
102 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
104 %add = add <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
108 ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
109 define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
110 ; AVX-LABEL: splat_v8i32:
112 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
113 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2]
114 ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
115 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
116 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
119 ; AVX2-LABEL: splat_v8i32:
121 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
122 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
124 %add = add <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
128 ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
129 define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 {
130 ; AVX-LABEL: splat_v8i16:
132 ; AVX-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
135 ; AVX2-LABEL: splat_v8i16:
137 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
138 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
140 %add = add <8 x i16> %x, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
144 ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
145 define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
146 ; AVX-LABEL: splat_v16i16:
148 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
149 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
150 ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
151 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0
152 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
155 ; AVX2-LABEL: splat_v16i16:
157 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
158 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
160 %add = add <16 x i16> %x, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
164 ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
165 define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 {
166 ; AVX-LABEL: splat_v16i8:
168 ; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
171 ; AVX2-LABEL: splat_v16i8:
173 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
174 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
176 %add = add <16 x i8> %x, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
180 ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
181 define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
182 ; AVX-LABEL: splat_v32i8:
184 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
185 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
186 ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
187 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
188 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
191 ; AVX2-LABEL: splat_v32i8:
193 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
194 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
196 %add = add <32 x i8> %x, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
200 ; PR23259: Verify that ISel doesn't crash with a 'fatal error in backend'
201 ; due to a missing AVX pattern to select a v2i64 X86ISD::BROADCAST of a
202 ; loadi64 with multiple uses.
204 @A = common global <3 x i64> zeroinitializer, align 32
206 define <8 x i64> @pr23259() #1 {
207 ; AVX-LABEL: pr23259:
208 ; AVX: # %bb.0: # %entry
210 ; AVX-NEXT: .cfi_adjust_cfa_offset 8
211 ; AVX-NEXT: popq %rax
212 ; AVX-NEXT: .cfi_adjust_cfa_offset -8
213 ; AVX-NEXT: vmovq %rax, %xmm0
214 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
215 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
216 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
217 ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
220 ; AVX2-LABEL: pr23259:
221 ; AVX2: # %bb.0: # %entry
222 ; AVX2-NEXT: vmovdqa {{.*}}(%rip), %ymm0
223 ; AVX2-NEXT: pushq $1
224 ; AVX2-NEXT: .cfi_adjust_cfa_offset 8
225 ; AVX2-NEXT: popq %rax
226 ; AVX2-NEXT: .cfi_adjust_cfa_offset -8
227 ; AVX2-NEXT: vmovq %rax, %xmm1
228 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
229 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,1]
230 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
233 %0 = load <4 x i64>, <4 x i64>* bitcast (<3 x i64>* @A to <4 x i64>*), align 32
234 %1 = shufflevector <4 x i64> %0, <4 x i64> undef, <3 x i32> <i32 undef, i32 undef, i32 2>
235 %shuffle = shufflevector <3 x i64> <i64 1, i64 undef, i64 undef>, <3 x i64> %1, <8 x i32> <i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
236 ret <8 x i64> %shuffle
239 attributes #0 = { optsize }
240 attributes #1 = { minsize }