1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
3 define <8 x i8> @v_dup8(i8 %A) nounwind {
6 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
7 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
8 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
9 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
10 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
11 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
12 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
13 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
17 define <4 x i16> @v_dup16(i16 %A) nounwind {
18 ;CHECK-LABEL: v_dup16:
20 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
21 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
22 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
23 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
27 define <2 x i32> @v_dup32(i32 %A) nounwind {
28 ;CHECK-LABEL: v_dup32:
30 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
31 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
35 define <2 x float> @v_dupfloat(float %A) nounwind {
36 ;CHECK-LABEL: v_dupfloat:
38 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
39 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
43 define <16 x i8> @v_dupQ8(i8 %A) nounwind {
44 ;CHECK-LABEL: v_dupQ8:
46 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
47 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
48 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
49 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
50 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
51 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
52 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
53 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
54 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
55 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
56 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
57 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
58 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
59 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
60 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
61 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
65 define <8 x i16> @v_dupQ16(i16 %A) nounwind {
66 ;CHECK-LABEL: v_dupQ16:
68 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
69 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
70 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
71 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
72 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
73 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
74 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
75 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
79 define <4 x i32> @v_dupQ32(i32 %A) nounwind {
80 ;CHECK-LABEL: v_dupQ32:
82 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
83 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
84 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
85 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
89 define <4 x float> @v_dupQfloat(float %A) nounwind {
90 ;CHECK-LABEL: v_dupQfloat:
92 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
93 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
94 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
95 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
99 ; Check to make sure it works with shuffles, too.
101 define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
102 ;CHECK-LABEL: v_shuffledup8:
104 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
105 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
109 define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
110 ;CHECK-LABEL: v_shuffledup16:
112 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
113 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
117 define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
118 ;CHECK-LABEL: v_shuffledup32:
120 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
121 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
125 define <2 x float> @v_shuffledupfloat(float %A) nounwind {
126 ;CHECK-LABEL: v_shuffledupfloat:
128 %tmp1 = insertelement <2 x float> undef, float %A, i32 0
129 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
130 ret <2 x float> %tmp2
133 define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
134 ;CHECK-LABEL: v_shuffledupQ8:
136 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
137 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
141 define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
142 ;CHECK-LABEL: v_shuffledupQ16:
144 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
145 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
149 define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
150 ;CHECK-LABEL: v_shuffledupQ32:
152 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
153 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
157 define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
158 ;CHECK-LABEL: v_shuffledupQfloat:
160 %tmp1 = insertelement <4 x float> undef, float %A, i32 0
161 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
162 ret <4 x float> %tmp2
165 define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
166 ;CHECK-LABEL: vduplane8:
168 %tmp1 = load <8 x i8>, <8 x i8>* %A
169 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
173 define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
174 ;CHECK-LABEL: vduplane16:
176 %tmp1 = load <4 x i16>, <4 x i16>* %A
177 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
181 define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
182 ;CHECK-LABEL: vduplane32:
184 %tmp1 = load <2 x i32>, <2 x i32>* %A
185 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
189 define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
190 ;CHECK-LABEL: vduplanefloat:
192 %tmp1 = load <2 x float>, <2 x float>* %A
193 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
194 ret <2 x float> %tmp2
197 define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
198 ;CHECK-LABEL: vduplaneQ8:
200 %tmp1 = load <8 x i8>, <8 x i8>* %A
201 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
205 define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
206 ;CHECK-LABEL: vduplaneQ16:
208 %tmp1 = load <4 x i16>, <4 x i16>* %A
209 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
213 define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
214 ;CHECK-LABEL: vduplaneQ32:
216 %tmp1 = load <2 x i32>, <2 x i32>* %A
217 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
221 define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
222 ;CHECK-LABEL: vduplaneQfloat:
224 %tmp1 = load <2 x float>, <2 x float>* %A
225 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
226 ret <4 x float> %tmp2
229 define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
233 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
237 define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
241 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
245 define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
249 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
253 define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
257 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
261 define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone {
263 ; CHECK-NEXT: fmov s0, w0
264 ; CHECK-NEXT: mov.s v0[1], w1
266 %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
267 %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
268 ret <2 x i32> %vecinit1
271 define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone {
273 ; CHECK-NEXT: fmov s0, w0
274 ; CHECK-NEXT: mov.s v0[1], w1
275 ; CHECK-NEXT: mov.s v0[2], w1
276 ; CHECK-NEXT: mov.s v0[3], w0
278 %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
279 %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
280 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
281 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
282 ret <4 x i32> %vecinit3
285 define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone {
287 ; CHECK-NEXT: fmov d0, x0
288 ; CHECK-NEXT: mov.d v0[1], x1
290 %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
291 %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
292 ret <2 x i64> %vecinit1
295 ; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
296 ; the single value needed was of the same type as the vector. This is false if
297 ; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
298 ; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
299 ; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
301 ; *However*, it is a dup vD.4h, vN.h[2*idx].
302 define <4 x i16> @test_build_illegal(<4 x i32> %in) {
303 ; CHECK-LABEL: test_build_illegal:
304 ; CHECK: dup.4h v0, v0[6]
305 %val = extractelement <4 x i32> %in, i32 3
306 %smallval = trunc i32 %val to i16
307 %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
312 ; We used to inherit an already extract_subvectored v4i16 from
313 ; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
314 ; the formation of an indexed-by-7 MLS.
315 define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
316 ; CHECK-LABEL: test_high_splat:
317 ; CHECK: mls.4h v0, v1, v2[7]
319 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
320 %mul = mul <4 x i16> %shuffle, %b
321 %sub = sub <4 x i16> %a, %mul
325 ; Also test the DUP path in the PerfectShuffle generator.
327 ; CHECK-LABEL: test_perfectshuffle_dupext_v4i16:
328 ; CHECK-NEXT: dup.4h v0, v0[0]
329 ; CHECK-NEXT: ext.8b v0, v0, v1, #4
330 define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
331 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
335 ; CHECK-LABEL: test_perfectshuffle_dupext_v4f16:
336 ; CHECK-NEXT: dup.4h v0, v0[0]
337 ; CHECK-NEXT: ext.8b v0, v0, v1, #4
339 define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
340 %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
344 ; CHECK-LABEL: test_perfectshuffle_dupext_v4i32:
345 ; CHECK-NEXT: dup.4s v0, v0[0]
346 ; CHECK-NEXT: ext.16b v0, v0, v1, #8
348 define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
349 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
353 ; CHECK-LABEL: test_perfectshuffle_dupext_v4f32:
354 ; CHECK-NEXT: dup.4s v0, v0[0]
355 ; CHECK-NEXT: ext.16b v0, v0, v1, #8
357 define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind {
358 %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>