1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
4 define <8 x i8> @v_dup8(i8 %A) nounwind {
7 ; CHECK-NEXT: dup.8b v0, w0
9 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
10 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
11 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
12 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
13 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
14 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
15 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
16 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
20 define <4 x i16> @v_dup16(i16 %A) nounwind {
21 ; CHECK-LABEL: v_dup16:
23 ; CHECK-NEXT: dup.4h v0, w0
25 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
26 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
27 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
28 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
32 define <2 x i32> @v_dup32(i32 %A) nounwind {
33 ; CHECK-LABEL: v_dup32:
35 ; CHECK-NEXT: dup.2s v0, w0
37 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
38 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
42 define <2 x float> @v_dupfloat(float %A) nounwind {
43 ; CHECK-LABEL: v_dupfloat:
45 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
46 ; CHECK-NEXT: dup.2s v0, v0[0]
48 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
49 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
53 define <16 x i8> @v_dupQ8(i8 %A) nounwind {
54 ; CHECK-LABEL: v_dupQ8:
56 ; CHECK-NEXT: dup.16b v0, w0
58 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
59 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
60 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
61 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
62 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
63 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
64 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
65 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
66 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
67 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
68 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
69 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
70 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
71 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
72 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
73 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
77 define <8 x i16> @v_dupQ16(i16 %A) nounwind {
78 ; CHECK-LABEL: v_dupQ16:
80 ; CHECK-NEXT: dup.8h v0, w0
82 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
83 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
84 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
85 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
86 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
87 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
88 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
89 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
93 define <4 x i32> @v_dupQ32(i32 %A) nounwind {
94 ; CHECK-LABEL: v_dupQ32:
96 ; CHECK-NEXT: dup.4s v0, w0
98 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
99 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
100 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
101 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
105 define <4 x float> @v_dupQfloat(float %A) nounwind {
106 ; CHECK-LABEL: v_dupQfloat:
108 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
109 ; CHECK-NEXT: dup.4s v0, v0[0]
111 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
112 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
113 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
114 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
115 ret <4 x float> %tmp4
118 ; Check to make sure it works with shuffles, too.
120 define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
121 ; CHECK-LABEL: v_shuffledup8:
123 ; CHECK-NEXT: dup.8b v0, w0
125 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
126 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
130 define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
131 ; CHECK-LABEL: v_shuffledup16:
133 ; CHECK-NEXT: dup.4h v0, w0
135 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
136 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
140 define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
141 ; CHECK-LABEL: v_shuffledup32:
143 ; CHECK-NEXT: dup.2s v0, w0
145 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
146 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
150 define <2 x float> @v_shuffledupfloat(float %A) nounwind {
151 ; CHECK-LABEL: v_shuffledupfloat:
153 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
154 ; CHECK-NEXT: dup.2s v0, v0[0]
156 %tmp1 = insertelement <2 x float> undef, float %A, i32 0
157 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
158 ret <2 x float> %tmp2
161 define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
162 ; CHECK-LABEL: v_shuffledupQ8:
164 ; CHECK-NEXT: dup.16b v0, w0
166 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
167 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
171 define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
172 ; CHECK-LABEL: v_shuffledupQ16:
174 ; CHECK-NEXT: dup.8h v0, w0
176 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
177 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
181 define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
182 ; CHECK-LABEL: v_shuffledupQ32:
184 ; CHECK-NEXT: dup.4s v0, w0
186 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
187 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
191 define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
192 ; CHECK-LABEL: v_shuffledupQfloat:
194 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
195 ; CHECK-NEXT: dup.4s v0, v0[0]
197 %tmp1 = insertelement <4 x float> undef, float %A, i32 0
198 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
199 ret <4 x float> %tmp2
202 define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
203 ; CHECK-LABEL: vduplane8:
205 ; CHECK-NEXT: ldr d0, [x0]
206 ; CHECK-NEXT: dup.8b v0, v0[1]
208 %tmp1 = load <8 x i8>, <8 x i8>* %A
209 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
213 define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
214 ; CHECK-LABEL: vduplane16:
216 ; CHECK-NEXT: ldr d0, [x0]
217 ; CHECK-NEXT: dup.4h v0, v0[1]
219 %tmp1 = load <4 x i16>, <4 x i16>* %A
220 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
224 define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
225 ; CHECK-LABEL: vduplane32:
227 ; CHECK-NEXT: ldr d0, [x0]
228 ; CHECK-NEXT: dup.2s v0, v0[1]
230 %tmp1 = load <2 x i32>, <2 x i32>* %A
231 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
235 define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
236 ; CHECK-LABEL: vduplanefloat:
238 ; CHECK-NEXT: ldr d0, [x0]
239 ; CHECK-NEXT: dup.2s v0, v0[1]
241 %tmp1 = load <2 x float>, <2 x float>* %A
242 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
243 ret <2 x float> %tmp2
246 define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
247 ; CHECK-LABEL: vduplaneQ8:
249 ; CHECK-NEXT: ldr d0, [x0]
250 ; CHECK-NEXT: dup.16b v0, v0[1]
252 %tmp1 = load <8 x i8>, <8 x i8>* %A
253 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
257 define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
258 ; CHECK-LABEL: vduplaneQ16:
260 ; CHECK-NEXT: ldr d0, [x0]
261 ; CHECK-NEXT: dup.8h v0, v0[1]
263 %tmp1 = load <4 x i16>, <4 x i16>* %A
264 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
268 define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
269 ; CHECK-LABEL: vduplaneQ32:
271 ; CHECK-NEXT: ldr d0, [x0]
272 ; CHECK-NEXT: dup.4s v0, v0[1]
274 %tmp1 = load <2 x i32>, <2 x i32>* %A
275 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
279 define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
280 ; CHECK-LABEL: vduplaneQfloat:
282 ; CHECK-NEXT: ldr d0, [x0]
283 ; CHECK-NEXT: dup.4s v0, v0[1]
285 %tmp1 = load <2 x float>, <2 x float>* %A
286 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
287 ret <4 x float> %tmp2
290 define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
292 ; CHECK: // %bb.0: // %entry
293 ; CHECK-NEXT: dup.2d v0, v0[1]
296 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
300 define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
302 ; CHECK: // %bb.0: // %entry
303 ; CHECK-NEXT: dup.2d v0, v0[0]
306 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
310 define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
312 ; CHECK: // %bb.0: // %entry
313 ; CHECK-NEXT: dup.2d v0, v0[1]
316 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
320 define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
322 ; CHECK: // %bb.0: // %entry
323 ; CHECK-NEXT: dup.2d v0, v0[0]
326 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
330 define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone {
333 ; CHECK-NEXT: fmov s0, w0
334 ; CHECK-NEXT: mov.s v0[1], w1
335 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
337 %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
338 %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
339 ret <2 x i32> %vecinit1
342 define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone {
345 ; CHECK-NEXT: fmov s0, w0
346 ; CHECK-NEXT: mov.s v0[1], w1
347 ; CHECK-NEXT: mov.s v0[2], w1
348 ; CHECK-NEXT: mov.s v0[3], w0
350 %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
351 %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
352 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
353 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
354 ret <4 x i32> %vecinit3
357 define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone {
360 ; CHECK-NEXT: fmov d0, x0
361 ; CHECK-NEXT: mov.d v0[1], x1
363 %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
364 %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
365 ret <2 x i64> %vecinit1
368 ; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
369 ; the single value needed was of the same type as the vector. This is false if
370 ; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
371 ; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
372 ; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
374 ; *However*, it is a dup vD.4h, vN.h[2*idx].
375 define <4 x i16> @test_build_illegal(<4 x i32> %in) {
376 ; CHECK-LABEL: test_build_illegal:
378 ; CHECK-NEXT: dup.4h v0, v0[6]
380 %val = extractelement <4 x i32> %in, i32 3
381 %smallval = trunc i32 %val to i16
382 %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
387 ; We used to inherit an already extract_subvectored v4i16 from
388 ; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
389 ; the formation of an indexed-by-7 MLS.
390 define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
391 ; CHECK-LABEL: test_high_splat:
392 ; CHECK: // %bb.0: // %entry
393 ; CHECK-NEXT: mls.4h v0, v1, v2[7]
396 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
397 %mul = mul <4 x i16> %shuffle, %b
398 %sub = sub <4 x i16> %a, %mul
402 ; Also test the DUP path in the PerfectShuffle generator.
404 define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
405 ; CHECK-LABEL: test_perfectshuffle_dupext_v4i16:
407 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
408 ; CHECK-NEXT: dup.4h v0, v0[0]
409 ; CHECK-NEXT: ext.8b v0, v0, v1, #4
411 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
415 define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
416 ; CHECK-LABEL: test_perfectshuffle_dupext_v4f16:
418 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
419 ; CHECK-NEXT: dup.4h v0, v0[0]
420 ; CHECK-NEXT: ext.8b v0, v0, v1, #4
422 %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
426 define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
427 ; CHECK-LABEL: test_perfectshuffle_dupext_v4i32:
429 ; CHECK-NEXT: dup.4s v0, v0[0]
430 ; CHECK-NEXT: ext.16b v0, v0, v1, #8
432 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
436 define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind {
437 ; CHECK-LABEL: test_perfectshuffle_dupext_v4f32:
439 ; CHECK-NEXT: dup.4s v0, v0[0]
440 ; CHECK-NEXT: ext.16b v0, v0, v1, #8
442 %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
446 define void @disguised_dup(<4 x float> %x, <4 x float>* %p1, <4 x float>* %p2) {
447 ; CHECK-LABEL: disguised_dup:
449 ; CHECK-NEXT: dup.4s v1, v0[0]
450 ; CHECK-NEXT: ext.16b v0, v0, v0, #12
451 ; CHECK-NEXT: ext.16b v0, v0, v1, #8
452 ; CHECK-NEXT: str q0, [x0]
453 ; CHECK-NEXT: str q1, [x1]
455 %shuf = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 0>
456 %dup = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
457 store <4 x float> %shuf, <4 x float>* %p1, align 8
458 store <4 x float> %dup, <4 x float>* %p2, align 8