1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
5 define <8 x i8> @v_dup8(i8 %A) nounwind {
8 ; CHECK-NEXT: dup.8b v0, w0
10 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
11 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
12 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
13 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
14 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
15 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
16 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
17 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
21 define <4 x i16> @v_dup16(i16 %A) nounwind {
22 ; CHECK-LABEL: v_dup16:
24 ; CHECK-NEXT: dup.4h v0, w0
26 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
27 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
28 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
29 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
33 define <2 x i32> @v_dup32(i32 %A) nounwind {
34 ; CHECK-LABEL: v_dup32:
36 ; CHECK-NEXT: dup.2s v0, w0
38 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
39 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
43 define <2 x float> @v_dupfloat(float %A) nounwind {
44 ; CHECK-LABEL: v_dupfloat:
46 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
47 ; CHECK-NEXT: dup.2s v0, v0[0]
49 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
50 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
54 define <16 x i8> @v_dupQ8(i8 %A) nounwind {
55 ; CHECK-LABEL: v_dupQ8:
57 ; CHECK-NEXT: dup.16b v0, w0
59 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
60 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
61 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
62 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
63 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
64 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
65 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
66 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
67 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
68 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
69 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
70 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
71 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
72 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
73 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
74 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
78 define <8 x i16> @v_dupQ16(i16 %A) nounwind {
79 ; CHECK-LABEL: v_dupQ16:
81 ; CHECK-NEXT: dup.8h v0, w0
83 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
84 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
85 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
86 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
87 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
88 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
89 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
90 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
94 define <4 x i32> @v_dupQ32(i32 %A) nounwind {
95 ; CHECK-LABEL: v_dupQ32:
97 ; CHECK-NEXT: dup.4s v0, w0
99 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
100 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
101 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
102 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
106 define <4 x float> @v_dupQfloat(float %A) nounwind {
107 ; CHECK-LABEL: v_dupQfloat:
109 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
110 ; CHECK-NEXT: dup.4s v0, v0[0]
112 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
113 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
114 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
115 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
116 ret <4 x float> %tmp4
119 ; Check to make sure it works with shuffles, too.
121 define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
122 ; CHECK-LABEL: v_shuffledup8:
124 ; CHECK-NEXT: dup.8b v0, w0
126 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
127 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
131 define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
132 ; CHECK-LABEL: v_shuffledup16:
134 ; CHECK-NEXT: dup.4h v0, w0
136 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
137 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
141 define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
142 ; CHECK-LABEL: v_shuffledup32:
144 ; CHECK-NEXT: dup.2s v0, w0
146 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
147 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
151 define <2 x float> @v_shuffledupfloat(float %A) nounwind {
152 ; CHECK-LABEL: v_shuffledupfloat:
154 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
155 ; CHECK-NEXT: dup.2s v0, v0[0]
157 %tmp1 = insertelement <2 x float> undef, float %A, i32 0
158 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
159 ret <2 x float> %tmp2
162 define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
163 ; CHECK-LABEL: v_shuffledupQ8:
165 ; CHECK-NEXT: dup.16b v0, w0
167 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
168 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
172 define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
173 ; CHECK-LABEL: v_shuffledupQ16:
175 ; CHECK-NEXT: dup.8h v0, w0
177 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
178 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
182 define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
183 ; CHECK-LABEL: v_shuffledupQ32:
185 ; CHECK-NEXT: dup.4s v0, w0
187 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
188 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
192 define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
193 ; CHECK-LABEL: v_shuffledupQfloat:
195 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
196 ; CHECK-NEXT: dup.4s v0, v0[0]
198 %tmp1 = insertelement <4 x float> undef, float %A, i32 0
199 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
200 ret <4 x float> %tmp2
203 define <8 x i8> @vduplane8(<8 x i8> %A) nounwind {
204 ; CHECK-LABEL: vduplane8:
206 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
207 ; CHECK-NEXT: dup.8b v0, v0[1]
209 %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
213 define <4 x i16> @vduplane16(<4 x i16> %A) nounwind {
214 ; CHECK-LABEL: vduplane16:
216 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
217 ; CHECK-NEXT: dup.4h v0, v0[1]
219 %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
223 define <2 x i32> @vduplane32(<2 x i32> %A) nounwind {
224 ; CHECK-LABEL: vduplane32:
226 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
227 ; CHECK-NEXT: dup.2s v0, v0[1]
229 %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
233 define <2 x float> @vduplanefloat(<2 x float> %A) nounwind {
234 ; CHECK-LABEL: vduplanefloat:
236 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
237 ; CHECK-NEXT: dup.2s v0, v0[1]
239 %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
240 ret <2 x float> %tmp2
243 define <16 x i8> @vduplaneQ8(<8 x i8> %A) nounwind {
244 ; CHECK-LABEL: vduplaneQ8:
246 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
247 ; CHECK-NEXT: dup.16b v0, v0[1]
249 %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
253 define <8 x i16> @vduplaneQ16(<4 x i16> %A) nounwind {
254 ; CHECK-LABEL: vduplaneQ16:
256 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
257 ; CHECK-NEXT: dup.8h v0, v0[1]
259 %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
263 define <4 x i32> @vduplaneQ32(<2 x i32> %A) nounwind {
264 ; CHECK-LABEL: vduplaneQ32:
266 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
267 ; CHECK-NEXT: dup.4s v0, v0[1]
269 %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
273 define <4 x float> @vduplaneQfloat(<2 x float> %A) nounwind {
274 ; CHECK-LABEL: vduplaneQfloat:
276 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
277 ; CHECK-NEXT: dup.4s v0, v0[1]
279 %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
280 ret <4 x float> %tmp2
283 define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
285 ; CHECK: // %bb.0: // %entry
286 ; CHECK-NEXT: dup.2d v0, v0[1]
289 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
293 define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
295 ; CHECK: // %bb.0: // %entry
296 ; CHECK-NEXT: dup.2d v0, v0[0]
299 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
303 define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
305 ; CHECK: // %bb.0: // %entry
306 ; CHECK-NEXT: dup.2d v0, v0[1]
309 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
313 define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
315 ; CHECK: // %bb.0: // %entry
316 ; CHECK-NEXT: dup.2d v0, v0[0]
319 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
323 define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone {
326 ; CHECK-NEXT: fmov s0, w0
327 ; CHECK-NEXT: mov.s v0[1], w1
328 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
330 %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
331 %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
332 ret <2 x i32> %vecinit1
335 define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone {
338 ; CHECK-NEXT: fmov s0, w0
339 ; CHECK-NEXT: mov.s v0[1], w1
340 ; CHECK-NEXT: mov.s v0[2], w1
341 ; CHECK-NEXT: mov.s v0[3], w0
343 %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
344 %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
345 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
346 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
347 ret <4 x i32> %vecinit3
350 define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone {
353 ; CHECK-NEXT: fmov d0, x0
354 ; CHECK-NEXT: mov.d v0[1], x1
356 %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
357 %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
358 ret <2 x i64> %vecinit1
361 ; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
362 ; the single value needed was of the same type as the vector. This is false if
363 ; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
364 ; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
365 ; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
367 ; *However*, it is a dup vD.4h, vN.h[2*idx].
368 define <4 x i16> @test_build_illegal(<4 x i32> %in) {
369 ; CHECK-SD-LABEL: test_build_illegal:
370 ; CHECK-SD: // %bb.0:
371 ; CHECK-SD-NEXT: dup.4h v0, v0[6]
374 ; CHECK-GI-LABEL: test_build_illegal:
375 ; CHECK-GI: // %bb.0:
376 ; CHECK-GI-NEXT: mov.h v1[1], v0[0]
377 ; CHECK-GI-NEXT: mov s0, v0[3]
378 ; CHECK-GI-NEXT: mov.h v1[2], v0[0]
379 ; CHECK-GI-NEXT: mov.h v1[3], v0[0]
380 ; CHECK-GI-NEXT: fmov d0, d1
382 %val = extractelement <4 x i32> %in, i32 3
383 %smallval = trunc i32 %val to i16
384 %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
389 ; We used to inherit an already extract_subvectored v4i16 from
390 ; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
391 ; the formation of an indexed-by-7 MLS.
392 define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
393 ; CHECK-SD-LABEL: test_high_splat:
394 ; CHECK-SD: // %bb.0: // %entry
395 ; CHECK-SD-NEXT: mls.4h v0, v1, v2[7]
398 ; CHECK-GI-LABEL: test_high_splat:
399 ; CHECK-GI: // %bb.0: // %entry
400 ; CHECK-GI-NEXT: dup.8h v2, v2[7]
401 ; CHECK-GI-NEXT: mls.4h v0, v2, v1
404 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
405 %mul = mul <4 x i16> %shuffle, %b
406 %sub = sub <4 x i16> %a, %mul
410 ; Also test the DUP path in the PerfectShuffle generator.
412 define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
413 ; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i16:
414 ; CHECK-SD: // %bb.0:
415 ; CHECK-SD-NEXT: trn1.4h v0, v0, v0
416 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
417 ; CHECK-SD-NEXT: mov.s v0[1], v1[0]
418 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
421 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i16:
422 ; CHECK-GI: // %bb.0:
423 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
424 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
425 ; CHECK-GI-NEXT: adrp x8, .LCPI33_0
426 ; CHECK-GI-NEXT: mov.d v0[1], v1[0]
427 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI33_0]
428 ; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
429 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
431 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
435 define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
436 ; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f16:
437 ; CHECK-SD: // %bb.0:
438 ; CHECK-SD-NEXT: trn1.4h v0, v0, v0
439 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
440 ; CHECK-SD-NEXT: mov.s v0[1], v1[0]
441 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
444 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f16:
445 ; CHECK-GI: // %bb.0:
446 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
447 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
448 ; CHECK-GI-NEXT: adrp x8, .LCPI34_0
449 ; CHECK-GI-NEXT: mov.d v0[1], v1[0]
450 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI34_0]
451 ; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
452 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
454 %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
458 define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
459 ; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i32:
460 ; CHECK-SD: // %bb.0:
461 ; CHECK-SD-NEXT: trn1.4s v0, v0, v0
462 ; CHECK-SD-NEXT: mov.d v0[1], v1[0]
465 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i32:
466 ; CHECK-GI: // %bb.0:
467 ; CHECK-GI-NEXT: adrp x8, .LCPI35_0
468 ; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
469 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_0]
470 ; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
471 ; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
473 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
477 define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind {
478 ; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f32:
479 ; CHECK-SD: // %bb.0:
480 ; CHECK-SD-NEXT: trn1.4s v0, v0, v0
481 ; CHECK-SD-NEXT: mov.d v0[1], v1[0]
484 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f32:
485 ; CHECK-GI: // %bb.0:
486 ; CHECK-GI-NEXT: adrp x8, .LCPI36_0
487 ; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
488 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0]
489 ; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
490 ; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
492 %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
496 define void @disguised_dup(<4 x float> %x, ptr %p1, ptr %p2) {
497 ; CHECK-SD-LABEL: disguised_dup:
498 ; CHECK-SD: // %bb.0:
499 ; CHECK-SD-NEXT: ext.16b v1, v0, v0, #4
500 ; CHECK-SD-NEXT: mov.s v1[2], v0[0]
501 ; CHECK-SD-NEXT: dup.4s v0, v0[0]
502 ; CHECK-SD-NEXT: str q1, [x0]
503 ; CHECK-SD-NEXT: str q0, [x1]
506 ; CHECK-GI-LABEL: disguised_dup:
507 ; CHECK-GI: // %bb.0:
508 ; CHECK-GI-NEXT: adrp x8, .LCPI37_1
509 ; CHECK-GI-NEXT: // kill: def $q0 killed $q0 def $q0_q1
510 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI37_1]
511 ; CHECK-GI-NEXT: adrp x8, .LCPI37_0
512 ; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
513 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI37_0]
514 ; CHECK-GI-NEXT: tbl.16b v2, { v0, v1 }, v2
515 ; CHECK-GI-NEXT: str q0, [x0]
516 ; CHECK-GI-NEXT: str q2, [x1]
518 %shuf = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 0>
519 %dup = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
520 store <4 x float> %shuf, ptr %p1, align 8
521 store <4 x float> %dup, ptr %p2, align 8
525 define <2 x i32> @dup_const2(<2 x i32> %A) nounwind {
526 ; CHECK-SD-LABEL: dup_const2:
527 ; CHECK-SD: // %bb.0:
528 ; CHECK-SD-NEXT: mov w8, #32770 // =0x8002
529 ; CHECK-SD-NEXT: movk w8, #128, lsl #16
530 ; CHECK-SD-NEXT: dup.2s v1, w8
531 ; CHECK-SD-NEXT: add.2s v0, v0, v1
534 ; CHECK-GI-LABEL: dup_const2:
535 ; CHECK-GI: // %bb.0:
536 ; CHECK-GI-NEXT: adrp x8, .LCPI38_0
537 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI38_0]
538 ; CHECK-GI-NEXT: add.2s v0, v0, v1
540 %tmp2 = add <2 x i32> %A, <i32 8421378, i32 8421378>
544 define <2 x i32> @dup_const4_ext(<4 x i32> %A) nounwind {
545 ; CHECK-SD-LABEL: dup_const4_ext:
546 ; CHECK-SD: // %bb.0:
547 ; CHECK-SD-NEXT: mov w8, #32769 // =0x8001
548 ; CHECK-SD-NEXT: movk w8, #128, lsl #16
549 ; CHECK-SD-NEXT: dup.2s v1, w8
550 ; CHECK-SD-NEXT: add.2s v0, v0, v1
553 ; CHECK-GI-LABEL: dup_const4_ext:
554 ; CHECK-GI: // %bb.0:
555 ; CHECK-GI-NEXT: adrp x8, .LCPI39_0
556 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI39_0]
557 ; CHECK-GI-NEXT: add.4s v0, v0, v1
558 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
560 %tmp1 = add <4 x i32> %A, <i32 8421377, i32 8421377, i32 8421377, i32 8421377>
561 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
565 define <4 x i32> @dup_const24(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C) nounwind {
566 ; CHECK-SD-LABEL: dup_const24:
567 ; CHECK-SD: // %bb.0:
568 ; CHECK-SD-NEXT: mov w8, #32768 // =0x8000
569 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
570 ; CHECK-SD-NEXT: movk w8, #128, lsl #16
571 ; CHECK-SD-NEXT: dup.4s v3, w8
572 ; CHECK-SD-NEXT: add.2s v0, v0, v3
573 ; CHECK-SD-NEXT: mov.d v0[1], v1[0]
574 ; CHECK-SD-NEXT: add.4s v1, v2, v3
575 ; CHECK-SD-NEXT: eor.16b v0, v1, v0
578 ; CHECK-GI-LABEL: dup_const24:
579 ; CHECK-GI: // %bb.0:
580 ; CHECK-GI-NEXT: adrp x8, .LCPI40_1
581 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
582 ; CHECK-GI-NEXT: ldr d3, [x8, :lo12:.LCPI40_1]
583 ; CHECK-GI-NEXT: adrp x8, .LCPI40_0
584 ; CHECK-GI-NEXT: add.2s v0, v0, v3
585 ; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI40_0]
586 ; CHECK-GI-NEXT: mov.d v0[1], v1[0]
587 ; CHECK-GI-NEXT: add.4s v1, v2, v3
588 ; CHECK-GI-NEXT: eor.16b v0, v1, v0
590 %tmp1 = add <2 x i32> %A, <i32 8421376, i32 8421376>
591 %tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
592 %tmp3 = add <4 x i32> %C, <i32 8421376, i32 8421376, i32 8421376, i32 8421376>
593 %tmp5 = xor <4 x i32> %tmp3, %tmp4
597 define <8 x i16> @bitcast_i64_v8i16(i64 %a) {
598 ; CHECK-SD-LABEL: bitcast_i64_v8i16:
599 ; CHECK-SD: // %bb.0:
600 ; CHECK-SD-NEXT: dup.8h v0, w0
603 ; CHECK-GI-LABEL: bitcast_i64_v8i16:
604 ; CHECK-GI: // %bb.0:
605 ; CHECK-GI-NEXT: fmov d0, x0
606 ; CHECK-GI-NEXT: dup.8h v0, v0[0]
608 %b = bitcast i64 %a to <4 x i16>
609 %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> zeroinitializer
613 define <8 x i16> @bitcast_i64_v8i16_lane1(i64 %a) {
614 ; CHECK-LABEL: bitcast_i64_v8i16_lane1:
616 ; CHECK-NEXT: fmov d0, x0
617 ; CHECK-NEXT: dup.8h v0, v0[1]
619 %b = bitcast i64 %a to <4 x i16>
620 %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
624 define <8 x i16> @bitcast_f64_v8i16(double %a) {
625 ; CHECK-LABEL: bitcast_f64_v8i16:
627 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
628 ; CHECK-NEXT: dup.8h v0, v0[0]
630 %b = bitcast double %a to <4 x i16>
631 %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> zeroinitializer
635 define <8 x half> @bitcast_i64_v8f16(i64 %a) {
636 ; CHECK-LABEL: bitcast_i64_v8f16:
638 ; CHECK-NEXT: fmov d0, x0
639 ; CHECK-NEXT: dup.8h v0, v0[0]
641 %b = bitcast i64 %a to <4 x half>
642 %r = shufflevector <4 x half> %b, <4 x half> poison, <8 x i32> zeroinitializer
646 define <2 x i64> @bitcast_i64_v2f64(i64 %a) {
647 ; CHECK-SD-LABEL: bitcast_i64_v2f64:
648 ; CHECK-SD: // %bb.0:
649 ; CHECK-SD-NEXT: fmov d0, x0
650 ; CHECK-SD-NEXT: dup.2d v0, v0[0]
653 ; CHECK-GI-LABEL: bitcast_i64_v2f64:
654 ; CHECK-GI: // %bb.0:
655 ; CHECK-GI-NEXT: dup.2d v0, x0
657 %b = bitcast i64 %a to <1 x i64>
658 %r = shufflevector <1 x i64> %b, <1 x i64> poison, <2 x i32> zeroinitializer
662 define <2 x i64> @bitcast_v2f64_v2i64(<2 x double> %a) {
663 ; CHECK-LABEL: bitcast_v2f64_v2i64:
665 ; CHECK-NEXT: dup.2d v0, v0[0]
667 %b = bitcast <2 x double> %a to <2 x i64>
668 %r = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
672 define <2 x i64> @bitcast_v8i16_v2i64(<8 x i16> %a) {
673 ; CHECK-LABEL: bitcast_v8i16_v2i64:
675 ; CHECK-NEXT: dup.2d v0, v0[0]
677 %b = bitcast <8 x i16> %a to <2 x i64>
678 %r = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
682 define <8 x i16> @bitcast_v2f64_v8i16(<2 x i64> %a) {
683 ; CHECK-LABEL: bitcast_v2f64_v8i16:
685 ; CHECK-NEXT: dup.8h v0, v0[0]
687 %b = bitcast <2 x i64> %a to <8 x i16>
688 %r = shufflevector <8 x i16> %b, <8 x i16> poison, <8 x i32> zeroinitializer