1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
5 define <8 x i8> @v_dup8(i8 %A) nounwind {
8 ; CHECK-NEXT: dup.8b v0, w0
10 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
11 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
12 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
13 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
14 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
15 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
16 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
17 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
21 define <4 x i16> @v_dup16(i16 %A) nounwind {
22 ; CHECK-LABEL: v_dup16:
24 ; CHECK-NEXT: dup.4h v0, w0
26 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
27 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
28 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
29 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
33 define <2 x i32> @v_dup32(i32 %A) nounwind {
34 ; CHECK-LABEL: v_dup32:
36 ; CHECK-NEXT: dup.2s v0, w0
38 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
39 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
43 define <2 x float> @v_dupfloat(float %A) nounwind {
44 ; CHECK-LABEL: v_dupfloat:
46 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
47 ; CHECK-NEXT: dup.2s v0, v0[0]
49 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
50 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
54 define <16 x i8> @v_dupQ8(i8 %A) nounwind {
55 ; CHECK-LABEL: v_dupQ8:
57 ; CHECK-NEXT: dup.16b v0, w0
59 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
60 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
61 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
62 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
63 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
64 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
65 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
66 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
67 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
68 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
69 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
70 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
71 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
72 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
73 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
74 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
78 define <8 x i16> @v_dupQ16(i16 %A) nounwind {
79 ; CHECK-LABEL: v_dupQ16:
81 ; CHECK-NEXT: dup.8h v0, w0
83 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
84 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
85 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
86 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
87 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
88 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
89 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
90 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
94 define <4 x i32> @v_dupQ32(i32 %A) nounwind {
95 ; CHECK-LABEL: v_dupQ32:
97 ; CHECK-NEXT: dup.4s v0, w0
99 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
100 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
101 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
102 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
106 define <4 x float> @v_dupQfloat(float %A) nounwind {
107 ; CHECK-LABEL: v_dupQfloat:
109 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
110 ; CHECK-NEXT: dup.4s v0, v0[0]
112 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
113 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
114 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
115 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
116 ret <4 x float> %tmp4
119 ; Check to make sure it works with shuffles, too.
121 define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
122 ; CHECK-LABEL: v_shuffledup8:
124 ; CHECK-NEXT: dup.8b v0, w0
126 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
127 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
131 define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
132 ; CHECK-LABEL: v_shuffledup16:
134 ; CHECK-NEXT: dup.4h v0, w0
136 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
137 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
141 define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
142 ; CHECK-LABEL: v_shuffledup32:
144 ; CHECK-NEXT: dup.2s v0, w0
146 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
147 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
151 define <2 x float> @v_shuffledupfloat(float %A) nounwind {
152 ; CHECK-LABEL: v_shuffledupfloat:
154 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
155 ; CHECK-NEXT: dup.2s v0, v0[0]
157 %tmp1 = insertelement <2 x float> undef, float %A, i32 0
158 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
159 ret <2 x float> %tmp2
162 define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
163 ; CHECK-LABEL: v_shuffledupQ8:
165 ; CHECK-NEXT: dup.16b v0, w0
167 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
168 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
172 define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
173 ; CHECK-LABEL: v_shuffledupQ16:
175 ; CHECK-NEXT: dup.8h v0, w0
177 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
178 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
182 define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
183 ; CHECK-LABEL: v_shuffledupQ32:
185 ; CHECK-NEXT: dup.4s v0, w0
187 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
188 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
192 define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
193 ; CHECK-LABEL: v_shuffledupQfloat:
195 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
196 ; CHECK-NEXT: dup.4s v0, v0[0]
198 %tmp1 = insertelement <4 x float> undef, float %A, i32 0
199 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
200 ret <4 x float> %tmp2
203 define <8 x i8> @vduplane8(<8 x i8> %A) nounwind {
204 ; CHECK-LABEL: vduplane8:
206 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
207 ; CHECK-NEXT: dup.8b v0, v0[1]
209 %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
213 define <4 x i16> @vduplane16(<4 x i16> %A) nounwind {
214 ; CHECK-LABEL: vduplane16:
216 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
217 ; CHECK-NEXT: dup.4h v0, v0[1]
219 %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
223 define <2 x i32> @vduplane32(<2 x i32> %A) nounwind {
224 ; CHECK-LABEL: vduplane32:
226 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
227 ; CHECK-NEXT: dup.2s v0, v0[1]
229 %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
233 define <2 x float> @vduplanefloat(<2 x float> %A) nounwind {
234 ; CHECK-LABEL: vduplanefloat:
236 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
237 ; CHECK-NEXT: dup.2s v0, v0[1]
239 %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
240 ret <2 x float> %tmp2
243 define <16 x i8> @vduplaneQ8(<8 x i8> %A) nounwind {
244 ; CHECK-LABEL: vduplaneQ8:
246 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
247 ; CHECK-NEXT: dup.16b v0, v0[1]
249 %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
253 define <8 x i16> @vduplaneQ16(<4 x i16> %A) nounwind {
254 ; CHECK-LABEL: vduplaneQ16:
256 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
257 ; CHECK-NEXT: dup.8h v0, v0[1]
259 %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
263 define <4 x i32> @vduplaneQ32(<2 x i32> %A) nounwind {
264 ; CHECK-LABEL: vduplaneQ32:
266 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
267 ; CHECK-NEXT: dup.4s v0, v0[1]
269 %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
273 define <4 x float> @vduplaneQfloat(<2 x float> %A) nounwind {
274 ; CHECK-LABEL: vduplaneQfloat:
276 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
277 ; CHECK-NEXT: dup.4s v0, v0[1]
279 %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
280 ret <4 x float> %tmp2
283 define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
285 ; CHECK: // %bb.0: // %entry
286 ; CHECK-NEXT: dup.2d v0, v0[1]
289 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
293 define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
295 ; CHECK: // %bb.0: // %entry
296 ; CHECK-NEXT: dup.2d v0, v0[0]
299 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
303 define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
305 ; CHECK: // %bb.0: // %entry
306 ; CHECK-NEXT: dup.2d v0, v0[1]
309 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
313 define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
315 ; CHECK: // %bb.0: // %entry
316 ; CHECK-NEXT: dup.2d v0, v0[0]
319 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
323 define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone {
326 ; CHECK-NEXT: fmov s0, w0
327 ; CHECK-NEXT: mov.s v0[1], w1
328 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
330 %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
331 %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
332 ret <2 x i32> %vecinit1
335 define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone {
338 ; CHECK-NEXT: fmov s0, w0
339 ; CHECK-NEXT: mov.s v0[1], w1
340 ; CHECK-NEXT: mov.s v0[2], w1
341 ; CHECK-NEXT: mov.s v0[3], w0
343 %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
344 %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
345 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
346 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
347 ret <4 x i32> %vecinit3
350 define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone {
353 ; CHECK-NEXT: fmov d0, x0
354 ; CHECK-NEXT: mov.d v0[1], x1
356 %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
357 %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
358 ret <2 x i64> %vecinit1
361 ; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
362 ; the single value needed was of the same type as the vector. This is false if
363 ; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
364 ; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
365 ; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
367 ; *However*, it is a dup vD.4h, vN.h[2*idx].
368 define <4 x i16> @test_build_illegal(<4 x i32> %in) {
369 ; CHECK-SD-LABEL: test_build_illegal:
370 ; CHECK-SD: // %bb.0:
371 ; CHECK-SD-NEXT: dup.4h v0, v0[6]
374 ; CHECK-GI-LABEL: test_build_illegal:
375 ; CHECK-GI: // %bb.0:
376 ; CHECK-GI-NEXT: mov s0, v0[3]
377 ; CHECK-GI-NEXT: mov.h v0[3], v0[0]
378 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
380 %val = extractelement <4 x i32> %in, i32 3
381 %smallval = trunc i32 %val to i16
382 %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
387 ; We used to inherit an already extract_subvectored v4i16 from
388 ; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
389 ; the formation of an indexed-by-7 MLS.
390 define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
391 ; CHECK-SD-LABEL: test_high_splat:
392 ; CHECK-SD: // %bb.0: // %entry
393 ; CHECK-SD-NEXT: mls.4h v0, v1, v2[7]
396 ; CHECK-GI-LABEL: test_high_splat:
397 ; CHECK-GI: // %bb.0: // %entry
398 ; CHECK-GI-NEXT: dup.8h v2, v2[7]
399 ; CHECK-GI-NEXT: mls.4h v0, v2, v1
402 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
403 %mul = mul <4 x i16> %shuffle, %b
404 %sub = sub <4 x i16> %a, %mul
408 ; Also test the DUP path in the PerfectShuffle generator.
410 define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
411 ; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i16:
412 ; CHECK-SD: // %bb.0:
413 ; CHECK-SD-NEXT: trn1.4h v0, v0, v0
414 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
415 ; CHECK-SD-NEXT: mov.s v0[1], v1[0]
416 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
419 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i16:
420 ; CHECK-GI: // %bb.0:
421 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
422 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
423 ; CHECK-GI-NEXT: adrp x8, .LCPI33_0
424 ; CHECK-GI-NEXT: mov.d v0[1], v1[0]
425 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI33_0]
426 ; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
427 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
429 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
433 define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
434 ; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f16:
435 ; CHECK-SD: // %bb.0:
436 ; CHECK-SD-NEXT: trn1.4h v0, v0, v0
437 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
438 ; CHECK-SD-NEXT: mov.s v0[1], v1[0]
439 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
442 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f16:
443 ; CHECK-GI: // %bb.0:
444 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
445 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
446 ; CHECK-GI-NEXT: adrp x8, .LCPI34_0
447 ; CHECK-GI-NEXT: mov.d v0[1], v1[0]
448 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI34_0]
449 ; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
450 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
452 %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
456 define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
457 ; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i32:
458 ; CHECK-SD: // %bb.0:
459 ; CHECK-SD-NEXT: trn1.4s v0, v0, v0
460 ; CHECK-SD-NEXT: mov.d v0[1], v1[0]
463 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i32:
464 ; CHECK-GI: // %bb.0:
465 ; CHECK-GI-NEXT: adrp x8, .LCPI35_0
466 ; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
467 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_0]
468 ; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
469 ; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
471 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
475 define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind {
476 ; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f32:
477 ; CHECK-SD: // %bb.0:
478 ; CHECK-SD-NEXT: trn1.4s v0, v0, v0
479 ; CHECK-SD-NEXT: mov.d v0[1], v1[0]
482 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f32:
483 ; CHECK-GI: // %bb.0:
484 ; CHECK-GI-NEXT: adrp x8, .LCPI36_0
485 ; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
486 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0]
487 ; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
488 ; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
490 %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
494 define void @disguised_dup(<4 x float> %x, ptr %p1, ptr %p2) {
495 ; CHECK-SD-LABEL: disguised_dup:
496 ; CHECK-SD: // %bb.0:
497 ; CHECK-SD-NEXT: ext.16b v1, v0, v0, #4
498 ; CHECK-SD-NEXT: mov.s v1[2], v0[0]
499 ; CHECK-SD-NEXT: dup.4s v0, v0[0]
500 ; CHECK-SD-NEXT: str q1, [x0]
501 ; CHECK-SD-NEXT: str q0, [x1]
504 ; CHECK-GI-LABEL: disguised_dup:
505 ; CHECK-GI: // %bb.0:
506 ; CHECK-GI-NEXT: adrp x8, .LCPI37_1
507 ; CHECK-GI-NEXT: // kill: def $q0 killed $q0 def $q0_q1
508 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI37_1]
509 ; CHECK-GI-NEXT: adrp x8, .LCPI37_0
510 ; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
511 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI37_0]
512 ; CHECK-GI-NEXT: tbl.16b v2, { v0, v1 }, v2
513 ; CHECK-GI-NEXT: str q0, [x0]
514 ; CHECK-GI-NEXT: str q2, [x1]
516 %shuf = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 0>
517 %dup = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
518 store <4 x float> %shuf, ptr %p1, align 8
519 store <4 x float> %dup, ptr %p2, align 8
523 define <2 x i32> @dup_const2(<2 x i32> %A) nounwind {
524 ; CHECK-SD-LABEL: dup_const2:
525 ; CHECK-SD: // %bb.0:
526 ; CHECK-SD-NEXT: mov w8, #32770 // =0x8002
527 ; CHECK-SD-NEXT: movk w8, #128, lsl #16
528 ; CHECK-SD-NEXT: dup.2s v1, w8
529 ; CHECK-SD-NEXT: add.2s v0, v0, v1
532 ; CHECK-GI-LABEL: dup_const2:
533 ; CHECK-GI: // %bb.0:
534 ; CHECK-GI-NEXT: adrp x8, .LCPI38_0
535 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI38_0]
536 ; CHECK-GI-NEXT: add.2s v0, v0, v1
538 %tmp2 = add <2 x i32> %A, <i32 8421378, i32 8421378>
542 define <2 x i32> @dup_const4_ext(<4 x i32> %A) nounwind {
543 ; CHECK-SD-LABEL: dup_const4_ext:
544 ; CHECK-SD: // %bb.0:
545 ; CHECK-SD-NEXT: mov w8, #32769 // =0x8001
546 ; CHECK-SD-NEXT: movk w8, #128, lsl #16
547 ; CHECK-SD-NEXT: dup.2s v1, w8
548 ; CHECK-SD-NEXT: add.2s v0, v0, v1
551 ; CHECK-GI-LABEL: dup_const4_ext:
552 ; CHECK-GI: // %bb.0:
553 ; CHECK-GI-NEXT: adrp x8, .LCPI39_0
554 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI39_0]
555 ; CHECK-GI-NEXT: add.4s v0, v0, v1
556 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
558 %tmp1 = add <4 x i32> %A, <i32 8421377, i32 8421377, i32 8421377, i32 8421377>
559 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
563 define <4 x i32> @dup_const24(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C) nounwind {
564 ; CHECK-SD-LABEL: dup_const24:
565 ; CHECK-SD: // %bb.0:
566 ; CHECK-SD-NEXT: mov w8, #32768 // =0x8000
567 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
568 ; CHECK-SD-NEXT: movk w8, #128, lsl #16
569 ; CHECK-SD-NEXT: dup.4s v3, w8
570 ; CHECK-SD-NEXT: add.2s v0, v0, v3
571 ; CHECK-SD-NEXT: mov.d v0[1], v1[0]
572 ; CHECK-SD-NEXT: add.4s v1, v2, v3
573 ; CHECK-SD-NEXT: eor.16b v0, v1, v0
576 ; CHECK-GI-LABEL: dup_const24:
577 ; CHECK-GI: // %bb.0:
578 ; CHECK-GI-NEXT: adrp x8, .LCPI40_1
579 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
580 ; CHECK-GI-NEXT: ldr d3, [x8, :lo12:.LCPI40_1]
581 ; CHECK-GI-NEXT: adrp x8, .LCPI40_0
582 ; CHECK-GI-NEXT: add.2s v0, v0, v3
583 ; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI40_0]
584 ; CHECK-GI-NEXT: mov.d v0[1], v1[0]
585 ; CHECK-GI-NEXT: add.4s v1, v2, v3
586 ; CHECK-GI-NEXT: eor.16b v0, v1, v0
588 %tmp1 = add <2 x i32> %A, <i32 8421376, i32 8421376>
589 %tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
590 %tmp3 = add <4 x i32> %C, <i32 8421376, i32 8421376, i32 8421376, i32 8421376>
591 %tmp5 = xor <4 x i32> %tmp3, %tmp4
595 define <8 x i16> @bitcast_i64_v8i16(i64 %a) {
596 ; CHECK-SD-LABEL: bitcast_i64_v8i16:
597 ; CHECK-SD: // %bb.0:
598 ; CHECK-SD-NEXT: dup.8h v0, w0
601 ; CHECK-GI-LABEL: bitcast_i64_v8i16:
602 ; CHECK-GI: // %bb.0:
603 ; CHECK-GI-NEXT: fmov d0, x0
604 ; CHECK-GI-NEXT: dup.8h v0, v0[0]
606 %b = bitcast i64 %a to <4 x i16>
607 %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> zeroinitializer
611 define <8 x i16> @bitcast_i64_v8i16_lane1(i64 %a) {
612 ; CHECK-LABEL: bitcast_i64_v8i16_lane1:
614 ; CHECK-NEXT: fmov d0, x0
615 ; CHECK-NEXT: dup.8h v0, v0[1]
617 %b = bitcast i64 %a to <4 x i16>
618 %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
622 define <8 x i16> @bitcast_f64_v8i16(double %a) {
623 ; CHECK-LABEL: bitcast_f64_v8i16:
625 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
626 ; CHECK-NEXT: dup.8h v0, v0[0]
628 %b = bitcast double %a to <4 x i16>
629 %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> zeroinitializer
633 define <8 x half> @bitcast_i64_v8f16(i64 %a) {
634 ; CHECK-LABEL: bitcast_i64_v8f16:
636 ; CHECK-NEXT: fmov d0, x0
637 ; CHECK-NEXT: dup.8h v0, v0[0]
639 %b = bitcast i64 %a to <4 x half>
640 %r = shufflevector <4 x half> %b, <4 x half> poison, <8 x i32> zeroinitializer
644 define <2 x i64> @bitcast_i64_v2f64(i64 %a) {
645 ; CHECK-SD-LABEL: bitcast_i64_v2f64:
646 ; CHECK-SD: // %bb.0:
647 ; CHECK-SD-NEXT: fmov d0, x0
648 ; CHECK-SD-NEXT: dup.2d v0, v0[0]
651 ; CHECK-GI-LABEL: bitcast_i64_v2f64:
652 ; CHECK-GI: // %bb.0:
653 ; CHECK-GI-NEXT: dup.2d v0, x0
655 %b = bitcast i64 %a to <1 x i64>
656 %r = shufflevector <1 x i64> %b, <1 x i64> poison, <2 x i32> zeroinitializer
660 define <2 x i64> @bitcast_v2f64_v2i64(<2 x double> %a) {
661 ; CHECK-LABEL: bitcast_v2f64_v2i64:
663 ; CHECK-NEXT: dup.2d v0, v0[0]
665 %b = bitcast <2 x double> %a to <2 x i64>
666 %r = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
670 define <2 x i64> @bitcast_v8i16_v2i64(<8 x i16> %a) {
671 ; CHECK-LABEL: bitcast_v8i16_v2i64:
673 ; CHECK-NEXT: dup.2d v0, v0[0]
675 %b = bitcast <8 x i16> %a to <2 x i64>
676 %r = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
680 define <8 x i16> @bitcast_v2f64_v8i16(<2 x i64> %a) {
681 ; CHECK-LABEL: bitcast_v2f64_v8i16:
683 ; CHECK-NEXT: dup.8h v0, v0[0]
685 %b = bitcast <2 x i64> %a to <8 x i16>
686 %r = shufflevector <8 x i16> %b, <8 x i16> poison, <8 x i32> zeroinitializer