1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
5 define <8 x i8> @v_dup8(i8 %A) nounwind {
8 ; CHECK-NEXT: dup.8b v0, w0
10 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
11 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
12 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
13 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
14 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
15 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
16 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
17 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
21 define <4 x i16> @v_dup16(i16 %A) nounwind {
22 ; CHECK-LABEL: v_dup16:
24 ; CHECK-NEXT: dup.4h v0, w0
26 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
27 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
28 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
29 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
33 define <2 x i32> @v_dup32(i32 %A) nounwind {
34 ; CHECK-LABEL: v_dup32:
36 ; CHECK-NEXT: dup.2s v0, w0
38 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
39 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
43 define <2 x float> @v_dupfloat(float %A) nounwind {
44 ; CHECK-LABEL: v_dupfloat:
46 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
47 ; CHECK-NEXT: dup.2s v0, v0[0]
49 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
50 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
54 define <16 x i8> @v_dupQ8(i8 %A) nounwind {
55 ; CHECK-LABEL: v_dupQ8:
57 ; CHECK-NEXT: dup.16b v0, w0
59 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
60 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
61 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
62 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
63 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
64 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
65 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
66 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
67 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
68 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
69 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
70 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
71 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
72 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
73 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
74 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
78 define <8 x i16> @v_dupQ16(i16 %A) nounwind {
79 ; CHECK-LABEL: v_dupQ16:
81 ; CHECK-NEXT: dup.8h v0, w0
83 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
84 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
85 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
86 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
87 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
88 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
89 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
90 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
94 define <4 x i32> @v_dupQ32(i32 %A) nounwind {
95 ; CHECK-LABEL: v_dupQ32:
97 ; CHECK-NEXT: dup.4s v0, w0
99 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
100 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
101 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
102 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
106 define <4 x i16> @v_dup16_const(i16 %y, ptr %p) {
107 ; CHECK-LABEL: v_dup16_const:
109 ; CHECK-NEXT: movi.4h v0, #10
110 ; CHECK-NEXT: mov w8, #10 // =0xa
111 ; CHECK-NEXT: strh w8, [x1]
113 %i = insertelement <4 x i16> undef, i16 10, i32 0
114 %lo = shufflevector <4 x i16> %i, <4 x i16> undef, <4 x i32> zeroinitializer
119 define <4 x float> @v_dupQfloat(float %A) nounwind {
120 ; CHECK-LABEL: v_dupQfloat:
122 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
123 ; CHECK-NEXT: dup.4s v0, v0[0]
125 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
126 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
127 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
128 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
129 ret <4 x float> %tmp4
132 ; Check to make sure it works with shuffles, too.
134 define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
135 ; CHECK-LABEL: v_shuffledup8:
137 ; CHECK-NEXT: dup.8b v0, w0
139 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
140 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
144 define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
145 ; CHECK-LABEL: v_shuffledup16:
147 ; CHECK-NEXT: dup.4h v0, w0
149 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
150 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
154 define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
155 ; CHECK-LABEL: v_shuffledup32:
157 ; CHECK-NEXT: dup.2s v0, w0
159 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
160 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
164 define <2 x float> @v_shuffledupfloat(float %A) nounwind {
165 ; CHECK-LABEL: v_shuffledupfloat:
167 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
168 ; CHECK-NEXT: dup.2s v0, v0[0]
170 %tmp1 = insertelement <2 x float> undef, float %A, i32 0
171 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
172 ret <2 x float> %tmp2
175 define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
176 ; CHECK-LABEL: v_shuffledupQ8:
178 ; CHECK-NEXT: dup.16b v0, w0
180 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
181 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
185 define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
186 ; CHECK-LABEL: v_shuffledupQ16:
188 ; CHECK-NEXT: dup.8h v0, w0
190 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
191 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
195 define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
196 ; CHECK-LABEL: v_shuffledupQ32:
198 ; CHECK-NEXT: dup.4s v0, w0
200 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
201 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
205 define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
206 ; CHECK-LABEL: v_shuffledupQfloat:
208 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
209 ; CHECK-NEXT: dup.4s v0, v0[0]
211 %tmp1 = insertelement <4 x float> undef, float %A, i32 0
212 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
213 ret <4 x float> %tmp2
216 define <8 x i8> @vduplane8(<8 x i8> %A) nounwind {
217 ; CHECK-LABEL: vduplane8:
219 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
220 ; CHECK-NEXT: dup.8b v0, v0[1]
222 %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
226 define <4 x i16> @vduplane16(<4 x i16> %A) nounwind {
227 ; CHECK-LABEL: vduplane16:
229 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
230 ; CHECK-NEXT: dup.4h v0, v0[1]
232 %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
236 define <2 x i32> @vduplane32(<2 x i32> %A) nounwind {
237 ; CHECK-LABEL: vduplane32:
239 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
240 ; CHECK-NEXT: dup.2s v0, v0[1]
242 %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
246 define <2 x float> @vduplanefloat(<2 x float> %A) nounwind {
247 ; CHECK-LABEL: vduplanefloat:
249 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
250 ; CHECK-NEXT: dup.2s v0, v0[1]
252 %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
253 ret <2 x float> %tmp2
256 define <16 x i8> @vduplaneQ8(<8 x i8> %A) nounwind {
257 ; CHECK-LABEL: vduplaneQ8:
259 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
260 ; CHECK-NEXT: dup.16b v0, v0[1]
262 %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
266 define <8 x i16> @vduplaneQ16(<4 x i16> %A) nounwind {
267 ; CHECK-LABEL: vduplaneQ16:
269 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
270 ; CHECK-NEXT: dup.8h v0, v0[1]
272 %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
276 define <4 x i32> @vduplaneQ32(<2 x i32> %A) nounwind {
277 ; CHECK-LABEL: vduplaneQ32:
279 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
280 ; CHECK-NEXT: dup.4s v0, v0[1]
282 %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
286 define <4 x float> @vduplaneQfloat(<2 x float> %A) nounwind {
287 ; CHECK-LABEL: vduplaneQfloat:
289 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
290 ; CHECK-NEXT: dup.4s v0, v0[1]
292 %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
293 ret <4 x float> %tmp2
296 define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
298 ; CHECK: // %bb.0: // %entry
299 ; CHECK-NEXT: dup.2d v0, v0[1]
302 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
306 define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
308 ; CHECK: // %bb.0: // %entry
309 ; CHECK-NEXT: dup.2d v0, v0[0]
312 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
316 define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
318 ; CHECK: // %bb.0: // %entry
319 ; CHECK-NEXT: dup.2d v0, v0[1]
322 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
326 define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
328 ; CHECK: // %bb.0: // %entry
329 ; CHECK-NEXT: dup.2d v0, v0[0]
332 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
336 define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone {
338 ; CHECK-SD: // %bb.0:
339 ; CHECK-SD-NEXT: fmov s0, w0
340 ; CHECK-SD-NEXT: mov.s v0[1], w1
341 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
345 ; CHECK-GI: // %bb.0:
346 ; CHECK-GI-NEXT: mov.s v0[0], w0
347 ; CHECK-GI-NEXT: mov.s v0[1], w1
348 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
350 %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
351 %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
352 ret <2 x i32> %vecinit1
355 define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone {
357 ; CHECK-SD: // %bb.0:
358 ; CHECK-SD-NEXT: fmov s0, w0
359 ; CHECK-SD-NEXT: mov.s v0[1], w1
360 ; CHECK-SD-NEXT: mov.s v0[2], w1
361 ; CHECK-SD-NEXT: mov.s v0[3], w0
365 ; CHECK-GI: // %bb.0:
366 ; CHECK-GI-NEXT: mov.s v0[0], w0
367 ; CHECK-GI-NEXT: mov.s v0[1], w1
368 ; CHECK-GI-NEXT: mov.s v0[2], w1
369 ; CHECK-GI-NEXT: mov.s v0[3], w0
371 %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
372 %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
373 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
374 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
375 ret <4 x i32> %vecinit3
378 define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone {
380 ; CHECK-SD: // %bb.0:
381 ; CHECK-SD-NEXT: fmov d0, x0
382 ; CHECK-SD-NEXT: mov.d v0[1], x1
386 ; CHECK-GI: // %bb.0:
387 ; CHECK-GI-NEXT: mov.d v0[0], x0
388 ; CHECK-GI-NEXT: mov.d v0[1], x1
390 %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
391 %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
392 ret <2 x i64> %vecinit1
395 ; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
396 ; the single value needed was of the same type as the vector. This is false if
397 ; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
398 ; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
399 ; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
401 ; *However*, it is a dup vD.4h, vN.h[2*idx].
402 define <4 x i16> @test_build_illegal(<4 x i32> %in) {
403 ; CHECK-SD-LABEL: test_build_illegal:
404 ; CHECK-SD: // %bb.0:
405 ; CHECK-SD-NEXT: dup.4h v0, v0[6]
408 ; CHECK-GI-LABEL: test_build_illegal:
409 ; CHECK-GI: // %bb.0:
410 ; CHECK-GI-NEXT: mov.s w8, v0[3]
411 ; CHECK-GI-NEXT: mov.h v0[3], w8
412 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
414 %val = extractelement <4 x i32> %in, i32 3
415 %smallval = trunc i32 %val to i16
416 %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
421 ; We used to inherit an already extract_subvectored v4i16 from
422 ; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
423 ; the formation of an indexed-by-7 MLS.
424 define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
425 ; CHECK-SD-LABEL: test_high_splat:
426 ; CHECK-SD: // %bb.0: // %entry
427 ; CHECK-SD-NEXT: mls.4h v0, v1, v2[7]
430 ; CHECK-GI-LABEL: test_high_splat:
431 ; CHECK-GI: // %bb.0: // %entry
432 ; CHECK-GI-NEXT: dup.8h v2, v2[7]
433 ; CHECK-GI-NEXT: mls.4h v0, v2, v1
436 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
437 %mul = mul <4 x i16> %shuffle, %b
438 %sub = sub <4 x i16> %a, %mul
442 ; Also test the DUP path in the PerfectShuffle generator.
444 define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
445 ; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i16:
446 ; CHECK-SD: // %bb.0:
447 ; CHECK-SD-NEXT: trn1.4h v0, v0, v0
448 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
449 ; CHECK-SD-NEXT: mov.s v0[1], v1[0]
450 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
453 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i16:
454 ; CHECK-GI: // %bb.0:
455 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
456 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
457 ; CHECK-GI-NEXT: adrp x8, .LCPI34_0
458 ; CHECK-GI-NEXT: mov.d v0[1], v1[0]
459 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI34_0]
460 ; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
461 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
463 %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
467 define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
468 ; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f16:
469 ; CHECK-SD: // %bb.0:
470 ; CHECK-SD-NEXT: trn1.4h v0, v0, v0
471 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
472 ; CHECK-SD-NEXT: mov.s v0[1], v1[0]
473 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
476 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f16:
477 ; CHECK-GI: // %bb.0:
478 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
479 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
480 ; CHECK-GI-NEXT: adrp x8, .LCPI35_0
481 ; CHECK-GI-NEXT: mov.d v0[1], v1[0]
482 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI35_0]
483 ; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
484 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
486 %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
490 define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
491 ; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i32:
492 ; CHECK-SD: // %bb.0:
493 ; CHECK-SD-NEXT: trn1.4s v0, v0, v0
494 ; CHECK-SD-NEXT: mov.d v0[1], v1[0]
497 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i32:
498 ; CHECK-GI: // %bb.0:
499 ; CHECK-GI-NEXT: adrp x8, .LCPI36_0
500 ; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
501 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0]
502 ; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
503 ; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
505 %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
509 define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind {
510 ; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f32:
511 ; CHECK-SD: // %bb.0:
512 ; CHECK-SD-NEXT: trn1.4s v0, v0, v0
513 ; CHECK-SD-NEXT: mov.d v0[1], v1[0]
516 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f32:
517 ; CHECK-GI: // %bb.0:
518 ; CHECK-GI-NEXT: adrp x8, .LCPI37_0
519 ; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
520 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI37_0]
521 ; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
522 ; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
524 %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
528 define void @disguised_dup(<4 x float> %x, ptr %p1, ptr %p2) {
529 ; CHECK-SD-LABEL: disguised_dup:
530 ; CHECK-SD: // %bb.0:
531 ; CHECK-SD-NEXT: ext.16b v1, v0, v0, #4
532 ; CHECK-SD-NEXT: mov.s v1[2], v0[0]
533 ; CHECK-SD-NEXT: dup.4s v0, v0[0]
534 ; CHECK-SD-NEXT: str q1, [x0]
535 ; CHECK-SD-NEXT: str q0, [x1]
538 ; CHECK-GI-LABEL: disguised_dup:
539 ; CHECK-GI: // %bb.0:
540 ; CHECK-GI-NEXT: adrp x8, .LCPI38_1
541 ; CHECK-GI-NEXT: // kill: def $q0 killed $q0 def $q0_q1
542 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI38_1]
543 ; CHECK-GI-NEXT: adrp x8, .LCPI38_0
544 ; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
545 ; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI38_0]
546 ; CHECK-GI-NEXT: tbl.16b v2, { v0, v1 }, v2
547 ; CHECK-GI-NEXT: str q0, [x0]
548 ; CHECK-GI-NEXT: str q2, [x1]
550 %shuf = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 0>
551 %dup = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
552 store <4 x float> %shuf, ptr %p1, align 8
553 store <4 x float> %dup, ptr %p2, align 8
557 define <2 x i32> @dup_const2(<2 x i32> %A) nounwind {
558 ; CHECK-SD-LABEL: dup_const2:
559 ; CHECK-SD: // %bb.0:
560 ; CHECK-SD-NEXT: mov w8, #32770 // =0x8002
561 ; CHECK-SD-NEXT: movk w8, #128, lsl #16
562 ; CHECK-SD-NEXT: dup.2s v1, w8
563 ; CHECK-SD-NEXT: add.2s v0, v0, v1
566 ; CHECK-GI-LABEL: dup_const2:
567 ; CHECK-GI: // %bb.0:
568 ; CHECK-GI-NEXT: adrp x8, .LCPI39_0
569 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI39_0]
570 ; CHECK-GI-NEXT: add.2s v0, v0, v1
572 %tmp2 = add <2 x i32> %A, <i32 8421378, i32 8421378>
576 define <2 x i32> @dup_const4_ext(<4 x i32> %A) nounwind {
577 ; CHECK-SD-LABEL: dup_const4_ext:
578 ; CHECK-SD: // %bb.0:
579 ; CHECK-SD-NEXT: mov w8, #32769 // =0x8001
580 ; CHECK-SD-NEXT: movk w8, #128, lsl #16
581 ; CHECK-SD-NEXT: dup.2s v1, w8
582 ; CHECK-SD-NEXT: add.2s v0, v0, v1
585 ; CHECK-GI-LABEL: dup_const4_ext:
586 ; CHECK-GI: // %bb.0:
587 ; CHECK-GI-NEXT: adrp x8, .LCPI40_0
588 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI40_0]
589 ; CHECK-GI-NEXT: add.4s v0, v0, v1
590 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
592 %tmp1 = add <4 x i32> %A, <i32 8421377, i32 8421377, i32 8421377, i32 8421377>
593 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
597 define <4 x i32> @dup_const24(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C) nounwind {
598 ; CHECK-SD-LABEL: dup_const24:
599 ; CHECK-SD: // %bb.0:
600 ; CHECK-SD-NEXT: mov w8, #32768 // =0x8000
601 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
602 ; CHECK-SD-NEXT: movk w8, #128, lsl #16
603 ; CHECK-SD-NEXT: dup.4s v3, w8
604 ; CHECK-SD-NEXT: add.2s v0, v0, v3
605 ; CHECK-SD-NEXT: mov.d v0[1], v1[0]
606 ; CHECK-SD-NEXT: add.4s v1, v2, v3
607 ; CHECK-SD-NEXT: eor.16b v0, v1, v0
610 ; CHECK-GI-LABEL: dup_const24:
611 ; CHECK-GI: // %bb.0:
612 ; CHECK-GI-NEXT: adrp x8, .LCPI41_1
613 ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
614 ; CHECK-GI-NEXT: ldr d3, [x8, :lo12:.LCPI41_1]
615 ; CHECK-GI-NEXT: adrp x8, .LCPI41_0
616 ; CHECK-GI-NEXT: add.2s v0, v0, v3
617 ; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI41_0]
618 ; CHECK-GI-NEXT: mov.d v0[1], v1[0]
619 ; CHECK-GI-NEXT: add.4s v1, v2, v3
620 ; CHECK-GI-NEXT: eor.16b v0, v1, v0
622 %tmp1 = add <2 x i32> %A, <i32 8421376, i32 8421376>
623 %tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
624 %tmp3 = add <4 x i32> %C, <i32 8421376, i32 8421376, i32 8421376, i32 8421376>
625 %tmp5 = xor <4 x i32> %tmp3, %tmp4
629 define <8 x i16> @bitcast_i64_v8i16(i64 %a) {
630 ; CHECK-SD-LABEL: bitcast_i64_v8i16:
631 ; CHECK-SD: // %bb.0:
632 ; CHECK-SD-NEXT: dup.8h v0, w0
635 ; CHECK-GI-LABEL: bitcast_i64_v8i16:
636 ; CHECK-GI: // %bb.0:
637 ; CHECK-GI-NEXT: fmov d0, x0
638 ; CHECK-GI-NEXT: dup.8h v0, v0[0]
640 %b = bitcast i64 %a to <4 x i16>
641 %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> zeroinitializer
645 define <8 x i16> @bitcast_i64_v8i16_lane1(i64 %a) {
646 ; CHECK-LABEL: bitcast_i64_v8i16_lane1:
648 ; CHECK-NEXT: fmov d0, x0
649 ; CHECK-NEXT: dup.8h v0, v0[1]
651 %b = bitcast i64 %a to <4 x i16>
652 %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
656 define <8 x i16> @bitcast_f64_v8i16(double %a) {
657 ; CHECK-LABEL: bitcast_f64_v8i16:
659 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
660 ; CHECK-NEXT: dup.8h v0, v0[0]
662 %b = bitcast double %a to <4 x i16>
663 %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> zeroinitializer
667 define <8 x half> @bitcast_i64_v8f16(i64 %a) {
668 ; CHECK-LABEL: bitcast_i64_v8f16:
670 ; CHECK-NEXT: fmov d0, x0
671 ; CHECK-NEXT: dup.8h v0, v0[0]
673 %b = bitcast i64 %a to <4 x half>
674 %r = shufflevector <4 x half> %b, <4 x half> poison, <8 x i32> zeroinitializer
678 define <2 x i64> @bitcast_i64_v2f64(i64 %a) {
679 ; CHECK-SD-LABEL: bitcast_i64_v2f64:
680 ; CHECK-SD: // %bb.0:
681 ; CHECK-SD-NEXT: fmov d0, x0
682 ; CHECK-SD-NEXT: dup.2d v0, v0[0]
685 ; CHECK-GI-LABEL: bitcast_i64_v2f64:
686 ; CHECK-GI: // %bb.0:
687 ; CHECK-GI-NEXT: dup.2d v0, x0
689 %b = bitcast i64 %a to <1 x i64>
690 %r = shufflevector <1 x i64> %b, <1 x i64> poison, <2 x i32> zeroinitializer
694 define <2 x i64> @bitcast_v2f64_v2i64(<2 x double> %a) {
695 ; CHECK-LABEL: bitcast_v2f64_v2i64:
697 ; CHECK-NEXT: dup.2d v0, v0[0]
699 %b = bitcast <2 x double> %a to <2 x i64>
700 %r = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
704 define <2 x i64> @bitcast_v8i16_v2i64(<8 x i16> %a) {
705 ; CHECK-LABEL: bitcast_v8i16_v2i64:
707 ; CHECK-NEXT: dup.2d v0, v0[0]
709 %b = bitcast <8 x i16> %a to <2 x i64>
710 %r = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
714 define <8 x i16> @bitcast_v2f64_v8i16(<2 x i64> %a) {
715 ; CHECK-LABEL: bitcast_v2f64_v8i16:
717 ; CHECK-NEXT: dup.8h v0, v0[0]
719 %b = bitcast <2 x i64> %a to <8 x i16>
720 %r = shufflevector <8 x i16> %b, <8 x i16> poison, <8 x i32> zeroinitializer
724 define <4 x i16> @dup_i16_v4i16_constant() {
725 ; CHECK-SD-LABEL: dup_i16_v4i16_constant:
726 ; CHECK-SD: // %bb.0:
727 ; CHECK-SD-NEXT: mov w8, #9211 // =0x23fb
728 ; CHECK-SD-NEXT: dup.4h v0, w8
731 ; CHECK-GI-LABEL: dup_i16_v4i16_constant:
732 ; CHECK-GI: // %bb.0:
733 ; CHECK-GI-NEXT: adrp x8, .LCPI50_0
734 ; CHECK-GI-NEXT: ldr d0, [x8, :lo12:.LCPI50_0]
736 ret <4 x i16> <i16 9211, i16 9211, i16 9211, i16 9211>