1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs | FileCheck %s
4 define <8 x i8> @v_dup8(i8 %A) nounwind {
7 ; CHECK-NEXT: vdup.8 d16, r0
8 ; CHECK-NEXT: vmov r0, r1, d16
9 ; CHECK-NEXT: mov pc, lr
10 %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
11 %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
12 %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
13 %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
14 %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
15 %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
16 %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
17 %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
21 define <4 x i16> @v_dup16(i16 %A) nounwind {
22 ; CHECK-LABEL: v_dup16:
24 ; CHECK-NEXT: vdup.16 d16, r0
25 ; CHECK-NEXT: vmov r0, r1, d16
26 ; CHECK-NEXT: mov pc, lr
27 %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
28 %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
29 %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
30 %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
34 define <2 x i32> @v_dup32(i32 %A) nounwind {
35 ; CHECK-LABEL: v_dup32:
37 ; CHECK-NEXT: vdup.32 d16, r0
38 ; CHECK-NEXT: vmov r0, r1, d16
39 ; CHECK-NEXT: mov pc, lr
40 %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
41 %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
45 define <2 x float> @v_dupfloat(float %A) nounwind {
46 ; CHECK-LABEL: v_dupfloat:
48 ; CHECK-NEXT: vdup.32 d16, r0
49 ; CHECK-NEXT: vmov r0, r1, d16
50 ; CHECK-NEXT: mov pc, lr
51 %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
52 %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
56 define <16 x i8> @v_dupQ8(i8 %A) nounwind {
57 ; CHECK-LABEL: v_dupQ8:
59 ; CHECK-NEXT: vdup.8 q8, r0
60 ; CHECK-NEXT: vmov r0, r1, d16
61 ; CHECK-NEXT: vmov r2, r3, d17
62 ; CHECK-NEXT: mov pc, lr
63 %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
64 %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
65 %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
66 %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
67 %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
68 %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
69 %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
70 %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
71 %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
72 %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
73 %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
74 %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
75 %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
76 %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
77 %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
78 %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
82 define <8 x i16> @v_dupQ16(i16 %A) nounwind {
83 ; CHECK-LABEL: v_dupQ16:
85 ; CHECK-NEXT: vdup.16 q8, r0
86 ; CHECK-NEXT: vmov r0, r1, d16
87 ; CHECK-NEXT: vmov r2, r3, d17
88 ; CHECK-NEXT: mov pc, lr
89 %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
90 %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
91 %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
92 %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
93 %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
94 %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
95 %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
96 %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
100 define <4 x i32> @v_dupQ32(i32 %A) nounwind {
101 ; CHECK-LABEL: v_dupQ32:
103 ; CHECK-NEXT: vdup.32 q8, r0
104 ; CHECK-NEXT: vmov r0, r1, d16
105 ; CHECK-NEXT: vmov r2, r3, d17
106 ; CHECK-NEXT: mov pc, lr
107 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
108 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
109 %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
110 %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
114 define <4 x float> @v_dupQfloat(float %A) nounwind {
115 ; CHECK-LABEL: v_dupQfloat:
117 ; CHECK-NEXT: vdup.32 q8, r0
118 ; CHECK-NEXT: vmov r0, r1, d16
119 ; CHECK-NEXT: vmov r2, r3, d17
120 ; CHECK-NEXT: mov pc, lr
121 %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
122 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
123 %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
124 %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
125 ret <4 x float> %tmp4
128 ; Check to make sure it works with shuffles, too.
130 define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
131 ; CHECK-LABEL: v_shuffledup8:
133 ; CHECK-NEXT: vdup.8 d16, r0
134 ; CHECK-NEXT: vmov r0, r1, d16
135 ; CHECK-NEXT: mov pc, lr
136 %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
137 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
141 define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
142 ; CHECK-LABEL: v_shuffledup16:
144 ; CHECK-NEXT: vdup.16 d16, r0
145 ; CHECK-NEXT: vmov r0, r1, d16
146 ; CHECK-NEXT: mov pc, lr
147 %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
148 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
152 define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
153 ; CHECK-LABEL: v_shuffledup32:
155 ; CHECK-NEXT: vdup.32 d16, r0
156 ; CHECK-NEXT: vmov r0, r1, d16
157 ; CHECK-NEXT: mov pc, lr
158 %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
159 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
163 define <2 x float> @v_shuffledupfloat(float %A) nounwind {
164 ; CHECK-LABEL: v_shuffledupfloat:
166 ; CHECK-NEXT: vdup.32 d16, r0
167 ; CHECK-NEXT: vmov r0, r1, d16
168 ; CHECK-NEXT: mov pc, lr
169 %tmp1 = insertelement <2 x float> undef, float %A, i32 0
170 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
171 ret <2 x float> %tmp2
174 define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
175 ; CHECK-LABEL: v_shuffledupQ8:
177 ; CHECK-NEXT: vdup.8 q8, r0
178 ; CHECK-NEXT: vmov r0, r1, d16
179 ; CHECK-NEXT: vmov r2, r3, d17
180 ; CHECK-NEXT: mov pc, lr
181 %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
182 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
186 define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
187 ; CHECK-LABEL: v_shuffledupQ16:
189 ; CHECK-NEXT: vdup.16 q8, r0
190 ; CHECK-NEXT: vmov r0, r1, d16
191 ; CHECK-NEXT: vmov r2, r3, d17
192 ; CHECK-NEXT: mov pc, lr
193 %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
194 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
198 define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
199 ; CHECK-LABEL: v_shuffledupQ32:
201 ; CHECK-NEXT: vdup.32 q8, r0
202 ; CHECK-NEXT: vmov r0, r1, d16
203 ; CHECK-NEXT: vmov r2, r3, d17
204 ; CHECK-NEXT: mov pc, lr
205 %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
206 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
210 define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
211 ; CHECK-LABEL: v_shuffledupQfloat:
213 ; CHECK-NEXT: vdup.32 q8, r0
214 ; CHECK-NEXT: vmov r0, r1, d16
215 ; CHECK-NEXT: vmov r2, r3, d17
216 ; CHECK-NEXT: mov pc, lr
217 %tmp1 = insertelement <4 x float> undef, float %A, i32 0
218 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
219 ret <4 x float> %tmp2
222 define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
223 ; CHECK-LABEL: vduplane8:
225 ; CHECK-NEXT: vldr d16, [r0]
226 ; CHECK-NEXT: vdup.8 d16, d16[1]
227 ; CHECK-NEXT: vmov r0, r1, d16
228 ; CHECK-NEXT: mov pc, lr
229 %tmp1 = load <8 x i8>, <8 x i8>* %A
230 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
234 define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
235 ; CHECK-LABEL: vduplane16:
237 ; CHECK-NEXT: vldr d16, [r0]
238 ; CHECK-NEXT: vdup.16 d16, d16[1]
239 ; CHECK-NEXT: vmov r0, r1, d16
240 ; CHECK-NEXT: mov pc, lr
241 %tmp1 = load <4 x i16>, <4 x i16>* %A
242 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
246 define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
247 ; CHECK-LABEL: vduplane32:
249 ; CHECK-NEXT: vldr d16, [r0]
250 ; CHECK-NEXT: vdup.32 d16, d16[1]
251 ; CHECK-NEXT: vmov r0, r1, d16
252 ; CHECK-NEXT: mov pc, lr
253 %tmp1 = load <2 x i32>, <2 x i32>* %A
254 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
258 define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
259 ; CHECK-LABEL: vduplanefloat:
261 ; CHECK-NEXT: vldr d16, [r0]
262 ; CHECK-NEXT: vdup.32 d16, d16[1]
263 ; CHECK-NEXT: vmov r0, r1, d16
264 ; CHECK-NEXT: mov pc, lr
265 %tmp1 = load <2 x float>, <2 x float>* %A
266 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
267 ret <2 x float> %tmp2
270 define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
271 ; CHECK-LABEL: vduplaneQ8:
273 ; CHECK-NEXT: vldr d16, [r0]
274 ; CHECK-NEXT: vdup.8 q8, d16[1]
275 ; CHECK-NEXT: vmov r0, r1, d16
276 ; CHECK-NEXT: vmov r2, r3, d17
277 ; CHECK-NEXT: mov pc, lr
278 %tmp1 = load <8 x i8>, <8 x i8>* %A
279 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
283 define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
284 ; CHECK-LABEL: vduplaneQ16:
286 ; CHECK-NEXT: vldr d16, [r0]
287 ; CHECK-NEXT: vdup.16 q8, d16[1]
288 ; CHECK-NEXT: vmov r0, r1, d16
289 ; CHECK-NEXT: vmov r2, r3, d17
290 ; CHECK-NEXT: mov pc, lr
291 %tmp1 = load <4 x i16>, <4 x i16>* %A
292 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
296 define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
297 ; CHECK-LABEL: vduplaneQ32:
299 ; CHECK-NEXT: vldr d16, [r0]
300 ; CHECK-NEXT: vdup.32 q8, d16[1]
301 ; CHECK-NEXT: vmov r0, r1, d16
302 ; CHECK-NEXT: vmov r2, r3, d17
303 ; CHECK-NEXT: mov pc, lr
304 %tmp1 = load <2 x i32>, <2 x i32>* %A
305 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
309 define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
310 ; CHECK-LABEL: vduplaneQfloat:
312 ; CHECK-NEXT: vldr d16, [r0]
313 ; CHECK-NEXT: vdup.32 q8, d16[1]
314 ; CHECK-NEXT: vmov r0, r1, d16
315 ; CHECK-NEXT: vmov r2, r3, d17
316 ; CHECK-NEXT: mov pc, lr
317 %tmp1 = load <2 x float>, <2 x float>* %A
318 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
319 ret <4 x float> %tmp2
322 define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
324 ; CHECK: @ %bb.0: @ %entry
325 ; CHECK-NEXT: mov r0, r2
326 ; CHECK-NEXT: mov r1, r3
327 ; CHECK-NEXT: mov pc, lr
329 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
333 define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
335 ; CHECK: @ %bb.0: @ %entry
336 ; CHECK-NEXT: mov r2, r0
337 ; CHECK-NEXT: mov r3, r1
338 ; CHECK-NEXT: mov pc, lr
340 %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
344 define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
346 ; CHECK: @ %bb.0: @ %entry
347 ; CHECK-NEXT: mov r0, r2
348 ; CHECK-NEXT: mov r1, r3
349 ; CHECK-NEXT: mov pc, lr
351 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
355 define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
357 ; CHECK: @ %bb.0: @ %entry
358 ; CHECK-NEXT: mov r2, r0
359 ; CHECK-NEXT: mov r3, r1
360 ; CHECK-NEXT: mov pc, lr
362 %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
367 define void @redundantVdup(<8 x i8>* %ptr) nounwind {
368 ; CHECK-LABEL: redundantVdup:
370 ; CHECK-NEXT: vmov.i8 d16, #0x80
371 ; CHECK-NEXT: vstr d16, [r0]
372 ; CHECK-NEXT: mov pc, lr
373 %1 = insertelement <8 x i8> undef, i8 -128, i32 0
374 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
375 store <8 x i8> %2, <8 x i8>* %ptr, align 8
379 define <4 x i32> @tdupi(i32 %x, i32 %y) {
380 ; CHECK-LABEL: tdupi:
382 ; CHECK-NEXT: vdup.32 q8, r0
383 ; CHECK-NEXT: vmov.32 d17[1], r1
384 ; CHECK-NEXT: vmov r0, r1, d16
385 ; CHECK-NEXT: vmov r2, r3, d17
386 ; CHECK-NEXT: mov pc, lr
387 %1 = insertelement <4 x i32> undef, i32 %x, i32 0
388 %2 = insertelement <4 x i32> %1, i32 %x, i32 1
389 %3 = insertelement <4 x i32> %2, i32 %x, i32 2
390 %4 = insertelement <4 x i32> %3, i32 %y, i32 3
394 define <4 x float> @tdupf(float %x, float %y) {
395 ; CHECK-LABEL: tdupf:
397 ; CHECK-NEXT: vdup.32 q0, r0
398 ; CHECK-NEXT: vmov s3, r1
399 ; CHECK-NEXT: vmov r0, r1, d0
400 ; CHECK-NEXT: vmov r2, r3, d1
401 ; CHECK-NEXT: mov pc, lr
402 %1 = insertelement <4 x float> undef, float %x, i32 0
403 %2 = insertelement <4 x float> %1, float %x, i32 1
404 %3 = insertelement <4 x float> %2, float %x, i32 2
405 %4 = insertelement <4 x float> %3, float %y, i32 3
409 ; This test checks that when splatting an element from a vector into another,
410 ; the value isn't moved out to GPRs first.
411 define <4 x i32> @tduplane(<4 x i32> %invec) {
412 ; CHECK-LABEL: tduplane:
414 ; CHECK-NEXT: vmov d16, r0, r1
415 ; CHECK-NEXT: mov r0, #255
416 ; CHECK-NEXT: vdup.32 q8, d16[1]
417 ; CHECK-NEXT: vmov.32 d17[1], r0
418 ; CHECK-NEXT: vmov r0, r1, d16
419 ; CHECK-NEXT: vmov r2, r3, d17
420 ; CHECK-NEXT: mov pc, lr
421 %in = extractelement <4 x i32> %invec, i32 1
422 %1 = insertelement <4 x i32> undef, i32 %in, i32 0
423 %2 = insertelement <4 x i32> %1, i32 %in, i32 1
424 %3 = insertelement <4 x i32> %2, i32 %in, i32 2
425 %4 = insertelement <4 x i32> %3, i32 255, i32 3
429 define <2 x float> @check_f32(<4 x float> %v) nounwind {
430 ; CHECK-LABEL: check_f32:
432 ; CHECK-NEXT: vmov d17, r2, r3
433 ; CHECK-NEXT: vdup.32 d16, d17[1]
434 ; CHECK-NEXT: vmov r0, r1, d16
435 ; CHECK-NEXT: mov pc, lr
436 %x = extractelement <4 x float> %v, i32 3
437 %1 = insertelement <2 x float> undef, float %x, i32 0
438 %2 = insertelement <2 x float> %1, float %x, i32 1
442 define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
443 ; CHECK-LABEL: check_i32:
445 ; CHECK-NEXT: vmov d17, r2, r3
446 ; CHECK-NEXT: vdup.32 d16, d17[1]
447 ; CHECK-NEXT: vmov r0, r1, d16
448 ; CHECK-NEXT: mov pc, lr
449 %x = extractelement <4 x i32> %v, i32 3
450 %1 = insertelement <2 x i32> undef, i32 %x, i32 0
451 %2 = insertelement <2 x i32> %1, i32 %x, i32 1
455 define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
456 ; CHECK-LABEL: check_i16:
458 ; CHECK-NEXT: vmov d16, r0, r1
459 ; CHECK-NEXT: vdup.16 d16, d16[3]
460 ; CHECK-NEXT: vmov r0, r1, d16
461 ; CHECK-NEXT: mov pc, lr
462 %x = extractelement <8 x i16> %v, i32 3
463 %1 = insertelement <4 x i16> undef, i16 %x, i32 0
464 %2 = insertelement <4 x i16> %1, i16 %x, i32 1
468 define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
469 ; CHECK-LABEL: check_i8:
471 ; CHECK-NEXT: vmov d16, r0, r1
472 ; CHECK-NEXT: vdup.8 d16, d16[3]
473 ; CHECK-NEXT: vmov r0, r1, d16
474 ; CHECK-NEXT: mov pc, lr
475 %x = extractelement <16 x i8> %v, i32 3
476 %1 = insertelement <8 x i8> undef, i8 %x, i32 0
477 %2 = insertelement <8 x i8> %1, i8 %x, i32 1
481 ; Check that an SPR splat produces a vdup.
483 define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
484 ; CHECK-LABEL: check_spr_splat2:
486 ; CHECK-NEXT: lsl r2, r2, #16
487 ; CHECK-NEXT: vmov d16, r0, r1
488 ; CHECK-NEXT: asr r2, r2, #16
489 ; CHECK-NEXT: vmov s0, r2
490 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
491 ; CHECK-NEXT: vdup.32 d17, d0[0]
492 ; CHECK-NEXT: vsub.f32 d16, d17, d16
493 ; CHECK-NEXT: vmov r0, r1, d16
494 ; CHECK-NEXT: mov pc, lr
495 %conv = sitofp i16 %q to float
496 %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0
497 %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer
498 %sub = fsub <2 x float> %splat.splat, %p
502 define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
503 ; CHECK-LABEL: check_spr_splat4:
505 ; CHECK-NEXT: ldrsh r12, [sp]
506 ; CHECK-NEXT: vmov d17, r2, r3
507 ; CHECK-NEXT: vmov d16, r0, r1
508 ; CHECK-NEXT: vmov s0, r12
509 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
510 ; CHECK-NEXT: vdup.32 q9, d0[0]
511 ; CHECK-NEXT: vsub.f32 q8, q9, q8
512 ; CHECK-NEXT: vmov r0, r1, d16
513 ; CHECK-NEXT: vmov r2, r3, d17
514 ; CHECK-NEXT: mov pc, lr
515 %conv = sitofp i16 %q to float
516 %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0
517 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
518 %sub = fsub <4 x float> %splat.splat, %p
521 ; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant.
522 define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
523 ; CHECK-LABEL: check_spr_splat4_lane1:
525 ; CHECK-NEXT: ldrsh r12, [sp]
526 ; CHECK-NEXT: vmov d17, r2, r3
527 ; CHECK-NEXT: vmov d16, r0, r1
528 ; CHECK-NEXT: vmov s0, r12
529 ; CHECK-NEXT: vcvt.f32.s32 s0, s0
530 ; CHECK-NEXT: vdup.32 q9, d0[0]
531 ; CHECK-NEXT: vsub.f32 q8, q9, q8
532 ; CHECK-NEXT: vmov r0, r1, d16
533 ; CHECK-NEXT: vmov r2, r3, d17
534 ; CHECK-NEXT: mov pc, lr
535 %conv = sitofp i16 %q to float
536 %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1
537 %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
538 %sub = fsub <4 x float> %splat.splat, %p
542 ; Also make sure we don't barf on variable-index extractelts, where we almost
543 ; could have generated a vdup.
545 define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) {
546 ; CHECK-LABEL: check_i8_varidx:
548 ; CHECK-NEXT: .save {r11}
549 ; CHECK-NEXT: push {r11}
550 ; CHECK-NEXT: .setfp r11, sp
551 ; CHECK-NEXT: mov r11, sp
552 ; CHECK-NEXT: .pad #28
553 ; CHECK-NEXT: sub sp, sp, #28
554 ; CHECK-NEXT: bic sp, sp, #15
555 ; CHECK-NEXT: ldr r12, [r11, #4]
556 ; CHECK-NEXT: vmov d17, r2, r3
557 ; CHECK-NEXT: vmov d16, r0, r1
558 ; CHECK-NEXT: mov r1, sp
559 ; CHECK-NEXT: and r0, r12, #15
560 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1:128], r0
561 ; CHECK-NEXT: vld1.8 {d16[]}, [r1]
562 ; CHECK-NEXT: vmov r0, r1, d16
563 ; CHECK-NEXT: mov sp, r11
564 ; CHECK-NEXT: pop {r11}
565 ; CHECK-NEXT: mov pc, lr
566 %x = extractelement <16 x i8> %v, i32 %idx
567 %1 = insertelement <8 x i8> undef, i8 %x, i32 0
568 %2 = insertelement <8 x i8> %1, i8 %x, i32 1