1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
4 define arm_aapcs_vfpcc <16 x i8> @test_vidupq_n_u8(i32 %a) {
5 ; CHECK-LABEL: test_vidupq_n_u8:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vidup.u8 q0, r0, #4
10 %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 %a, i32 4)
11 %1 = extractvalue { <16 x i8>, i32 } %0, 0
15 define arm_aapcs_vfpcc <8 x i16> @test_vidupq_n_u16(i32 %a) {
16 ; CHECK-LABEL: test_vidupq_n_u16:
17 ; CHECK: @ %bb.0: @ %entry
18 ; CHECK-NEXT: vidup.u16 q0, r0, #1
21 %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %a, i32 1)
22 %1 = extractvalue { <8 x i16>, i32 } %0, 0
26 define arm_aapcs_vfpcc <4 x i32> @test_vidupq_n_u32(i32 %a) {
27 ; CHECK-LABEL: test_vidupq_n_u32:
28 ; CHECK: @ %bb.0: @ %entry
29 ; CHECK-NEXT: vidup.u32 q0, r0, #4
32 %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 %a, i32 4)
33 %1 = extractvalue { <4 x i32>, i32 } %0, 0
37 define arm_aapcs_vfpcc <16 x i8> @test_vddupq_n_u8(i32 %a) {
38 ; CHECK-LABEL: test_vddupq_n_u8:
39 ; CHECK: @ %bb.0: @ %entry
40 ; CHECK-NEXT: vddup.u8 q0, r0, #2
43 %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 %a, i32 2)
44 %1 = extractvalue { <16 x i8>, i32 } %0, 0
48 define arm_aapcs_vfpcc <8 x i16> @test_vddupq_n_u16(i32 %a) {
49 ; CHECK-LABEL: test_vddupq_n_u16:
50 ; CHECK: @ %bb.0: @ %entry
51 ; CHECK-NEXT: vddup.u16 q0, r0, #4
54 %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 %a, i32 4)
55 %1 = extractvalue { <8 x i16>, i32 } %0, 0
59 define arm_aapcs_vfpcc <4 x i32> @test_vddupq_n_u32(i32 %a) {
60 ; CHECK-LABEL: test_vddupq_n_u32:
61 ; CHECK: @ %bb.0: @ %entry
62 ; CHECK-NEXT: vddup.u32 q0, r0, #2
65 %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 %a, i32 2)
66 %1 = extractvalue { <4 x i32>, i32 } %0, 0
70 define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_n_u8(i32 %a, i32 %b) {
71 ; CHECK-LABEL: test_viwdupq_n_u8:
72 ; CHECK: @ %bb.0: @ %entry
73 ; CHECK-NEXT: viwdup.u8 q0, r0, r1, #4
76 %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 %a, i32 %b, i32 4)
77 %1 = extractvalue { <16 x i8>, i32 } %0, 0
81 define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_n_u16(i32 %a, i32 %b) {
82 ; CHECK-LABEL: test_viwdupq_n_u16:
83 ; CHECK: @ %bb.0: @ %entry
84 ; CHECK-NEXT: viwdup.u16 q0, r0, r1, #2
87 %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 %a, i32 %b, i32 2)
88 %1 = extractvalue { <8 x i16>, i32 } %0, 0
92 define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_n_u32(i32 %a, i32 %b) {
93 ; CHECK-LABEL: test_viwdupq_n_u32:
94 ; CHECK: @ %bb.0: @ %entry
95 ; CHECK-NEXT: viwdup.u32 q0, r0, r1, #8
98 %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 %a, i32 %b, i32 8)
99 %1 = extractvalue { <4 x i32>, i32 } %0, 0
103 define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_n_u8(i32 %a, i32 %b) {
104 ; CHECK-LABEL: test_vdwdupq_n_u8:
105 ; CHECK: @ %bb.0: @ %entry
106 ; CHECK-NEXT: vdwdup.u8 q0, r0, r1, #4
109 %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 %a, i32 %b, i32 4)
110 %1 = extractvalue { <16 x i8>, i32 } %0, 0
114 define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_n_u16(i32 %a, i32 %b) {
115 ; CHECK-LABEL: test_vdwdupq_n_u16:
116 ; CHECK: @ %bb.0: @ %entry
117 ; CHECK-NEXT: vdwdup.u16 q0, r0, r1, #8
120 %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 %a, i32 %b, i32 8)
121 %1 = extractvalue { <8 x i16>, i32 } %0, 0
125 define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_n_u32(i32 %a, i32 %b) {
126 ; CHECK-LABEL: test_vdwdupq_n_u32:
127 ; CHECK: @ %bb.0: @ %entry
128 ; CHECK-NEXT: vdwdup.u32 q0, r0, r1, #1
131 %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 %a, i32 %b, i32 1)
132 %1 = extractvalue { <4 x i32>, i32 } %0, 0
136 define arm_aapcs_vfpcc <16 x i8> @test_vidupq_wb_u8(ptr nocapture %a) {
137 ; CHECK-LABEL: test_vidupq_wb_u8:
138 ; CHECK: @ %bb.0: @ %entry
139 ; CHECK-NEXT: ldr r2, [r0]
140 ; CHECK-NEXT: vidup.u8 q0, r2, #8
141 ; CHECK-NEXT: str r2, [r0]
144 %0 = load i32, ptr %a, align 4
145 %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 %0, i32 8)
146 %2 = extractvalue { <16 x i8>, i32 } %1, 1
147 store i32 %2, ptr %a, align 4
148 %3 = extractvalue { <16 x i8>, i32 } %1, 0
152 define arm_aapcs_vfpcc <8 x i16> @test_vidupq_wb_u16(ptr nocapture %a) {
153 ; CHECK-LABEL: test_vidupq_wb_u16:
154 ; CHECK: @ %bb.0: @ %entry
155 ; CHECK-NEXT: ldr r2, [r0]
156 ; CHECK-NEXT: vidup.u16 q0, r2, #1
157 ; CHECK-NEXT: str r2, [r0]
160 %0 = load i32, ptr %a, align 4
161 %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %0, i32 1)
162 %2 = extractvalue { <8 x i16>, i32 } %1, 1
163 store i32 %2, ptr %a, align 4
164 %3 = extractvalue { <8 x i16>, i32 } %1, 0
168 define arm_aapcs_vfpcc <4 x i32> @test_vidupq_wb_u32(ptr nocapture %a) {
169 ; CHECK-LABEL: test_vidupq_wb_u32:
170 ; CHECK: @ %bb.0: @ %entry
171 ; CHECK-NEXT: ldr r2, [r0]
172 ; CHECK-NEXT: vidup.u32 q0, r2, #4
173 ; CHECK-NEXT: str r2, [r0]
176 %0 = load i32, ptr %a, align 4
177 %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 %0, i32 4)
178 %2 = extractvalue { <4 x i32>, i32 } %1, 1
179 store i32 %2, ptr %a, align 4
180 %3 = extractvalue { <4 x i32>, i32 } %1, 0
184 define arm_aapcs_vfpcc <16 x i8> @test_vddupq_wb_u8(ptr nocapture %a) {
185 ; CHECK-LABEL: test_vddupq_wb_u8:
186 ; CHECK: @ %bb.0: @ %entry
187 ; CHECK-NEXT: ldr r2, [r0]
188 ; CHECK-NEXT: vddup.u8 q0, r2, #2
189 ; CHECK-NEXT: str r2, [r0]
192 %0 = load i32, ptr %a, align 4
193 %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 %0, i32 2)
194 %2 = extractvalue { <16 x i8>, i32 } %1, 1
195 store i32 %2, ptr %a, align 4
196 %3 = extractvalue { <16 x i8>, i32 } %1, 0
200 define arm_aapcs_vfpcc <8 x i16> @test_vddupq_wb_u16(ptr nocapture %a) {
201 ; CHECK-LABEL: test_vddupq_wb_u16:
202 ; CHECK: @ %bb.0: @ %entry
203 ; CHECK-NEXT: ldr r2, [r0]
204 ; CHECK-NEXT: vddup.u16 q0, r2, #8
205 ; CHECK-NEXT: str r2, [r0]
208 %0 = load i32, ptr %a, align 4
209 %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 %0, i32 8)
210 %2 = extractvalue { <8 x i16>, i32 } %1, 1
211 store i32 %2, ptr %a, align 4
212 %3 = extractvalue { <8 x i16>, i32 } %1, 0
216 define arm_aapcs_vfpcc <4 x i32> @test_vddupq_wb_u32(ptr nocapture %a) {
217 ; CHECK-LABEL: test_vddupq_wb_u32:
218 ; CHECK: @ %bb.0: @ %entry
219 ; CHECK-NEXT: ldr r2, [r0]
220 ; CHECK-NEXT: vddup.u32 q0, r2, #2
221 ; CHECK-NEXT: str r2, [r0]
224 %0 = load i32, ptr %a, align 4
225 %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 %0, i32 2)
226 %2 = extractvalue { <4 x i32>, i32 } %1, 1
227 store i32 %2, ptr %a, align 4
228 %3 = extractvalue { <4 x i32>, i32 } %1, 0
232 define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_wb_u8(ptr nocapture %a, i32 %b) {
233 ; CHECK-LABEL: test_vdwdupq_wb_u8:
234 ; CHECK: @ %bb.0: @ %entry
235 ; CHECK-NEXT: ldr r2, [r0]
236 ; CHECK-NEXT: vdwdup.u8 q0, r2, r1, #4
237 ; CHECK-NEXT: str r2, [r0]
240 %0 = load i32, ptr %a, align 4
241 %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 %0, i32 %b, i32 4)
242 %2 = extractvalue { <16 x i8>, i32 } %1, 1
243 store i32 %2, ptr %a, align 4
244 %3 = extractvalue { <16 x i8>, i32 } %1, 0
248 define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_wb_u16(ptr nocapture %a, i32 %b) {
249 ; CHECK-LABEL: test_vdwdupq_wb_u16:
250 ; CHECK: @ %bb.0: @ %entry
251 ; CHECK-NEXT: ldr r2, [r0]
252 ; CHECK-NEXT: vdwdup.u16 q0, r2, r1, #4
253 ; CHECK-NEXT: str r2, [r0]
256 %0 = load i32, ptr %a, align 4
257 %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 %0, i32 %b, i32 4)
258 %2 = extractvalue { <8 x i16>, i32 } %1, 1
259 store i32 %2, ptr %a, align 4
260 %3 = extractvalue { <8 x i16>, i32 } %1, 0
264 define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_wb_u8(ptr nocapture %a, i32 %b) {
265 ; CHECK-LABEL: test_viwdupq_wb_u8:
266 ; CHECK: @ %bb.0: @ %entry
267 ; CHECK-NEXT: ldr r2, [r0]
268 ; CHECK-NEXT: viwdup.u8 q0, r2, r1, #1
269 ; CHECK-NEXT: str r2, [r0]
272 %0 = load i32, ptr %a, align 4
273 %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 %0, i32 %b, i32 1)
274 %2 = extractvalue { <16 x i8>, i32 } %1, 1
275 store i32 %2, ptr %a, align 4
276 %3 = extractvalue { <16 x i8>, i32 } %1, 0
280 define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_wb_u16(ptr nocapture %a, i32 %b) {
281 ; CHECK-LABEL: test_viwdupq_wb_u16:
282 ; CHECK: @ %bb.0: @ %entry
283 ; CHECK-NEXT: ldr r2, [r0]
284 ; CHECK-NEXT: viwdup.u16 q0, r2, r1, #1
285 ; CHECK-NEXT: str r2, [r0]
288 %0 = load i32, ptr %a, align 4
289 %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 %0, i32 %b, i32 1)
290 %2 = extractvalue { <8 x i16>, i32 } %1, 1
291 store i32 %2, ptr %a, align 4
292 %3 = extractvalue { <8 x i16>, i32 } %1, 0
296 define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_wb_u32(ptr nocapture %a, i32 %b) {
297 ; CHECK-LABEL: test_viwdupq_wb_u32:
298 ; CHECK: @ %bb.0: @ %entry
299 ; CHECK-NEXT: ldr r2, [r0]
300 ; CHECK-NEXT: viwdup.u32 q0, r2, r1, #8
301 ; CHECK-NEXT: str r2, [r0]
304 %0 = load i32, ptr %a, align 4
305 %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 %0, i32 %b, i32 8)
306 %2 = extractvalue { <4 x i32>, i32 } %1, 1
307 store i32 %2, ptr %a, align 4
308 %3 = extractvalue { <4 x i32>, i32 } %1, 0
312 define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_wb_u32(ptr nocapture %a, i32 %b) {
313 ; CHECK-LABEL: test_vdwdupq_wb_u32:
314 ; CHECK: @ %bb.0: @ %entry
315 ; CHECK-NEXT: ldr r2, [r0]
316 ; CHECK-NEXT: vdwdup.u32 q0, r2, r1, #2
317 ; CHECK-NEXT: str r2, [r0]
320 %0 = load i32, ptr %a, align 4
321 %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 %0, i32 %b, i32 2)
322 %2 = extractvalue { <4 x i32>, i32 } %1, 1
323 store i32 %2, ptr %a, align 4
324 %3 = extractvalue { <4 x i32>, i32 } %1, 0
328 define arm_aapcs_vfpcc <16 x i8> @test_vidupq_m_n_u8(<16 x i8> %inactive, i32 %a, i16 zeroext %p) {
329 ; CHECK-LABEL: test_vidupq_m_n_u8:
330 ; CHECK: @ %bb.0: @ %entry
331 ; CHECK-NEXT: vmsr p0, r1
333 ; CHECK-NEXT: vidupt.u8 q0, r0, #8
336 %0 = zext i16 %p to i32
337 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
338 %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 8, <16 x i1> %1)
339 %3 = extractvalue { <16 x i8>, i32 } %2, 0
343 define arm_aapcs_vfpcc <8 x i16> @test_vidupq_m_n_u16(<8 x i16> %inactive, i32 %a, i16 zeroext %p) {
344 ; CHECK-LABEL: test_vidupq_m_n_u16:
345 ; CHECK: @ %bb.0: @ %entry
346 ; CHECK-NEXT: vmsr p0, r1
348 ; CHECK-NEXT: vidupt.u16 q0, r0, #8
351 %0 = zext i16 %p to i32
352 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
353 %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 8, <8 x i1> %1)
354 %3 = extractvalue { <8 x i16>, i32 } %2, 0
358 define arm_aapcs_vfpcc <4 x i32> @test_vidupq_m_n_u32(<4 x i32> %inactive, i32 %a, i16 zeroext %p) {
359 ; CHECK-LABEL: test_vidupq_m_n_u32:
360 ; CHECK: @ %bb.0: @ %entry
361 ; CHECK-NEXT: vmsr p0, r1
363 ; CHECK-NEXT: vidupt.u32 q0, r0, #2
366 %0 = zext i16 %p to i32
367 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
368 %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 2, <4 x i1> %1)
369 %3 = extractvalue { <4 x i32>, i32 } %2, 0
373 define arm_aapcs_vfpcc <16 x i8> @test_vddupq_m_n_u8(<16 x i8> %inactive, i32 %a, i16 zeroext %p) {
374 ; CHECK-LABEL: test_vddupq_m_n_u8:
375 ; CHECK: @ %bb.0: @ %entry
376 ; CHECK-NEXT: vmsr p0, r1
378 ; CHECK-NEXT: vddupt.u8 q0, r0, #8
381 %0 = zext i16 %p to i32
382 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
383 %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 8, <16 x i1> %1)
384 %3 = extractvalue { <16 x i8>, i32 } %2, 0
388 define arm_aapcs_vfpcc <8 x i16> @test_vddupq_m_n_u16(<8 x i16> %inactive, i32 %a, i16 zeroext %p) {
389 ; CHECK-LABEL: test_vddupq_m_n_u16:
390 ; CHECK: @ %bb.0: @ %entry
391 ; CHECK-NEXT: vmsr p0, r1
393 ; CHECK-NEXT: vddupt.u16 q0, r0, #2
396 %0 = zext i16 %p to i32
397 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
398 %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 2, <8 x i1> %1)
399 %3 = extractvalue { <8 x i16>, i32 } %2, 0
403 define arm_aapcs_vfpcc <4 x i32> @test_vddupq_m_n_u32(<4 x i32> %inactive, i32 %a, i16 zeroext %p) {
404 ; CHECK-LABEL: test_vddupq_m_n_u32:
405 ; CHECK: @ %bb.0: @ %entry
406 ; CHECK-NEXT: vmsr p0, r1
408 ; CHECK-NEXT: vddupt.u32 q0, r0, #8
411 %0 = zext i16 %p to i32
412 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
413 %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 8, <4 x i1> %1)
414 %3 = extractvalue { <4 x i32>, i32 } %2, 0
418 define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_m_n_u8(<16 x i8> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
419 ; CHECK-LABEL: test_viwdupq_m_n_u8:
420 ; CHECK: @ %bb.0: @ %entry
421 ; CHECK-NEXT: vmsr p0, r2
423 ; CHECK-NEXT: viwdupt.u8 q0, r0, r1, #8
426 %0 = zext i16 %p to i32
427 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
428 %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 %b, i32 8, <16 x i1> %1)
429 %3 = extractvalue { <16 x i8>, i32 } %2, 0
433 define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_m_n_u16(<8 x i16> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
434 ; CHECK-LABEL: test_viwdupq_m_n_u16:
435 ; CHECK: @ %bb.0: @ %entry
436 ; CHECK-NEXT: vmsr p0, r2
438 ; CHECK-NEXT: viwdupt.u16 q0, r0, r1, #8
441 %0 = zext i16 %p to i32
442 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
443 %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 %b, i32 8, <8 x i1> %1)
444 %3 = extractvalue { <8 x i16>, i32 } %2, 0
448 define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_m_n_u32(<4 x i32> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
449 ; CHECK-LABEL: test_viwdupq_m_n_u32:
450 ; CHECK: @ %bb.0: @ %entry
451 ; CHECK-NEXT: vmsr p0, r2
453 ; CHECK-NEXT: viwdupt.u32 q0, r0, r1, #4
456 %0 = zext i16 %p to i32
457 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
458 %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 %b, i32 4, <4 x i1> %1)
459 %3 = extractvalue { <4 x i32>, i32 } %2, 0
463 define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_m_n_u8(<16 x i8> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
464 ; CHECK-LABEL: test_vdwdupq_m_n_u8:
465 ; CHECK: @ %bb.0: @ %entry
466 ; CHECK-NEXT: vmsr p0, r2
468 ; CHECK-NEXT: vdwdupt.u8 q0, r0, r1, #1
471 %0 = zext i16 %p to i32
472 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
473 %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 %b, i32 1, <16 x i1> %1)
474 %3 = extractvalue { <16 x i8>, i32 } %2, 0
478 define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_m_n_u16(<8 x i16> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
479 ; CHECK-LABEL: test_vdwdupq_m_n_u16:
480 ; CHECK: @ %bb.0: @ %entry
481 ; CHECK-NEXT: vmsr p0, r2
483 ; CHECK-NEXT: vdwdupt.u16 q0, r0, r1, #2
486 %0 = zext i16 %p to i32
487 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
488 %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 %b, i32 2, <8 x i1> %1)
489 %3 = extractvalue { <8 x i16>, i32 } %2, 0
493 define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_m_n_u32(<4 x i32> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
494 ; CHECK-LABEL: test_vdwdupq_m_n_u32:
495 ; CHECK: @ %bb.0: @ %entry
496 ; CHECK-NEXT: vmsr p0, r2
498 ; CHECK-NEXT: vdwdupt.u32 q0, r0, r1, #4
501 %0 = zext i16 %p to i32
502 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
503 %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 %b, i32 4, <4 x i1> %1)
504 %3 = extractvalue { <4 x i32>, i32 } %2, 0
508 define arm_aapcs_vfpcc <16 x i8> @test_vidupq_m_wb_u8(<16 x i8> %inactive, ptr nocapture %a, i16 zeroext %p) {
509 ; CHECK-LABEL: test_vidupq_m_wb_u8:
510 ; CHECK: @ %bb.0: @ %entry
511 ; CHECK-NEXT: ldr r2, [r0]
512 ; CHECK-NEXT: vmsr p0, r1
514 ; CHECK-NEXT: vidupt.u8 q0, r2, #8
515 ; CHECK-NEXT: str r2, [r0]
518 %0 = load i32, ptr %a, align 4
519 %1 = zext i16 %p to i32
520 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
521 %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 8, <16 x i1> %2)
522 %4 = extractvalue { <16 x i8>, i32 } %3, 1
523 store i32 %4, ptr %a, align 4
524 %5 = extractvalue { <16 x i8>, i32 } %3, 0
528 define arm_aapcs_vfpcc <8 x i16> @test_vidupq_m_wb_u16(<8 x i16> %inactive, ptr nocapture %a, i16 zeroext %p) {
529 ; CHECK-LABEL: test_vidupq_m_wb_u16:
530 ; CHECK: @ %bb.0: @ %entry
531 ; CHECK-NEXT: ldr r2, [r0]
532 ; CHECK-NEXT: vmsr p0, r1
534 ; CHECK-NEXT: vidupt.u16 q0, r2, #2
535 ; CHECK-NEXT: str r2, [r0]
538 %0 = load i32, ptr %a, align 4
539 %1 = zext i16 %p to i32
540 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
541 %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 2, <8 x i1> %2)
542 %4 = extractvalue { <8 x i16>, i32 } %3, 1
543 store i32 %4, ptr %a, align 4
544 %5 = extractvalue { <8 x i16>, i32 } %3, 0
548 define arm_aapcs_vfpcc <4 x i32> @test_vidupq_m_wb_u32(<4 x i32> %inactive, ptr nocapture %a, i16 zeroext %p) {
549 ; CHECK-LABEL: test_vidupq_m_wb_u32:
550 ; CHECK: @ %bb.0: @ %entry
551 ; CHECK-NEXT: ldr r2, [r0]
552 ; CHECK-NEXT: vmsr p0, r1
554 ; CHECK-NEXT: vidupt.u32 q0, r2, #8
555 ; CHECK-NEXT: str r2, [r0]
558 %0 = load i32, ptr %a, align 4
559 %1 = zext i16 %p to i32
560 %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
561 %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 8, <4 x i1> %2)
562 %4 = extractvalue { <4 x i32>, i32 } %3, 1
563 store i32 %4, ptr %a, align 4
564 %5 = extractvalue { <4 x i32>, i32 } %3, 0
568 define arm_aapcs_vfpcc <16 x i8> @test_vddupq_m_wb_u8(<16 x i8> %inactive, ptr nocapture %a, i16 zeroext %p) {
569 ; CHECK-LABEL: test_vddupq_m_wb_u8:
570 ; CHECK: @ %bb.0: @ %entry
571 ; CHECK-NEXT: ldr r2, [r0]
572 ; CHECK-NEXT: vmsr p0, r1
574 ; CHECK-NEXT: vddupt.u8 q0, r2, #1
575 ; CHECK-NEXT: str r2, [r0]
578 %0 = load i32, ptr %a, align 4
579 %1 = zext i16 %p to i32
580 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
581 %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 1, <16 x i1> %2)
582 %4 = extractvalue { <16 x i8>, i32 } %3, 1
583 store i32 %4, ptr %a, align 4
584 %5 = extractvalue { <16 x i8>, i32 } %3, 0
588 define arm_aapcs_vfpcc <8 x i16> @test_vddupq_m_wb_u16(<8 x i16> %inactive, ptr nocapture %a, i16 zeroext %p) {
589 ; CHECK-LABEL: test_vddupq_m_wb_u16:
590 ; CHECK: @ %bb.0: @ %entry
591 ; CHECK-NEXT: ldr r2, [r0]
592 ; CHECK-NEXT: vmsr p0, r1
594 ; CHECK-NEXT: vddupt.u16 q0, r2, #1
595 ; CHECK-NEXT: str r2, [r0]
598 %0 = load i32, ptr %a, align 4
599 %1 = zext i16 %p to i32
600 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
601 %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 1, <8 x i1> %2)
602 %4 = extractvalue { <8 x i16>, i32 } %3, 1
603 store i32 %4, ptr %a, align 4
604 %5 = extractvalue { <8 x i16>, i32 } %3, 0
608 define arm_aapcs_vfpcc <4 x i32> @test_vddupq_m_wb_u32(<4 x i32> %inactive, ptr nocapture %a, i16 zeroext %p) {
609 ; CHECK-LABEL: test_vddupq_m_wb_u32:
610 ; CHECK: @ %bb.0: @ %entry
611 ; CHECK-NEXT: ldr r2, [r0]
612 ; CHECK-NEXT: vmsr p0, r1
614 ; CHECK-NEXT: vddupt.u32 q0, r2, #4
615 ; CHECK-NEXT: str r2, [r0]
618 %0 = load i32, ptr %a, align 4
619 %1 = zext i16 %p to i32
620 %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
621 %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 4, <4 x i1> %2)
622 %4 = extractvalue { <4 x i32>, i32 } %3, 1
623 store i32 %4, ptr %a, align 4
624 %5 = extractvalue { <4 x i32>, i32 } %3, 0
628 define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_m_wb_u8(<16 x i8> %inactive, ptr nocapture %a, i32 %b, i16 zeroext %p) {
629 ; CHECK-LABEL: test_viwdupq_m_wb_u8:
630 ; CHECK: @ %bb.0: @ %entry
631 ; CHECK-NEXT: ldr.w r12, [r0]
632 ; CHECK-NEXT: vmsr p0, r2
634 ; CHECK-NEXT: viwdupt.u8 q0, r12, r1, #8
635 ; CHECK-NEXT: str.w r12, [r0]
638 %0 = load i32, ptr %a, align 4
639 %1 = zext i16 %p to i32
640 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
641 %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 %b, i32 8, <16 x i1> %2)
642 %4 = extractvalue { <16 x i8>, i32 } %3, 1
643 store i32 %4, ptr %a, align 4
644 %5 = extractvalue { <16 x i8>, i32 } %3, 0
648 define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_m_wb_u16(<8 x i16> %inactive, ptr nocapture %a, i32 %b, i16 zeroext %p) {
649 ; CHECK-LABEL: test_viwdupq_m_wb_u16:
650 ; CHECK: @ %bb.0: @ %entry
651 ; CHECK-NEXT: ldr.w r12, [r0]
652 ; CHECK-NEXT: vmsr p0, r2
654 ; CHECK-NEXT: viwdupt.u16 q0, r12, r1, #8
655 ; CHECK-NEXT: str.w r12, [r0]
658 %0 = load i32, ptr %a, align 4
659 %1 = zext i16 %p to i32
660 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
661 %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 %b, i32 8, <8 x i1> %2)
662 %4 = extractvalue { <8 x i16>, i32 } %3, 1
663 store i32 %4, ptr %a, align 4
664 %5 = extractvalue { <8 x i16>, i32 } %3, 0
668 define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_m_wb_u32(<4 x i32> %inactive, ptr nocapture %a, i32 %b, i16 zeroext %p) {
669 ; CHECK-LABEL: test_viwdupq_m_wb_u32:
670 ; CHECK: @ %bb.0: @ %entry
671 ; CHECK-NEXT: ldr.w r12, [r0]
672 ; CHECK-NEXT: vmsr p0, r2
674 ; CHECK-NEXT: viwdupt.u32 q0, r12, r1, #4
675 ; CHECK-NEXT: str.w r12, [r0]
678 %0 = load i32, ptr %a, align 4
679 %1 = zext i16 %p to i32
680 %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
681 %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 %b, i32 4, <4 x i1> %2)
682 %4 = extractvalue { <4 x i32>, i32 } %3, 1
683 store i32 %4, ptr %a, align 4
684 %5 = extractvalue { <4 x i32>, i32 } %3, 0
688 define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_m_wb_u8(<16 x i8> %inactive, ptr nocapture %a, i32 %b, i16 zeroext %p) {
689 ; CHECK-LABEL: test_vdwdupq_m_wb_u8:
690 ; CHECK: @ %bb.0: @ %entry
691 ; CHECK-NEXT: ldr.w r12, [r0]
692 ; CHECK-NEXT: vmsr p0, r2
694 ; CHECK-NEXT: vdwdupt.u8 q0, r12, r1, #1
695 ; CHECK-NEXT: str.w r12, [r0]
698 %0 = load i32, ptr %a, align 4
699 %1 = zext i16 %p to i32
700 %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
701 %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 %b, i32 1, <16 x i1> %2)
702 %4 = extractvalue { <16 x i8>, i32 } %3, 1
703 store i32 %4, ptr %a, align 4
704 %5 = extractvalue { <16 x i8>, i32 } %3, 0
708 define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_m_wb_u16(<8 x i16> %inactive, ptr nocapture %a, i32 %b, i16 zeroext %p) {
709 ; CHECK-LABEL: test_vdwdupq_m_wb_u16:
710 ; CHECK: @ %bb.0: @ %entry
711 ; CHECK-NEXT: ldr.w r12, [r0]
712 ; CHECK-NEXT: vmsr p0, r2
714 ; CHECK-NEXT: vdwdupt.u16 q0, r12, r1, #4
715 ; CHECK-NEXT: str.w r12, [r0]
718 %0 = load i32, ptr %a, align 4
719 %1 = zext i16 %p to i32
720 %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
721 %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 %b, i32 4, <8 x i1> %2)
722 %4 = extractvalue { <8 x i16>, i32 } %3, 1
723 store i32 %4, ptr %a, align 4
724 %5 = extractvalue { <8 x i16>, i32 } %3, 0
728 define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_m_wb_u32(<4 x i32> %inactive, ptr nocapture %a, i32 %b, i16 zeroext %p) {
729 ; CHECK-LABEL: test_vdwdupq_m_wb_u32:
730 ; CHECK: @ %bb.0: @ %entry
731 ; CHECK-NEXT: ldr.w r12, [r0]
732 ; CHECK-NEXT: vmsr p0, r2
734 ; CHECK-NEXT: vdwdupt.u32 q0, r12, r1, #4
735 ; CHECK-NEXT: str.w r12, [r0]
738 %0 = load i32, ptr %a, align 4
739 %1 = zext i16 %p to i32
740 %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
741 %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 %b, i32 4, <4 x i1> %2)
742 %4 = extractvalue { <4 x i32>, i32 } %3, 1
743 store i32 %4, ptr %a, align 4
744 %5 = extractvalue { <4 x i32>, i32 } %3, 0
748 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
749 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
750 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
752 declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32)
753 declare { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32, i32)
754 declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
755 declare { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32, i32)
756 declare { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32, i32)
757 declare { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32, i32)
758 declare { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32, i32, i32)
759 declare { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32, i32, i32)
760 declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32, i32, i32)
761 declare { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32, i32, i32)
762 declare { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32, i32, i32)
763 declare { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32, i32, i32)
764 declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>)
765 declare { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>)
766 declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>)
767 declare { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>)
768 declare { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>)
769 declare { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>)
770 declare { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>)
771 declare { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>)
772 declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, i32, <4 x i1>)
773 declare { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>)
774 declare { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>)
775 declare { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, i32, <4 x i1>)