1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
4 define arm_aapcs_vfpcc <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
5 ; CHECK-LABEL: test_vmulq_u8:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vmul.i8 q0, q1, q0
10 %0 = mul <16 x i8> %b, %a
14 define arm_aapcs_vfpcc <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
15 ; CHECK-LABEL: test_vmulq_s16:
16 ; CHECK: @ %bb.0: @ %entry
17 ; CHECK-NEXT: vmul.i16 q0, q1, q0
20 %0 = mul <8 x i16> %b, %a
24 define arm_aapcs_vfpcc <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
25 ; CHECK-LABEL: test_vmulq_u32:
26 ; CHECK: @ %bb.0: @ %entry
27 ; CHECK-NEXT: vmul.i32 q0, q1, q0
30 %0 = mul <4 x i32> %b, %a
34 define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
35 ; CHECK-LABEL: test_vmulq_f32:
36 ; CHECK: @ %bb.0: @ %entry
37 ; CHECK-NEXT: vmul.f32 q0, q0, q1
40 %0 = fmul <4 x float> %a, %b
44 define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
45 ; CHECK-LABEL: test_vmulq_m_s8:
46 ; CHECK: @ %bb.0: @ %entry
47 ; CHECK-NEXT: vmsr p0, r0
49 ; CHECK-NEXT: vmult.i8 q0, q1, q2
52 %0 = zext i16 %p to i32
53 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
54 %2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
58 declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
60 declare <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
62 define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
63 ; CHECK-LABEL: test_vmulq_m_u16:
64 ; CHECK: @ %bb.0: @ %entry
65 ; CHECK-NEXT: vmsr p0, r0
67 ; CHECK-NEXT: vmult.i16 q0, q1, q2
70 %0 = zext i16 %p to i32
71 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
72 %2 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
76 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
78 declare <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
80 define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
81 ; CHECK-LABEL: test_vmulq_m_s32:
82 ; CHECK: @ %bb.0: @ %entry
83 ; CHECK-NEXT: vmsr p0, r0
85 ; CHECK-NEXT: vmult.i32 q0, q1, q2
88 %0 = zext i16 %p to i32
89 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
90 %2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
94 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
96 declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
98 define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 {
99 ; CHECK-LABEL: test_vmulq_m_f16:
100 ; CHECK: @ %bb.0: @ %entry
101 ; CHECK-NEXT: vmsr p0, r0
103 ; CHECK-NEXT: vmult.f16 q0, q1, q2
106 %0 = zext i16 %p to i32
107 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
108 %2 = tail call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
112 declare <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
114 define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
115 ; CHECK-LABEL: test_vmulq_x_u8:
116 ; CHECK: @ %bb.0: @ %entry
117 ; CHECK-NEXT: vmsr p0, r0
119 ; CHECK-NEXT: vmult.i8 q0, q0, q1
122 %0 = zext i16 %p to i32
123 %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
124 %2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
128 define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
129 ; CHECK-LABEL: test_vmulq_x_s16:
130 ; CHECK: @ %bb.0: @ %entry
131 ; CHECK-NEXT: vmsr p0, r0
133 ; CHECK-NEXT: vmult.i16 q0, q0, q1
136 %0 = zext i16 %p to i32
137 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
138 %2 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
142 define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
143 ; CHECK-LABEL: test_vmulq_x_u32:
144 ; CHECK: @ %bb.0: @ %entry
145 ; CHECK-NEXT: vmsr p0, r0
147 ; CHECK-NEXT: vmult.i32 q0, q0, q1
150 %0 = zext i16 %p to i32
151 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
152 %2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
156 define arm_aapcs_vfpcc <4 x float> @test_vmulq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
157 ; CHECK-LABEL: test_vmulq_m_f32:
158 ; CHECK: @ %bb.0: @ %entry
159 ; CHECK-NEXT: vmsr p0, r0
161 ; CHECK-NEXT: vmult.f32 q0, q0, q1
164 %0 = zext i16 %p to i32
165 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
166 %2 = tail call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef)
170 declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
172 define arm_aapcs_vfpcc <16 x i8> @test_vmulq_n_u8(<16 x i8> %a, i8 zeroext %b) {
173 ; CHECK-LABEL: test_vmulq_n_u8:
174 ; CHECK: @ %bb.0: @ %entry
175 ; CHECK-NEXT: vmul.i8 q0, q0, r0
178 %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
179 %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
180 %0 = mul <16 x i8> %.splat, %a
184 define arm_aapcs_vfpcc <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) {
185 ; CHECK-LABEL: test_vmulq_n_s16:
186 ; CHECK: @ %bb.0: @ %entry
187 ; CHECK-NEXT: vmul.i16 q0, q0, r0
190 %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
191 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
192 %0 = mul <8 x i16> %.splat, %a
196 define arm_aapcs_vfpcc <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) {
197 ; CHECK-LABEL: test_vmulq_n_u32:
198 ; CHECK: @ %bb.0: @ %entry
199 ; CHECK-NEXT: vmul.i32 q0, q0, r0
202 %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
203 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
204 %0 = mul <4 x i32> %.splat, %a
208 define arm_aapcs_vfpcc <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
209 ; CHECK-LABEL: test_vmulq_n_f32:
210 ; CHECK: @ %bb.0: @ %entry
211 ; CHECK-NEXT: vmov r0, s4
212 ; CHECK-NEXT: vmul.f32 q0, q0, r0
215 %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
216 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
217 %0 = fmul <4 x float> %.splat, %a
221 define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
222 ; CHECK-LABEL: test_vmulq_m_n_s8:
223 ; CHECK: @ %bb.0: @ %entry
224 ; CHECK-NEXT: vmsr p0, r1
226 ; CHECK-NEXT: vmult.i8 q0, q1, r0
229 %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
230 %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
231 %0 = zext i16 %p to i32
232 %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
233 %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
237 define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
238 ; CHECK-LABEL: test_vmulq_m_n_u16:
239 ; CHECK: @ %bb.0: @ %entry
240 ; CHECK-NEXT: vmsr p0, r1
242 ; CHECK-NEXT: vmult.i16 q0, q1, r0
245 %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
246 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
247 %0 = zext i16 %p to i32
248 %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
249 %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive)
253 define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
254 ; CHECK-LABEL: test_vmulq_m_n_s32:
255 ; CHECK: @ %bb.0: @ %entry
256 ; CHECK-NEXT: vmsr p0, r1
258 ; CHECK-NEXT: vmult.i32 q0, q1, r0
261 %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
262 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
263 %0 = zext i16 %p to i32
264 %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
265 %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive)
269 define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_n_f16(<8 x half> %inactive, <8 x half> %a, float %b.coerce, i16 zeroext %p) {
270 ; CHECK-LABEL: test_vmulq_m_n_f16:
271 ; CHECK: @ %bb.0: @ %entry
272 ; CHECK-NEXT: vmov r1, s8
273 ; CHECK-NEXT: vmsr p0, r0
275 ; CHECK-NEXT: vmult.f16 q0, q1, r1
278 %0 = bitcast float %b.coerce to i32
279 %tmp.0.extract.trunc = trunc i32 %0 to i16
280 %1 = bitcast i16 %tmp.0.extract.trunc to half
281 %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
282 %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
283 %2 = zext i16 %p to i32
284 %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
285 %4 = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> %inactive)
289 define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) {
290 ; CHECK-LABEL: test_vmulq_x_n_u8:
291 ; CHECK: @ %bb.0: @ %entry
292 ; CHECK-NEXT: vmsr p0, r1
294 ; CHECK-NEXT: vmult.i8 q0, q0, r0
297 %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
298 %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
299 %0 = zext i16 %p to i32
300 %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
301 %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> undef)
305 define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) {
306 ; CHECK-LABEL: test_vmulq_x_n_s16:
307 ; CHECK: @ %bb.0: @ %entry
308 ; CHECK-NEXT: vmsr p0, r1
310 ; CHECK-NEXT: vmult.i16 q0, q0, r0
313 %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
314 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
315 %0 = zext i16 %p to i32
316 %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
317 %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef)
321 define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) {
322 ; CHECK-LABEL: test_vmulq_x_n_u32:
323 ; CHECK: @ %bb.0: @ %entry
324 ; CHECK-NEXT: vmsr p0, r1
326 ; CHECK-NEXT: vmult.i32 q0, q0, r0
329 %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
330 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
331 %0 = zext i16 %p to i32
332 %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
333 %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> undef)
337 define arm_aapcs_vfpcc <4 x float> @test_vmulq_x_n_f32(<4 x float> %a, float %b, i16 zeroext %p) {
338 ; CHECK-LABEL: test_vmulq_x_n_f32:
339 ; CHECK: @ %bb.0: @ %entry
340 ; CHECK-NEXT: vmov r1, s4
341 ; CHECK-NEXT: vmsr p0, r0
343 ; CHECK-NEXT: vmult.f32 q0, q0, r1
346 %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
347 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
348 %0 = zext i16 %p to i32
349 %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
350 %2 = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> undef)