1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
4 define arm_aapcs_vfpcc <2 x i32> @vmulhs_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
5 ; CHECK-LABEL: vmulhs_v2i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vmullb.s32 q2, q0, q1
8 ; CHECK-NEXT: vmov r0, s11
9 ; CHECK-NEXT: vmov r1, s9
10 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
11 ; CHECK-NEXT: asrs r0, r0, #31
12 ; CHECK-NEXT: asrs r1, r1, #31
13 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
16 %s0s = sext <2 x i32> %s0 to <2 x i64>
17 %s1s = sext <2 x i32> %s1 to <2 x i64>
18 %m = mul <2 x i64> %s0s, %s1s
19 %s = ashr <2 x i64> %m, <i64 32, i64 32>
20 %s2 = trunc <2 x i64> %s to <2 x i32>
24 define arm_aapcs_vfpcc <2 x i32> @vmulhu_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
25 ; CHECK-LABEL: vmulhu_v2i32:
26 ; CHECK: @ %bb.0: @ %entry
27 ; CHECK-NEXT: vmullb.u32 q2, q0, q1
28 ; CHECK-NEXT: vldr s1, .LCPI1_0
29 ; CHECK-NEXT: vmov.f32 s0, s9
30 ; CHECK-NEXT: vmov.f32 s2, s11
31 ; CHECK-NEXT: vmov.f32 s3, s1
33 ; CHECK-NEXT: .p2align 2
34 ; CHECK-NEXT: @ %bb.1:
35 ; CHECK-NEXT: .LCPI1_0:
36 ; CHECK-NEXT: .long 0x00000000 @ float 0
38 %s0s = zext <2 x i32> %s0 to <2 x i64>
39 %s1s = zext <2 x i32> %s1 to <2 x i64>
40 %m = mul <2 x i64> %s0s, %s1s
41 %s = lshr <2 x i64> %m, <i64 32, i64 32>
42 %s2 = trunc <2 x i64> %s to <2 x i32>
46 define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
47 ; CHECK-LABEL: vmulhs_v4i32:
48 ; CHECK: @ %bb.0: @ %entry
49 ; CHECK-NEXT: vmulh.s32 q0, q0, q1
52 %s0s = sext <4 x i32> %s0 to <4 x i64>
53 %s1s = sext <4 x i32> %s1 to <4 x i64>
54 %m = mul <4 x i64> %s0s, %s1s
55 %s = ashr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
56 %s2 = trunc <4 x i64> %s to <4 x i32>
60 define arm_aapcs_vfpcc <4 x i32> @vmulhu_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
61 ; CHECK-LABEL: vmulhu_v4i32:
62 ; CHECK: @ %bb.0: @ %entry
63 ; CHECK-NEXT: vmulh.u32 q0, q0, q1
66 %s0s = zext <4 x i32> %s0 to <4 x i64>
67 %s1s = zext <4 x i32> %s1 to <4 x i64>
68 %m = mul <4 x i64> %s0s, %s1s
69 %s = lshr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
70 %s2 = trunc <4 x i64> %s to <4 x i32>
74 define arm_aapcs_vfpcc <4 x i16> @vmulhs_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
75 ; CHECK-LABEL: vmulhs_v4i16:
76 ; CHECK: @ %bb.0: @ %entry
77 ; CHECK-NEXT: vmullb.s16 q0, q0, q1
78 ; CHECK-NEXT: vshr.s32 q0, q0, #16
81 %s0s = sext <4 x i16> %s0 to <4 x i32>
82 %s1s = sext <4 x i16> %s1 to <4 x i32>
83 %m = mul <4 x i32> %s0s, %s1s
84 %s = ashr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16>
85 %s2 = trunc <4 x i32> %s to <4 x i16>
89 define arm_aapcs_vfpcc <4 x i16> @vmulhu_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
90 ; CHECK-LABEL: vmulhu_v4i16:
91 ; CHECK: @ %bb.0: @ %entry
92 ; CHECK-NEXT: vmullb.u16 q0, q0, q1
93 ; CHECK-NEXT: vshr.u32 q0, q0, #16
96 %s0s = zext <4 x i16> %s0 to <4 x i32>
97 %s1s = zext <4 x i16> %s1 to <4 x i32>
98 %m = mul <4 x i32> %s0s, %s1s
99 %s = lshr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16>
100 %s2 = trunc <4 x i32> %s to <4 x i16>
104 define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
105 ; CHECK-LABEL: vmulhs_v8i16:
106 ; CHECK: @ %bb.0: @ %entry
107 ; CHECK-NEXT: vmulh.s16 q0, q0, q1
110 %s0s = sext <8 x i16> %s0 to <8 x i32>
111 %s1s = sext <8 x i16> %s1 to <8 x i32>
112 %m = mul <8 x i32> %s0s, %s1s
113 %s = ashr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
114 %s2 = trunc <8 x i32> %s to <8 x i16>
118 define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
119 ; CHECK-LABEL: vmulhu_v8i16:
120 ; CHECK: @ %bb.0: @ %entry
121 ; CHECK-NEXT: vmulh.u16 q0, q0, q1
124 %s0s = zext <8 x i16> %s0 to <8 x i32>
125 %s1s = zext <8 x i16> %s1 to <8 x i32>
126 %m = mul <8 x i32> %s0s, %s1s
127 %s = lshr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
128 %s2 = trunc <8 x i32> %s to <8 x i16>
132 define arm_aapcs_vfpcc <4 x i8> @vmulhs_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
133 ; CHECK-LABEL: vmulhs_v4i8:
134 ; CHECK: @ %bb.0: @ %entry
135 ; CHECK-NEXT: vmovlb.s8 q1, q1
136 ; CHECK-NEXT: vmovlb.s8 q0, q0
137 ; CHECK-NEXT: vmovlb.s16 q1, q1
138 ; CHECK-NEXT: vmovlb.s16 q0, q0
139 ; CHECK-NEXT: vmul.i32 q0, q0, q1
140 ; CHECK-NEXT: vshr.s32 q0, q0, #8
143 %s0s = sext <4 x i8> %s0 to <4 x i16>
144 %s1s = sext <4 x i8> %s1 to <4 x i16>
145 %m = mul <4 x i16> %s0s, %s1s
146 %s = ashr <4 x i16> %m, <i16 8, i16 8, i16 8, i16 8>
147 %s2 = trunc <4 x i16> %s to <4 x i8>
151 define arm_aapcs_vfpcc <4 x i8> @vmulhu_v4i8(<4 x i8> %s0, <4 x i8> %s1) {
152 ; CHECK-LABEL: vmulhu_v4i8:
153 ; CHECK: @ %bb.0: @ %entry
154 ; CHECK-NEXT: vmov.i32 q2, #0xff
155 ; CHECK-NEXT: vand q1, q1, q2
156 ; CHECK-NEXT: vand q0, q0, q2
157 ; CHECK-NEXT: vmul.i32 q0, q0, q1
158 ; CHECK-NEXT: vshr.u32 q0, q0, #8
161 %s0s = zext <4 x i8> %s0 to <4 x i16>
162 %s1s = zext <4 x i8> %s1 to <4 x i16>
163 %m = mul <4 x i16> %s0s, %s1s
164 %s = lshr <4 x i16> %m, <i16 8, i16 8, i16 8, i16 8>
165 %s2 = trunc <4 x i16> %s to <4 x i8>
169 define arm_aapcs_vfpcc <8 x i8> @vmulhs_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
170 ; CHECK-LABEL: vmulhs_v8i8:
171 ; CHECK: @ %bb.0: @ %entry
172 ; CHECK-NEXT: vmullb.s8 q0, q0, q1
173 ; CHECK-NEXT: vshr.s16 q0, q0, #8
176 %s0s = sext <8 x i8> %s0 to <8 x i16>
177 %s1s = sext <8 x i8> %s1 to <8 x i16>
178 %m = mul <8 x i16> %s0s, %s1s
179 %s = ashr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
180 %s2 = trunc <8 x i16> %s to <8 x i8>
184 define arm_aapcs_vfpcc <8 x i8> @vmulhu_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
185 ; CHECK-LABEL: vmulhu_v8i8:
186 ; CHECK: @ %bb.0: @ %entry
187 ; CHECK-NEXT: vmullb.u8 q0, q0, q1
188 ; CHECK-NEXT: vshr.u16 q0, q0, #8
191 %s0s = zext <8 x i8> %s0 to <8 x i16>
192 %s1s = zext <8 x i8> %s1 to <8 x i16>
193 %m = mul <8 x i16> %s0s, %s1s
194 %s = lshr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
195 %s2 = trunc <8 x i16> %s to <8 x i8>
199 define arm_aapcs_vfpcc <16 x i8> @vmulhs_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
200 ; CHECK-LABEL: vmulhs_v16i8:
201 ; CHECK: @ %bb.0: @ %entry
202 ; CHECK-NEXT: vmulh.s8 q0, q0, q1
205 %s0s = sext <16 x i8> %s0 to <16 x i16>
206 %s1s = sext <16 x i8> %s1 to <16 x i16>
207 %m = mul <16 x i16> %s0s, %s1s
208 %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
209 %s2 = trunc <16 x i16> %s to <16 x i8>
213 define arm_aapcs_vfpcc <16 x i8> @vmulhu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
214 ; CHECK-LABEL: vmulhu_v16i8:
215 ; CHECK: @ %bb.0: @ %entry
216 ; CHECK-NEXT: vmulh.u8 q0, q0, q1
219 %s0s = zext <16 x i8> %s0 to <16 x i16>
220 %s1s = zext <16 x i8> %s1 to <16 x i16>
221 %m = mul <16 x i16> %s0s, %s1s
222 %s = lshr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
223 %s2 = trunc <16 x i16> %s to <16 x i8>
227 define void @vmulh_s8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
228 ; CHECK-LABEL: vmulh_s8:
229 ; CHECK: @ %bb.0: @ %entry
230 ; CHECK-NEXT: .save {r7, lr}
231 ; CHECK-NEXT: push {r7, lr}
232 ; CHECK-NEXT: mov.w lr, #64
233 ; CHECK-NEXT: .LBB14_1: @ %vector.body
234 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
235 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
236 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
237 ; CHECK-NEXT: vmulh.s8 q0, q1, q0
238 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
239 ; CHECK-NEXT: le lr, .LBB14_1
240 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
241 ; CHECK-NEXT: pop {r7, pc}
243 br label %vector.body
245 vector.body: ; preds = %vector.body, %entry
246 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
247 %0 = getelementptr inbounds i8, ptr %x, i32 %index
248 %wide.load = load <16 x i8>, ptr %0, align 1
249 %1 = sext <16 x i8> %wide.load to <16 x i16>
250 %2 = getelementptr inbounds i8, ptr %y, i32 %index
251 %wide.load17 = load <16 x i8>, ptr %2, align 1
252 %3 = sext <16 x i8> %wide.load17 to <16 x i16>
253 %4 = mul nsw <16 x i16> %3, %1
254 %5 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
255 %6 = trunc <16 x i16> %5 to <16 x i8>
256 %7 = getelementptr inbounds i8, ptr %z, i32 %index
257 store <16 x i8> %6, ptr %7, align 1
258 %index.next = add i32 %index, 16
259 %8 = icmp eq i32 %index.next, 1024
260 br i1 %8, label %for.cond.cleanup, label %vector.body
262 for.cond.cleanup: ; preds = %vector.body
266 define void @vmulh_s16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
267 ; CHECK-LABEL: vmulh_s16:
268 ; CHECK: @ %bb.0: @ %entry
269 ; CHECK-NEXT: .save {r7, lr}
270 ; CHECK-NEXT: push {r7, lr}
271 ; CHECK-NEXT: mov.w lr, #128
272 ; CHECK-NEXT: .LBB15_1: @ %vector.body
273 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
274 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
275 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
276 ; CHECK-NEXT: vmulh.s16 q0, q1, q0
277 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
278 ; CHECK-NEXT: le lr, .LBB15_1
279 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
280 ; CHECK-NEXT: pop {r7, pc}
282 br label %vector.body
284 vector.body: ; preds = %vector.body, %entry
285 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
286 %0 = getelementptr inbounds i16, ptr %x, i32 %index
287 %wide.load = load <8 x i16>, ptr %0, align 2
288 %1 = sext <8 x i16> %wide.load to <8 x i32>
289 %2 = getelementptr inbounds i16, ptr %y, i32 %index
290 %wide.load17 = load <8 x i16>, ptr %2, align 2
291 %3 = sext <8 x i16> %wide.load17 to <8 x i32>
292 %4 = mul nsw <8 x i32> %3, %1
293 %5 = lshr <8 x i32> %4, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
294 %6 = trunc <8 x i32> %5 to <8 x i16>
295 %7 = getelementptr inbounds i16, ptr %z, i32 %index
296 store <8 x i16> %6, ptr %7, align 2
297 %index.next = add i32 %index, 8
298 %8 = icmp eq i32 %index.next, 1024
299 br i1 %8, label %for.cond.cleanup, label %vector.body
301 for.cond.cleanup: ; preds = %vector.body
305 define void @vmulh_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
306 ; CHECK-LABEL: vmulh_s32:
307 ; CHECK: @ %bb.0: @ %entry
308 ; CHECK-NEXT: .save {r7, lr}
309 ; CHECK-NEXT: push {r7, lr}
310 ; CHECK-NEXT: mov.w lr, #256
311 ; CHECK-NEXT: .LBB16_1: @ %vector.body
312 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
313 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
314 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
315 ; CHECK-NEXT: vmulh.s32 q0, q1, q0
316 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
317 ; CHECK-NEXT: le lr, .LBB16_1
318 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
319 ; CHECK-NEXT: pop {r7, pc}
321 br label %vector.body
323 vector.body: ; preds = %vector.body, %entry
324 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
325 %0 = getelementptr inbounds i32, ptr %x, i32 %index
326 %wide.load = load <4 x i32>, ptr %0, align 4
327 %1 = sext <4 x i32> %wide.load to <4 x i64>
328 %2 = getelementptr inbounds i32, ptr %y, i32 %index
329 %wide.load17 = load <4 x i32>, ptr %2, align 4
330 %3 = sext <4 x i32> %wide.load17 to <4 x i64>
331 %4 = mul nsw <4 x i64> %3, %1
332 %5 = lshr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
333 %6 = trunc <4 x i64> %5 to <4 x i32>
334 %7 = getelementptr inbounds i32, ptr %z, i32 %index
335 store <4 x i32> %6, ptr %7, align 4
336 %index.next = add i32 %index, 4
337 %8 = icmp eq i32 %index.next, 1024
338 br i1 %8, label %for.cond.cleanup, label %vector.body
340 for.cond.cleanup: ; preds = %vector.body
344 define void @vmulh_u8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
345 ; CHECK-LABEL: vmulh_u8:
346 ; CHECK: @ %bb.0: @ %entry
347 ; CHECK-NEXT: .save {r7, lr}
348 ; CHECK-NEXT: push {r7, lr}
349 ; CHECK-NEXT: mov.w lr, #64
350 ; CHECK-NEXT: .LBB17_1: @ %vector.body
351 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
352 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
353 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
354 ; CHECK-NEXT: vmulh.u8 q0, q1, q0
355 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
356 ; CHECK-NEXT: le lr, .LBB17_1
357 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
358 ; CHECK-NEXT: pop {r7, pc}
360 br label %vector.body
362 vector.body: ; preds = %vector.body, %entry
363 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
364 %0 = getelementptr inbounds i8, ptr %x, i32 %index
365 %wide.load = load <16 x i8>, ptr %0, align 1
366 %1 = zext <16 x i8> %wide.load to <16 x i16>
367 %2 = getelementptr inbounds i8, ptr %y, i32 %index
368 %wide.load17 = load <16 x i8>, ptr %2, align 1
369 %3 = zext <16 x i8> %wide.load17 to <16 x i16>
370 %4 = mul nuw <16 x i16> %3, %1
371 %5 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
372 %6 = trunc <16 x i16> %5 to <16 x i8>
373 %7 = getelementptr inbounds i8, ptr %z, i32 %index
374 store <16 x i8> %6, ptr %7, align 1
375 %index.next = add i32 %index, 16
376 %8 = icmp eq i32 %index.next, 1024
377 br i1 %8, label %for.cond.cleanup, label %vector.body
379 for.cond.cleanup: ; preds = %vector.body
383 define void @vmulh_u16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
384 ; CHECK-LABEL: vmulh_u16:
385 ; CHECK: @ %bb.0: @ %entry
386 ; CHECK-NEXT: .save {r7, lr}
387 ; CHECK-NEXT: push {r7, lr}
388 ; CHECK-NEXT: mov.w lr, #128
389 ; CHECK-NEXT: .LBB18_1: @ %vector.body
390 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
391 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
392 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
393 ; CHECK-NEXT: vmulh.u16 q0, q1, q0
394 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
395 ; CHECK-NEXT: le lr, .LBB18_1
396 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
397 ; CHECK-NEXT: pop {r7, pc}
399 br label %vector.body
401 vector.body: ; preds = %vector.body, %entry
402 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
403 %0 = getelementptr inbounds i16, ptr %x, i32 %index
404 %wide.load = load <8 x i16>, ptr %0, align 2
405 %1 = zext <8 x i16> %wide.load to <8 x i32>
406 %2 = getelementptr inbounds i16, ptr %y, i32 %index
407 %wide.load17 = load <8 x i16>, ptr %2, align 2
408 %3 = zext <8 x i16> %wide.load17 to <8 x i32>
409 %4 = mul nuw <8 x i32> %3, %1
410 %5 = lshr <8 x i32> %4, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
411 %6 = trunc <8 x i32> %5 to <8 x i16>
412 %7 = getelementptr inbounds i16, ptr %z, i32 %index
413 store <8 x i16> %6, ptr %7, align 2
414 %index.next = add i32 %index, 8
415 %8 = icmp eq i32 %index.next, 1024
416 br i1 %8, label %for.cond.cleanup, label %vector.body
418 for.cond.cleanup: ; preds = %vector.body
422 define void @vmulh_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
423 ; CHECK-LABEL: vmulh_u32:
424 ; CHECK: @ %bb.0: @ %entry
425 ; CHECK-NEXT: .save {r7, lr}
426 ; CHECK-NEXT: push {r7, lr}
427 ; CHECK-NEXT: mov.w lr, #256
428 ; CHECK-NEXT: .LBB19_1: @ %vector.body
429 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
430 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
431 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
432 ; CHECK-NEXT: vmulh.u32 q0, q1, q0
433 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
434 ; CHECK-NEXT: le lr, .LBB19_1
435 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
436 ; CHECK-NEXT: pop {r7, pc}
438 br label %vector.body
440 vector.body: ; preds = %vector.body, %entry
441 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
442 %0 = getelementptr inbounds i32, ptr %x, i32 %index
443 %wide.load = load <4 x i32>, ptr %0, align 4
444 %1 = zext <4 x i32> %wide.load to <4 x i64>
445 %2 = getelementptr inbounds i32, ptr %y, i32 %index
446 %wide.load17 = load <4 x i32>, ptr %2, align 4
447 %3 = zext <4 x i32> %wide.load17 to <4 x i64>
448 %4 = mul nuw <4 x i64> %3, %1
449 %5 = lshr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
450 %6 = trunc <4 x i64> %5 to <4 x i32>
451 %7 = getelementptr inbounds i32, ptr %z, i32 %index
452 store <4 x i32> %6, ptr %7, align 4
453 %index.next = add i32 %index, 4
454 %8 = icmp eq i32 %index.next, 1024
455 br i1 %8, label %for.cond.cleanup, label %vector.body
457 for.cond.cleanup: ; preds = %vector.body
462 define void @vmulh_s32_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
463 ; CHECK-LABEL: vmulh_s32_pred:
464 ; CHECK: @ %bb.0: @ %entry
465 ; CHECK-NEXT: .save {r7, lr}
466 ; CHECK-NEXT: push {r7, lr}
467 ; CHECK-NEXT: cmp r3, #1
469 ; CHECK-NEXT: poplt {r7, pc}
470 ; CHECK-NEXT: .LBB20_1: @ %vector.ph
471 ; CHECK-NEXT: dlstp.32 lr, r3
472 ; CHECK-NEXT: .LBB20_2: @ %vector.body
473 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
474 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
475 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
476 ; CHECK-NEXT: vmulh.s32 q0, q1, q0
477 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
478 ; CHECK-NEXT: letp lr, .LBB20_2
479 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
480 ; CHECK-NEXT: pop {r7, pc}
482 %cmp10 = icmp sgt i32 %n, 0
483 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
485 vector.ph: ; preds = %entry
486 %n.rnd.up = add i32 %n, 3
487 %n.vec = and i32 %n.rnd.up, -4
488 br label %vector.body
490 vector.body: ; preds = %vector.body, %vector.ph
491 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
492 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
493 %0 = getelementptr inbounds i32, ptr %x, i32 %index
494 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %0, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
495 %1 = sext <4 x i32> %wide.masked.load to <4 x i64>
496 %2 = getelementptr inbounds i32, ptr %y, i32 %index
497 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
498 %3 = sext <4 x i32> %wide.masked.load12 to <4 x i64>
499 %4 = mul nsw <4 x i64> %3, %1
500 %5 = lshr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
501 %6 = trunc <4 x i64> %5 to <4 x i32>
502 %7 = getelementptr inbounds i32, ptr %d, i32 %index
503 call void @llvm.masked.store.v4i32.p0(<4 x i32> %6, ptr %7, i32 4, <4 x i1> %active.lane.mask)
504 %index.next = add i32 %index, 4
505 %8 = icmp eq i32 %index.next, %n.vec
506 br i1 %8, label %for.cond.cleanup, label %vector.body
508 for.cond.cleanup: ; preds = %vector.body, %entry
512 define void @vmulh_u32_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
513 ; CHECK-LABEL: vmulh_u32_pred:
514 ; CHECK: @ %bb.0: @ %entry
515 ; CHECK-NEXT: .save {r7, lr}
516 ; CHECK-NEXT: push {r7, lr}
517 ; CHECK-NEXT: cmp r3, #1
519 ; CHECK-NEXT: poplt {r7, pc}
520 ; CHECK-NEXT: .LBB21_1: @ %vector.ph
521 ; CHECK-NEXT: dlstp.32 lr, r3
522 ; CHECK-NEXT: .LBB21_2: @ %vector.body
523 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
524 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
525 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
526 ; CHECK-NEXT: vmulh.u32 q0, q1, q0
527 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
528 ; CHECK-NEXT: letp lr, .LBB21_2
529 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
530 ; CHECK-NEXT: pop {r7, pc}
532 %cmp10 = icmp sgt i32 %n, 0
533 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
535 vector.ph: ; preds = %entry
536 %n.rnd.up = add i32 %n, 3
537 %n.vec = and i32 %n.rnd.up, -4
538 br label %vector.body
540 vector.body: ; preds = %vector.body, %vector.ph
541 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
542 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
543 %0 = getelementptr inbounds i32, ptr %x, i32 %index
544 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %0, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
545 %1 = zext <4 x i32> %wide.masked.load to <4 x i64>
546 %2 = getelementptr inbounds i32, ptr %y, i32 %index
547 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
548 %3 = zext <4 x i32> %wide.masked.load12 to <4 x i64>
549 %4 = mul nuw <4 x i64> %3, %1
550 %5 = lshr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32>
551 %6 = trunc <4 x i64> %5 to <4 x i32>
552 %7 = getelementptr inbounds i32, ptr %d, i32 %index
553 call void @llvm.masked.store.v4i32.p0(<4 x i32> %6, ptr %7, i32 4, <4 x i1> %active.lane.mask)
554 %index.next = add i32 %index, 4
555 %8 = icmp eq i32 %index.next, %n.vec
556 br i1 %8, label %for.cond.cleanup, label %vector.body
558 for.cond.cleanup: ; preds = %vector.body, %entry
562 define void @vmulh_s16_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
563 ; CHECK-LABEL: vmulh_s16_pred:
564 ; CHECK: @ %bb.0: @ %entry
565 ; CHECK-NEXT: .save {r7, lr}
566 ; CHECK-NEXT: push {r7, lr}
567 ; CHECK-NEXT: cmp r3, #1
569 ; CHECK-NEXT: poplt {r7, pc}
570 ; CHECK-NEXT: .LBB22_1: @ %vector.ph
571 ; CHECK-NEXT: dlstp.16 lr, r3
572 ; CHECK-NEXT: .LBB22_2: @ %vector.body
573 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
574 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
575 ; CHECK-NEXT: vldrh.u16 q1, [r2], #16
576 ; CHECK-NEXT: vmulh.s16 q0, q1, q0
577 ; CHECK-NEXT: vstrh.16 q0, [r0], #16
578 ; CHECK-NEXT: letp lr, .LBB22_2
579 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
580 ; CHECK-NEXT: pop {r7, pc}
582 %cmp10 = icmp sgt i32 %n, 0
583 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
585 vector.ph: ; preds = %entry
586 %n.rnd.up = add i32 %n, 7
587 %n.vec = and i32 %n.rnd.up, -8
588 br label %vector.body
590 vector.body: ; preds = %vector.body, %vector.ph
591 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
592 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
593 %0 = getelementptr inbounds i16, ptr %x, i32 %index
594 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %0, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
595 %1 = sext <8 x i16> %wide.masked.load to <8 x i32>
596 %2 = getelementptr inbounds i16, ptr %y, i32 %index
597 %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %2, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
598 %3 = sext <8 x i16> %wide.masked.load12 to <8 x i32>
599 %4 = mul nsw <8 x i32> %3, %1
600 %5 = lshr <8 x i32> %4, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
601 %6 = trunc <8 x i32> %5 to <8 x i16>
602 %7 = getelementptr inbounds i16, ptr %d, i32 %index
603 call void @llvm.masked.store.v8i16.p0(<8 x i16> %6, ptr %7, i32 2, <8 x i1> %active.lane.mask)
604 %index.next = add i32 %index, 8
605 %8 = icmp eq i32 %index.next, %n.vec
606 br i1 %8, label %for.cond.cleanup, label %vector.body
608 for.cond.cleanup: ; preds = %vector.body, %entry
612 define void @vmulh_u16_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
613 ; CHECK-LABEL: vmulh_u16_pred:
614 ; CHECK: @ %bb.0: @ %entry
615 ; CHECK-NEXT: .save {r7, lr}
616 ; CHECK-NEXT: push {r7, lr}
617 ; CHECK-NEXT: cmp r3, #1
619 ; CHECK-NEXT: poplt {r7, pc}
620 ; CHECK-NEXT: .LBB23_1: @ %vector.ph
621 ; CHECK-NEXT: dlstp.16 lr, r3
622 ; CHECK-NEXT: .LBB23_2: @ %vector.body
623 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
624 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
625 ; CHECK-NEXT: vldrh.u16 q1, [r2], #16
626 ; CHECK-NEXT: vmulh.u16 q0, q1, q0
627 ; CHECK-NEXT: vstrh.16 q0, [r0], #16
628 ; CHECK-NEXT: letp lr, .LBB23_2
629 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
630 ; CHECK-NEXT: pop {r7, pc}
632 %cmp10 = icmp sgt i32 %n, 0
633 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
635 vector.ph: ; preds = %entry
636 %n.rnd.up = add i32 %n, 7
637 %n.vec = and i32 %n.rnd.up, -8
638 br label %vector.body
640 vector.body: ; preds = %vector.body, %vector.ph
641 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
642 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
643 %0 = getelementptr inbounds i16, ptr %x, i32 %index
644 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %0, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
645 %1 = zext <8 x i16> %wide.masked.load to <8 x i32>
646 %2 = getelementptr inbounds i16, ptr %y, i32 %index
647 %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %2, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
648 %3 = zext <8 x i16> %wide.masked.load12 to <8 x i32>
649 %4 = mul nuw <8 x i32> %3, %1
650 %5 = lshr <8 x i32> %4, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
651 %6 = trunc <8 x i32> %5 to <8 x i16>
652 %7 = getelementptr inbounds i16, ptr %d, i32 %index
653 call void @llvm.masked.store.v8i16.p0(<8 x i16> %6, ptr %7, i32 2, <8 x i1> %active.lane.mask)
654 %index.next = add i32 %index, 8
655 %8 = icmp eq i32 %index.next, %n.vec
656 br i1 %8, label %for.cond.cleanup, label %vector.body
658 for.cond.cleanup: ; preds = %vector.body, %entry
662 define void @vmulh_s8_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
663 ; CHECK-LABEL: vmulh_s8_pred:
664 ; CHECK: @ %bb.0: @ %entry
665 ; CHECK-NEXT: .save {r7, lr}
666 ; CHECK-NEXT: push {r7, lr}
667 ; CHECK-NEXT: cmp r3, #1
669 ; CHECK-NEXT: poplt {r7, pc}
670 ; CHECK-NEXT: .LBB24_1: @ %vector.ph
671 ; CHECK-NEXT: dlstp.8 lr, r3
672 ; CHECK-NEXT: .LBB24_2: @ %vector.body
673 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
674 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
675 ; CHECK-NEXT: vldrb.u8 q1, [r2], #16
676 ; CHECK-NEXT: vmulh.s8 q0, q1, q0
677 ; CHECK-NEXT: vstrb.8 q0, [r0], #16
678 ; CHECK-NEXT: letp lr, .LBB24_2
679 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
680 ; CHECK-NEXT: pop {r7, pc}
682 %cmp10 = icmp sgt i32 %n, 0
683 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
685 vector.ph: ; preds = %entry
686 %n.rnd.up = add i32 %n, 15
687 %n.vec = and i32 %n.rnd.up, -16
688 br label %vector.body
690 vector.body: ; preds = %vector.body, %vector.ph
691 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
692 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
693 %0 = getelementptr inbounds i8, ptr %x, i32 %index
694 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %0, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
695 %1 = sext <16 x i8> %wide.masked.load to <16 x i16>
696 %2 = getelementptr inbounds i8, ptr %y, i32 %index
697 %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %2, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
698 %3 = sext <16 x i8> %wide.masked.load12 to <16 x i16>
699 %4 = mul nsw <16 x i16> %3, %1
700 %5 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
701 %6 = trunc <16 x i16> %5 to <16 x i8>
702 %7 = getelementptr inbounds i8, ptr %d, i32 %index
703 call void @llvm.masked.store.v16i8.p0(<16 x i8> %6, ptr %7, i32 1, <16 x i1> %active.lane.mask)
704 %index.next = add i32 %index, 16
705 %8 = icmp eq i32 %index.next, %n.vec
706 br i1 %8, label %for.cond.cleanup, label %vector.body
708 for.cond.cleanup: ; preds = %vector.body, %entry
712 define void @vmulh_u8_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
713 ; CHECK-LABEL: vmulh_u8_pred:
714 ; CHECK: @ %bb.0: @ %entry
715 ; CHECK-NEXT: .save {r7, lr}
716 ; CHECK-NEXT: push {r7, lr}
717 ; CHECK-NEXT: cmp r3, #1
719 ; CHECK-NEXT: poplt {r7, pc}
720 ; CHECK-NEXT: .LBB25_1: @ %vector.ph
721 ; CHECK-NEXT: dlstp.8 lr, r3
722 ; CHECK-NEXT: .LBB25_2: @ %vector.body
723 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
724 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
725 ; CHECK-NEXT: vldrb.u8 q1, [r2], #16
726 ; CHECK-NEXT: vmulh.u8 q0, q1, q0
727 ; CHECK-NEXT: vstrb.8 q0, [r0], #16
728 ; CHECK-NEXT: letp lr, .LBB25_2
729 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
730 ; CHECK-NEXT: pop {r7, pc}
732 %cmp10 = icmp sgt i32 %n, 0
733 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
735 vector.ph: ; preds = %entry
736 %n.rnd.up = add i32 %n, 15
737 %n.vec = and i32 %n.rnd.up, -16
738 br label %vector.body
740 vector.body: ; preds = %vector.body, %vector.ph
741 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
742 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
743 %0 = getelementptr inbounds i8, ptr %x, i32 %index
744 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %0, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
745 %1 = zext <16 x i8> %wide.masked.load to <16 x i16>
746 %2 = getelementptr inbounds i8, ptr %y, i32 %index
747 %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %2, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
748 %3 = zext <16 x i8> %wide.masked.load12 to <16 x i16>
749 %4 = mul nuw <16 x i16> %3, %1
750 %5 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
751 %6 = trunc <16 x i16> %5 to <16 x i8>
752 %7 = getelementptr inbounds i8, ptr %d, i32 %index
753 call void @llvm.masked.store.v16i8.p0(<16 x i8> %6, ptr %7, i32 1, <16 x i1> %active.lane.mask)
754 %index.next = add i32 %index, 16
755 %8 = icmp eq i32 %index.next, %n.vec
756 br i1 %8, label %for.cond.cleanup, label %vector.body
758 for.cond.cleanup: ; preds = %vector.body, %entry
763 define arm_aapcs_vfpcc i16 @vmulhs_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
764 ; CHECK-LABEL: vmulhs_reduce_v16i8:
765 ; CHECK: @ %bb.0: @ %entry
766 ; CHECK-NEXT: vmulh.s8 q0, q0, q1
767 ; CHECK-NEXT: vaddv.s8 r0, q0
770 %s0s = sext <16 x i8> %s0 to <16 x i16>
771 %s1s = sext <16 x i8> %s1 to <16 x i16>
772 %m = mul <16 x i16> %s0s, %s1s
773 %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
774 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
778 define arm_aapcs_vfpcc i16 @vmulhu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
779 ; CHECK-LABEL: vmulhu_reduce_v16i8:
780 ; CHECK: @ %bb.0: @ %entry
781 ; CHECK-NEXT: vmulh.u8 q0, q0, q1
782 ; CHECK-NEXT: vaddv.s8 r0, q0
785 %s0s = zext <16 x i8> %s0 to <16 x i16>
786 %s1s = zext <16 x i8> %s1 to <16 x i16>
787 %m = mul <16 x i16> %s0s, %s1s
788 %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
789 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
793 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
796 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
797 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
798 declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
799 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
800 declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>)
801 declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>)
802 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
803 declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>)
804 declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>)