1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
4 define arm_aapcs_vfpcc <2 x i32> @vmulhs_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
5 ; CHECK-LABEL: vmulhs_v2i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vmullb.s32 q2, q0, q1
8 ; CHECK-NEXT: vmov r0, s11
9 ; CHECK-NEXT: vmov r1, s9
10 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
11 ; CHECK-NEXT: asrs r0, r0, #31
12 ; CHECK-NEXT: asrs r1, r1, #31
13 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
16 %s0s = sext <2 x i32> %s0 to <2 x i64>
17 %s1s = sext <2 x i32> %s1 to <2 x i64>
18 %m = mul <2 x i64> %s0s, %s1s
19 %s = ashr <2 x i64> %m, <i64 32, i64 32>
20 %s2 = trunc <2 x i64> %s to <2 x i32>
24 define arm_aapcs_vfpcc <2 x i32> @vmulhu_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
25 ; CHECK-LABEL: vmulhu_v2i32:
26 ; CHECK: @ %bb.0: @ %entry
27 ; CHECK-NEXT: vmullb.u32 q2, q0, q1
28 ; CHECK-NEXT: vldr s1, .LCPI1_0
29 ; CHECK-NEXT: vmov.f32 s0, s9
30 ; CHECK-NEXT: vmov.f32 s2, s11
31 ; CHECK-NEXT: vmov.f32 s3, s1
33 ; CHECK-NEXT: .p2align 2
34 ; CHECK-NEXT: @ %bb.1:
35 ; CHECK-NEXT: .LCPI1_0:
36 ; CHECK-NEXT: .long 0x00000000 @ float 0
38 %s0s = zext <2 x i32> %s0 to <2 x i64>
39 %s1s = zext <2 x i32> %s1 to <2 x i64>
40 %m = mul <2 x i64> %s0s, %s1s
41 %s = lshr <2 x i64> %m, <i64 32, i64 32>
42 %s2 = trunc <2 x i64> %s to <2 x i32>
46 define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
47 ; CHECK-LABEL: vmulhs_v4i32:
48 ; CHECK: @ %bb.0: @ %entry
49 ; CHECK-NEXT: vmulh.s32 q0, q0, q1
52 %s0s = sext <4 x i32> %s0 to <4 x i64>
53 %s1s = sext <4 x i32> %s1 to <4 x i64>
54 %m = mul <4 x i64> %s0s, %s1s
55 %s = ashr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
56 %s2 = trunc <4 x i64> %s to <4 x i32>
60 define arm_aapcs_vfpcc <4 x i32> @vmulhu_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
61 ; CHECK-LABEL: vmulhu_v4i32:
62 ; CHECK: @ %bb.0: @ %entry
63 ; CHECK-NEXT: vmulh.u32 q0, q0, q1
66 %s0s = zext <4 x i32> %s0 to <4 x i64>
67 %s1s = zext <4 x i32> %s1 to <4 x i64>
68 %m = mul <4 x i64> %s0s, %s1s
69 %s = lshr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
70 %s2 = trunc <4 x i64> %s to <4 x i32>
74 define arm_aapcs_vfpcc <4 x i16> @vmulhs_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
75 ; CHECK-LABEL: vmulhs_v4i16:
76 ; CHECK: @ %bb.0: @ %entry
77 ; CHECK-NEXT: vmullb.s16 q0, q0, q1
78 ; CHECK-NEXT: vshr.s32 q0, q0, #16
81 %s0s = sext <4 x i16> %s0 to <4 x i32>
82 %s1s = sext <4 x i16> %s1 to <4 x i32>
83 %m = mul <4 x i32> %s0s, %s1s
84 %s = ashr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16>
85 %s2 = trunc <4 x i32> %s to <4 x i16>
89 define arm_aapcs_vfpcc <4 x i16> @vmulhu_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
90 ; CHECK-LABEL: vmulhu_v4i16:
91 ; CHECK: @ %bb.0: @ %entry
92 ; CHECK-NEXT: vmullb.u16 q0, q0, q1
93 ; CHECK-NEXT: vshr.u32 q0, q0, #16
96 %s0s = zext <4 x i16> %s0 to <4 x i32>
97 %s1s = zext <4 x i16> %s1 to <4 x i32>
98 %m = mul <4 x i32> %s0s, %s1s
99 %s = lshr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16>
100 %s2 = trunc <4 x i32> %s to <4 x i16>
104 define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
105 ; CHECK-LABEL: vmulhs_v8i16:
106 ; CHECK: @ %bb.0: @ %entry
107 ; CHECK-NEXT: vmulh.s16 q0, q0, q1
110 %s0s = sext <8 x i16> %s0 to <8 x i32>
111 %s1s = sext <8 x i16> %s1 to <8 x i32>
112 %m = mul <8 x i32> %s0s, %s1s
113 %s = ashr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
114 %s2 = trunc <8 x i32> %s to <8 x i16>
118 define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
119 ; CHECK-LABEL: vmulhu_v8i16:
120 ; CHECK: @ %bb.0: @ %entry
121 ; CHECK-NEXT: vmulh.u16 q0, q0, q1
124 %s0s = zext <8 x i16> %s0 to <8 x i32>
125 %s1s = zext <8 x i16> %s1 to <8 x i32>
126 %m = mul <8 x i32> %s0s, %s1s
127 %s = lshr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
128 %s2 = trunc <8 x i32> %s to <8 x i16>
132 define arm_aapcs_vfpcc <8 x i8> @vmulhs_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
133 ; CHECK-LABEL: vmulhs_v8i8:
134 ; CHECK: @ %bb.0: @ %entry
135 ; CHECK-NEXT: vmullb.s8 q0, q0, q1
136 ; CHECK-NEXT: vshr.s16 q0, q0, #8
139 %s0s = sext <8 x i8> %s0 to <8 x i16>
140 %s1s = sext <8 x i8> %s1 to <8 x i16>
141 %m = mul <8 x i16> %s0s, %s1s
142 %s = ashr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
143 %s2 = trunc <8 x i16> %s to <8 x i8>
147 define arm_aapcs_vfpcc <8 x i8> @vmulhu_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
148 ; CHECK-LABEL: vmulhu_v8i8:
149 ; CHECK: @ %bb.0: @ %entry
150 ; CHECK-NEXT: vmullb.u8 q0, q0, q1
151 ; CHECK-NEXT: vshr.u16 q0, q0, #8
154 %s0s = zext <8 x i8> %s0 to <8 x i16>
155 %s1s = zext <8 x i8> %s1 to <8 x i16>
156 %m = mul <8 x i16> %s0s, %s1s
157 %s = lshr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
158 %s2 = trunc <8 x i16> %s to <8 x i8>
162 define arm_aapcs_vfpcc <16 x i8> @vmulhs_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
163 ; CHECK-LABEL: vmulhs_v16i8:
164 ; CHECK: @ %bb.0: @ %entry
165 ; CHECK-NEXT: vmulh.s8 q0, q0, q1
168 %s0s = sext <16 x i8> %s0 to <16 x i16>
169 %s1s = sext <16 x i8> %s1 to <16 x i16>
170 %m = mul <16 x i16> %s0s, %s1s
171 %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
172 %s2 = trunc <16 x i16> %s to <16 x i8>
176 define arm_aapcs_vfpcc <16 x i8> @vmulhu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
177 ; CHECK-LABEL: vmulhu_v16i8:
178 ; CHECK: @ %bb.0: @ %entry
179 ; CHECK-NEXT: vmulh.u8 q0, q0, q1
182 %s0s = zext <16 x i8> %s0 to <16 x i16>
183 %s1s = zext <16 x i8> %s1 to <16 x i16>
184 %m = mul <16 x i16> %s0s, %s1s
185 %s = lshr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
186 %s2 = trunc <16 x i16> %s to <16 x i8>
190 define void @vmulh_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
191 ; CHECK-LABEL: vmulh_s8:
192 ; CHECK: @ %bb.0: @ %entry
193 ; CHECK-NEXT: .save {r7, lr}
194 ; CHECK-NEXT: push {r7, lr}
195 ; CHECK-NEXT: mov.w lr, #64
196 ; CHECK-NEXT: .LBB12_1: @ %vector.body
197 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
198 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
199 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
200 ; CHECK-NEXT: vmulh.s8 q0, q1, q0
201 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
202 ; CHECK-NEXT: le lr, .LBB12_1
203 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
204 ; CHECK-NEXT: pop {r7, pc}
206 br label %vector.body
208 vector.body: ; preds = %vector.body, %entry
209 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
210 %0 = getelementptr inbounds i8, i8* %x, i32 %index
211 %1 = bitcast i8* %0 to <16 x i8>*
212 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
213 %2 = sext <16 x i8> %wide.load to <16 x i16>
214 %3 = getelementptr inbounds i8, i8* %y, i32 %index
215 %4 = bitcast i8* %3 to <16 x i8>*
216 %wide.load17 = load <16 x i8>, <16 x i8>* %4, align 1
217 %5 = sext <16 x i8> %wide.load17 to <16 x i16>
218 %6 = mul nsw <16 x i16> %5, %2
219 %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
220 %8 = trunc <16 x i16> %7 to <16 x i8>
221 %9 = getelementptr inbounds i8, i8* %z, i32 %index
222 %10 = bitcast i8* %9 to <16 x i8>*
223 store <16 x i8> %8, <16 x i8>* %10, align 1
224 %index.next = add i32 %index, 16
225 %11 = icmp eq i32 %index.next, 1024
226 br i1 %11, label %for.cond.cleanup, label %vector.body
228 for.cond.cleanup: ; preds = %vector.body
232 define void @vmulh_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
233 ; CHECK-LABEL: vmulh_s16:
234 ; CHECK: @ %bb.0: @ %entry
235 ; CHECK-NEXT: .save {r7, lr}
236 ; CHECK-NEXT: push {r7, lr}
237 ; CHECK-NEXT: mov.w lr, #128
238 ; CHECK-NEXT: .LBB13_1: @ %vector.body
239 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
240 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
241 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
242 ; CHECK-NEXT: vmulh.s16 q0, q1, q0
243 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
244 ; CHECK-NEXT: le lr, .LBB13_1
245 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
246 ; CHECK-NEXT: pop {r7, pc}
248 br label %vector.body
250 vector.body: ; preds = %vector.body, %entry
251 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
252 %0 = getelementptr inbounds i16, i16* %x, i32 %index
253 %1 = bitcast i16* %0 to <8 x i16>*
254 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
255 %2 = sext <8 x i16> %wide.load to <8 x i32>
256 %3 = getelementptr inbounds i16, i16* %y, i32 %index
257 %4 = bitcast i16* %3 to <8 x i16>*
258 %wide.load17 = load <8 x i16>, <8 x i16>* %4, align 2
259 %5 = sext <8 x i16> %wide.load17 to <8 x i32>
260 %6 = mul nsw <8 x i32> %5, %2
261 %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
262 %8 = trunc <8 x i32> %7 to <8 x i16>
263 %9 = getelementptr inbounds i16, i16* %z, i32 %index
264 %10 = bitcast i16* %9 to <8 x i16>*
265 store <8 x i16> %8, <8 x i16>* %10, align 2
266 %index.next = add i32 %index, 8
267 %11 = icmp eq i32 %index.next, 1024
268 br i1 %11, label %for.cond.cleanup, label %vector.body
270 for.cond.cleanup: ; preds = %vector.body
274 define void @vmulh_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
275 ; CHECK-LABEL: vmulh_s32:
276 ; CHECK: @ %bb.0: @ %entry
277 ; CHECK-NEXT: .save {r7, lr}
278 ; CHECK-NEXT: push {r7, lr}
279 ; CHECK-NEXT: mov.w lr, #256
280 ; CHECK-NEXT: .LBB14_1: @ %vector.body
281 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
282 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
283 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
284 ; CHECK-NEXT: vmulh.s32 q0, q1, q0
285 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
286 ; CHECK-NEXT: le lr, .LBB14_1
287 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
288 ; CHECK-NEXT: pop {r7, pc}
290 br label %vector.body
292 vector.body: ; preds = %vector.body, %entry
293 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
294 %0 = getelementptr inbounds i32, i32* %x, i32 %index
295 %1 = bitcast i32* %0 to <4 x i32>*
296 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
297 %2 = sext <4 x i32> %wide.load to <4 x i64>
298 %3 = getelementptr inbounds i32, i32* %y, i32 %index
299 %4 = bitcast i32* %3 to <4 x i32>*
300 %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
301 %5 = sext <4 x i32> %wide.load17 to <4 x i64>
302 %6 = mul nsw <4 x i64> %5, %2
303 %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
304 %8 = trunc <4 x i64> %7 to <4 x i32>
305 %9 = getelementptr inbounds i32, i32* %z, i32 %index
306 %10 = bitcast i32* %9 to <4 x i32>*
307 store <4 x i32> %8, <4 x i32>* %10, align 4
308 %index.next = add i32 %index, 4
309 %11 = icmp eq i32 %index.next, 1024
310 br i1 %11, label %for.cond.cleanup, label %vector.body
312 for.cond.cleanup: ; preds = %vector.body
316 define void @vmulh_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
317 ; CHECK-LABEL: vmulh_u8:
318 ; CHECK: @ %bb.0: @ %entry
319 ; CHECK-NEXT: .save {r7, lr}
320 ; CHECK-NEXT: push {r7, lr}
321 ; CHECK-NEXT: mov.w lr, #64
322 ; CHECK-NEXT: .LBB15_1: @ %vector.body
323 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
324 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
325 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
326 ; CHECK-NEXT: vmulh.u8 q0, q1, q0
327 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
328 ; CHECK-NEXT: le lr, .LBB15_1
329 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
330 ; CHECK-NEXT: pop {r7, pc}
332 br label %vector.body
334 vector.body: ; preds = %vector.body, %entry
335 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
336 %0 = getelementptr inbounds i8, i8* %x, i32 %index
337 %1 = bitcast i8* %0 to <16 x i8>*
338 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
339 %2 = zext <16 x i8> %wide.load to <16 x i16>
340 %3 = getelementptr inbounds i8, i8* %y, i32 %index
341 %4 = bitcast i8* %3 to <16 x i8>*
342 %wide.load17 = load <16 x i8>, <16 x i8>* %4, align 1
343 %5 = zext <16 x i8> %wide.load17 to <16 x i16>
344 %6 = mul nuw <16 x i16> %5, %2
345 %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
346 %8 = trunc <16 x i16> %7 to <16 x i8>
347 %9 = getelementptr inbounds i8, i8* %z, i32 %index
348 %10 = bitcast i8* %9 to <16 x i8>*
349 store <16 x i8> %8, <16 x i8>* %10, align 1
350 %index.next = add i32 %index, 16
351 %11 = icmp eq i32 %index.next, 1024
352 br i1 %11, label %for.cond.cleanup, label %vector.body
354 for.cond.cleanup: ; preds = %vector.body
358 define void @vmulh_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
359 ; CHECK-LABEL: vmulh_u16:
360 ; CHECK: @ %bb.0: @ %entry
361 ; CHECK-NEXT: .save {r7, lr}
362 ; CHECK-NEXT: push {r7, lr}
363 ; CHECK-NEXT: mov.w lr, #128
364 ; CHECK-NEXT: .LBB16_1: @ %vector.body
365 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
366 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
367 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
368 ; CHECK-NEXT: vmulh.u16 q0, q1, q0
369 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
370 ; CHECK-NEXT: le lr, .LBB16_1
371 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
372 ; CHECK-NEXT: pop {r7, pc}
374 br label %vector.body
376 vector.body: ; preds = %vector.body, %entry
377 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
378 %0 = getelementptr inbounds i16, i16* %x, i32 %index
379 %1 = bitcast i16* %0 to <8 x i16>*
380 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
381 %2 = zext <8 x i16> %wide.load to <8 x i32>
382 %3 = getelementptr inbounds i16, i16* %y, i32 %index
383 %4 = bitcast i16* %3 to <8 x i16>*
384 %wide.load17 = load <8 x i16>, <8 x i16>* %4, align 2
385 %5 = zext <8 x i16> %wide.load17 to <8 x i32>
386 %6 = mul nuw <8 x i32> %5, %2
387 %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
388 %8 = trunc <8 x i32> %7 to <8 x i16>
389 %9 = getelementptr inbounds i16, i16* %z, i32 %index
390 %10 = bitcast i16* %9 to <8 x i16>*
391 store <8 x i16> %8, <8 x i16>* %10, align 2
392 %index.next = add i32 %index, 8
393 %11 = icmp eq i32 %index.next, 1024
394 br i1 %11, label %for.cond.cleanup, label %vector.body
396 for.cond.cleanup: ; preds = %vector.body
400 define void @vmulh_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
401 ; CHECK-LABEL: vmulh_u32:
402 ; CHECK: @ %bb.0: @ %entry
403 ; CHECK-NEXT: .save {r7, lr}
404 ; CHECK-NEXT: push {r7, lr}
405 ; CHECK-NEXT: mov.w lr, #256
406 ; CHECK-NEXT: .LBB17_1: @ %vector.body
407 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
408 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
409 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
410 ; CHECK-NEXT: vmulh.u32 q0, q1, q0
411 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
412 ; CHECK-NEXT: le lr, .LBB17_1
413 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
414 ; CHECK-NEXT: pop {r7, pc}
416 br label %vector.body
418 vector.body: ; preds = %vector.body, %entry
419 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
420 %0 = getelementptr inbounds i32, i32* %x, i32 %index
421 %1 = bitcast i32* %0 to <4 x i32>*
422 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
423 %2 = zext <4 x i32> %wide.load to <4 x i64>
424 %3 = getelementptr inbounds i32, i32* %y, i32 %index
425 %4 = bitcast i32* %3 to <4 x i32>*
426 %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
427 %5 = zext <4 x i32> %wide.load17 to <4 x i64>
428 %6 = mul nuw <4 x i64> %5, %2
429 %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
430 %8 = trunc <4 x i64> %7 to <4 x i32>
431 %9 = getelementptr inbounds i32, i32* %z, i32 %index
432 %10 = bitcast i32* %9 to <4 x i32>*
433 store <4 x i32> %8, <4 x i32>* %10, align 4
434 %index.next = add i32 %index, 4
435 %11 = icmp eq i32 %index.next, 1024
436 br i1 %11, label %for.cond.cleanup, label %vector.body
438 for.cond.cleanup: ; preds = %vector.body
443 define void @vmulh_s32_pred(i32* noalias nocapture %d, i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
444 ; CHECK-LABEL: vmulh_s32_pred:
445 ; CHECK: @ %bb.0: @ %entry
446 ; CHECK-NEXT: .save {r7, lr}
447 ; CHECK-NEXT: push {r7, lr}
448 ; CHECK-NEXT: cmp r3, #1
450 ; CHECK-NEXT: poplt {r7, pc}
451 ; CHECK-NEXT: .LBB18_1: @ %vector.ph
452 ; CHECK-NEXT: dlstp.32 lr, r3
453 ; CHECK-NEXT: .LBB18_2: @ %vector.body
454 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
455 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
456 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
457 ; CHECK-NEXT: vmulh.s32 q0, q1, q0
458 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
459 ; CHECK-NEXT: letp lr, .LBB18_2
460 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
461 ; CHECK-NEXT: pop {r7, pc}
463 %cmp10 = icmp sgt i32 %n, 0
464 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
466 vector.ph: ; preds = %entry
467 %n.rnd.up = add i32 %n, 3
468 %n.vec = and i32 %n.rnd.up, -4
469 br label %vector.body
471 vector.body: ; preds = %vector.body, %vector.ph
472 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
473 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
474 %0 = getelementptr inbounds i32, i32* %x, i32 %index
475 %1 = bitcast i32* %0 to <4 x i32>*
476 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
477 %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
478 %3 = getelementptr inbounds i32, i32* %y, i32 %index
479 %4 = bitcast i32* %3 to <4 x i32>*
480 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
481 %5 = sext <4 x i32> %wide.masked.load12 to <4 x i64>
482 %6 = mul nsw <4 x i64> %5, %2
483 %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
484 %8 = trunc <4 x i64> %7 to <4 x i32>
485 %9 = getelementptr inbounds i32, i32* %d, i32 %index
486 %10 = bitcast i32* %9 to <4 x i32>*
487 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %active.lane.mask)
488 %index.next = add i32 %index, 4
489 %11 = icmp eq i32 %index.next, %n.vec
490 br i1 %11, label %for.cond.cleanup, label %vector.body
492 for.cond.cleanup: ; preds = %vector.body, %entry
496 define void @vmulh_u32_pred(i32* noalias nocapture %d, i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
497 ; CHECK-LABEL: vmulh_u32_pred:
498 ; CHECK: @ %bb.0: @ %entry
499 ; CHECK-NEXT: .save {r7, lr}
500 ; CHECK-NEXT: push {r7, lr}
501 ; CHECK-NEXT: cmp r3, #1
503 ; CHECK-NEXT: poplt {r7, pc}
504 ; CHECK-NEXT: .LBB19_1: @ %vector.ph
505 ; CHECK-NEXT: dlstp.32 lr, r3
506 ; CHECK-NEXT: .LBB19_2: @ %vector.body
507 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
508 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16
509 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
510 ; CHECK-NEXT: vmulh.u32 q0, q1, q0
511 ; CHECK-NEXT: vstrw.32 q0, [r0], #16
512 ; CHECK-NEXT: letp lr, .LBB19_2
513 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
514 ; CHECK-NEXT: pop {r7, pc}
516 %cmp10 = icmp sgt i32 %n, 0
517 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
519 vector.ph: ; preds = %entry
520 %n.rnd.up = add i32 %n, 3
521 %n.vec = and i32 %n.rnd.up, -4
522 br label %vector.body
524 vector.body: ; preds = %vector.body, %vector.ph
525 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
526 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
527 %0 = getelementptr inbounds i32, i32* %x, i32 %index
528 %1 = bitcast i32* %0 to <4 x i32>*
529 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
530 %2 = zext <4 x i32> %wide.masked.load to <4 x i64>
531 %3 = getelementptr inbounds i32, i32* %y, i32 %index
532 %4 = bitcast i32* %3 to <4 x i32>*
533 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
534 %5 = zext <4 x i32> %wide.masked.load12 to <4 x i64>
535 %6 = mul nuw <4 x i64> %5, %2
536 %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
537 %8 = trunc <4 x i64> %7 to <4 x i32>
538 %9 = getelementptr inbounds i32, i32* %d, i32 %index
539 %10 = bitcast i32* %9 to <4 x i32>*
540 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %active.lane.mask)
541 %index.next = add i32 %index, 4
542 %11 = icmp eq i32 %index.next, %n.vec
543 br i1 %11, label %for.cond.cleanup, label %vector.body
545 for.cond.cleanup: ; preds = %vector.body, %entry
549 define void @vmulh_s16_pred(i16* noalias nocapture %d, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
550 ; CHECK-LABEL: vmulh_s16_pred:
551 ; CHECK: @ %bb.0: @ %entry
552 ; CHECK-NEXT: .save {r7, lr}
553 ; CHECK-NEXT: push {r7, lr}
554 ; CHECK-NEXT: cmp r3, #1
556 ; CHECK-NEXT: poplt {r7, pc}
557 ; CHECK-NEXT: .LBB20_1: @ %vector.ph
558 ; CHECK-NEXT: dlstp.16 lr, r3
559 ; CHECK-NEXT: .LBB20_2: @ %vector.body
560 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
561 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
562 ; CHECK-NEXT: vldrh.u16 q1, [r2], #16
563 ; CHECK-NEXT: vmulh.s16 q0, q1, q0
564 ; CHECK-NEXT: vstrh.16 q0, [r0], #16
565 ; CHECK-NEXT: letp lr, .LBB20_2
566 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
567 ; CHECK-NEXT: pop {r7, pc}
569 %cmp10 = icmp sgt i32 %n, 0
570 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
572 vector.ph: ; preds = %entry
573 %n.rnd.up = add i32 %n, 7
574 %n.vec = and i32 %n.rnd.up, -8
575 br label %vector.body
577 vector.body: ; preds = %vector.body, %vector.ph
578 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
579 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
580 %0 = getelementptr inbounds i16, i16* %x, i32 %index
581 %1 = bitcast i16* %0 to <8 x i16>*
582 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
583 %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
584 %3 = getelementptr inbounds i16, i16* %y, i32 %index
585 %4 = bitcast i16* %3 to <8 x i16>*
586 %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
587 %5 = sext <8 x i16> %wide.masked.load12 to <8 x i32>
588 %6 = mul nsw <8 x i32> %5, %2
589 %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
590 %8 = trunc <8 x i32> %7 to <8 x i16>
591 %9 = getelementptr inbounds i16, i16* %d, i32 %index
592 %10 = bitcast i16* %9 to <8 x i16>*
593 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %8, <8 x i16>* %10, i32 2, <8 x i1> %active.lane.mask)
594 %index.next = add i32 %index, 8
595 %11 = icmp eq i32 %index.next, %n.vec
596 br i1 %11, label %for.cond.cleanup, label %vector.body
598 for.cond.cleanup: ; preds = %vector.body, %entry
602 define void @vmulh_u16_pred(i16* noalias nocapture %d, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
603 ; CHECK-LABEL: vmulh_u16_pred:
604 ; CHECK: @ %bb.0: @ %entry
605 ; CHECK-NEXT: .save {r7, lr}
606 ; CHECK-NEXT: push {r7, lr}
607 ; CHECK-NEXT: cmp r3, #1
609 ; CHECK-NEXT: poplt {r7, pc}
610 ; CHECK-NEXT: .LBB21_1: @ %vector.ph
611 ; CHECK-NEXT: dlstp.16 lr, r3
612 ; CHECK-NEXT: .LBB21_2: @ %vector.body
613 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
614 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16
615 ; CHECK-NEXT: vldrh.u16 q1, [r2], #16
616 ; CHECK-NEXT: vmulh.u16 q0, q1, q0
617 ; CHECK-NEXT: vstrh.16 q0, [r0], #16
618 ; CHECK-NEXT: letp lr, .LBB21_2
619 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
620 ; CHECK-NEXT: pop {r7, pc}
622 %cmp10 = icmp sgt i32 %n, 0
623 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
625 vector.ph: ; preds = %entry
626 %n.rnd.up = add i32 %n, 7
627 %n.vec = and i32 %n.rnd.up, -8
628 br label %vector.body
630 vector.body: ; preds = %vector.body, %vector.ph
631 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
632 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
633 %0 = getelementptr inbounds i16, i16* %x, i32 %index
634 %1 = bitcast i16* %0 to <8 x i16>*
635 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
636 %2 = zext <8 x i16> %wide.masked.load to <8 x i32>
637 %3 = getelementptr inbounds i16, i16* %y, i32 %index
638 %4 = bitcast i16* %3 to <8 x i16>*
639 %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
640 %5 = zext <8 x i16> %wide.masked.load12 to <8 x i32>
641 %6 = mul nuw <8 x i32> %5, %2
642 %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
643 %8 = trunc <8 x i32> %7 to <8 x i16>
644 %9 = getelementptr inbounds i16, i16* %d, i32 %index
645 %10 = bitcast i16* %9 to <8 x i16>*
646 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %8, <8 x i16>* %10, i32 2, <8 x i1> %active.lane.mask)
647 %index.next = add i32 %index, 8
648 %11 = icmp eq i32 %index.next, %n.vec
649 br i1 %11, label %for.cond.cleanup, label %vector.body
651 for.cond.cleanup: ; preds = %vector.body, %entry
655 define void @vmulh_s8_pred(i8* noalias nocapture %d, i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
656 ; CHECK-LABEL: vmulh_s8_pred:
657 ; CHECK: @ %bb.0: @ %entry
658 ; CHECK-NEXT: .save {r7, lr}
659 ; CHECK-NEXT: push {r7, lr}
660 ; CHECK-NEXT: cmp r3, #1
662 ; CHECK-NEXT: poplt {r7, pc}
663 ; CHECK-NEXT: .LBB22_1: @ %vector.ph
664 ; CHECK-NEXT: dlstp.8 lr, r3
665 ; CHECK-NEXT: .LBB22_2: @ %vector.body
666 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
667 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
668 ; CHECK-NEXT: vldrb.u8 q1, [r2], #16
669 ; CHECK-NEXT: vmulh.s8 q0, q1, q0
670 ; CHECK-NEXT: vstrb.8 q0, [r0], #16
671 ; CHECK-NEXT: letp lr, .LBB22_2
672 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
673 ; CHECK-NEXT: pop {r7, pc}
675 %cmp10 = icmp sgt i32 %n, 0
676 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
678 vector.ph: ; preds = %entry
679 %n.rnd.up = add i32 %n, 15
680 %n.vec = and i32 %n.rnd.up, -16
681 br label %vector.body
683 vector.body: ; preds = %vector.body, %vector.ph
684 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
685 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
686 %0 = getelementptr inbounds i8, i8* %x, i32 %index
687 %1 = bitcast i8* %0 to <16 x i8>*
688 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
689 %2 = sext <16 x i8> %wide.masked.load to <16 x i16>
690 %3 = getelementptr inbounds i8, i8* %y, i32 %index
691 %4 = bitcast i8* %3 to <16 x i8>*
692 %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
693 %5 = sext <16 x i8> %wide.masked.load12 to <16 x i16>
694 %6 = mul nsw <16 x i16> %5, %2
695 %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
696 %8 = trunc <16 x i16> %7 to <16 x i8>
697 %9 = getelementptr inbounds i8, i8* %d, i32 %index
698 %10 = bitcast i8* %9 to <16 x i8>*
699 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %8, <16 x i8>* %10, i32 1, <16 x i1> %active.lane.mask)
700 %index.next = add i32 %index, 16
701 %11 = icmp eq i32 %index.next, %n.vec
702 br i1 %11, label %for.cond.cleanup, label %vector.body
704 for.cond.cleanup: ; preds = %vector.body, %entry
708 define void @vmulh_u8_pred(i8* noalias nocapture %d, i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
709 ; CHECK-LABEL: vmulh_u8_pred:
710 ; CHECK: @ %bb.0: @ %entry
711 ; CHECK-NEXT: .save {r7, lr}
712 ; CHECK-NEXT: push {r7, lr}
713 ; CHECK-NEXT: cmp r3, #1
715 ; CHECK-NEXT: poplt {r7, pc}
716 ; CHECK-NEXT: .LBB23_1: @ %vector.ph
717 ; CHECK-NEXT: dlstp.8 lr, r3
718 ; CHECK-NEXT: .LBB23_2: @ %vector.body
719 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
720 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16
721 ; CHECK-NEXT: vldrb.u8 q1, [r2], #16
722 ; CHECK-NEXT: vmulh.u8 q0, q1, q0
723 ; CHECK-NEXT: vstrb.8 q0, [r0], #16
724 ; CHECK-NEXT: letp lr, .LBB23_2
725 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
726 ; CHECK-NEXT: pop {r7, pc}
728 %cmp10 = icmp sgt i32 %n, 0
729 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
731 vector.ph: ; preds = %entry
732 %n.rnd.up = add i32 %n, 15
733 %n.vec = and i32 %n.rnd.up, -16
734 br label %vector.body
736 vector.body: ; preds = %vector.body, %vector.ph
737 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
738 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
739 %0 = getelementptr inbounds i8, i8* %x, i32 %index
740 %1 = bitcast i8* %0 to <16 x i8>*
741 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
742 %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
743 %3 = getelementptr inbounds i8, i8* %y, i32 %index
744 %4 = bitcast i8* %3 to <16 x i8>*
745 %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
746 %5 = zext <16 x i8> %wide.masked.load12 to <16 x i16>
747 %6 = mul nuw <16 x i16> %5, %2
748 %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
749 %8 = trunc <16 x i16> %7 to <16 x i8>
750 %9 = getelementptr inbounds i8, i8* %d, i32 %index
751 %10 = bitcast i8* %9 to <16 x i8>*
752 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %8, <16 x i8>* %10, i32 1, <16 x i1> %active.lane.mask)
753 %index.next = add i32 %index, 16
754 %11 = icmp eq i32 %index.next, %n.vec
755 br i1 %11, label %for.cond.cleanup, label %vector.body
757 for.cond.cleanup: ; preds = %vector.body, %entry
761 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
762 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
763 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
764 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
765 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
766 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
767 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
768 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
769 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)