1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
5 ; CHECK-LABEL: ssatmul_s_q31:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
8 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
10 ; CHECK-NEXT: sub sp, #12
11 ; CHECK-NEXT: cmp r3, #0
12 ; CHECK-NEXT: beq.w .LBB0_8
13 ; CHECK-NEXT: @ %bb.1: @ %entry
14 ; CHECK-NEXT: cmp r3, #1
15 ; CHECK-NEXT: bne .LBB0_3
16 ; CHECK-NEXT: @ %bb.2:
17 ; CHECK-NEXT: movs r7, #0
18 ; CHECK-NEXT: mov r12, r0
19 ; CHECK-NEXT: mov r8, r1
20 ; CHECK-NEXT: mov r10, r2
21 ; CHECK-NEXT: b .LBB0_6
22 ; CHECK-NEXT: .LBB0_3: @ %vector.ph
23 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
24 ; CHECK-NEXT: bic r3, r3, #1
25 ; CHECK-NEXT: subs r7, r3, #2
26 ; CHECK-NEXT: movs r6, #1
27 ; CHECK-NEXT: adr r4, .LCPI0_0
28 ; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
29 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1
30 ; CHECK-NEXT: add.w r10, r2, r3, lsl #2
31 ; CHECK-NEXT: add.w r8, r1, r3, lsl #2
32 ; CHECK-NEXT: add.w r12, r0, r3, lsl #2
33 ; CHECK-NEXT: vldrw.u32 q0, [r4]
34 ; CHECK-NEXT: vmvn.i32 q1, #0x80000000
35 ; CHECK-NEXT: .LBB0_4: @ %vector.body
36 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
37 ; CHECK-NEXT: ldrd r4, r3, [r0], #8
38 ; CHECK-NEXT: movs r5, #0
39 ; CHECK-NEXT: ldrd r7, r6, [r1], #8
40 ; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
41 ; CHECK-NEXT: smull r4, r11, r7, r4
42 ; CHECK-NEXT: asrl r4, r11, #31
43 ; CHECK-NEXT: rsbs.w r9, r4, #-2147483648
44 ; CHECK-NEXT: mov.w r9, #-1
45 ; CHECK-NEXT: sbcs.w r3, r9, r11
46 ; CHECK-NEXT: csetm r3, lt
47 ; CHECK-NEXT: bfi r5, r3, #0, #8
48 ; CHECK-NEXT: ldr r3, [sp, #8] @ 4-byte Reload
49 ; CHECK-NEXT: smull r6, r3, r6, r3
50 ; CHECK-NEXT: asrl r6, r3, #31
51 ; CHECK-NEXT: rsbs.w r7, r6, #-2147483648
52 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r6
53 ; CHECK-NEXT: sbcs.w r7, r9, r3
54 ; CHECK-NEXT: vmov q2[3], q2[1], r11, r3
55 ; CHECK-NEXT: csetm r7, lt
56 ; CHECK-NEXT: mvn r6, #-2147483648
57 ; CHECK-NEXT: bfi r5, r7, #8, #8
58 ; CHECK-NEXT: vmsr p0, r5
59 ; CHECK-NEXT: vpsel q2, q2, q0
60 ; CHECK-NEXT: vmov r3, r4, d4
61 ; CHECK-NEXT: subs r3, r3, r6
62 ; CHECK-NEXT: sbcs r3, r4, #0
63 ; CHECK-NEXT: mov.w r4, #0
64 ; CHECK-NEXT: csetm r3, lt
65 ; CHECK-NEXT: bfi r4, r3, #0, #8
66 ; CHECK-NEXT: vmov r3, r5, d5
67 ; CHECK-NEXT: subs r3, r3, r6
68 ; CHECK-NEXT: sbcs r3, r5, #0
69 ; CHECK-NEXT: csetm r3, lt
70 ; CHECK-NEXT: bfi r4, r3, #8, #8
71 ; CHECK-NEXT: vmsr p0, r4
72 ; CHECK-NEXT: vpsel q2, q2, q1
73 ; CHECK-NEXT: vmov r3, s10
74 ; CHECK-NEXT: vmov r4, s8
75 ; CHECK-NEXT: strd r4, r3, [r2], #8
76 ; CHECK-NEXT: le lr, .LBB0_4
77 ; CHECK-NEXT: @ %bb.5: @ %middle.block
78 ; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload
79 ; CHECK-NEXT: cmp r7, r3
80 ; CHECK-NEXT: beq .LBB0_8
81 ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader
82 ; CHECK-NEXT: sub.w lr, r3, r7
83 ; CHECK-NEXT: mov.w r0, #-1
84 ; CHECK-NEXT: mov.w r1, #-2147483648
85 ; CHECK-NEXT: mvn r2, #-2147483648
86 ; CHECK-NEXT: .LBB0_7: @ %for.body
87 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
88 ; CHECK-NEXT: ldr r3, [r12], #4
89 ; CHECK-NEXT: ldr r4, [r8], #4
90 ; CHECK-NEXT: smull r4, r3, r4, r3
91 ; CHECK-NEXT: asrl r4, r3, #31
92 ; CHECK-NEXT: subs r5, r1, r4
93 ; CHECK-NEXT: sbcs.w r5, r0, r3
94 ; CHECK-NEXT: cset r5, lt
95 ; CHECK-NEXT: cmp r5, #0
96 ; CHECK-NEXT: csel r4, r4, r1, ne
97 ; CHECK-NEXT: csel r3, r3, r0, ne
98 ; CHECK-NEXT: subs r5, r4, r2
99 ; CHECK-NEXT: sbcs r3, r3, #0
100 ; CHECK-NEXT: csel r3, r4, r2, lt
101 ; CHECK-NEXT: str r3, [r10], #4
102 ; CHECK-NEXT: le lr, .LBB0_7
103 ; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup
104 ; CHECK-NEXT: add sp, #12
105 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
106 ; CHECK-NEXT: .p2align 4
107 ; CHECK-NEXT: @ %bb.9:
108 ; CHECK-NEXT: .LCPI0_0:
109 ; CHECK-NEXT: .long 2147483648 @ 0x80000000
110 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
111 ; CHECK-NEXT: .long 2147483648 @ 0x80000000
112 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
114 switch i32 %N, label %vector.ph [
115 i32 0, label %for.cond.cleanup
116 i32 1, label %for.body.preheader
119 vector.ph: ; preds = %entry
120 %n.vec = and i32 %N, -2
121 %ind.end = getelementptr i32, ptr %pSrcA, i32 %n.vec
122 %ind.end15 = getelementptr i32, ptr %pSrcB, i32 %n.vec
123 %ind.end17 = getelementptr i32, ptr %pDst, i32 %n.vec
124 br label %vector.body
126 vector.body: ; preds = %vector.body, %vector.ph
127 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
128 %next.gep = getelementptr i32, ptr %pSrcA, i32 %index
129 %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index
130 %next.gep19 = getelementptr i32, ptr %pDst, i32 %index
131 %wide.load = load <2 x i32>, ptr %next.gep, align 4
132 %0 = sext <2 x i32> %wide.load to <2 x i64>
133 %wide.load20 = load <2 x i32>, ptr %next.gep18, align 4
134 %1 = sext <2 x i32> %wide.load20 to <2 x i64>
135 %2 = mul nsw <2 x i64> %1, %0
136 %3 = ashr <2 x i64> %2, <i64 31, i64 31>
137 %4 = icmp sgt <2 x i64> %3, <i64 -2147483648, i64 -2147483648>
138 %5 = select <2 x i1> %4, <2 x i64> %3, <2 x i64> <i64 -2147483648, i64 -2147483648>
139 %6 = icmp slt <2 x i64> %5, <i64 2147483647, i64 2147483647>
140 %7 = select <2 x i1> %6, <2 x i64> %5, <2 x i64> <i64 2147483647, i64 2147483647>
141 %8 = trunc <2 x i64> %7 to <2 x i32>
142 store <2 x i32> %8, ptr %next.gep19, align 4
143 %index.next = add i32 %index, 2
144 %9 = icmp eq i32 %index.next, %n.vec
145 br i1 %9, label %middle.block, label %vector.body
147 middle.block: ; preds = %vector.body
148 %cmp.n = icmp eq i32 %n.vec, %N
149 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader
151 for.body.preheader: ; preds = %entry, %middle.block
152 %i.012.ph = phi i32 [ 0, %entry ], [ %n.vec, %middle.block ]
153 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %entry ], [ %ind.end, %middle.block ]
154 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %entry ], [ %ind.end15, %middle.block ]
155 %pDst.addr.09.ph = phi ptr [ %pDst, %entry ], [ %ind.end17, %middle.block ]
158 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
161 for.body: ; preds = %for.body.preheader, %for.body
162 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader ]
163 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader ]
164 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader ]
165 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader ]
166 %incdec.ptr = getelementptr inbounds i32, ptr %pSrcA.addr.011, i32 1
167 %10 = load i32, ptr %pSrcA.addr.011, align 4
168 %conv = sext i32 %10 to i64
169 %incdec.ptr1 = getelementptr inbounds i32, ptr %pSrcB.addr.010, i32 1
170 %11 = load i32, ptr %pSrcB.addr.010, align 4
171 %conv2 = sext i32 %11 to i64
172 %mul = mul nsw i64 %conv2, %conv
173 %shr = ashr i64 %mul, 31
174 %12 = icmp sgt i64 %shr, -2147483648
175 %.val.i = select i1 %12, i64 %shr, i64 -2147483648
176 %13 = icmp slt i64 %.val.i, 2147483647
177 %retval.0.i = select i1 %13, i64 %.val.i, i64 2147483647
178 %conv3 = trunc i64 %retval.0.i to i32
179 %incdec.ptr4 = getelementptr inbounds i32, ptr %pDst.addr.09, i32 1
180 store i32 %conv3, ptr %pDst.addr.09, align 4
181 %inc = add nuw i32 %i.012, 1
182 %exitcond = icmp eq i32 %inc, %N
183 br i1 %exitcond, label %for.cond.cleanup, label %for.body
186 define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
187 ; CHECK-LABEL: ssatmul_4_q31:
188 ; CHECK: @ %bb.0: @ %entry
189 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
190 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
191 ; CHECK-NEXT: .pad #4
192 ; CHECK-NEXT: sub sp, #4
193 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
194 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
195 ; CHECK-NEXT: .pad #16
196 ; CHECK-NEXT: sub sp, #16
197 ; CHECK-NEXT: cmp r3, #0
198 ; CHECK-NEXT: beq.w .LBB1_8
199 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
200 ; CHECK-NEXT: mov r5, r1
201 ; CHECK-NEXT: movs r1, #0
202 ; CHECK-NEXT: cmp r3, #3
203 ; CHECK-NEXT: bhi .LBB1_3
204 ; CHECK-NEXT: @ %bb.2:
205 ; CHECK-NEXT: mov r12, r0
206 ; CHECK-NEXT: mov r9, r5
207 ; CHECK-NEXT: mov r11, r2
208 ; CHECK-NEXT: b .LBB1_6
209 ; CHECK-NEXT: .LBB1_3: @ %vector.ph
210 ; CHECK-NEXT: bic r1, r3, #3
211 ; CHECK-NEXT: adr r4, .LCPI1_0
212 ; CHECK-NEXT: subs r7, r1, #4
213 ; CHECK-NEXT: movs r6, #1
214 ; CHECK-NEXT: vldrw.u32 q0, [r4]
215 ; CHECK-NEXT: adr r4, .LCPI1_1
216 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
217 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
218 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
219 ; CHECK-NEXT: add.w r11, r2, r1, lsl #2
220 ; CHECK-NEXT: add.w r9, r5, r1, lsl #2
221 ; CHECK-NEXT: add.w r12, r0, r1, lsl #2
222 ; CHECK-NEXT: vldrw.u32 q1, [r4]
223 ; CHECK-NEXT: .LBB1_4: @ %vector.body
224 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
225 ; CHECK-NEXT: vldrw.u32 q3, [r5], #16
226 ; CHECK-NEXT: vldrw.u32 q2, [r0], #16
227 ; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
228 ; CHECK-NEXT: mov.w r2, #-1
229 ; CHECK-NEXT: vmov.f32 s16, s10
230 ; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill
231 ; CHECK-NEXT: vmov.f32 s20, s14
232 ; CHECK-NEXT: vmov.f32 s18, s11
233 ; CHECK-NEXT: vmov.f32 s22, s15
234 ; CHECK-NEXT: mov.w r8, #0
235 ; CHECK-NEXT: vmullb.s32 q6, q5, q4
236 ; CHECK-NEXT: vmov.f32 s14, s13
237 ; CHECK-NEXT: vmov r4, r7, d12
238 ; CHECK-NEXT: asrl r4, r7, #31
239 ; CHECK-NEXT: vmov.f32 s10, s9
240 ; CHECK-NEXT: rsbs.w r5, r4, #-2147483648
241 ; CHECK-NEXT: sbcs.w r5, r2, r7
242 ; CHECK-NEXT: csetm r5, lt
243 ; CHECK-NEXT: bfi r8, r5, #0, #8
244 ; CHECK-NEXT: vmov r10, r5, d13
245 ; CHECK-NEXT: asrl r10, r5, #31
246 ; CHECK-NEXT: vmov r6, s14
247 ; CHECK-NEXT: rsbs.w r3, r10, #-2147483648
248 ; CHECK-NEXT: vmov q4[2], q4[0], r4, r10
249 ; CHECK-NEXT: sbcs.w r3, r2, r5
250 ; CHECK-NEXT: vmov q4[3], q4[1], r7, r5
251 ; CHECK-NEXT: csetm r3, lt
252 ; CHECK-NEXT: bfi r8, r3, #8, #8
253 ; CHECK-NEXT: vmsr p0, r8
254 ; CHECK-NEXT: mvn r8, #-2147483648
255 ; CHECK-NEXT: vpsel q4, q4, q0
256 ; CHECK-NEXT: vmov r3, r4, d8
257 ; CHECK-NEXT: subs.w r3, r3, r8
258 ; CHECK-NEXT: sbcs r3, r4, #0
259 ; CHECK-NEXT: mov.w r4, #0
260 ; CHECK-NEXT: csetm r3, lt
261 ; CHECK-NEXT: bfi r4, r3, #0, #8
262 ; CHECK-NEXT: vmov r3, r5, d9
263 ; CHECK-NEXT: subs.w r3, r3, r8
264 ; CHECK-NEXT: sbcs r3, r5, #0
265 ; CHECK-NEXT: mov.w r5, #0
266 ; CHECK-NEXT: csetm r3, lt
267 ; CHECK-NEXT: bfi r4, r3, #8, #8
268 ; CHECK-NEXT: vmov r3, s8
269 ; CHECK-NEXT: vmsr p0, r4
270 ; CHECK-NEXT: vmov r4, s12
271 ; CHECK-NEXT: vpsel q4, q4, q1
272 ; CHECK-NEXT: smull r4, r7, r4, r3
273 ; CHECK-NEXT: asrl r4, r7, #31
274 ; CHECK-NEXT: rsbs.w r3, r4, #-2147483648
275 ; CHECK-NEXT: sbcs.w r3, r2, r7
276 ; CHECK-NEXT: csetm r3, lt
277 ; CHECK-NEXT: bfi r5, r3, #0, #8
278 ; CHECK-NEXT: vmov r3, s10
279 ; CHECK-NEXT: smull r6, r3, r6, r3
280 ; CHECK-NEXT: asrl r6, r3, #31
281 ; CHECK-NEXT: rsbs.w r1, r6, #-2147483648
282 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r6
283 ; CHECK-NEXT: sbcs.w r1, r2, r3
284 ; CHECK-NEXT: vmov q2[3], q2[1], r7, r3
285 ; CHECK-NEXT: csetm r1, lt
286 ; CHECK-NEXT: bfi r5, r1, #8, #8
287 ; CHECK-NEXT: vmsr p0, r5
288 ; CHECK-NEXT: ldrd r5, r2, [sp, #8] @ 8-byte Folded Reload
289 ; CHECK-NEXT: vpsel q2, q2, q0
290 ; CHECK-NEXT: vmov r1, r3, d4
291 ; CHECK-NEXT: subs.w r1, r1, r8
292 ; CHECK-NEXT: sbcs r1, r3, #0
293 ; CHECK-NEXT: mov.w r3, #0
294 ; CHECK-NEXT: csetm r1, lt
295 ; CHECK-NEXT: bfi r3, r1, #0, #8
296 ; CHECK-NEXT: vmov r1, r4, d5
297 ; CHECK-NEXT: subs.w r1, r1, r8
298 ; CHECK-NEXT: sbcs r1, r4, #0
299 ; CHECK-NEXT: csetm r1, lt
300 ; CHECK-NEXT: bfi r3, r1, #8, #8
301 ; CHECK-NEXT: vmsr p0, r3
302 ; CHECK-NEXT: vpsel q2, q2, q1
303 ; CHECK-NEXT: vmov.f32 s9, s10
304 ; CHECK-NEXT: vmov.f32 s10, s16
305 ; CHECK-NEXT: vmov.f32 s11, s18
306 ; CHECK-NEXT: vstrb.8 q2, [r2], #16
307 ; CHECK-NEXT: le lr, .LBB1_4
308 ; CHECK-NEXT: @ %bb.5: @ %middle.block
309 ; CHECK-NEXT: ldrd r1, r3, [sp] @ 8-byte Folded Reload
310 ; CHECK-NEXT: cmp r1, r3
311 ; CHECK-NEXT: beq .LBB1_8
312 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader21
313 ; CHECK-NEXT: sub.w lr, r3, r1
314 ; CHECK-NEXT: mov.w r0, #-1
315 ; CHECK-NEXT: mov.w r3, #-2147483648
316 ; CHECK-NEXT: mvn r2, #-2147483648
317 ; CHECK-NEXT: .LBB1_7: @ %for.body
318 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
319 ; CHECK-NEXT: ldr r1, [r12], #4
320 ; CHECK-NEXT: ldr r4, [r9], #4
321 ; CHECK-NEXT: smull r4, r1, r4, r1
322 ; CHECK-NEXT: asrl r4, r1, #31
323 ; CHECK-NEXT: subs r5, r3, r4
324 ; CHECK-NEXT: sbcs.w r5, r0, r1
325 ; CHECK-NEXT: cset r5, lt
326 ; CHECK-NEXT: cmp r5, #0
327 ; CHECK-NEXT: csel r4, r4, r3, ne
328 ; CHECK-NEXT: csel r1, r1, r0, ne
329 ; CHECK-NEXT: subs r5, r4, r2
330 ; CHECK-NEXT: sbcs r1, r1, #0
331 ; CHECK-NEXT: csel r1, r4, r2, lt
332 ; CHECK-NEXT: str r1, [r11], #4
333 ; CHECK-NEXT: le lr, .LBB1_7
334 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup
335 ; CHECK-NEXT: add sp, #16
336 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
337 ; CHECK-NEXT: add sp, #4
338 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
339 ; CHECK-NEXT: .p2align 4
340 ; CHECK-NEXT: @ %bb.9:
341 ; CHECK-NEXT: .LCPI1_0:
342 ; CHECK-NEXT: .long 2147483648 @ 0x80000000
343 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
344 ; CHECK-NEXT: .long 2147483648 @ 0x80000000
345 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
346 ; CHECK-NEXT: .LCPI1_1:
347 ; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
348 ; CHECK-NEXT: .long 0 @ 0x0
349 ; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
350 ; CHECK-NEXT: .long 0 @ 0x0
352 %cmp8 = icmp eq i32 %N, 0
353 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
355 for.body.preheader: ; preds = %entry
356 %min.iters.check = icmp ult i32 %N, 4
357 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
359 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
360 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
361 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
362 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
363 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
366 vector.ph: ; preds = %for.body.preheader
367 %n.vec = and i32 %N, -4
368 %ind.end = getelementptr i32, ptr %pSrcA, i32 %n.vec
369 %ind.end15 = getelementptr i32, ptr %pSrcB, i32 %n.vec
370 %ind.end17 = getelementptr i32, ptr %pDst, i32 %n.vec
371 br label %vector.body
373 vector.body: ; preds = %vector.body, %vector.ph
374 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
375 %next.gep = getelementptr i32, ptr %pSrcA, i32 %index
376 %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index
377 %next.gep19 = getelementptr i32, ptr %pDst, i32 %index
378 %wide.load = load <4 x i32>, ptr %next.gep, align 4
379 %0 = sext <4 x i32> %wide.load to <4 x i64>
380 %wide.load20 = load <4 x i32>, ptr %next.gep18, align 4
381 %1 = sext <4 x i32> %wide.load20 to <4 x i64>
382 %2 = mul nsw <4 x i64> %1, %0
383 %3 = ashr <4 x i64> %2, <i64 31, i64 31, i64 31, i64 31>
384 %4 = icmp sgt <4 x i64> %3, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
385 %5 = select <4 x i1> %4, <4 x i64> %3, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
386 %6 = icmp slt <4 x i64> %5, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
387 %7 = select <4 x i1> %6, <4 x i64> %5, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
388 %8 = trunc <4 x i64> %7 to <4 x i32>
389 store <4 x i32> %8, ptr %next.gep19, align 4
390 %index.next = add i32 %index, 4
391 %9 = icmp eq i32 %index.next, %n.vec
392 br i1 %9, label %middle.block, label %vector.body
394 middle.block: ; preds = %vector.body
395 %cmp.n = icmp eq i32 %n.vec, %N
396 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
398 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
401 for.body: ; preds = %for.body.preheader21, %for.body
402 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
403 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
404 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
405 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
406 %incdec.ptr = getelementptr inbounds i32, ptr %pSrcA.addr.011, i32 1
407 %10 = load i32, ptr %pSrcA.addr.011, align 4
408 %conv = sext i32 %10 to i64
409 %incdec.ptr1 = getelementptr inbounds i32, ptr %pSrcB.addr.010, i32 1
410 %11 = load i32, ptr %pSrcB.addr.010, align 4
411 %conv2 = sext i32 %11 to i64
412 %mul = mul nsw i64 %conv2, %conv
413 %shr = ashr i64 %mul, 31
414 %12 = icmp sgt i64 %shr, -2147483648
415 %.val.i = select i1 %12, i64 %shr, i64 -2147483648
416 %13 = icmp slt i64 %.val.i, 2147483647
417 %retval.0.i = select i1 %13, i64 %.val.i, i64 2147483647
418 %conv3 = trunc i64 %retval.0.i to i32
419 %incdec.ptr4 = getelementptr inbounds i32, ptr %pDst.addr.09, i32 1
420 store i32 %conv3, ptr %pDst.addr.09, align 4
421 %inc = add nuw i32 %i.012, 1
422 %exitcond = icmp eq i32 %inc, %N
423 br i1 %exitcond, label %for.cond.cleanup, label %for.body
426 define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
427 ; CHECK-LABEL: ssatmul_4t_q31:
428 ; CHECK: @ %bb.0: @ %entry
429 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
430 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
431 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
432 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
433 ; CHECK-NEXT: .pad #24
434 ; CHECK-NEXT: sub sp, #24
435 ; CHECK-NEXT: cmp r3, #0
436 ; CHECK-NEXT: beq.w .LBB2_3
437 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
438 ; CHECK-NEXT: adds r6, r3, #3
439 ; CHECK-NEXT: movs r5, #1
440 ; CHECK-NEXT: bic r6, r6, #3
441 ; CHECK-NEXT: adr r4, .LCPI2_1
442 ; CHECK-NEXT: subs r6, #4
443 ; CHECK-NEXT: vldrw.u32 q2, [r4]
444 ; CHECK-NEXT: mov.w r9, #0
445 ; CHECK-NEXT: mov.w r12, #-1
446 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
447 ; CHECK-NEXT: adr r5, .LCPI2_0
448 ; CHECK-NEXT: vldrw.u32 q0, [r5]
449 ; CHECK-NEXT: adr r5, .LCPI2_2
450 ; CHECK-NEXT: subs r6, r3, #1
451 ; CHECK-NEXT: vldrw.u32 q3, [r5]
452 ; CHECK-NEXT: vdup.32 q1, r6
453 ; CHECK-NEXT: mvn r8, #-2147483648
454 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
455 ; CHECK-NEXT: .LBB2_2: @ %vector.body
456 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
457 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
458 ; CHECK-NEXT: vdup.32 q4, r9
459 ; CHECK-NEXT: movs r4, #0
460 ; CHECK-NEXT: add.w r9, r9, #4
461 ; CHECK-NEXT: vorr q4, q4, q0
462 ; CHECK-NEXT: vcmp.u32 cs, q1, q4
463 ; CHECK-NEXT: vstr p0, [sp, #20] @ 4-byte Spill
465 ; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
466 ; CHECK-NEXT: vldrwt.u32 q5, [r1], #16
467 ; CHECK-NEXT: vmov.f32 s24, s18
468 ; CHECK-NEXT: vmov.f32 s26, s19
469 ; CHECK-NEXT: vmov.f32 s28, s22
470 ; CHECK-NEXT: vmov.f32 s30, s23
471 ; CHECK-NEXT: vmullb.s32 q0, q7, q6
472 ; CHECK-NEXT: vmov.f32 s18, s21
473 ; CHECK-NEXT: vmov r10, r5, d0
474 ; CHECK-NEXT: asrl r10, r5, #31
475 ; CHECK-NEXT: rsbs.w r7, r10, #-2147483648
476 ; CHECK-NEXT: sbcs.w r7, r12, r5
477 ; CHECK-NEXT: csetm r7, lt
478 ; CHECK-NEXT: bfi r4, r7, #0, #8
479 ; CHECK-NEXT: vmov r6, r7, d1
480 ; CHECK-NEXT: asrl r6, r7, #31
481 ; CHECK-NEXT: rsbs.w r3, r6, #-2147483648
482 ; CHECK-NEXT: vmov q0[2], q0[0], r10, r6
483 ; CHECK-NEXT: sbcs.w r3, r12, r7
484 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r7
485 ; CHECK-NEXT: csetm r3, lt
486 ; CHECK-NEXT: vmov r7, s18
487 ; CHECK-NEXT: bfi r4, r3, #8, #8
488 ; CHECK-NEXT: vmsr p0, r4
489 ; CHECK-NEXT: vpsel q0, q0, q2
490 ; CHECK-NEXT: vmov r3, r4, d0
491 ; CHECK-NEXT: subs.w r3, r3, r8
492 ; CHECK-NEXT: sbcs r3, r4, #0
493 ; CHECK-NEXT: mov.w r4, #0
494 ; CHECK-NEXT: csetm r3, lt
495 ; CHECK-NEXT: bfi r4, r3, #0, #8
496 ; CHECK-NEXT: vmov r3, r5, d1
497 ; CHECK-NEXT: subs.w r3, r3, r8
498 ; CHECK-NEXT: sbcs r3, r5, #0
499 ; CHECK-NEXT: csetm r3, lt
500 ; CHECK-NEXT: bfi r4, r3, #8, #8
501 ; CHECK-NEXT: vmov r3, s16
502 ; CHECK-NEXT: vmsr p0, r4
503 ; CHECK-NEXT: vmov r4, s20
504 ; CHECK-NEXT: vpsel q6, q0, q3
505 ; CHECK-NEXT: vmov.f32 s2, s17
506 ; CHECK-NEXT: smull r10, r5, r4, r3
507 ; CHECK-NEXT: movs r4, #0
508 ; CHECK-NEXT: asrl r10, r5, #31
509 ; CHECK-NEXT: rsbs.w r3, r10, #-2147483648
510 ; CHECK-NEXT: sbcs.w r3, r12, r5
511 ; CHECK-NEXT: csetm r3, lt
512 ; CHECK-NEXT: bfi r4, r3, #0, #8
513 ; CHECK-NEXT: vmov r3, s2
514 ; CHECK-NEXT: smull r6, r3, r7, r3
515 ; CHECK-NEXT: asrl r6, r3, #31
516 ; CHECK-NEXT: rsbs.w r7, r6, #-2147483648
517 ; CHECK-NEXT: vmov q0[2], q0[0], r10, r6
518 ; CHECK-NEXT: sbcs.w r7, r12, r3
519 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r3
520 ; CHECK-NEXT: csetm r7, lt
521 ; CHECK-NEXT: bfi r4, r7, #8, #8
522 ; CHECK-NEXT: vmsr p0, r4
523 ; CHECK-NEXT: vpsel q0, q0, q2
524 ; CHECK-NEXT: vmov r3, r4, d0
525 ; CHECK-NEXT: subs.w r3, r3, r8
526 ; CHECK-NEXT: sbcs r3, r4, #0
527 ; CHECK-NEXT: mov.w r4, #0
528 ; CHECK-NEXT: csetm r3, lt
529 ; CHECK-NEXT: bfi r4, r3, #0, #8
530 ; CHECK-NEXT: vmov r3, r5, d1
531 ; CHECK-NEXT: subs.w r3, r3, r8
532 ; CHECK-NEXT: sbcs r3, r5, #0
533 ; CHECK-NEXT: csetm r3, lt
534 ; CHECK-NEXT: bfi r4, r3, #8, #8
535 ; CHECK-NEXT: vmsr p0, r4
536 ; CHECK-NEXT: vpsel q0, q0, q3
537 ; CHECK-NEXT: vldr p0, [sp, #20] @ 4-byte Reload
538 ; CHECK-NEXT: vmov.f32 s1, s2
539 ; CHECK-NEXT: vmov.f32 s2, s24
540 ; CHECK-NEXT: vmov.f32 s3, s26
542 ; CHECK-NEXT: vstrwt.32 q0, [r2], #16
543 ; CHECK-NEXT: le lr, .LBB2_2
544 ; CHECK-NEXT: .LBB2_3: @ %for.cond.cleanup
545 ; CHECK-NEXT: add sp, #24
546 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
547 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
548 ; CHECK-NEXT: .p2align 4
549 ; CHECK-NEXT: @ %bb.4:
550 ; CHECK-NEXT: .LCPI2_0:
551 ; CHECK-NEXT: .long 0 @ 0x0
552 ; CHECK-NEXT: .long 1 @ 0x1
553 ; CHECK-NEXT: .long 2 @ 0x2
554 ; CHECK-NEXT: .long 3 @ 0x3
555 ; CHECK-NEXT: .LCPI2_1:
556 ; CHECK-NEXT: .long 2147483648 @ 0x80000000
557 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
558 ; CHECK-NEXT: .long 2147483648 @ 0x80000000
559 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
560 ; CHECK-NEXT: .LCPI2_2:
561 ; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
562 ; CHECK-NEXT: .long 0 @ 0x0
563 ; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
564 ; CHECK-NEXT: .long 0 @ 0x0
566 %cmp8 = icmp eq i32 %N, 0
567 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
569 vector.ph: ; preds = %entry
570 %n.rnd.up = add i32 %N, 3
571 %n.vec = and i32 %n.rnd.up, -4
572 %trip.count.minus.1 = add i32 %N, -1
573 %broadcast.splatinsert20 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
574 %broadcast.splat21 = shufflevector <4 x i32> %broadcast.splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
575 br label %vector.body
577 vector.body: ; preds = %vector.body, %vector.ph
578 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
579 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
580 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
581 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
582 %next.gep = getelementptr i32, ptr %pSrcA, i32 %index
583 %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index
584 %next.gep19 = getelementptr i32, ptr %pDst, i32 %index
585 %0 = icmp ule <4 x i32> %induction, %broadcast.splat21
586 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %next.gep, i32 4, <4 x i1> %0, <4 x i32> undef)
587 %1 = sext <4 x i32> %wide.masked.load to <4 x i64>
588 %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %next.gep18, i32 4, <4 x i1> %0, <4 x i32> undef)
589 %2 = sext <4 x i32> %wide.masked.load22 to <4 x i64>
590 %3 = mul nsw <4 x i64> %2, %1
591 %4 = ashr <4 x i64> %3, <i64 31, i64 31, i64 31, i64 31>
592 %5 = icmp sgt <4 x i64> %4, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
593 %6 = select <4 x i1> %5, <4 x i64> %4, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
594 %7 = icmp slt <4 x i64> %6, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
595 %8 = select <4 x i1> %7, <4 x i64> %6, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
596 %9 = trunc <4 x i64> %8 to <4 x i32>
597 call void @llvm.masked.store.v4i32.p0(<4 x i32> %9, ptr %next.gep19, i32 4, <4 x i1> %0)
598 %index.next = add i32 %index, 4
599 %10 = icmp eq i32 %index.next, %n.vec
600 br i1 %10, label %for.cond.cleanup, label %vector.body
602 for.cond.cleanup: ; preds = %vector.body, %entry
606 define arm_aapcs_vfpcc void @usatmul_2_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
607 ; CHECK-LABEL: usatmul_2_q31:
608 ; CHECK: @ %bb.0: @ %entry
609 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
610 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
611 ; CHECK-NEXT: .pad #4
612 ; CHECK-NEXT: sub sp, #4
613 ; CHECK-NEXT: cmp r3, #0
614 ; CHECK-NEXT: beq .LBB3_8
615 ; CHECK-NEXT: @ %bb.1: @ %entry
616 ; CHECK-NEXT: mov r8, r2
617 ; CHECK-NEXT: cmp r3, #1
618 ; CHECK-NEXT: bne .LBB3_3
619 ; CHECK-NEXT: @ %bb.2:
620 ; CHECK-NEXT: movs r7, #0
621 ; CHECK-NEXT: mov r12, r0
622 ; CHECK-NEXT: mov r11, r1
623 ; CHECK-NEXT: mov r2, r8
624 ; CHECK-NEXT: b .LBB3_6
625 ; CHECK-NEXT: .LBB3_3: @ %vector.ph
626 ; CHECK-NEXT: bic r5, r3, #1
627 ; CHECK-NEXT: movs r6, #1
628 ; CHECK-NEXT: subs r7, r5, #2
629 ; CHECK-NEXT: str r5, [sp] @ 4-byte Spill
630 ; CHECK-NEXT: add.w r2, r8, r5, lsl #2
631 ; CHECK-NEXT: add.w r11, r1, r5, lsl #2
632 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1
633 ; CHECK-NEXT: add.w r12, r0, r5, lsl #2
634 ; CHECK-NEXT: vmov.i8 q0, #0xff
635 ; CHECK-NEXT: .LBB3_4: @ %vector.body
636 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
637 ; CHECK-NEXT: ldrd r4, r9, [r0], #8
638 ; CHECK-NEXT: ldrd r5, r10, [r1], #8
639 ; CHECK-NEXT: umull r4, r5, r5, r4
640 ; CHECK-NEXT: lsrl r4, r5, #31
641 ; CHECK-NEXT: subs.w r6, r4, #-1
642 ; CHECK-NEXT: sbcs r5, r5, #0
643 ; CHECK-NEXT: mov.w r6, #0
644 ; CHECK-NEXT: csetm r5, lo
645 ; CHECK-NEXT: bfi r6, r5, #0, #8
646 ; CHECK-NEXT: umull r10, r5, r10, r9
647 ; CHECK-NEXT: lsrl r10, r5, #31
648 ; CHECK-NEXT: subs.w r7, r10, #-1
649 ; CHECK-NEXT: vmov q1[2], q1[0], r4, r10
650 ; CHECK-NEXT: sbcs r5, r5, #0
651 ; CHECK-NEXT: csetm r5, lo
652 ; CHECK-NEXT: bfi r6, r5, #8, #8
653 ; CHECK-NEXT: vmsr p0, r6
654 ; CHECK-NEXT: vpsel q1, q1, q0
655 ; CHECK-NEXT: vmov r4, s6
656 ; CHECK-NEXT: vmov r5, s4
657 ; CHECK-NEXT: strd r5, r4, [r8], #8
658 ; CHECK-NEXT: le lr, .LBB3_4
659 ; CHECK-NEXT: @ %bb.5: @ %middle.block
660 ; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload
661 ; CHECK-NEXT: cmp r7, r3
662 ; CHECK-NEXT: beq .LBB3_8
663 ; CHECK-NEXT: .LBB3_6: @ %for.body.preheader
664 ; CHECK-NEXT: sub.w lr, r3, r7
665 ; CHECK-NEXT: .LBB3_7: @ %for.body
666 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
667 ; CHECK-NEXT: ldr r0, [r12], #4
668 ; CHECK-NEXT: ldr r1, [r11], #4
669 ; CHECK-NEXT: umull r0, r1, r1, r0
670 ; CHECK-NEXT: lsrl r0, r1, #31
671 ; CHECK-NEXT: subs.w r3, r0, #-1
672 ; CHECK-NEXT: sbcs r1, r1, #0
674 ; CHECK-NEXT: movhs.w r0, #-1
675 ; CHECK-NEXT: str r0, [r2], #4
676 ; CHECK-NEXT: le lr, .LBB3_7
677 ; CHECK-NEXT: .LBB3_8: @ %for.cond.cleanup
678 ; CHECK-NEXT: add sp, #4
679 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
681 switch i32 %N, label %vector.ph [
682 i32 0, label %for.cond.cleanup
683 i32 1, label %for.body.preheader
686 vector.ph: ; preds = %entry
687 %n.vec = and i32 %N, -2
688 %ind.end = getelementptr i32, ptr %pSrcA, i32 %n.vec
689 %ind.end15 = getelementptr i32, ptr %pSrcB, i32 %n.vec
690 %ind.end17 = getelementptr i32, ptr %pDst, i32 %n.vec
691 br label %vector.body
693 vector.body: ; preds = %vector.body, %vector.ph
694 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
695 %next.gep = getelementptr i32, ptr %pSrcA, i32 %index
696 %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index
697 %next.gep19 = getelementptr i32, ptr %pDst, i32 %index
698 %wide.load = load <2 x i32>, ptr %next.gep, align 4
699 %0 = zext <2 x i32> %wide.load to <2 x i64>
700 %wide.load20 = load <2 x i32>, ptr %next.gep18, align 4
701 %1 = zext <2 x i32> %wide.load20 to <2 x i64>
702 %2 = mul nuw <2 x i64> %1, %0
703 %3 = lshr <2 x i64> %2, <i64 31, i64 31>
704 %4 = icmp ult <2 x i64> %3, <i64 4294967295, i64 4294967295>
705 %5 = select <2 x i1> %4, <2 x i64> %3, <2 x i64> <i64 4294967295, i64 4294967295>
706 %6 = trunc <2 x i64> %5 to <2 x i32>
707 store <2 x i32> %6, ptr %next.gep19, align 4
708 %index.next = add i32 %index, 2
709 %7 = icmp eq i32 %index.next, %n.vec
710 br i1 %7, label %middle.block, label %vector.body
712 middle.block: ; preds = %vector.body
713 %cmp.n = icmp eq i32 %n.vec, %N
714 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader
716 for.body.preheader: ; preds = %entry, %middle.block
717 %i.012.ph = phi i32 [ 0, %entry ], [ %n.vec, %middle.block ]
718 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %entry ], [ %ind.end, %middle.block ]
719 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %entry ], [ %ind.end15, %middle.block ]
720 %pDst.addr.09.ph = phi ptr [ %pDst, %entry ], [ %ind.end17, %middle.block ]
723 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
726 for.body: ; preds = %for.body.preheader, %for.body
727 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader ]
728 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader ]
729 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader ]
730 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader ]
731 %incdec.ptr = getelementptr inbounds i32, ptr %pSrcA.addr.011, i32 1
732 %8 = load i32, ptr %pSrcA.addr.011, align 4
733 %conv = zext i32 %8 to i64
734 %incdec.ptr1 = getelementptr inbounds i32, ptr %pSrcB.addr.010, i32 1
735 %9 = load i32, ptr %pSrcB.addr.010, align 4
736 %conv2 = zext i32 %9 to i64
737 %mul = mul nuw i64 %conv2, %conv
738 %shr = lshr i64 %mul, 31
739 %10 = icmp ult i64 %shr, 4294967295
740 %retval.0.i = select i1 %10, i64 %shr, i64 4294967295
741 %conv3 = trunc i64 %retval.0.i to i32
742 %incdec.ptr4 = getelementptr inbounds i32, ptr %pDst.addr.09, i32 1
743 store i32 %conv3, ptr %pDst.addr.09, align 4
744 %inc = add nuw i32 %i.012, 1
745 %exitcond = icmp eq i32 %inc, %N
746 br i1 %exitcond, label %for.cond.cleanup, label %for.body
749 define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
750 ; CHECK-LABEL: usatmul_4_q31:
751 ; CHECK: @ %bb.0: @ %entry
752 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
753 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
754 ; CHECK-NEXT: .pad #4
755 ; CHECK-NEXT: sub sp, #4
756 ; CHECK-NEXT: .vsave {d8, d9, d10, d11}
757 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
758 ; CHECK-NEXT: cmp r3, #0
759 ; CHECK-NEXT: beq.w .LBB4_8
760 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
761 ; CHECK-NEXT: mov.w r8, #0
762 ; CHECK-NEXT: cmp r3, #3
763 ; CHECK-NEXT: bhi .LBB4_3
764 ; CHECK-NEXT: @ %bb.2:
765 ; CHECK-NEXT: mov r12, r0
766 ; CHECK-NEXT: mov r9, r1
767 ; CHECK-NEXT: mov r11, r2
768 ; CHECK-NEXT: b .LBB4_6
769 ; CHECK-NEXT: .LBB4_3: @ %vector.ph
770 ; CHECK-NEXT: bic r8, r3, #3
771 ; CHECK-NEXT: movs r6, #1
772 ; CHECK-NEXT: sub.w r7, r8, #4
773 ; CHECK-NEXT: vmov.i64 q0, #0xffffffff
774 ; CHECK-NEXT: add.w r11, r2, r8, lsl #2
775 ; CHECK-NEXT: add.w r9, r1, r8, lsl #2
776 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
777 ; CHECK-NEXT: add.w r12, r0, r8, lsl #2
778 ; CHECK-NEXT: .LBB4_4: @ %vector.body
779 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
780 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
781 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16
782 ; CHECK-NEXT: vmov.f32 s12, s6
783 ; CHECK-NEXT: vmov.f32 s16, s10
784 ; CHECK-NEXT: vmov.f32 s14, s7
785 ; CHECK-NEXT: vmov.f32 s18, s11
786 ; CHECK-NEXT: vmullb.u32 q5, q4, q3
787 ; CHECK-NEXT: vmov.f32 s6, s5
788 ; CHECK-NEXT: vmov r10, r5, d10
789 ; CHECK-NEXT: lsrl r10, r5, #31
790 ; CHECK-NEXT: vmov.f32 s10, s9
791 ; CHECK-NEXT: subs.w r6, r10, #-1
792 ; CHECK-NEXT: sbcs r5, r5, #0
793 ; CHECK-NEXT: mov.w r6, #0
794 ; CHECK-NEXT: csetm r5, lo
795 ; CHECK-NEXT: bfi r6, r5, #0, #8
796 ; CHECK-NEXT: vmov r4, r5, d11
797 ; CHECK-NEXT: lsrl r4, r5, #31
798 ; CHECK-NEXT: subs.w r7, r4, #-1
799 ; CHECK-NEXT: vmov q3[2], q3[0], r10, r4
800 ; CHECK-NEXT: sbcs r5, r5, #0
801 ; CHECK-NEXT: csetm r5, lo
802 ; CHECK-NEXT: bfi r6, r5, #8, #8
803 ; CHECK-NEXT: vmsr p0, r6
804 ; CHECK-NEXT: vpsel q3, q3, q0
805 ; CHECK-NEXT: vmullb.u32 q4, q2, q1
806 ; CHECK-NEXT: vmov r10, r5, d8
807 ; CHECK-NEXT: lsrl r10, r5, #31
808 ; CHECK-NEXT: subs.w r6, r10, #-1
809 ; CHECK-NEXT: sbcs r5, r5, #0
810 ; CHECK-NEXT: mov.w r6, #0
811 ; CHECK-NEXT: csetm r5, lo
812 ; CHECK-NEXT: bfi r6, r5, #0, #8
813 ; CHECK-NEXT: vmov r4, r5, d9
814 ; CHECK-NEXT: lsrl r4, r5, #31
815 ; CHECK-NEXT: subs.w r7, r4, #-1
816 ; CHECK-NEXT: vmov q1[2], q1[0], r10, r4
817 ; CHECK-NEXT: sbcs r5, r5, #0
818 ; CHECK-NEXT: csetm r5, lo
819 ; CHECK-NEXT: bfi r6, r5, #8, #8
820 ; CHECK-NEXT: vmsr p0, r6
821 ; CHECK-NEXT: vpsel q1, q1, q0
822 ; CHECK-NEXT: vmov.f32 s5, s6
823 ; CHECK-NEXT: vmov.f32 s6, s12
824 ; CHECK-NEXT: vmov.f32 s7, s14
825 ; CHECK-NEXT: vstrb.8 q1, [r2], #16
826 ; CHECK-NEXT: le lr, .LBB4_4
827 ; CHECK-NEXT: @ %bb.5: @ %middle.block
828 ; CHECK-NEXT: cmp r8, r3
829 ; CHECK-NEXT: beq .LBB4_8
830 ; CHECK-NEXT: .LBB4_6: @ %for.body.preheader21
831 ; CHECK-NEXT: sub.w lr, r3, r8
832 ; CHECK-NEXT: .LBB4_7: @ %for.body
833 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
834 ; CHECK-NEXT: ldr r0, [r12], #4
835 ; CHECK-NEXT: ldr r1, [r9], #4
836 ; CHECK-NEXT: umull r0, r1, r1, r0
837 ; CHECK-NEXT: lsrl r0, r1, #31
838 ; CHECK-NEXT: subs.w r2, r0, #-1
839 ; CHECK-NEXT: sbcs r1, r1, #0
841 ; CHECK-NEXT: movhs.w r0, #-1
842 ; CHECK-NEXT: str r0, [r11], #4
843 ; CHECK-NEXT: le lr, .LBB4_7
844 ; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup
845 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
846 ; CHECK-NEXT: add sp, #4
847 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
849 %cmp8 = icmp eq i32 %N, 0
850 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
852 for.body.preheader: ; preds = %entry
853 %min.iters.check = icmp ult i32 %N, 4
854 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
856 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
857 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
858 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
859 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
860 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
863 vector.ph: ; preds = %for.body.preheader
864 %n.vec = and i32 %N, -4
865 %ind.end = getelementptr i32, ptr %pSrcA, i32 %n.vec
866 %ind.end15 = getelementptr i32, ptr %pSrcB, i32 %n.vec
867 %ind.end17 = getelementptr i32, ptr %pDst, i32 %n.vec
868 br label %vector.body
870 vector.body: ; preds = %vector.body, %vector.ph
871 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
872 %next.gep = getelementptr i32, ptr %pSrcA, i32 %index
873 %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index
874 %next.gep19 = getelementptr i32, ptr %pDst, i32 %index
875 %wide.load = load <4 x i32>, ptr %next.gep, align 4
876 %0 = zext <4 x i32> %wide.load to <4 x i64>
877 %wide.load20 = load <4 x i32>, ptr %next.gep18, align 4
878 %1 = zext <4 x i32> %wide.load20 to <4 x i64>
879 %2 = mul nuw <4 x i64> %1, %0
880 %3 = lshr <4 x i64> %2, <i64 31, i64 31, i64 31, i64 31>
881 %4 = icmp ult <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
882 %5 = select <4 x i1> %4, <4 x i64> %3, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
883 %6 = trunc <4 x i64> %5 to <4 x i32>
884 store <4 x i32> %6, ptr %next.gep19, align 4
885 %index.next = add i32 %index, 4
886 %7 = icmp eq i32 %index.next, %n.vec
887 br i1 %7, label %middle.block, label %vector.body
889 middle.block: ; preds = %vector.body
890 %cmp.n = icmp eq i32 %n.vec, %N
891 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
893 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
896 for.body: ; preds = %for.body.preheader21, %for.body
897 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
898 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
899 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
900 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
901 %incdec.ptr = getelementptr inbounds i32, ptr %pSrcA.addr.011, i32 1
902 %8 = load i32, ptr %pSrcA.addr.011, align 4
903 %conv = zext i32 %8 to i64
904 %incdec.ptr1 = getelementptr inbounds i32, ptr %pSrcB.addr.010, i32 1
905 %9 = load i32, ptr %pSrcB.addr.010, align 4
906 %conv2 = zext i32 %9 to i64
907 %mul = mul nuw i64 %conv2, %conv
908 %shr = lshr i64 %mul, 31
909 %10 = icmp ult i64 %shr, 4294967295
910 %retval.0.i = select i1 %10, i64 %shr, i64 4294967295
911 %conv3 = trunc i64 %retval.0.i to i32
912 %incdec.ptr4 = getelementptr inbounds i32, ptr %pDst.addr.09, i32 1
913 store i32 %conv3, ptr %pDst.addr.09, align 4
914 %inc = add nuw i32 %i.012, 1
915 %exitcond = icmp eq i32 %inc, %N
916 br i1 %exitcond, label %for.cond.cleanup, label %for.body
922 define arm_aapcs_vfpcc void @ssatmul_4_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
923 ; CHECK-LABEL: ssatmul_4_q15:
924 ; CHECK: @ %bb.0: @ %entry
925 ; CHECK-NEXT: .save {r4, r5, r6, lr}
926 ; CHECK-NEXT: push {r4, r5, r6, lr}
927 ; CHECK-NEXT: cbz r3, .LBB5_8
928 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
929 ; CHECK-NEXT: cmp r3, #3
930 ; CHECK-NEXT: bhi .LBB5_3
931 ; CHECK-NEXT: @ %bb.2:
932 ; CHECK-NEXT: movs r5, #0
933 ; CHECK-NEXT: mov r12, r0
934 ; CHECK-NEXT: mov r6, r1
935 ; CHECK-NEXT: mov r4, r2
936 ; CHECK-NEXT: b .LBB5_6
937 ; CHECK-NEXT: .LBB5_3: @ %vector.ph
938 ; CHECK-NEXT: bic r5, r3, #3
939 ; CHECK-NEXT: movs r4, #1
940 ; CHECK-NEXT: subs r6, r5, #4
941 ; CHECK-NEXT: add.w r12, r0, r5, lsl #1
942 ; CHECK-NEXT: add.w lr, r4, r6, lsr #2
943 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1
944 ; CHECK-NEXT: add.w r6, r1, r5, lsl #1
945 ; CHECK-NEXT: .LBB5_4: @ %vector.body
946 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
947 ; CHECK-NEXT: vldrh.s32 q0, [r0], #8
948 ; CHECK-NEXT: vldrh.s32 q1, [r1], #8
949 ; CHECK-NEXT: vmul.i32 q0, q1, q0
950 ; CHECK-NEXT: vqshrnb.s32 q0, q0, #15
951 ; CHECK-NEXT: vstrh.32 q0, [r2], #8
952 ; CHECK-NEXT: le lr, .LBB5_4
953 ; CHECK-NEXT: @ %bb.5: @ %middle.block
954 ; CHECK-NEXT: cmp r5, r3
956 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
957 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader21
958 ; CHECK-NEXT: sub.w lr, r3, r5
959 ; CHECK-NEXT: .LBB5_7: @ %for.body
960 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
961 ; CHECK-NEXT: ldrsh r0, [r12], #2
962 ; CHECK-NEXT: ldrsh r1, [r6], #2
963 ; CHECK-NEXT: muls r0, r1, r0
964 ; CHECK-NEXT: ssat r0, #16, r0, asr #15
965 ; CHECK-NEXT: strh r0, [r4], #2
966 ; CHECK-NEXT: le lr, .LBB5_7
967 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup
968 ; CHECK-NEXT: pop {r4, r5, r6, pc}
970 %cmp8 = icmp eq i32 %N, 0
971 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
973 for.body.preheader: ; preds = %entry
974 %min.iters.check = icmp ult i32 %N, 4
975 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
977 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
978 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
979 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
980 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
981 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
984 vector.ph: ; preds = %for.body.preheader
985 %n.vec = and i32 %N, -4
986 %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec
987 %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec
988 %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec
989 br label %vector.body
991 vector.body: ; preds = %vector.body, %vector.ph
992 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
993 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
994 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
995 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
996 %wide.load = load <4 x i16>, ptr %next.gep, align 2
997 %0 = sext <4 x i16> %wide.load to <4 x i32>
998 %wide.load20 = load <4 x i16>, ptr %next.gep18, align 2
999 %1 = sext <4 x i16> %wide.load20 to <4 x i32>
1000 %2 = mul nsw <4 x i32> %1, %0
1001 %3 = ashr <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15>
1002 %4 = icmp sgt <4 x i32> %3, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1003 %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1004 %6 = icmp slt <4 x i32> %5, <i32 32767, i32 32767, i32 32767, i32 32767>
1005 %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1006 %8 = trunc <4 x i32> %7 to <4 x i16>
1007 store <4 x i16> %8, ptr %next.gep19, align 2
1008 %index.next = add i32 %index, 4
1009 %9 = icmp eq i32 %index.next, %n.vec
1010 br i1 %9, label %middle.block, label %vector.body
1012 middle.block: ; preds = %vector.body
1013 %cmp.n = icmp eq i32 %n.vec, %N
1014 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1016 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1019 for.body: ; preds = %for.body.preheader21, %for.body
1020 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1021 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1022 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1023 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1024 %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1
1025 %10 = load i16, ptr %pSrcA.addr.011, align 2
1026 %conv = sext i16 %10 to i32
1027 %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1
1028 %11 = load i16, ptr %pSrcB.addr.010, align 2
1029 %conv2 = sext i16 %11 to i32
1030 %mul = mul nsw i32 %conv2, %conv
1031 %shr = ashr i32 %mul, 15
1032 %12 = icmp sgt i32 %shr, -32768
1033 %.val.i = select i1 %12, i32 %shr, i32 -32768
1034 %13 = icmp slt i32 %.val.i, 32767
1035 %retval.0.i = select i1 %13, i32 %.val.i, i32 32767
1036 %conv3 = trunc i32 %retval.0.i to i16
1037 %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1
1038 store i16 %conv3, ptr %pDst.addr.09, align 2
1039 %inc = add nuw i32 %i.012, 1
1040 %exitcond = icmp eq i32 %inc, %N
1041 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1044 define arm_aapcs_vfpcc void @ssatmul_8_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1045 ; CHECK-LABEL: ssatmul_8_q15:
1046 ; CHECK: @ %bb.0: @ %entry
1047 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1048 ; CHECK-NEXT: push {r4, r5, r6, lr}
1049 ; CHECK-NEXT: cbz r3, .LBB6_8
1050 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1051 ; CHECK-NEXT: cmp r3, #7
1052 ; CHECK-NEXT: bhi .LBB6_3
1053 ; CHECK-NEXT: @ %bb.2:
1054 ; CHECK-NEXT: movs r5, #0
1055 ; CHECK-NEXT: mov r12, r0
1056 ; CHECK-NEXT: mov r6, r1
1057 ; CHECK-NEXT: mov r4, r2
1058 ; CHECK-NEXT: b .LBB6_6
1059 ; CHECK-NEXT: .LBB6_3: @ %vector.ph
1060 ; CHECK-NEXT: bic r5, r3, #7
1061 ; CHECK-NEXT: movs r4, #1
1062 ; CHECK-NEXT: sub.w r6, r5, #8
1063 ; CHECK-NEXT: add.w r12, r0, r5, lsl #1
1064 ; CHECK-NEXT: add.w lr, r4, r6, lsr #3
1065 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1
1066 ; CHECK-NEXT: add.w r6, r1, r5, lsl #1
1067 ; CHECK-NEXT: .LBB6_4: @ %vector.body
1068 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1069 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
1070 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
1071 ; CHECK-NEXT: vmullt.s16 q2, q1, q0
1072 ; CHECK-NEXT: vmullb.s16 q0, q1, q0
1073 ; CHECK-NEXT: vqshrnb.s32 q0, q0, #15
1074 ; CHECK-NEXT: vqshrnt.s32 q0, q2, #15
1075 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
1076 ; CHECK-NEXT: le lr, .LBB6_4
1077 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1078 ; CHECK-NEXT: cmp r5, r3
1080 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
1081 ; CHECK-NEXT: .LBB6_6: @ %for.body.preheader21
1082 ; CHECK-NEXT: sub.w lr, r3, r5
1083 ; CHECK-NEXT: .LBB6_7: @ %for.body
1084 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1085 ; CHECK-NEXT: ldrsh r0, [r12], #2
1086 ; CHECK-NEXT: ldrsh r1, [r6], #2
1087 ; CHECK-NEXT: muls r0, r1, r0
1088 ; CHECK-NEXT: ssat r0, #16, r0, asr #15
1089 ; CHECK-NEXT: strh r0, [r4], #2
1090 ; CHECK-NEXT: le lr, .LBB6_7
1091 ; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup
1092 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1094 %cmp8 = icmp eq i32 %N, 0
1095 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1097 for.body.preheader: ; preds = %entry
1098 %min.iters.check = icmp ult i32 %N, 8
1099 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1101 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
1102 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1103 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1104 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1105 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1108 vector.ph: ; preds = %for.body.preheader
1109 %n.vec = and i32 %N, -8
1110 %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec
1111 %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec
1112 %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec
1113 br label %vector.body
1115 vector.body: ; preds = %vector.body, %vector.ph
1116 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1117 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1118 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1119 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1120 %wide.load = load <8 x i16>, ptr %next.gep, align 2
1121 %0 = sext <8 x i16> %wide.load to <8 x i32>
1122 %wide.load20 = load <8 x i16>, ptr %next.gep18, align 2
1123 %1 = sext <8 x i16> %wide.load20 to <8 x i32>
1124 %2 = mul nsw <8 x i32> %1, %0
1125 %3 = ashr <8 x i32> %2, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
1126 %4 = icmp sgt <8 x i32> %3, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1127 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1128 %6 = icmp slt <8 x i32> %5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
1129 %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
1130 %8 = trunc <8 x i32> %7 to <8 x i16>
1131 store <8 x i16> %8, ptr %next.gep19, align 2
1132 %index.next = add i32 %index, 8
1133 %9 = icmp eq i32 %index.next, %n.vec
1134 br i1 %9, label %middle.block, label %vector.body
1136 middle.block: ; preds = %vector.body
1137 %cmp.n = icmp eq i32 %n.vec, %N
1138 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1140 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1143 for.body: ; preds = %for.body.preheader21, %for.body
1144 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1145 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1146 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1147 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1148 %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1
1149 %10 = load i16, ptr %pSrcA.addr.011, align 2
1150 %conv = sext i16 %10 to i32
1151 %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1
1152 %11 = load i16, ptr %pSrcB.addr.010, align 2
1153 %conv2 = sext i16 %11 to i32
1154 %mul = mul nsw i32 %conv2, %conv
1155 %shr = ashr i32 %mul, 15
1156 %12 = icmp sgt i32 %shr, -32768
1157 %.val.i = select i1 %12, i32 %shr, i32 -32768
1158 %13 = icmp slt i32 %.val.i, 32767
1159 %retval.0.i = select i1 %13, i32 %.val.i, i32 32767
1160 %conv3 = trunc i32 %retval.0.i to i16
1161 %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1
1162 store i16 %conv3, ptr %pDst.addr.09, align 2
1163 %inc = add nuw i32 %i.012, 1
1164 %exitcond = icmp eq i32 %inc, %N
1165 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1168 define arm_aapcs_vfpcc void @ssatmul_8i_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1169 ; CHECK-LABEL: ssatmul_8i_q15:
1170 ; CHECK: @ %bb.0: @ %entry
1171 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1172 ; CHECK-NEXT: push {r4, r5, r6, lr}
1173 ; CHECK-NEXT: cbz r3, .LBB7_8
1174 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1175 ; CHECK-NEXT: cmp r3, #7
1176 ; CHECK-NEXT: bhi .LBB7_3
1177 ; CHECK-NEXT: @ %bb.2:
1178 ; CHECK-NEXT: movs r5, #0
1179 ; CHECK-NEXT: mov r12, r0
1180 ; CHECK-NEXT: mov r6, r1
1181 ; CHECK-NEXT: mov r4, r2
1182 ; CHECK-NEXT: b .LBB7_6
1183 ; CHECK-NEXT: .LBB7_3: @ %vector.ph
1184 ; CHECK-NEXT: bic r5, r3, #7
1185 ; CHECK-NEXT: movs r4, #1
1186 ; CHECK-NEXT: sub.w r6, r5, #8
1187 ; CHECK-NEXT: add.w r12, r0, r5, lsl #1
1188 ; CHECK-NEXT: add.w lr, r4, r6, lsr #3
1189 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1
1190 ; CHECK-NEXT: add.w r6, r1, r5, lsl #1
1191 ; CHECK-NEXT: .LBB7_4: @ %vector.body
1192 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1193 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
1194 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
1195 ; CHECK-NEXT: vmullt.s16 q2, q1, q0
1196 ; CHECK-NEXT: vmullb.s16 q0, q1, q0
1197 ; CHECK-NEXT: vqshrnb.s32 q0, q0, #15
1198 ; CHECK-NEXT: vqshrnt.s32 q0, q2, #15
1199 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
1200 ; CHECK-NEXT: le lr, .LBB7_4
1201 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1202 ; CHECK-NEXT: cmp r5, r3
1204 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
1205 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader21
1206 ; CHECK-NEXT: sub.w lr, r3, r5
1207 ; CHECK-NEXT: .LBB7_7: @ %for.body
1208 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1209 ; CHECK-NEXT: ldrsh r0, [r12], #2
1210 ; CHECK-NEXT: ldrsh r1, [r6], #2
1211 ; CHECK-NEXT: muls r0, r1, r0
1212 ; CHECK-NEXT: ssat r0, #16, r0, asr #15
1213 ; CHECK-NEXT: strh r0, [r4], #2
1214 ; CHECK-NEXT: le lr, .LBB7_7
1215 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup
1216 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1218 %cmp8 = icmp eq i32 %N, 0
1219 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1221 for.body.preheader: ; preds = %entry
1222 %min.iters.check = icmp ult i32 %N, 8
1223 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1225 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
1226 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1227 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1228 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1229 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1232 vector.ph: ; preds = %for.body.preheader
1233 %n.vec = and i32 %N, -8
1234 %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec
1235 %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec
1236 %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec
1237 br label %vector.body
1239 vector.body: ; preds = %vector.body, %vector.ph
1240 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1241 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1242 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1243 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1244 %wide.load = load <8 x i16>, ptr %next.gep, align 2
1245 %0 = shufflevector <8 x i16> %wide.load, <8 x i16> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1246 %1 = shufflevector <8 x i16> %wide.load, <8 x i16> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1247 %2 = sext <4 x i16> %0 to <4 x i32>
1248 %3 = sext <4 x i16> %1 to <4 x i32>
1249 %wide.load20 = load <8 x i16>, ptr %next.gep18, align 2
1250 %4 = shufflevector <8 x i16> %wide.load20, <8 x i16> %wide.load20, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1251 %5 = shufflevector <8 x i16> %wide.load20, <8 x i16> %wide.load20, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1252 %6 = sext <4 x i16> %4 to <4 x i32>
1253 %7 = sext <4 x i16> %5 to <4 x i32>
1254 %8 = mul <4 x i32> %6, %2
1255 %9 = mul <4 x i32> %7, %3
1256 %10 = ashr <4 x i32> %8, <i32 15, i32 15, i32 15, i32 15>
1257 %11 = ashr <4 x i32> %9, <i32 15, i32 15, i32 15, i32 15>
1258 %12 = icmp sgt <4 x i32> %10, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1259 %13 = icmp sgt <4 x i32> %11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1260 %14 = select <4 x i1> %12, <4 x i32> %10, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1261 %15 = select <4 x i1> %13, <4 x i32> %11, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1262 %16 = icmp slt <4 x i32> %14, <i32 32767, i32 32767, i32 32767, i32 32767>
1263 %17 = icmp slt <4 x i32> %15, <i32 32767, i32 32767, i32 32767, i32 32767>
1264 %18 = select <4 x i1> %16, <4 x i32> %14, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1265 %19 = select <4 x i1> %17, <4 x i32> %15, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1266 %20 = shufflevector <4 x i32> %18, <4 x i32> %19, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1267 %21 = trunc <8 x i32> %20 to <8 x i16>
1268 store <8 x i16> %21, ptr %next.gep19, align 2
1269 %index.next = add i32 %index, 8
1270 %22 = icmp eq i32 %index.next, %n.vec
1271 br i1 %22, label %middle.block, label %vector.body
1273 middle.block: ; preds = %vector.body
1274 %cmp.n = icmp eq i32 %n.vec, %N
1275 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1277 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1280 for.body: ; preds = %for.body, %for.body.preheader21
1281 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1282 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1283 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1284 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1285 %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1
1286 %23 = load i16, ptr %pSrcA.addr.011, align 2
1287 %conv = sext i16 %23 to i32
1288 %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1
1289 %24 = load i16, ptr %pSrcB.addr.010, align 2
1290 %conv2 = sext i16 %24 to i32
1291 %mul = mul nsw i32 %conv2, %conv
1292 %shr = ashr i32 %mul, 15
1293 %25 = icmp sgt i32 %shr, -32768
1294 %.val.i = select i1 %25, i32 %shr, i32 -32768
1295 %26 = icmp slt i32 %.val.i, 32767
1296 %retval.0.i = select i1 %26, i32 %.val.i, i32 32767
1297 %conv3 = trunc i32 %retval.0.i to i16
1298 %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1
1299 store i16 %conv3, ptr %pDst.addr.09, align 2
1300 %inc = add nuw i32 %i.012, 1
1301 %exitcond = icmp eq i32 %inc, %N
1302 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1305 define arm_aapcs_vfpcc void @ssatmul_s4t_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1306 ; CHECK-LABEL: ssatmul_s4t_q15:
1307 ; CHECK: @ %bb.0: @ %entry
1308 ; CHECK-NEXT: .save {r4, lr}
1309 ; CHECK-NEXT: push {r4, lr}
1310 ; CHECK-NEXT: cmp r3, #0
1312 ; CHECK-NEXT: popeq {r4, pc}
1313 ; CHECK-NEXT: .LBB8_1: @ %vector.ph
1314 ; CHECK-NEXT: add.w r12, r3, #3
1315 ; CHECK-NEXT: mov.w lr, #1
1316 ; CHECK-NEXT: bic r12, r12, #3
1317 ; CHECK-NEXT: adr r4, .LCPI8_0
1318 ; CHECK-NEXT: sub.w r12, r12, #4
1319 ; CHECK-NEXT: vldrw.u32 q0, [r4]
1320 ; CHECK-NEXT: add.w lr, lr, r12, lsr #2
1321 ; CHECK-NEXT: sub.w r12, r3, #1
1322 ; CHECK-NEXT: movs r3, #0
1323 ; CHECK-NEXT: vdup.32 q1, r12
1324 ; CHECK-NEXT: .LBB8_2: @ %vector.body
1325 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1326 ; CHECK-NEXT: vdup.32 q2, r3
1327 ; CHECK-NEXT: adds r3, #4
1328 ; CHECK-NEXT: vorr q2, q2, q0
1329 ; CHECK-NEXT: vptt.u32 cs, q1, q2
1330 ; CHECK-NEXT: vldrht.s32 q2, [r0], #8
1331 ; CHECK-NEXT: vldrht.s32 q3, [r1], #8
1332 ; CHECK-NEXT: vmul.i32 q2, q3, q2
1333 ; CHECK-NEXT: vqshrnb.s32 q2, q2, #15
1335 ; CHECK-NEXT: vstrht.32 q2, [r2], #8
1336 ; CHECK-NEXT: le lr, .LBB8_2
1337 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1338 ; CHECK-NEXT: pop {r4, pc}
1339 ; CHECK-NEXT: .p2align 4
1340 ; CHECK-NEXT: @ %bb.4:
1341 ; CHECK-NEXT: .LCPI8_0:
1342 ; CHECK-NEXT: .long 0 @ 0x0
1343 ; CHECK-NEXT: .long 1 @ 0x1
1344 ; CHECK-NEXT: .long 2 @ 0x2
1345 ; CHECK-NEXT: .long 3 @ 0x3
1347 %cmp8 = icmp eq i32 %N, 0
1348 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
1350 vector.ph: ; preds = %entry
1351 %n.rnd.up = add i32 %N, 3
1352 %n.vec = and i32 %n.rnd.up, -4
1353 %trip.count.minus.1 = add i32 %N, -1
1354 %broadcast.splatinsert20 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
1355 %broadcast.splat21 = shufflevector <4 x i32> %broadcast.splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
1356 br label %vector.body
1358 vector.body: ; preds = %vector.body, %vector.ph
1359 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1360 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
1361 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1362 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
1363 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1364 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1365 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1366 %0 = icmp ule <4 x i32> %induction, %broadcast.splat21
1367 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %next.gep, i32 2, <4 x i1> %0, <4 x i16> undef)
1368 %1 = sext <4 x i16> %wide.masked.load to <4 x i32>
1369 %wide.masked.load22 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %next.gep18, i32 2, <4 x i1> %0, <4 x i16> undef)
1370 %2 = sext <4 x i16> %wide.masked.load22 to <4 x i32>
1371 %3 = mul nsw <4 x i32> %2, %1
1372 %4 = ashr <4 x i32> %3, <i32 15, i32 15, i32 15, i32 15>
1373 %5 = icmp sgt <4 x i32> %4, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1374 %6 = select <4 x i1> %5, <4 x i32> %4, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1375 %7 = icmp slt <4 x i32> %6, <i32 32767, i32 32767, i32 32767, i32 32767>
1376 %8 = select <4 x i1> %7, <4 x i32> %6, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1377 %9 = trunc <4 x i32> %8 to <4 x i16>
1378 call void @llvm.masked.store.v4i16.p0(<4 x i16> %9, ptr %next.gep19, i32 2, <4 x i1> %0)
1379 %index.next = add i32 %index, 4
1380 %10 = icmp eq i32 %index.next, %n.vec
1381 br i1 %10, label %for.cond.cleanup, label %vector.body
1383 for.cond.cleanup: ; preds = %vector.body, %entry
1387 define arm_aapcs_vfpcc void @ssatmul_8t_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1388 ; CHECK-LABEL: ssatmul_8t_q15:
1389 ; CHECK: @ %bb.0: @ %entry
1390 ; CHECK-NEXT: .save {r4, r5, r7, lr}
1391 ; CHECK-NEXT: push {r4, r5, r7, lr}
1392 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1393 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1394 ; CHECK-NEXT: .pad #16
1395 ; CHECK-NEXT: sub sp, #16
1396 ; CHECK-NEXT: cmp r3, #0
1397 ; CHECK-NEXT: beq .LBB9_3
1398 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1399 ; CHECK-NEXT: adds r4, r3, #7
1400 ; CHECK-NEXT: vmov.i8 q2, #0x0
1401 ; CHECK-NEXT: bic r4, r4, #7
1402 ; CHECK-NEXT: vmov.i8 q3, #0xff
1403 ; CHECK-NEXT: sub.w r12, r4, #8
1404 ; CHECK-NEXT: movs r4, #1
1405 ; CHECK-NEXT: mov r5, sp
1406 ; CHECK-NEXT: add.w lr, r4, r12, lsr #3
1407 ; CHECK-NEXT: adr r4, .LCPI9_0
1408 ; CHECK-NEXT: vldrw.u32 q0, [r4]
1409 ; CHECK-NEXT: adr r4, .LCPI9_1
1410 ; CHECK-NEXT: sub.w r12, r3, #1
1411 ; CHECK-NEXT: vldrw.u32 q4, [r4]
1412 ; CHECK-NEXT: movs r3, #0
1413 ; CHECK-NEXT: vdup.32 q1, r12
1414 ; CHECK-NEXT: .LBB9_2: @ %vector.body
1415 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1416 ; CHECK-NEXT: vdup.32 q5, r3
1417 ; CHECK-NEXT: adds r3, #8
1418 ; CHECK-NEXT: vorr q6, q5, q0
1419 ; CHECK-NEXT: vorr q5, q5, q4
1420 ; CHECK-NEXT: vcmp.u32 cs, q1, q6
1421 ; CHECK-NEXT: vpsel q6, q3, q2
1422 ; CHECK-NEXT: vcmp.u32 cs, q1, q5
1423 ; CHECK-NEXT: vpsel q5, q3, q2
1424 ; CHECK-NEXT: vstrh.32 q6, [r5, #8]
1425 ; CHECK-NEXT: vstrh.32 q5, [r5]
1426 ; CHECK-NEXT: vldrw.u32 q5, [r5]
1427 ; CHECK-NEXT: vptt.i16 ne, q5, zr
1428 ; CHECK-NEXT: vldrht.u16 q5, [r0], #16
1429 ; CHECK-NEXT: vldrht.u16 q6, [r1], #16
1430 ; CHECK-NEXT: vmullt.s16 q7, q6, q5
1431 ; CHECK-NEXT: vmullb.s16 q5, q6, q5
1432 ; CHECK-NEXT: vqshrnb.s32 q5, q5, #15
1433 ; CHECK-NEXT: vqshrnt.s32 q5, q7, #15
1435 ; CHECK-NEXT: vstrht.16 q5, [r2], #16
1436 ; CHECK-NEXT: le lr, .LBB9_2
1437 ; CHECK-NEXT: .LBB9_3: @ %for.cond.cleanup
1438 ; CHECK-NEXT: add sp, #16
1439 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1440 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1441 ; CHECK-NEXT: .p2align 4
1442 ; CHECK-NEXT: @ %bb.4:
1443 ; CHECK-NEXT: .LCPI9_0:
1444 ; CHECK-NEXT: .long 4 @ 0x4
1445 ; CHECK-NEXT: .long 5 @ 0x5
1446 ; CHECK-NEXT: .long 6 @ 0x6
1447 ; CHECK-NEXT: .long 7 @ 0x7
1448 ; CHECK-NEXT: .LCPI9_1:
1449 ; CHECK-NEXT: .long 0 @ 0x0
1450 ; CHECK-NEXT: .long 1 @ 0x1
1451 ; CHECK-NEXT: .long 2 @ 0x2
1452 ; CHECK-NEXT: .long 3 @ 0x3
1454 %cmp8 = icmp eq i32 %N, 0
1455 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
1457 vector.ph: ; preds = %entry
1458 %n.rnd.up = add i32 %N, 7
1459 %n.vec = and i32 %n.rnd.up, -8
1460 %trip.count.minus.1 = add i32 %N, -1
1461 %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
1462 %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer
1463 br label %vector.body
1465 vector.body: ; preds = %vector.body, %vector.ph
1466 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1467 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
1468 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1469 %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1470 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1471 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1472 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1473 %0 = icmp ule <8 x i32> %induction, %broadcast.splat21
1474 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep, i32 2, <8 x i1> %0, <8 x i16> undef)
1475 %1 = sext <8 x i16> %wide.masked.load to <8 x i32>
1476 %wide.masked.load22 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep18, i32 2, <8 x i1> %0, <8 x i16> undef)
1477 %2 = sext <8 x i16> %wide.masked.load22 to <8 x i32>
1478 %3 = mul nsw <8 x i32> %2, %1
1479 %4 = ashr <8 x i32> %3, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
1480 %5 = icmp sgt <8 x i32> %4, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1481 %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1482 %7 = icmp slt <8 x i32> %6, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
1483 %8 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
1484 %9 = trunc <8 x i32> %8 to <8 x i16>
1485 call void @llvm.masked.store.v8i16.p0(<8 x i16> %9, ptr %next.gep19, i32 2, <8 x i1> %0)
1486 %index.next = add i32 %index, 8
1487 %10 = icmp eq i32 %index.next, %n.vec
1488 br i1 %10, label %for.cond.cleanup, label %vector.body
1490 for.cond.cleanup: ; preds = %vector.body, %entry
1494 define arm_aapcs_vfpcc void @ssatmul_8ti_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1495 ; CHECK-LABEL: ssatmul_8ti_q15:
1496 ; CHECK: @ %bb.0: @ %entry
1497 ; CHECK-NEXT: .save {r4, r5, r7, lr}
1498 ; CHECK-NEXT: push {r4, r5, r7, lr}
1499 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1500 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1501 ; CHECK-NEXT: .pad #16
1502 ; CHECK-NEXT: sub sp, #16
1503 ; CHECK-NEXT: cmp r3, #0
1504 ; CHECK-NEXT: beq .LBB10_3
1505 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1506 ; CHECK-NEXT: adds r4, r3, #7
1507 ; CHECK-NEXT: vmov.i8 q2, #0x0
1508 ; CHECK-NEXT: bic r4, r4, #7
1509 ; CHECK-NEXT: vmov.i8 q3, #0xff
1510 ; CHECK-NEXT: sub.w r12, r4, #8
1511 ; CHECK-NEXT: movs r4, #1
1512 ; CHECK-NEXT: mov r5, sp
1513 ; CHECK-NEXT: add.w lr, r4, r12, lsr #3
1514 ; CHECK-NEXT: adr r4, .LCPI10_0
1515 ; CHECK-NEXT: vldrw.u32 q0, [r4]
1516 ; CHECK-NEXT: adr r4, .LCPI10_1
1517 ; CHECK-NEXT: sub.w r12, r3, #1
1518 ; CHECK-NEXT: vldrw.u32 q4, [r4]
1519 ; CHECK-NEXT: movs r3, #0
1520 ; CHECK-NEXT: vdup.32 q1, r12
1521 ; CHECK-NEXT: .LBB10_2: @ %vector.body
1522 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1523 ; CHECK-NEXT: vdup.32 q5, r3
1524 ; CHECK-NEXT: adds r3, #8
1525 ; CHECK-NEXT: vorr q6, q5, q0
1526 ; CHECK-NEXT: vorr q5, q5, q4
1527 ; CHECK-NEXT: vcmp.u32 cs, q1, q6
1528 ; CHECK-NEXT: vpsel q6, q3, q2
1529 ; CHECK-NEXT: vcmp.u32 cs, q1, q5
1530 ; CHECK-NEXT: vpsel q5, q3, q2
1531 ; CHECK-NEXT: vstrh.32 q6, [r5, #8]
1532 ; CHECK-NEXT: vstrh.32 q5, [r5]
1533 ; CHECK-NEXT: vldrw.u32 q5, [r5]
1534 ; CHECK-NEXT: vptt.i16 ne, q5, zr
1535 ; CHECK-NEXT: vldrht.u16 q5, [r0], #16
1536 ; CHECK-NEXT: vldrht.u16 q6, [r1], #16
1537 ; CHECK-NEXT: vmullt.s16 q7, q6, q5
1538 ; CHECK-NEXT: vmullb.s16 q5, q6, q5
1539 ; CHECK-NEXT: vqshrnb.s32 q5, q5, #15
1540 ; CHECK-NEXT: vqshrnt.s32 q5, q7, #15
1542 ; CHECK-NEXT: vstrht.16 q5, [r2], #16
1543 ; CHECK-NEXT: le lr, .LBB10_2
1544 ; CHECK-NEXT: .LBB10_3: @ %for.cond.cleanup
1545 ; CHECK-NEXT: add sp, #16
1546 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1547 ; CHECK-NEXT: pop {r4, r5, r7, pc}
1548 ; CHECK-NEXT: .p2align 4
1549 ; CHECK-NEXT: @ %bb.4:
1550 ; CHECK-NEXT: .LCPI10_0:
1551 ; CHECK-NEXT: .long 4 @ 0x4
1552 ; CHECK-NEXT: .long 5 @ 0x5
1553 ; CHECK-NEXT: .long 6 @ 0x6
1554 ; CHECK-NEXT: .long 7 @ 0x7
1555 ; CHECK-NEXT: .LCPI10_1:
1556 ; CHECK-NEXT: .long 0 @ 0x0
1557 ; CHECK-NEXT: .long 1 @ 0x1
1558 ; CHECK-NEXT: .long 2 @ 0x2
1559 ; CHECK-NEXT: .long 3 @ 0x3
1561 %cmp8 = icmp eq i32 %N, 0
1562 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
1564 vector.ph: ; preds = %entry
1565 %n.rnd.up = add i32 %N, 7
1566 %n.vec = and i32 %n.rnd.up, -8
1567 %trip.count.minus.1 = add i32 %N, -1
1568 %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
1569 %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer
1570 br label %vector.body
1572 vector.body: ; preds = %vector.body, %vector.ph
1573 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1574 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
1575 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1576 %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1577 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1578 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1579 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1580 %0 = icmp ule <8 x i32> %induction, %broadcast.splat21
1581 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep, i32 2, <8 x i1> %0, <8 x i16> undef)
1582 %1 = shufflevector <8 x i16> %wide.masked.load, <8 x i16> %wide.masked.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1583 %2 = shufflevector <8 x i16> %wide.masked.load, <8 x i16> %wide.masked.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1584 %3 = sext <4 x i16> %1 to <4 x i32>
1585 %4 = sext <4 x i16> %2 to <4 x i32>
1586 %wide.masked.load22 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep18, i32 2, <8 x i1> %0, <8 x i16> undef)
1587 %5 = shufflevector <8 x i16> %wide.masked.load22, <8 x i16> %wide.masked.load22, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1588 %6 = shufflevector <8 x i16> %wide.masked.load22, <8 x i16> %wide.masked.load22, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1589 %7 = sext <4 x i16> %5 to <4 x i32>
1590 %8 = sext <4 x i16> %6 to <4 x i32>
1591 %9 = mul <4 x i32> %7, %3
1592 %10 = mul <4 x i32> %8, %4
1593 %11 = ashr <4 x i32> %9, <i32 15, i32 15, i32 15, i32 15>
1594 %12 = ashr <4 x i32> %10, <i32 15, i32 15, i32 15, i32 15>
1595 %13 = icmp sgt <4 x i32> %11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1596 %14 = icmp sgt <4 x i32> %12, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1597 %15 = select <4 x i1> %13, <4 x i32> %11, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1598 %16 = select <4 x i1> %14, <4 x i32> %12, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1599 %17 = icmp slt <4 x i32> %15, <i32 32767, i32 32767, i32 32767, i32 32767>
1600 %18 = icmp slt <4 x i32> %16, <i32 32767, i32 32767, i32 32767, i32 32767>
1601 %19 = select <4 x i1> %17, <4 x i32> %15, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1602 %20 = select <4 x i1> %18, <4 x i32> %16, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1603 %21 = shufflevector <4 x i32> %19, <4 x i32> %20, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1604 %22 = trunc <8 x i32> %21 to <8 x i16>
1605 call void @llvm.masked.store.v8i16.p0(<8 x i16> %22, ptr %next.gep19, i32 2, <8 x i1> %0)
1606 %index.next = add i32 %index, 8
1607 %23 = icmp eq i32 %index.next, %n.vec
1608 br i1 %23, label %for.cond.cleanup, label %vector.body
1610 for.cond.cleanup: ; preds = %vector.body, %entry
1614 define arm_aapcs_vfpcc void @usatmul_4_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1615 ; CHECK-LABEL: usatmul_4_q15:
1616 ; CHECK: @ %bb.0: @ %entry
1617 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1618 ; CHECK-NEXT: push {r4, r5, r6, lr}
1619 ; CHECK-NEXT: cmp r3, #0
1620 ; CHECK-NEXT: beq .LBB11_8
1621 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1622 ; CHECK-NEXT: cmp r3, #3
1623 ; CHECK-NEXT: bhi .LBB11_3
1624 ; CHECK-NEXT: @ %bb.2:
1625 ; CHECK-NEXT: movs r5, #0
1626 ; CHECK-NEXT: mov r12, r0
1627 ; CHECK-NEXT: mov r6, r1
1628 ; CHECK-NEXT: mov r4, r2
1629 ; CHECK-NEXT: b .LBB11_6
1630 ; CHECK-NEXT: .LBB11_3: @ %vector.ph
1631 ; CHECK-NEXT: bic r5, r3, #3
1632 ; CHECK-NEXT: movs r4, #1
1633 ; CHECK-NEXT: subs r6, r5, #4
1634 ; CHECK-NEXT: add.w r12, r0, r5, lsl #1
1635 ; CHECK-NEXT: add.w lr, r4, r6, lsr #2
1636 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1
1637 ; CHECK-NEXT: add.w r6, r1, r5, lsl #1
1638 ; CHECK-NEXT: .LBB11_4: @ %vector.body
1639 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1640 ; CHECK-NEXT: vldrh.u32 q0, [r0], #8
1641 ; CHECK-NEXT: vldrh.u32 q1, [r1], #8
1642 ; CHECK-NEXT: vmul.i32 q0, q1, q0
1643 ; CHECK-NEXT: vqshrnb.u32 q0, q0, #15
1644 ; CHECK-NEXT: vstrh.32 q0, [r2], #8
1645 ; CHECK-NEXT: le lr, .LBB11_4
1646 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1647 ; CHECK-NEXT: cmp r5, r3
1649 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
1650 ; CHECK-NEXT: .LBB11_6: @ %for.body.preheader21
1651 ; CHECK-NEXT: sub.w lr, r3, r5
1652 ; CHECK-NEXT: movw r0, #65535
1653 ; CHECK-NEXT: .LBB11_7: @ %for.body
1654 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1655 ; CHECK-NEXT: ldrh r1, [r12], #2
1656 ; CHECK-NEXT: ldrh r2, [r6], #2
1657 ; CHECK-NEXT: muls r1, r2, r1
1658 ; CHECK-NEXT: lsrs r2, r1, #15
1659 ; CHECK-NEXT: cmp r2, r0
1660 ; CHECK-NEXT: movw r2, #65535
1662 ; CHECK-NEXT: lsrlo r2, r1, #15
1663 ; CHECK-NEXT: strh r2, [r4], #2
1664 ; CHECK-NEXT: le lr, .LBB11_7
1665 ; CHECK-NEXT: .LBB11_8: @ %for.cond.cleanup
1666 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1668 %cmp8 = icmp eq i32 %N, 0
1669 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1671 for.body.preheader: ; preds = %entry
1672 %min.iters.check = icmp ult i32 %N, 4
1673 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1675 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
1676 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1677 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1678 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1679 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1682 vector.ph: ; preds = %for.body.preheader
1683 %n.vec = and i32 %N, -4
1684 %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec
1685 %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec
1686 %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec
1687 br label %vector.body
1689 vector.body: ; preds = %vector.body, %vector.ph
1690 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1691 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1692 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1693 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1694 %wide.load = load <4 x i16>, ptr %next.gep, align 2
1695 %0 = zext <4 x i16> %wide.load to <4 x i32>
1696 %wide.load20 = load <4 x i16>, ptr %next.gep18, align 2
1697 %1 = zext <4 x i16> %wide.load20 to <4 x i32>
1698 %2 = mul nuw <4 x i32> %1, %0
1699 %3 = lshr <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15>
1700 %4 = icmp ult <4 x i32> %3, <i32 65535, i32 65535, i32 65535, i32 65535>
1701 %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
1702 %6 = trunc <4 x i32> %5 to <4 x i16>
1703 store <4 x i16> %6, ptr %next.gep19, align 2
1704 %index.next = add i32 %index, 4
1705 %7 = icmp eq i32 %index.next, %n.vec
1706 br i1 %7, label %middle.block, label %vector.body
1708 middle.block: ; preds = %vector.body
1709 %cmp.n = icmp eq i32 %n.vec, %N
1710 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1712 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1715 for.body: ; preds = %for.body.preheader21, %for.body
1716 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1717 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1718 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1719 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1720 %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1
1721 %8 = load i16, ptr %pSrcA.addr.011, align 2
1722 %conv = zext i16 %8 to i32
1723 %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1
1724 %9 = load i16, ptr %pSrcB.addr.010, align 2
1725 %conv2 = zext i16 %9 to i32
1726 %mul = mul nuw i32 %conv2, %conv
1727 %shr = lshr i32 %mul, 15
1728 %10 = icmp ult i32 %shr, 65535
1729 %retval.0.i = select i1 %10, i32 %shr, i32 65535
1730 %conv3 = trunc i32 %retval.0.i to i16
1731 %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1
1732 store i16 %conv3, ptr %pDst.addr.09, align 2
1733 %inc = add nuw i32 %i.012, 1
1734 %exitcond = icmp eq i32 %inc, %N
1735 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1738 define arm_aapcs_vfpcc void @usatmul_8_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1739 ; CHECK-LABEL: usatmul_8_q15:
1740 ; CHECK: @ %bb.0: @ %entry
1741 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1742 ; CHECK-NEXT: push {r4, r5, r6, lr}
1743 ; CHECK-NEXT: cmp r3, #0
1744 ; CHECK-NEXT: beq .LBB12_8
1745 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1746 ; CHECK-NEXT: cmp r3, #7
1747 ; CHECK-NEXT: bhi .LBB12_3
1748 ; CHECK-NEXT: @ %bb.2:
1749 ; CHECK-NEXT: movs r5, #0
1750 ; CHECK-NEXT: mov r12, r0
1751 ; CHECK-NEXT: mov r6, r1
1752 ; CHECK-NEXT: mov r4, r2
1753 ; CHECK-NEXT: b .LBB12_6
1754 ; CHECK-NEXT: .LBB12_3: @ %vector.ph
1755 ; CHECK-NEXT: bic r5, r3, #7
1756 ; CHECK-NEXT: movs r4, #1
1757 ; CHECK-NEXT: sub.w r6, r5, #8
1758 ; CHECK-NEXT: add.w r12, r0, r5, lsl #1
1759 ; CHECK-NEXT: add.w lr, r4, r6, lsr #3
1760 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1
1761 ; CHECK-NEXT: add.w r6, r1, r5, lsl #1
1762 ; CHECK-NEXT: .LBB12_4: @ %vector.body
1763 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1764 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
1765 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
1766 ; CHECK-NEXT: vmullt.u16 q2, q1, q0
1767 ; CHECK-NEXT: vmullb.u16 q0, q1, q0
1768 ; CHECK-NEXT: vqshrnb.u32 q0, q0, #15
1769 ; CHECK-NEXT: vqshrnt.u32 q0, q2, #15
1770 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
1771 ; CHECK-NEXT: le lr, .LBB12_4
1772 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1773 ; CHECK-NEXT: cmp r5, r3
1775 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
1776 ; CHECK-NEXT: .LBB12_6: @ %for.body.preheader21
1777 ; CHECK-NEXT: sub.w lr, r3, r5
1778 ; CHECK-NEXT: movw r0, #65535
1779 ; CHECK-NEXT: .LBB12_7: @ %for.body
1780 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1781 ; CHECK-NEXT: ldrh r1, [r12], #2
1782 ; CHECK-NEXT: ldrh r2, [r6], #2
1783 ; CHECK-NEXT: muls r1, r2, r1
1784 ; CHECK-NEXT: lsrs r2, r1, #15
1785 ; CHECK-NEXT: cmp r2, r0
1786 ; CHECK-NEXT: movw r2, #65535
1788 ; CHECK-NEXT: lsrlo r2, r1, #15
1789 ; CHECK-NEXT: strh r2, [r4], #2
1790 ; CHECK-NEXT: le lr, .LBB12_7
1791 ; CHECK-NEXT: .LBB12_8: @ %for.cond.cleanup
1792 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1794 %cmp8 = icmp eq i32 %N, 0
1795 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1797 for.body.preheader: ; preds = %entry
1798 %min.iters.check = icmp ult i32 %N, 8
1799 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1801 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
1802 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1803 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1804 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1805 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1808 vector.ph: ; preds = %for.body.preheader
1809 %n.vec = and i32 %N, -8
1810 %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec
1811 %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec
1812 %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec
1813 br label %vector.body
1815 vector.body: ; preds = %vector.body, %vector.ph
1816 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1817 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index
1818 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index
1819 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index
1820 %wide.load = load <8 x i16>, ptr %next.gep, align 2
1821 %0 = zext <8 x i16> %wide.load to <8 x i32>
1822 %wide.load20 = load <8 x i16>, ptr %next.gep18, align 2
1823 %1 = zext <8 x i16> %wide.load20 to <8 x i32>
1824 %2 = mul nuw <8 x i32> %1, %0
1825 %3 = lshr <8 x i32> %2, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
1826 %4 = icmp ult <8 x i32> %3, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1827 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1828 %6 = trunc <8 x i32> %5 to <8 x i16>
1829 store <8 x i16> %6, ptr %next.gep19, align 2
1830 %index.next = add i32 %index, 8
1831 %7 = icmp eq i32 %index.next, %n.vec
1832 br i1 %7, label %middle.block, label %vector.body
1834 middle.block: ; preds = %vector.body
1835 %cmp.n = icmp eq i32 %n.vec, %N
1836 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1838 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1841 for.body: ; preds = %for.body.preheader21, %for.body
1842 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1843 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1844 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1845 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1846 %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1
1847 %8 = load i16, ptr %pSrcA.addr.011, align 2
1848 %conv = zext i16 %8 to i32
1849 %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1
1850 %9 = load i16, ptr %pSrcB.addr.010, align 2
1851 %conv2 = zext i16 %9 to i32
1852 %mul = mul nuw i32 %conv2, %conv
1853 %shr = lshr i32 %mul, 15
1854 %10 = icmp ult i32 %shr, 65535
1855 %retval.0.i = select i1 %10, i32 %shr, i32 65535
1856 %conv3 = trunc i32 %retval.0.i to i16
1857 %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1
1858 store i16 %conv3, ptr %pDst.addr.09, align 2
1859 %inc = add nuw i32 %i.012, 1
1860 %exitcond = icmp eq i32 %inc, %N
1861 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1867 define arm_aapcs_vfpcc void @ssatmul_4_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1868 ; CHECK-LABEL: ssatmul_4_q7:
1869 ; CHECK: @ %bb.0: @ %entry
1870 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1871 ; CHECK-NEXT: push {r4, r5, r6, lr}
1872 ; CHECK-NEXT: cmp r3, #0
1873 ; CHECK-NEXT: beq .LBB13_8
1874 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1875 ; CHECK-NEXT: cmp r3, #3
1876 ; CHECK-NEXT: bhi .LBB13_3
1877 ; CHECK-NEXT: @ %bb.2:
1878 ; CHECK-NEXT: movs r5, #0
1879 ; CHECK-NEXT: mov r12, r0
1880 ; CHECK-NEXT: mov r6, r1
1881 ; CHECK-NEXT: mov r4, r2
1882 ; CHECK-NEXT: b .LBB13_6
1883 ; CHECK-NEXT: .LBB13_3: @ %vector.ph
1884 ; CHECK-NEXT: bic r5, r3, #3
1885 ; CHECK-NEXT: movs r4, #1
1886 ; CHECK-NEXT: subs r6, r5, #4
1887 ; CHECK-NEXT: add.w r12, r0, r5
1888 ; CHECK-NEXT: vmvn.i32 q0, #0x7f
1889 ; CHECK-NEXT: vmov.i32 q1, #0x7f
1890 ; CHECK-NEXT: add.w lr, r4, r6, lsr #2
1891 ; CHECK-NEXT: adds r4, r2, r5
1892 ; CHECK-NEXT: adds r6, r1, r5
1893 ; CHECK-NEXT: .LBB13_4: @ %vector.body
1894 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1895 ; CHECK-NEXT: vldrb.s32 q2, [r0], #4
1896 ; CHECK-NEXT: vldrb.s32 q3, [r1], #4
1897 ; CHECK-NEXT: vmul.i32 q2, q3, q2
1898 ; CHECK-NEXT: vshr.s32 q2, q2, #7
1899 ; CHECK-NEXT: vmax.s32 q2, q2, q0
1900 ; CHECK-NEXT: vmin.s32 q2, q2, q1
1901 ; CHECK-NEXT: vstrb.32 q2, [r2], #4
1902 ; CHECK-NEXT: le lr, .LBB13_4
1903 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1904 ; CHECK-NEXT: cmp r5, r3
1906 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
1907 ; CHECK-NEXT: .LBB13_6: @ %for.body.preheader21
1908 ; CHECK-NEXT: sub.w lr, r3, r5
1909 ; CHECK-NEXT: .LBB13_7: @ %for.body
1910 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1911 ; CHECK-NEXT: ldrsb r0, [r12], #1
1912 ; CHECK-NEXT: ldrsb r1, [r6], #1
1913 ; CHECK-NEXT: muls r0, r1, r0
1914 ; CHECK-NEXT: ssat r0, #8, r0, asr #7
1915 ; CHECK-NEXT: strb r0, [r4], #1
1916 ; CHECK-NEXT: le lr, .LBB13_7
1917 ; CHECK-NEXT: .LBB13_8: @ %for.cond.cleanup
1918 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1920 %cmp8 = icmp eq i32 %N, 0
1921 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1923 for.body.preheader: ; preds = %entry
1924 %min.iters.check = icmp ult i32 %N, 4
1925 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1927 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
1928 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1929 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1930 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1931 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1934 vector.ph: ; preds = %for.body.preheader
1935 %n.vec = and i32 %N, -4
1936 %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec
1937 %ind.end15 = getelementptr i8, ptr %pSrcB, i32 %n.vec
1938 %ind.end17 = getelementptr i8, ptr %pDst, i32 %n.vec
1939 br label %vector.body
1941 vector.body: ; preds = %vector.body, %vector.ph
1942 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1943 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
1944 %next.gep18 = getelementptr i8, ptr %pSrcB, i32 %index
1945 %next.gep19 = getelementptr i8, ptr %pDst, i32 %index
1946 %wide.load = load <4 x i8>, ptr %next.gep, align 1
1947 %0 = sext <4 x i8> %wide.load to <4 x i32>
1948 %wide.load20 = load <4 x i8>, ptr %next.gep18, align 1
1949 %1 = sext <4 x i8> %wide.load20 to <4 x i32>
1950 %2 = mul nsw <4 x i32> %1, %0
1951 %3 = ashr <4 x i32> %2, <i32 7, i32 7, i32 7, i32 7>
1952 %4 = icmp sgt <4 x i32> %3, <i32 -128, i32 -128, i32 -128, i32 -128>
1953 %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 -128, i32 -128, i32 -128, i32 -128>
1954 %6 = icmp slt <4 x i32> %5, <i32 127, i32 127, i32 127, i32 127>
1955 %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 127, i32 127, i32 127, i32 127>
1956 %8 = trunc <4 x i32> %7 to <4 x i8>
1957 store <4 x i8> %8, ptr %next.gep19, align 1
1958 %index.next = add i32 %index, 4
1959 %9 = icmp eq i32 %index.next, %n.vec
1960 br i1 %9, label %middle.block, label %vector.body
1962 middle.block: ; preds = %vector.body
1963 %cmp.n = icmp eq i32 %n.vec, %N
1964 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1966 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1969 for.body: ; preds = %for.body.preheader21, %for.body
1970 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1971 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1972 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1973 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1974 %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1
1975 %10 = load i8, ptr %pSrcA.addr.011, align 1
1976 %conv = sext i8 %10 to i32
1977 %incdec.ptr1 = getelementptr inbounds i8, ptr %pSrcB.addr.010, i32 1
1978 %11 = load i8, ptr %pSrcB.addr.010, align 1
1979 %conv2 = sext i8 %11 to i32
1980 %mul = mul nsw i32 %conv2, %conv
1981 %shr = ashr i32 %mul, 7
1982 %12 = icmp sgt i32 %shr, -128
1983 %.val.i = select i1 %12, i32 %shr, i32 -128
1984 %13 = icmp slt i32 %.val.i, 127
1985 %retval.0.i = select i1 %13, i32 %.val.i, i32 127
1986 %conv3 = trunc i32 %retval.0.i to i8
1987 %incdec.ptr4 = getelementptr inbounds i8, ptr %pDst.addr.09, i32 1
1988 store i8 %conv3, ptr %pDst.addr.09, align 1
1989 %inc = add nuw i32 %i.012, 1
1990 %exitcond = icmp eq i32 %inc, %N
1991 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1994 define arm_aapcs_vfpcc void @ssatmul_8_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
1995 ; CHECK-LABEL: ssatmul_8_q7:
1996 ; CHECK: @ %bb.0: @ %entry
1997 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1998 ; CHECK-NEXT: push {r4, r5, r6, lr}
1999 ; CHECK-NEXT: cbz r3, .LBB14_8
2000 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
2001 ; CHECK-NEXT: cmp r3, #7
2002 ; CHECK-NEXT: bhi .LBB14_3
2003 ; CHECK-NEXT: @ %bb.2:
2004 ; CHECK-NEXT: movs r5, #0
2005 ; CHECK-NEXT: mov r12, r0
2006 ; CHECK-NEXT: mov r6, r1
2007 ; CHECK-NEXT: mov r4, r2
2008 ; CHECK-NEXT: b .LBB14_6
2009 ; CHECK-NEXT: .LBB14_3: @ %vector.ph
2010 ; CHECK-NEXT: bic r5, r3, #7
2011 ; CHECK-NEXT: movs r4, #1
2012 ; CHECK-NEXT: sub.w r6, r5, #8
2013 ; CHECK-NEXT: add.w r12, r0, r5
2014 ; CHECK-NEXT: add.w lr, r4, r6, lsr #3
2015 ; CHECK-NEXT: adds r4, r2, r5
2016 ; CHECK-NEXT: adds r6, r1, r5
2017 ; CHECK-NEXT: .LBB14_4: @ %vector.body
2018 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2019 ; CHECK-NEXT: vldrb.s16 q0, [r0], #8
2020 ; CHECK-NEXT: vldrb.s16 q1, [r1], #8
2021 ; CHECK-NEXT: vmul.i16 q0, q1, q0
2022 ; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
2023 ; CHECK-NEXT: vstrb.16 q0, [r2], #8
2024 ; CHECK-NEXT: le lr, .LBB14_4
2025 ; CHECK-NEXT: @ %bb.5: @ %middle.block
2026 ; CHECK-NEXT: cmp r5, r3
2028 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
2029 ; CHECK-NEXT: .LBB14_6: @ %for.body.preheader23
2030 ; CHECK-NEXT: sub.w lr, r3, r5
2031 ; CHECK-NEXT: .LBB14_7: @ %for.body
2032 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2033 ; CHECK-NEXT: ldrsb r0, [r12], #1
2034 ; CHECK-NEXT: ldrsb r1, [r6], #1
2035 ; CHECK-NEXT: muls r0, r1, r0
2036 ; CHECK-NEXT: ssat r0, #8, r0, asr #7
2037 ; CHECK-NEXT: strb r0, [r4], #1
2038 ; CHECK-NEXT: le lr, .LBB14_7
2039 ; CHECK-NEXT: .LBB14_8: @ %for.cond.cleanup
2040 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2042 %cmp10 = icmp eq i32 %N, 0
2043 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2045 for.body.preheader: ; preds = %entry
2046 %min.iters.check = icmp ult i32 %N, 8
2047 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2049 for.body.preheader23: ; preds = %middle.block, %for.body.preheader
2050 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2051 %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2052 %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2053 %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2056 vector.ph: ; preds = %for.body.preheader
2057 %n.vec = and i32 %N, -8
2058 %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec
2059 %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec
2060 %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec
2061 br label %vector.body
2063 vector.body: ; preds = %vector.body, %vector.ph
2064 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2065 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2066 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2067 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2068 %wide.load = load <8 x i8>, ptr %next.gep, align 1
2069 %0 = sext <8 x i8> %wide.load to <8 x i16>
2070 %wide.load22 = load <8 x i8>, ptr %next.gep20, align 1
2071 %1 = sext <8 x i8> %wide.load22 to <8 x i16>
2072 %2 = mul nsw <8 x i16> %1, %0
2073 %3 = ashr <8 x i16> %2, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2074 %4 = icmp sgt <8 x i16> %3, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2075 %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2076 %6 = icmp slt <8 x i16> %5, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2077 %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2078 %8 = trunc <8 x i16> %7 to <8 x i8>
2079 store <8 x i8> %8, ptr %next.gep21, align 1
2080 %index.next = add i32 %index, 8
2081 %9 = icmp eq i32 %index.next, %n.vec
2082 br i1 %9, label %middle.block, label %vector.body
2084 middle.block: ; preds = %vector.body
2085 %cmp.n = icmp eq i32 %n.vec, %N
2086 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
2088 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
2091 for.body: ; preds = %for.body.preheader23, %for.body
2092 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
2093 %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
2094 %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
2095 %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
2096 %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1
2097 %10 = load i8, ptr %pSrcA.addr.013, align 1
2098 %conv1 = sext i8 %10 to i16
2099 %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1
2100 %11 = load i8, ptr %pSrcB.addr.012, align 1
2101 %conv3 = sext i8 %11 to i16
2102 %mul = mul nsw i16 %conv3, %conv1
2103 %shr = ashr i16 %mul, 7
2104 %12 = icmp sgt i16 %shr, -128
2105 %.val.i = select i1 %12, i16 %shr, i16 -128
2106 %13 = icmp slt i16 %.val.i, 127
2107 %retval.0.i = select i1 %13, i16 %.val.i, i16 127
2108 %conv5 = trunc i16 %retval.0.i to i8
2109 %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1
2110 store i8 %conv5, ptr %pDst.addr.011, align 1
2111 %inc = add nuw i32 %i.014, 1
2112 %exitcond = icmp eq i32 %inc, %N
2113 br i1 %exitcond, label %for.cond.cleanup, label %for.body
2116 define arm_aapcs_vfpcc void @ssatmul_16_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2117 ; CHECK-LABEL: ssatmul_16_q7:
2118 ; CHECK: @ %bb.0: @ %entry
2119 ; CHECK-NEXT: .save {r4, r5, r6, lr}
2120 ; CHECK-NEXT: push {r4, r5, r6, lr}
2121 ; CHECK-NEXT: cbz r3, .LBB15_8
2122 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
2123 ; CHECK-NEXT: cmp r3, #15
2124 ; CHECK-NEXT: bhi .LBB15_3
2125 ; CHECK-NEXT: @ %bb.2:
2126 ; CHECK-NEXT: movs r5, #0
2127 ; CHECK-NEXT: mov r12, r0
2128 ; CHECK-NEXT: mov r6, r1
2129 ; CHECK-NEXT: mov r4, r2
2130 ; CHECK-NEXT: b .LBB15_6
2131 ; CHECK-NEXT: .LBB15_3: @ %vector.ph
2132 ; CHECK-NEXT: bic r5, r3, #15
2133 ; CHECK-NEXT: movs r4, #1
2134 ; CHECK-NEXT: sub.w r6, r5, #16
2135 ; CHECK-NEXT: add.w r12, r0, r5
2136 ; CHECK-NEXT: add.w lr, r4, r6, lsr #4
2137 ; CHECK-NEXT: adds r4, r2, r5
2138 ; CHECK-NEXT: adds r6, r1, r5
2139 ; CHECK-NEXT: .LBB15_4: @ %vector.body
2140 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2141 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2142 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
2143 ; CHECK-NEXT: vmullt.s8 q2, q1, q0
2144 ; CHECK-NEXT: vmullb.s8 q0, q1, q0
2145 ; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
2146 ; CHECK-NEXT: vqshrnt.s16 q0, q2, #7
2147 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
2148 ; CHECK-NEXT: le lr, .LBB15_4
2149 ; CHECK-NEXT: @ %bb.5: @ %middle.block
2150 ; CHECK-NEXT: cmp r5, r3
2152 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
2153 ; CHECK-NEXT: .LBB15_6: @ %for.body.preheader23
2154 ; CHECK-NEXT: sub.w lr, r3, r5
2155 ; CHECK-NEXT: .LBB15_7: @ %for.body
2156 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2157 ; CHECK-NEXT: ldrsb r0, [r12], #1
2158 ; CHECK-NEXT: ldrsb r1, [r6], #1
2159 ; CHECK-NEXT: muls r0, r1, r0
2160 ; CHECK-NEXT: ssat r0, #8, r0, asr #7
2161 ; CHECK-NEXT: strb r0, [r4], #1
2162 ; CHECK-NEXT: le lr, .LBB15_7
2163 ; CHECK-NEXT: .LBB15_8: @ %for.cond.cleanup
2164 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2166 %cmp10 = icmp eq i32 %N, 0
2167 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2169 for.body.preheader: ; preds = %entry
2170 %min.iters.check = icmp ult i32 %N, 16
2171 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2173 for.body.preheader23: ; preds = %middle.block, %for.body.preheader
2174 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2175 %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2176 %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2177 %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2180 vector.ph: ; preds = %for.body.preheader
2181 %n.vec = and i32 %N, -16
2182 %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec
2183 %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec
2184 %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec
2185 br label %vector.body
2187 vector.body: ; preds = %vector.body, %vector.ph
2188 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2189 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2190 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2191 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2192 %wide.load = load <16 x i8>, ptr %next.gep, align 1
2193 %0 = sext <16 x i8> %wide.load to <16 x i16>
2194 %wide.load22 = load <16 x i8>, ptr %next.gep20, align 1
2195 %1 = sext <16 x i8> %wide.load22 to <16 x i16>
2196 %2 = mul nsw <16 x i16> %1, %0
2197 %3 = ashr <16 x i16> %2, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2198 %4 = icmp sgt <16 x i16> %3, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2199 %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2200 %6 = icmp slt <16 x i16> %5, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2201 %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2202 %8 = trunc <16 x i16> %7 to <16 x i8>
2203 store <16 x i8> %8, ptr %next.gep21, align 1
2204 %index.next = add i32 %index, 16
2205 %9 = icmp eq i32 %index.next, %n.vec
2206 br i1 %9, label %middle.block, label %vector.body
2208 middle.block: ; preds = %vector.body
2209 %cmp.n = icmp eq i32 %n.vec, %N
2210 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
2212 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
2215 for.body: ; preds = %for.body.preheader23, %for.body
2216 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
2217 %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
2218 %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
2219 %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
2220 %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1
2221 %10 = load i8, ptr %pSrcA.addr.013, align 1
2222 %conv1 = sext i8 %10 to i16
2223 %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1
2224 %11 = load i8, ptr %pSrcB.addr.012, align 1
2225 %conv3 = sext i8 %11 to i16
2226 %mul = mul nsw i16 %conv3, %conv1
2227 %shr = ashr i16 %mul, 7
2228 %12 = icmp sgt i16 %shr, -128
2229 %.val.i = select i1 %12, i16 %shr, i16 -128
2230 %13 = icmp slt i16 %.val.i, 127
2231 %retval.0.i = select i1 %13, i16 %.val.i, i16 127
2232 %conv5 = trunc i16 %retval.0.i to i8
2233 %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1
2234 store i8 %conv5, ptr %pDst.addr.011, align 1
2235 %inc = add nuw i32 %i.014, 1
2236 %exitcond = icmp eq i32 %inc, %N
2237 br i1 %exitcond, label %for.cond.cleanup, label %for.body
2240 define arm_aapcs_vfpcc void @ssatmul_16i_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2241 ; CHECK-LABEL: ssatmul_16i_q7:
2242 ; CHECK: @ %bb.0: @ %entry
2243 ; CHECK-NEXT: .save {r4, r5, r6, lr}
2244 ; CHECK-NEXT: push {r4, r5, r6, lr}
2245 ; CHECK-NEXT: cbz r3, .LBB16_8
2246 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
2247 ; CHECK-NEXT: cmp r3, #15
2248 ; CHECK-NEXT: bhi .LBB16_3
2249 ; CHECK-NEXT: @ %bb.2:
2250 ; CHECK-NEXT: movs r5, #0
2251 ; CHECK-NEXT: mov r12, r0
2252 ; CHECK-NEXT: mov r6, r1
2253 ; CHECK-NEXT: mov r4, r2
2254 ; CHECK-NEXT: b .LBB16_6
2255 ; CHECK-NEXT: .LBB16_3: @ %vector.ph
2256 ; CHECK-NEXT: bic r5, r3, #15
2257 ; CHECK-NEXT: movs r4, #1
2258 ; CHECK-NEXT: sub.w r6, r5, #16
2259 ; CHECK-NEXT: add.w r12, r0, r5
2260 ; CHECK-NEXT: add.w lr, r4, r6, lsr #4
2261 ; CHECK-NEXT: adds r4, r2, r5
2262 ; CHECK-NEXT: adds r6, r1, r5
2263 ; CHECK-NEXT: .LBB16_4: @ %vector.body
2264 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2265 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2266 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
2267 ; CHECK-NEXT: vmullt.s8 q2, q1, q0
2268 ; CHECK-NEXT: vmullb.s8 q0, q1, q0
2269 ; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
2270 ; CHECK-NEXT: vqshrnt.s16 q0, q2, #7
2271 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
2272 ; CHECK-NEXT: le lr, .LBB16_4
2273 ; CHECK-NEXT: @ %bb.5: @ %middle.block
2274 ; CHECK-NEXT: cmp r5, r3
2276 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
2277 ; CHECK-NEXT: .LBB16_6: @ %for.body.preheader23
2278 ; CHECK-NEXT: sub.w lr, r3, r5
2279 ; CHECK-NEXT: .LBB16_7: @ %for.body
2280 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2281 ; CHECK-NEXT: ldrsb r0, [r12], #1
2282 ; CHECK-NEXT: ldrsb r1, [r6], #1
2283 ; CHECK-NEXT: muls r0, r1, r0
2284 ; CHECK-NEXT: ssat r0, #8, r0, asr #7
2285 ; CHECK-NEXT: strb r0, [r4], #1
2286 ; CHECK-NEXT: le lr, .LBB16_7
2287 ; CHECK-NEXT: .LBB16_8: @ %for.cond.cleanup
2288 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2290 %cmp10 = icmp eq i32 %N, 0
2291 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2293 for.body.preheader: ; preds = %entry
2294 %min.iters.check = icmp ult i32 %N, 16
2295 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2297 for.body.preheader23: ; preds = %middle.block, %for.body.preheader
2298 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2299 %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2300 %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2301 %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2304 vector.ph: ; preds = %for.body.preheader
2305 %n.vec = and i32 %N, -16
2306 %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec
2307 %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec
2308 %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec
2309 br label %vector.body
2311 vector.body: ; preds = %vector.body, %vector.ph
2312 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2313 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2314 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2315 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2316 %wide.load = load <16 x i8>, ptr %next.gep, align 1
2317 %0 = shufflevector <16 x i8> %wide.load, <16 x i8> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2318 %1 = shufflevector <16 x i8> %wide.load, <16 x i8> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2319 %2 = sext <8 x i8> %0 to <8 x i16>
2320 %3 = sext <8 x i8> %1 to <8 x i16>
2321 %wide.load22 = load <16 x i8>, ptr %next.gep20, align 1
2322 %4 = shufflevector <16 x i8> %wide.load22, <16 x i8> %wide.load22, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2323 %5 = shufflevector <16 x i8> %wide.load22, <16 x i8> %wide.load22, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2324 %6 = sext <8 x i8> %4 to <8 x i16>
2325 %7 = sext <8 x i8> %5 to <8 x i16>
2326 %8 = mul <8 x i16> %6, %2
2327 %9 = mul <8 x i16> %7, %3
2328 %10 = ashr <8 x i16> %8, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2329 %11 = ashr <8 x i16> %9, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2330 %12 = icmp sgt <8 x i16> %10, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2331 %13 = icmp sgt <8 x i16> %11, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2332 %14 = select <8 x i1> %12, <8 x i16> %10, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2333 %15 = select <8 x i1> %13, <8 x i16> %11, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2334 %16 = icmp slt <8 x i16> %14, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2335 %17 = icmp slt <8 x i16> %15, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2336 %18 = select <8 x i1> %16, <8 x i16> %14, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2337 %19 = select <8 x i1> %17, <8 x i16> %15, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2338 %20 = shufflevector <8 x i16> %18, <8 x i16> %19, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
2339 %21 = trunc <16 x i16> %20 to <16 x i8>
2340 store <16 x i8> %21, ptr %next.gep21, align 1
2341 %index.next = add i32 %index, 16
2342 %22 = icmp eq i32 %index.next, %n.vec
2343 br i1 %22, label %middle.block, label %vector.body
2345 middle.block: ; preds = %vector.body
2346 %cmp.n = icmp eq i32 %n.vec, %N
2347 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
2349 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
2352 for.body: ; preds = %for.body, %for.body.preheader23
2353 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
2354 %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
2355 %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
2356 %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
2357 %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1
2358 %23 = load i8, ptr %pSrcA.addr.013, align 1
2359 %conv1 = sext i8 %23 to i16
2360 %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1
2361 %24 = load i8, ptr %pSrcB.addr.012, align 1
2362 %conv3 = sext i8 %24 to i16
2363 %mul = mul nsw i16 %conv3, %conv1
2364 %shr = ashr i16 %mul, 7
2365 %25 = icmp sgt i16 %shr, -128
2366 %.val.i = select i1 %25, i16 %shr, i16 -128
2367 %26 = icmp slt i16 %.val.i, 127
2368 %retval.0.i = select i1 %26, i16 %.val.i, i16 127
2369 %conv5 = trunc i16 %retval.0.i to i8
2370 %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1
2371 store i8 %conv5, ptr %pDst.addr.011, align 1
2372 %inc = add nuw i32 %i.014, 1
2373 %exitcond = icmp eq i32 %inc, %N
2374 br i1 %exitcond, label %for.cond.cleanup, label %for.body
2377 define arm_aapcs_vfpcc void @ssatmul_8t_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2378 ; CHECK-LABEL: ssatmul_8t_q7:
2379 ; CHECK: @ %bb.0: @ %entry
2380 ; CHECK-NEXT: .save {r4, r5, r7, lr}
2381 ; CHECK-NEXT: push {r4, r5, r7, lr}
2382 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
2383 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
2384 ; CHECK-NEXT: .pad #16
2385 ; CHECK-NEXT: sub sp, #16
2386 ; CHECK-NEXT: cmp r3, #0
2387 ; CHECK-NEXT: beq .LBB17_3
2388 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2389 ; CHECK-NEXT: adds r4, r3, #7
2390 ; CHECK-NEXT: vmov.i8 q2, #0x0
2391 ; CHECK-NEXT: bic r4, r4, #7
2392 ; CHECK-NEXT: vmov.i8 q3, #0xff
2393 ; CHECK-NEXT: sub.w r12, r4, #8
2394 ; CHECK-NEXT: movs r4, #1
2395 ; CHECK-NEXT: mov r5, sp
2396 ; CHECK-NEXT: add.w lr, r4, r12, lsr #3
2397 ; CHECK-NEXT: adr r4, .LCPI17_0
2398 ; CHECK-NEXT: vldrw.u32 q0, [r4]
2399 ; CHECK-NEXT: adr r4, .LCPI17_1
2400 ; CHECK-NEXT: sub.w r12, r3, #1
2401 ; CHECK-NEXT: vldrw.u32 q4, [r4]
2402 ; CHECK-NEXT: movs r3, #0
2403 ; CHECK-NEXT: vdup.32 q1, r12
2404 ; CHECK-NEXT: .LBB17_2: @ %vector.body
2405 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2406 ; CHECK-NEXT: vdup.32 q5, r3
2407 ; CHECK-NEXT: adds r3, #8
2408 ; CHECK-NEXT: vorr q6, q5, q0
2409 ; CHECK-NEXT: vorr q5, q5, q4
2410 ; CHECK-NEXT: vcmp.u32 cs, q1, q6
2411 ; CHECK-NEXT: vpsel q6, q3, q2
2412 ; CHECK-NEXT: vcmp.u32 cs, q1, q5
2413 ; CHECK-NEXT: vpsel q5, q3, q2
2414 ; CHECK-NEXT: vstrh.32 q6, [r5, #8]
2415 ; CHECK-NEXT: vstrh.32 q5, [r5]
2416 ; CHECK-NEXT: vldrw.u32 q5, [r5]
2417 ; CHECK-NEXT: vptt.i16 ne, q5, zr
2418 ; CHECK-NEXT: vldrbt.s16 q5, [r0], #8
2419 ; CHECK-NEXT: vldrbt.s16 q6, [r1], #8
2420 ; CHECK-NEXT: vmul.i16 q5, q6, q5
2421 ; CHECK-NEXT: vqshrnb.s16 q5, q5, #7
2423 ; CHECK-NEXT: vstrbt.16 q5, [r2], #8
2424 ; CHECK-NEXT: le lr, .LBB17_2
2425 ; CHECK-NEXT: .LBB17_3: @ %for.cond.cleanup
2426 ; CHECK-NEXT: add sp, #16
2427 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
2428 ; CHECK-NEXT: pop {r4, r5, r7, pc}
2429 ; CHECK-NEXT: .p2align 4
2430 ; CHECK-NEXT: @ %bb.4:
2431 ; CHECK-NEXT: .LCPI17_0:
2432 ; CHECK-NEXT: .long 4 @ 0x4
2433 ; CHECK-NEXT: .long 5 @ 0x5
2434 ; CHECK-NEXT: .long 6 @ 0x6
2435 ; CHECK-NEXT: .long 7 @ 0x7
2436 ; CHECK-NEXT: .LCPI17_1:
2437 ; CHECK-NEXT: .long 0 @ 0x0
2438 ; CHECK-NEXT: .long 1 @ 0x1
2439 ; CHECK-NEXT: .long 2 @ 0x2
2440 ; CHECK-NEXT: .long 3 @ 0x3
2442 %cmp10 = icmp eq i32 %N, 0
2443 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
2445 vector.ph: ; preds = %entry
2446 %n.rnd.up = add i32 %N, 7
2447 %n.vec = and i32 %n.rnd.up, -8
2448 %trip.count.minus.1 = add i32 %N, -1
2449 %broadcast.splatinsert22 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
2450 %broadcast.splat23 = shufflevector <8 x i32> %broadcast.splatinsert22, <8 x i32> undef, <8 x i32> zeroinitializer
2451 br label %vector.body
2453 vector.body: ; preds = %vector.body, %vector.ph
2454 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2455 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
2456 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
2457 %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2458 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2459 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2460 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2461 %0 = icmp ule <8 x i32> %induction, %broadcast.splat23
2462 %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %next.gep, i32 1, <8 x i1> %0, <8 x i8> undef)
2463 %1 = sext <8 x i8> %wide.masked.load to <8 x i16>
2464 %wide.masked.load24 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %next.gep20, i32 1, <8 x i1> %0, <8 x i8> undef)
2465 %2 = sext <8 x i8> %wide.masked.load24 to <8 x i16>
2466 %3 = mul nsw <8 x i16> %2, %1
2467 %4 = ashr <8 x i16> %3, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2468 %5 = icmp sgt <8 x i16> %4, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2469 %6 = select <8 x i1> %5, <8 x i16> %4, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2470 %7 = icmp slt <8 x i16> %6, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2471 %8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2472 %9 = trunc <8 x i16> %8 to <8 x i8>
2473 call void @llvm.masked.store.v8i8.p0(<8 x i8> %9, ptr %next.gep21, i32 1, <8 x i1> %0)
2474 %index.next = add i32 %index, 8
2475 %10 = icmp eq i32 %index.next, %n.vec
2476 br i1 %10, label %for.cond.cleanup, label %vector.body
2478 for.cond.cleanup: ; preds = %vector.body, %entry
2482 define arm_aapcs_vfpcc void @ssatmul_16t_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2483 ; CHECK-LABEL: ssatmul_16t_q7:
2484 ; CHECK: @ %bb.0: @ %entry
2485 ; CHECK-NEXT: .save {r4, r5, r6, lr}
2486 ; CHECK-NEXT: push {r4, r5, r6, lr}
2487 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2488 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2489 ; CHECK-NEXT: .pad #80
2490 ; CHECK-NEXT: sub sp, #80
2491 ; CHECK-NEXT: cmp r3, #0
2492 ; CHECK-NEXT: beq .LBB18_3
2493 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2494 ; CHECK-NEXT: add.w r6, r3, #15
2495 ; CHECK-NEXT: movs r5, #1
2496 ; CHECK-NEXT: bic r6, r6, #15
2497 ; CHECK-NEXT: add r4, sp, #48
2498 ; CHECK-NEXT: subs r6, #16
2499 ; CHECK-NEXT: vmov.i8 q2, #0x0
2500 ; CHECK-NEXT: vmov.i8 q3, #0xff
2501 ; CHECK-NEXT: add.w lr, r5, r6, lsr #4
2502 ; CHECK-NEXT: adr r5, .LCPI18_0
2503 ; CHECK-NEXT: subs r6, r3, #1
2504 ; CHECK-NEXT: vldrw.u32 q0, [r5]
2505 ; CHECK-NEXT: vdup.32 q1, r6
2506 ; CHECK-NEXT: adr r6, .LCPI18_1
2507 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
2508 ; CHECK-NEXT: vldrw.u32 q0, [r6]
2509 ; CHECK-NEXT: adr r6, .LCPI18_2
2510 ; CHECK-NEXT: vldrw.u32 q5, [r6]
2511 ; CHECK-NEXT: adr r6, .LCPI18_3
2512 ; CHECK-NEXT: vldrw.u32 q6, [r6]
2513 ; CHECK-NEXT: add r5, sp, #32
2514 ; CHECK-NEXT: add r6, sp, #64
2515 ; CHECK-NEXT: movs r3, #0
2516 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
2517 ; CHECK-NEXT: .LBB18_2: @ %vector.body
2518 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2519 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
2520 ; CHECK-NEXT: vdup.32 q7, r3
2521 ; CHECK-NEXT: adds r3, #16
2522 ; CHECK-NEXT: vorr q0, q7, q0
2523 ; CHECK-NEXT: vcmp.u32 cs, q1, q0
2524 ; CHECK-NEXT: vpsel q0, q3, q2
2525 ; CHECK-NEXT: vstrh.32 q0, [r4, #8]
2526 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
2527 ; CHECK-NEXT: vorr q0, q7, q0
2528 ; CHECK-NEXT: vcmp.u32 cs, q1, q0
2529 ; CHECK-NEXT: vpsel q0, q3, q2
2530 ; CHECK-NEXT: vstrh.32 q0, [r4]
2531 ; CHECK-NEXT: vorr q0, q7, q5
2532 ; CHECK-NEXT: vcmp.u32 cs, q1, q0
2533 ; CHECK-NEXT: vpsel q0, q3, q2
2534 ; CHECK-NEXT: vstrh.32 q0, [r5, #8]
2535 ; CHECK-NEXT: vorr q0, q7, q6
2536 ; CHECK-NEXT: vcmp.u32 cs, q1, q0
2537 ; CHECK-NEXT: vpsel q0, q3, q2
2538 ; CHECK-NEXT: vstrh.32 q0, [r5]
2539 ; CHECK-NEXT: vldrw.u32 q0, [r4]
2540 ; CHECK-NEXT: vcmp.i16 ne, q0, zr
2541 ; CHECK-NEXT: vpsel q0, q3, q2
2542 ; CHECK-NEXT: vstrb.16 q0, [r6, #8]
2543 ; CHECK-NEXT: vldrw.u32 q0, [r5]
2544 ; CHECK-NEXT: vcmp.i16 ne, q0, zr
2545 ; CHECK-NEXT: vpsel q0, q3, q2
2546 ; CHECK-NEXT: vstrb.16 q0, [r6]
2547 ; CHECK-NEXT: vldrw.u32 q0, [r6]
2548 ; CHECK-NEXT: vptt.i8 ne, q0, zr
2549 ; CHECK-NEXT: vldrbt.u8 q0, [r0], #16
2550 ; CHECK-NEXT: vldrbt.u8 q7, [r1], #16
2551 ; CHECK-NEXT: vmullt.s8 q4, q7, q0
2552 ; CHECK-NEXT: vmullb.s8 q0, q7, q0
2553 ; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
2554 ; CHECK-NEXT: vqshrnt.s16 q0, q4, #7
2556 ; CHECK-NEXT: vstrbt.8 q0, [r2], #16
2557 ; CHECK-NEXT: le lr, .LBB18_2
2558 ; CHECK-NEXT: .LBB18_3: @ %for.cond.cleanup
2559 ; CHECK-NEXT: add sp, #80
2560 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2561 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2562 ; CHECK-NEXT: .p2align 4
2563 ; CHECK-NEXT: @ %bb.4:
2564 ; CHECK-NEXT: .LCPI18_0:
2565 ; CHECK-NEXT: .long 12 @ 0xc
2566 ; CHECK-NEXT: .long 13 @ 0xd
2567 ; CHECK-NEXT: .long 14 @ 0xe
2568 ; CHECK-NEXT: .long 15 @ 0xf
2569 ; CHECK-NEXT: .LCPI18_1:
2570 ; CHECK-NEXT: .long 8 @ 0x8
2571 ; CHECK-NEXT: .long 9 @ 0x9
2572 ; CHECK-NEXT: .long 10 @ 0xa
2573 ; CHECK-NEXT: .long 11 @ 0xb
2574 ; CHECK-NEXT: .LCPI18_2:
2575 ; CHECK-NEXT: .long 4 @ 0x4
2576 ; CHECK-NEXT: .long 5 @ 0x5
2577 ; CHECK-NEXT: .long 6 @ 0x6
2578 ; CHECK-NEXT: .long 7 @ 0x7
2579 ; CHECK-NEXT: .LCPI18_3:
2580 ; CHECK-NEXT: .long 0 @ 0x0
2581 ; CHECK-NEXT: .long 1 @ 0x1
2582 ; CHECK-NEXT: .long 2 @ 0x2
2583 ; CHECK-NEXT: .long 3 @ 0x3
2585 %cmp10 = icmp eq i32 %N, 0
2586 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
2588 vector.ph: ; preds = %entry
2589 %n.rnd.up = add i32 %N, 15
2590 %n.vec = and i32 %n.rnd.up, -16
2591 %trip.count.minus.1 = add i32 %N, -1
2592 %broadcast.splatinsert22 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
2593 %broadcast.splat23 = shufflevector <16 x i32> %broadcast.splatinsert22, <16 x i32> undef, <16 x i32> zeroinitializer
2594 br label %vector.body
2596 vector.body: ; preds = %vector.body, %vector.ph
2597 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2598 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
2599 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
2600 %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2601 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2602 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2603 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2604 %0 = icmp ule <16 x i32> %induction, %broadcast.splat23
2605 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %next.gep, i32 1, <16 x i1> %0, <16 x i8> undef)
2606 %1 = sext <16 x i8> %wide.masked.load to <16 x i16>
2607 %wide.masked.load24 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %next.gep20, i32 1, <16 x i1> %0, <16 x i8> undef)
2608 %2 = sext <16 x i8> %wide.masked.load24 to <16 x i16>
2609 %3 = mul nsw <16 x i16> %2, %1
2610 %4 = ashr <16 x i16> %3, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2611 %5 = icmp sgt <16 x i16> %4, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2612 %6 = select <16 x i1> %5, <16 x i16> %4, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2613 %7 = icmp slt <16 x i16> %6, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2614 %8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2615 %9 = trunc <16 x i16> %8 to <16 x i8>
2616 call void @llvm.masked.store.v16i8.p0(<16 x i8> %9, ptr %next.gep21, i32 1, <16 x i1> %0)
2617 %index.next = add i32 %index, 16
2618 %10 = icmp eq i32 %index.next, %n.vec
2619 br i1 %10, label %for.cond.cleanup, label %vector.body
2621 for.cond.cleanup: ; preds = %vector.body, %entry
2625 define arm_aapcs_vfpcc void @ssatmul_16ti_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2626 ; CHECK-LABEL: ssatmul_16ti_q7:
2627 ; CHECK: @ %bb.0: @ %entry
2628 ; CHECK-NEXT: .save {r4, r5, r6, lr}
2629 ; CHECK-NEXT: push {r4, r5, r6, lr}
2630 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2631 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2632 ; CHECK-NEXT: .pad #80
2633 ; CHECK-NEXT: sub sp, #80
2634 ; CHECK-NEXT: cmp r3, #0
2635 ; CHECK-NEXT: beq .LBB19_3
2636 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2637 ; CHECK-NEXT: add.w r6, r3, #15
2638 ; CHECK-NEXT: movs r5, #1
2639 ; CHECK-NEXT: bic r6, r6, #15
2640 ; CHECK-NEXT: add r4, sp, #48
2641 ; CHECK-NEXT: subs r6, #16
2642 ; CHECK-NEXT: vmov.i8 q2, #0x0
2643 ; CHECK-NEXT: vmov.i8 q3, #0xff
2644 ; CHECK-NEXT: add.w lr, r5, r6, lsr #4
2645 ; CHECK-NEXT: adr r5, .LCPI19_0
2646 ; CHECK-NEXT: subs r6, r3, #1
2647 ; CHECK-NEXT: vldrw.u32 q0, [r5]
2648 ; CHECK-NEXT: vdup.32 q1, r6
2649 ; CHECK-NEXT: adr r6, .LCPI19_1
2650 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
2651 ; CHECK-NEXT: vldrw.u32 q0, [r6]
2652 ; CHECK-NEXT: adr r6, .LCPI19_2
2653 ; CHECK-NEXT: vldrw.u32 q5, [r6]
2654 ; CHECK-NEXT: adr r6, .LCPI19_3
2655 ; CHECK-NEXT: vldrw.u32 q6, [r6]
2656 ; CHECK-NEXT: add r5, sp, #32
2657 ; CHECK-NEXT: add r6, sp, #64
2658 ; CHECK-NEXT: movs r3, #0
2659 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
2660 ; CHECK-NEXT: .LBB19_2: @ %vector.body
2661 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2662 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
2663 ; CHECK-NEXT: vdup.32 q7, r3
2664 ; CHECK-NEXT: adds r3, #16
2665 ; CHECK-NEXT: vorr q0, q7, q0
2666 ; CHECK-NEXT: vcmp.u32 cs, q1, q0
2667 ; CHECK-NEXT: vpsel q0, q3, q2
2668 ; CHECK-NEXT: vstrh.32 q0, [r4, #8]
2669 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
2670 ; CHECK-NEXT: vorr q0, q7, q0
2671 ; CHECK-NEXT: vcmp.u32 cs, q1, q0
2672 ; CHECK-NEXT: vpsel q0, q3, q2
2673 ; CHECK-NEXT: vstrh.32 q0, [r4]
2674 ; CHECK-NEXT: vorr q0, q7, q5
2675 ; CHECK-NEXT: vcmp.u32 cs, q1, q0
2676 ; CHECK-NEXT: vpsel q0, q3, q2
2677 ; CHECK-NEXT: vstrh.32 q0, [r5, #8]
2678 ; CHECK-NEXT: vorr q0, q7, q6
2679 ; CHECK-NEXT: vcmp.u32 cs, q1, q0
2680 ; CHECK-NEXT: vpsel q0, q3, q2
2681 ; CHECK-NEXT: vstrh.32 q0, [r5]
2682 ; CHECK-NEXT: vldrw.u32 q0, [r4]
2683 ; CHECK-NEXT: vcmp.i16 ne, q0, zr
2684 ; CHECK-NEXT: vpsel q0, q3, q2
2685 ; CHECK-NEXT: vstrb.16 q0, [r6, #8]
2686 ; CHECK-NEXT: vldrw.u32 q0, [r5]
2687 ; CHECK-NEXT: vcmp.i16 ne, q0, zr
2688 ; CHECK-NEXT: vpsel q0, q3, q2
2689 ; CHECK-NEXT: vstrb.16 q0, [r6]
2690 ; CHECK-NEXT: vldrw.u32 q0, [r6]
2691 ; CHECK-NEXT: vptt.i8 ne, q0, zr
2692 ; CHECK-NEXT: vldrbt.u8 q0, [r0], #16
2693 ; CHECK-NEXT: vldrbt.u8 q7, [r1], #16
2694 ; CHECK-NEXT: vmullt.s8 q4, q7, q0
2695 ; CHECK-NEXT: vmullb.s8 q0, q7, q0
2696 ; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
2697 ; CHECK-NEXT: vqshrnt.s16 q0, q4, #7
2699 ; CHECK-NEXT: vstrbt.8 q0, [r2], #16
2700 ; CHECK-NEXT: le lr, .LBB19_2
2701 ; CHECK-NEXT: .LBB19_3: @ %for.cond.cleanup
2702 ; CHECK-NEXT: add sp, #80
2703 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2704 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2705 ; CHECK-NEXT: .p2align 4
2706 ; CHECK-NEXT: @ %bb.4:
2707 ; CHECK-NEXT: .LCPI19_0:
2708 ; CHECK-NEXT: .long 12 @ 0xc
2709 ; CHECK-NEXT: .long 13 @ 0xd
2710 ; CHECK-NEXT: .long 14 @ 0xe
2711 ; CHECK-NEXT: .long 15 @ 0xf
2712 ; CHECK-NEXT: .LCPI19_1:
2713 ; CHECK-NEXT: .long 8 @ 0x8
2714 ; CHECK-NEXT: .long 9 @ 0x9
2715 ; CHECK-NEXT: .long 10 @ 0xa
2716 ; CHECK-NEXT: .long 11 @ 0xb
2717 ; CHECK-NEXT: .LCPI19_2:
2718 ; CHECK-NEXT: .long 4 @ 0x4
2719 ; CHECK-NEXT: .long 5 @ 0x5
2720 ; CHECK-NEXT: .long 6 @ 0x6
2721 ; CHECK-NEXT: .long 7 @ 0x7
2722 ; CHECK-NEXT: .LCPI19_3:
2723 ; CHECK-NEXT: .long 0 @ 0x0
2724 ; CHECK-NEXT: .long 1 @ 0x1
2725 ; CHECK-NEXT: .long 2 @ 0x2
2726 ; CHECK-NEXT: .long 3 @ 0x3
2728 %cmp10 = icmp eq i32 %N, 0
2729 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
2731 vector.ph: ; preds = %entry
2732 %n.rnd.up = add i32 %N, 15
2733 %n.vec = and i32 %n.rnd.up, -16
2734 %trip.count.minus.1 = add i32 %N, -1
2735 %broadcast.splatinsert22 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
2736 %broadcast.splat23 = shufflevector <16 x i32> %broadcast.splatinsert22, <16 x i32> undef, <16 x i32> zeroinitializer
2737 br label %vector.body
2739 vector.body: ; preds = %vector.body, %vector.ph
2740 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2741 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
2742 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
2743 %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2744 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2745 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2746 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2747 %0 = icmp ule <16 x i32> %induction, %broadcast.splat23
2748 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %next.gep, i32 1, <16 x i1> %0, <16 x i8> undef)
2749 %1 = shufflevector <16 x i8> %wide.masked.load, <16 x i8> %wide.masked.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2750 %2 = shufflevector <16 x i8> %wide.masked.load, <16 x i8> %wide.masked.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2751 %3 = sext <8 x i8> %1 to <8 x i16>
2752 %4 = sext <8 x i8> %2 to <8 x i16>
2753 %wide.masked.load24 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %next.gep20, i32 1, <16 x i1> %0, <16 x i8> undef)
2754 %5 = shufflevector <16 x i8> %wide.masked.load24, <16 x i8> %wide.masked.load24, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2755 %6 = shufflevector <16 x i8> %wide.masked.load24, <16 x i8> %wide.masked.load24, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2756 %7 = sext <8 x i8> %5 to <8 x i16>
2757 %8 = sext <8 x i8> %6 to <8 x i16>
2758 %9 = mul <8 x i16> %7, %3
2759 %10 = mul <8 x i16> %8, %4
2760 %11 = ashr <8 x i16> %9, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2761 %12 = ashr <8 x i16> %10, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2762 %13 = icmp sgt <8 x i16> %11, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2763 %14 = icmp sgt <8 x i16> %12, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2764 %15 = select <8 x i1> %13, <8 x i16> %11, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2765 %16 = select <8 x i1> %14, <8 x i16> %12, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2766 %17 = icmp slt <8 x i16> %15, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2767 %18 = icmp slt <8 x i16> %16, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2768 %19 = select <8 x i1> %17, <8 x i16> %15, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2769 %20 = select <8 x i1> %18, <8 x i16> %16, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2770 %21 = shufflevector <8 x i16> %19, <8 x i16> %20, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
2771 %22 = trunc <16 x i16> %21 to <16 x i8>
2772 call void @llvm.masked.store.v16i8.p0(<16 x i8> %22, ptr %next.gep21, i32 1, <16 x i1> %0)
2773 %index.next = add i32 %index, 16
2774 %23 = icmp eq i32 %index.next, %n.vec
2775 br i1 %23, label %for.cond.cleanup, label %vector.body
2777 for.cond.cleanup: ; preds = %vector.body, %entry
2781 define arm_aapcs_vfpcc void @usatmul_8_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2782 ; CHECK-LABEL: usatmul_8_q7:
2783 ; CHECK: @ %bb.0: @ %entry
2784 ; CHECK-NEXT: .save {r4, r5, r6, lr}
2785 ; CHECK-NEXT: push {r4, r5, r6, lr}
2786 ; CHECK-NEXT: cbz r3, .LBB20_8
2787 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
2788 ; CHECK-NEXT: cmp r3, #7
2789 ; CHECK-NEXT: bhi .LBB20_3
2790 ; CHECK-NEXT: @ %bb.2:
2791 ; CHECK-NEXT: movs r5, #0
2792 ; CHECK-NEXT: mov r12, r0
2793 ; CHECK-NEXT: mov r6, r1
2794 ; CHECK-NEXT: mov r4, r2
2795 ; CHECK-NEXT: b .LBB20_6
2796 ; CHECK-NEXT: .LBB20_3: @ %vector.ph
2797 ; CHECK-NEXT: bic r5, r3, #7
2798 ; CHECK-NEXT: movs r4, #1
2799 ; CHECK-NEXT: sub.w r6, r5, #8
2800 ; CHECK-NEXT: add.w r12, r0, r5
2801 ; CHECK-NEXT: add.w lr, r4, r6, lsr #3
2802 ; CHECK-NEXT: adds r4, r2, r5
2803 ; CHECK-NEXT: adds r6, r1, r5
2804 ; CHECK-NEXT: .LBB20_4: @ %vector.body
2805 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2806 ; CHECK-NEXT: vldrb.u16 q0, [r0], #8
2807 ; CHECK-NEXT: vldrb.u16 q1, [r1], #8
2808 ; CHECK-NEXT: vmul.i16 q0, q1, q0
2809 ; CHECK-NEXT: vqshrnb.u16 q0, q0, #7
2810 ; CHECK-NEXT: vstrb.16 q0, [r2], #8
2811 ; CHECK-NEXT: le lr, .LBB20_4
2812 ; CHECK-NEXT: @ %bb.5: @ %middle.block
2813 ; CHECK-NEXT: cmp r5, r3
2815 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
2816 ; CHECK-NEXT: .LBB20_6: @ %for.body.preheader23
2817 ; CHECK-NEXT: sub.w lr, r3, r5
2818 ; CHECK-NEXT: .LBB20_7: @ %for.body
2819 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2820 ; CHECK-NEXT: ldrb r0, [r12], #1
2821 ; CHECK-NEXT: ldrb r1, [r6], #1
2822 ; CHECK-NEXT: muls r0, r1, r0
2823 ; CHECK-NEXT: lsrs r1, r0, #7
2824 ; CHECK-NEXT: cmp r1, #255
2825 ; CHECK-NEXT: mov.w r1, #255
2827 ; CHECK-NEXT: lsrlo r1, r0, #7
2828 ; CHECK-NEXT: strb r1, [r4], #1
2829 ; CHECK-NEXT: le lr, .LBB20_7
2830 ; CHECK-NEXT: .LBB20_8: @ %for.cond.cleanup
2831 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2833 %cmp10 = icmp eq i32 %N, 0
2834 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2836 for.body.preheader: ; preds = %entry
2837 %min.iters.check = icmp ult i32 %N, 8
2838 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2840 for.body.preheader23: ; preds = %middle.block, %for.body.preheader
2841 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2842 %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2843 %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2844 %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2847 vector.ph: ; preds = %for.body.preheader
2848 %n.vec = and i32 %N, -8
2849 %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec
2850 %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec
2851 %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec
2852 br label %vector.body
2854 vector.body: ; preds = %vector.body, %vector.ph
2855 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2856 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2857 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2858 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2859 %wide.load = load <8 x i8>, ptr %next.gep, align 1
2860 %0 = zext <8 x i8> %wide.load to <8 x i16>
2861 %wide.load22 = load <8 x i8>, ptr %next.gep20, align 1
2862 %1 = zext <8 x i8> %wide.load22 to <8 x i16>
2863 %2 = mul nuw <8 x i16> %1, %0
2864 %3 = lshr <8 x i16> %2, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2865 %4 = icmp ult <8 x i16> %3, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
2866 %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
2867 %6 = trunc <8 x i16> %5 to <8 x i8>
2868 store <8 x i8> %6, ptr %next.gep21, align 1
2869 %index.next = add i32 %index, 8
2870 %7 = icmp eq i32 %index.next, %n.vec
2871 br i1 %7, label %middle.block, label %vector.body
2873 middle.block: ; preds = %vector.body
2874 %cmp.n = icmp eq i32 %n.vec, %N
2875 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
2877 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
2880 for.body: ; preds = %for.body.preheader23, %for.body
2881 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
2882 %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
2883 %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
2884 %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
2885 %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1
2886 %8 = load i8, ptr %pSrcA.addr.013, align 1
2887 %conv1 = zext i8 %8 to i16
2888 %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1
2889 %9 = load i8, ptr %pSrcB.addr.012, align 1
2890 %conv3 = zext i8 %9 to i16
2891 %mul = mul nuw i16 %conv3, %conv1
2892 %10 = lshr i16 %mul, 7
2893 %11 = icmp ult i16 %10, 255
2894 %retval.0.i = select i1 %11, i16 %10, i16 255
2895 %conv5 = trunc i16 %retval.0.i to i8
2896 %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1
2897 store i8 %conv5, ptr %pDst.addr.011, align 1
2898 %inc = add nuw i32 %i.014, 1
2899 %exitcond = icmp eq i32 %inc, %N
2900 br i1 %exitcond, label %for.cond.cleanup, label %for.body
2903 define arm_aapcs_vfpcc void @usatmul_16_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) {
2904 ; CHECK-LABEL: usatmul_16_q7:
2905 ; CHECK: @ %bb.0: @ %entry
2906 ; CHECK-NEXT: .save {r4, r5, r6, lr}
2907 ; CHECK-NEXT: push {r4, r5, r6, lr}
2908 ; CHECK-NEXT: cmp r3, #0
2909 ; CHECK-NEXT: beq .LBB21_8
2910 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
2911 ; CHECK-NEXT: cmp r3, #15
2912 ; CHECK-NEXT: bhi .LBB21_3
2913 ; CHECK-NEXT: @ %bb.2:
2914 ; CHECK-NEXT: movs r5, #0
2915 ; CHECK-NEXT: mov r12, r0
2916 ; CHECK-NEXT: mov r6, r1
2917 ; CHECK-NEXT: mov r4, r2
2918 ; CHECK-NEXT: b .LBB21_6
2919 ; CHECK-NEXT: .LBB21_3: @ %vector.ph
2920 ; CHECK-NEXT: bic r5, r3, #15
2921 ; CHECK-NEXT: movs r4, #1
2922 ; CHECK-NEXT: sub.w r6, r5, #16
2923 ; CHECK-NEXT: add.w r12, r0, r5
2924 ; CHECK-NEXT: add.w lr, r4, r6, lsr #4
2925 ; CHECK-NEXT: adds r4, r2, r5
2926 ; CHECK-NEXT: adds r6, r1, r5
2927 ; CHECK-NEXT: .LBB21_4: @ %vector.body
2928 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2929 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2930 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
2931 ; CHECK-NEXT: vmullt.u8 q2, q1, q0
2932 ; CHECK-NEXT: vmullb.u8 q0, q1, q0
2933 ; CHECK-NEXT: vqshrnb.u16 q0, q0, #7
2934 ; CHECK-NEXT: vqshrnt.u16 q0, q2, #7
2935 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
2936 ; CHECK-NEXT: le lr, .LBB21_4
2937 ; CHECK-NEXT: @ %bb.5: @ %middle.block
2938 ; CHECK-NEXT: cmp r5, r3
2940 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
2941 ; CHECK-NEXT: .LBB21_6: @ %for.body.preheader23
2942 ; CHECK-NEXT: sub.w lr, r3, r5
2943 ; CHECK-NEXT: .LBB21_7: @ %for.body
2944 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2945 ; CHECK-NEXT: ldrb r0, [r12], #1
2946 ; CHECK-NEXT: ldrb r1, [r6], #1
2947 ; CHECK-NEXT: muls r0, r1, r0
2948 ; CHECK-NEXT: lsrs r1, r0, #7
2949 ; CHECK-NEXT: cmp r1, #255
2950 ; CHECK-NEXT: mov.w r1, #255
2952 ; CHECK-NEXT: lsrlo r1, r0, #7
2953 ; CHECK-NEXT: strb r1, [r4], #1
2954 ; CHECK-NEXT: le lr, .LBB21_7
2955 ; CHECK-NEXT: .LBB21_8: @ %for.cond.cleanup
2956 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2958 %cmp10 = icmp eq i32 %N, 0
2959 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2961 for.body.preheader: ; preds = %entry
2962 %min.iters.check = icmp ult i32 %N, 16
2963 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2965 for.body.preheader23: ; preds = %middle.block, %for.body.preheader
2966 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2967 %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2968 %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2969 %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2972 vector.ph: ; preds = %for.body.preheader
2973 %n.vec = and i32 %N, -16
2974 %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec
2975 %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec
2976 %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec
2977 br label %vector.body
2979 vector.body: ; preds = %vector.body, %vector.ph
2980 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2981 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index
2982 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index
2983 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index
2984 %wide.load = load <16 x i8>, ptr %next.gep, align 1
2985 %0 = zext <16 x i8> %wide.load to <16 x i16>
2986 %wide.load22 = load <16 x i8>, ptr %next.gep20, align 1
2987 %1 = zext <16 x i8> %wide.load22 to <16 x i16>
2988 %2 = mul nuw <16 x i16> %1, %0
2989 %3 = lshr <16 x i16> %2, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2990 %4 = icmp ult <16 x i16> %3, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
2991 %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
2992 %6 = trunc <16 x i16> %5 to <16 x i8>
2993 store <16 x i8> %6, ptr %next.gep21, align 1
2994 %index.next = add i32 %index, 16
2995 %7 = icmp eq i32 %index.next, %n.vec
2996 br i1 %7, label %middle.block, label %vector.body
2998 middle.block: ; preds = %vector.body
2999 %cmp.n = icmp eq i32 %n.vec, %N
3000 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
3002 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
3005 for.body: ; preds = %for.body.preheader23, %for.body
3006 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
3007 %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
3008 %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
3009 %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
3010 %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1
3011 %8 = load i8, ptr %pSrcA.addr.013, align 1
3012 %conv1 = zext i8 %8 to i16
3013 %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1
3014 %9 = load i8, ptr %pSrcB.addr.012, align 1
3015 %conv3 = zext i8 %9 to i16
3016 %mul = mul nuw i16 %conv3, %conv1
3017 %10 = lshr i16 %mul, 7
3018 %11 = icmp ult i16 %10, 255
3019 %retval.0.i = select i1 %11, i16 %10, i16 255
3020 %conv5 = trunc i16 %retval.0.i to i8
3021 %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1
3022 store i8 %conv5, ptr %pDst.addr.011, align 1
3023 %inc = add nuw i32 %i.014, 1
3024 %exitcond = icmp eq i32 %inc, %N
3025 br i1 %exitcond, label %for.cond.cleanup, label %for.body
3028 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
3029 declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
3030 declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
3031 declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
3032 declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
3033 declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
3034 declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
3035 declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
3036 declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
3037 declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)