1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
5 ; CHECK-LABEL: ssatmul_s_q31:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
8 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
10 ; CHECK-NEXT: sub sp, #4
11 ; CHECK-NEXT: .vsave {d8, d9}
12 ; CHECK-NEXT: vpush {d8, d9}
14 ; CHECK-NEXT: sub sp, #8
15 ; CHECK-NEXT: cmp r3, #0
16 ; CHECK-NEXT: beq.w .LBB0_8
17 ; CHECK-NEXT: @ %bb.1: @ %entry
18 ; CHECK-NEXT: cmp r3, #1
19 ; CHECK-NEXT: bne .LBB0_3
20 ; CHECK-NEXT: @ %bb.2:
21 ; CHECK-NEXT: movs r7, #0
22 ; CHECK-NEXT: mov r12, r0
23 ; CHECK-NEXT: mov r10, r1
24 ; CHECK-NEXT: mov r11, r2
25 ; CHECK-NEXT: b .LBB0_6
26 ; CHECK-NEXT: .LBB0_3: @ %vector.ph
27 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
28 ; CHECK-NEXT: bic r3, r3, #1
29 ; CHECK-NEXT: subs r7, r3, #2
30 ; CHECK-NEXT: movs r6, #1
31 ; CHECK-NEXT: adr r4, .LCPI0_0
32 ; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
33 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1
34 ; CHECK-NEXT: add.w r11, r2, r3, lsl #2
35 ; CHECK-NEXT: add.w r10, r1, r3, lsl #2
36 ; CHECK-NEXT: add.w r12, r0, r3, lsl #2
37 ; CHECK-NEXT: vldrw.u32 q0, [r4]
38 ; CHECK-NEXT: vmvn.i32 q1, #0x80000000
39 ; CHECK-NEXT: .LBB0_4: @ %vector.body
40 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
41 ; CHECK-NEXT: ldrd r5, r4, [r0], #8
42 ; CHECK-NEXT: mov.w r3, #-1
43 ; CHECK-NEXT: ldrd r8, r7, [r1], #8
44 ; CHECK-NEXT: smull r4, r7, r7, r4
45 ; CHECK-NEXT: asrl r4, r7, #31
46 ; CHECK-NEXT: smull r6, r5, r8, r5
47 ; CHECK-NEXT: rsbs.w r9, r4, #-2147483648
48 ; CHECK-NEXT: sbcs r3, r7
49 ; CHECK-NEXT: mov.w r3, #0
50 ; CHECK-NEXT: asrl r6, r5, #31
52 ; CHECK-NEXT: movlt r3, #1
53 ; CHECK-NEXT: cmp r3, #0
54 ; CHECK-NEXT: csetm r9, ne
55 ; CHECK-NEXT: rsbs.w r3, r6, #-2147483648
56 ; CHECK-NEXT: mov.w r3, #-1
57 ; CHECK-NEXT: vmov q4[2], q4[0], r6, r4
58 ; CHECK-NEXT: sbcs r3, r5
59 ; CHECK-NEXT: vmov q4[3], q4[1], r5, r7
60 ; CHECK-NEXT: mov.w r3, #0
61 ; CHECK-NEXT: mvn r6, #-2147483648
63 ; CHECK-NEXT: movlt r3, #1
64 ; CHECK-NEXT: cmp r3, #0
65 ; CHECK-NEXT: csetm r3, ne
66 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r9
67 ; CHECK-NEXT: vmov q2[3], q2[1], r3, r9
68 ; CHECK-NEXT: vbic q3, q0, q2
69 ; CHECK-NEXT: vand q2, q4, q2
70 ; CHECK-NEXT: vorr q2, q2, q3
71 ; CHECK-NEXT: vmov r3, r4, d4
72 ; CHECK-NEXT: subs r3, r3, r6
73 ; CHECK-NEXT: sbcs r3, r4, #0
74 ; CHECK-NEXT: vmov r4, r5, d5
75 ; CHECK-NEXT: mov.w r3, #0
77 ; CHECK-NEXT: movlt r3, #1
78 ; CHECK-NEXT: cmp r3, #0
79 ; CHECK-NEXT: csetm r3, ne
80 ; CHECK-NEXT: vmov.32 q3[1], r3
81 ; CHECK-NEXT: subs r4, r4, r6
82 ; CHECK-NEXT: sbcs r4, r5, #0
83 ; CHECK-NEXT: mov.w r4, #0
85 ; CHECK-NEXT: movlt r4, #1
86 ; CHECK-NEXT: cmp r4, #0
87 ; CHECK-NEXT: csetm r4, ne
88 ; CHECK-NEXT: vmov q3[2], q3[0], r3, r4
89 ; CHECK-NEXT: vbic q4, q1, q3
90 ; CHECK-NEXT: vand q2, q2, q3
91 ; CHECK-NEXT: vorr q2, q2, q4
92 ; CHECK-NEXT: vmov r3, s10
93 ; CHECK-NEXT: vmov r4, s8
94 ; CHECK-NEXT: strd r4, r3, [r2], #8
95 ; CHECK-NEXT: le lr, .LBB0_4
96 ; CHECK-NEXT: @ %bb.5: @ %middle.block
97 ; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload
98 ; CHECK-NEXT: cmp r7, r3
99 ; CHECK-NEXT: beq .LBB0_8
100 ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader
101 ; CHECK-NEXT: sub.w lr, r3, r7
102 ; CHECK-NEXT: mov.w r0, #-1
103 ; CHECK-NEXT: mov.w r1, #-2147483648
104 ; CHECK-NEXT: mvn r2, #-2147483648
105 ; CHECK-NEXT: .LBB0_7: @ %for.body
106 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
107 ; CHECK-NEXT: ldr r3, [r12], #4
108 ; CHECK-NEXT: ldr r4, [r10], #4
109 ; CHECK-NEXT: smull r4, r3, r4, r3
110 ; CHECK-NEXT: asrl r4, r3, #31
111 ; CHECK-NEXT: subs r5, r1, r4
112 ; CHECK-NEXT: sbcs.w r5, r0, r3
113 ; CHECK-NEXT: mov.w r5, #0
115 ; CHECK-NEXT: movlt r5, #1
116 ; CHECK-NEXT: cmp r5, #0
117 ; CHECK-NEXT: csel r4, r4, r1, ne
118 ; CHECK-NEXT: csel r3, r3, r0, ne
119 ; CHECK-NEXT: subs r5, r4, r2
120 ; CHECK-NEXT: sbcs r3, r3, #0
121 ; CHECK-NEXT: csel r3, r4, r2, lt
122 ; CHECK-NEXT: str r3, [r11], #4
123 ; CHECK-NEXT: le lr, .LBB0_7
124 ; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup
125 ; CHECK-NEXT: add sp, #8
126 ; CHECK-NEXT: vpop {d8, d9}
127 ; CHECK-NEXT: add sp, #4
128 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
129 ; CHECK-NEXT: .p2align 4
130 ; CHECK-NEXT: @ %bb.9:
131 ; CHECK-NEXT: .LCPI0_0:
132 ; CHECK-NEXT: .long 2147483648 @ 0x80000000
133 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
134 ; CHECK-NEXT: .long 2147483648 @ 0x80000000
135 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
137 switch i32 %N, label %vector.ph [
138 i32 0, label %for.cond.cleanup
139 i32 1, label %for.body.preheader
142 vector.ph: ; preds = %entry
143 %n.vec = and i32 %N, -2
144 %ind.end = getelementptr i32, i32* %pSrcA, i32 %n.vec
145 %ind.end15 = getelementptr i32, i32* %pSrcB, i32 %n.vec
146 %ind.end17 = getelementptr i32, i32* %pDst, i32 %n.vec
147 br label %vector.body
149 vector.body: ; preds = %vector.body, %vector.ph
150 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
151 %next.gep = getelementptr i32, i32* %pSrcA, i32 %index
152 %next.gep18 = getelementptr i32, i32* %pSrcB, i32 %index
153 %next.gep19 = getelementptr i32, i32* %pDst, i32 %index
154 %0 = bitcast i32* %next.gep to <2 x i32>*
155 %wide.load = load <2 x i32>, <2 x i32>* %0, align 4
156 %1 = sext <2 x i32> %wide.load to <2 x i64>
157 %2 = bitcast i32* %next.gep18 to <2 x i32>*
158 %wide.load20 = load <2 x i32>, <2 x i32>* %2, align 4
159 %3 = sext <2 x i32> %wide.load20 to <2 x i64>
160 %4 = mul nsw <2 x i64> %3, %1
161 %5 = ashr <2 x i64> %4, <i64 31, i64 31>
162 %6 = icmp sgt <2 x i64> %5, <i64 -2147483648, i64 -2147483648>
163 %7 = select <2 x i1> %6, <2 x i64> %5, <2 x i64> <i64 -2147483648, i64 -2147483648>
164 %8 = icmp slt <2 x i64> %7, <i64 2147483647, i64 2147483647>
165 %9 = select <2 x i1> %8, <2 x i64> %7, <2 x i64> <i64 2147483647, i64 2147483647>
166 %10 = trunc <2 x i64> %9 to <2 x i32>
167 %11 = bitcast i32* %next.gep19 to <2 x i32>*
168 store <2 x i32> %10, <2 x i32>* %11, align 4
169 %index.next = add i32 %index, 2
170 %12 = icmp eq i32 %index.next, %n.vec
171 br i1 %12, label %middle.block, label %vector.body
173 middle.block: ; preds = %vector.body
174 %cmp.n = icmp eq i32 %n.vec, %N
175 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader
177 for.body.preheader: ; preds = %entry, %middle.block
178 %i.012.ph = phi i32 [ 0, %entry ], [ %n.vec, %middle.block ]
179 %pSrcA.addr.011.ph = phi i32* [ %pSrcA, %entry ], [ %ind.end, %middle.block ]
180 %pSrcB.addr.010.ph = phi i32* [ %pSrcB, %entry ], [ %ind.end15, %middle.block ]
181 %pDst.addr.09.ph = phi i32* [ %pDst, %entry ], [ %ind.end17, %middle.block ]
184 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
187 for.body: ; preds = %for.body.preheader, %for.body
188 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader ]
189 %pSrcA.addr.011 = phi i32* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader ]
190 %pSrcB.addr.010 = phi i32* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader ]
191 %pDst.addr.09 = phi i32* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader ]
192 %incdec.ptr = getelementptr inbounds i32, i32* %pSrcA.addr.011, i32 1
193 %13 = load i32, i32* %pSrcA.addr.011, align 4
194 %conv = sext i32 %13 to i64
195 %incdec.ptr1 = getelementptr inbounds i32, i32* %pSrcB.addr.010, i32 1
196 %14 = load i32, i32* %pSrcB.addr.010, align 4
197 %conv2 = sext i32 %14 to i64
198 %mul = mul nsw i64 %conv2, %conv
199 %shr = ashr i64 %mul, 31
200 %15 = icmp sgt i64 %shr, -2147483648
201 %.val.i = select i1 %15, i64 %shr, i64 -2147483648
202 %16 = icmp slt i64 %.val.i, 2147483647
203 %retval.0.i = select i1 %16, i64 %.val.i, i64 2147483647
204 %conv3 = trunc i64 %retval.0.i to i32
205 %incdec.ptr4 = getelementptr inbounds i32, i32* %pDst.addr.09, i32 1
206 store i32 %conv3, i32* %pDst.addr.09, align 4
207 %inc = add nuw i32 %i.012, 1
208 %exitcond = icmp eq i32 %inc, %N
209 br i1 %exitcond, label %for.cond.cleanup, label %for.body
212 define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
213 ; CHECK-LABEL: ssatmul_4_q31:
214 ; CHECK: @ %bb.0: @ %entry
215 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
216 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
217 ; CHECK-NEXT: .pad #4
218 ; CHECK-NEXT: sub sp, #4
219 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
220 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
221 ; CHECK-NEXT: .pad #16
222 ; CHECK-NEXT: sub sp, #16
223 ; CHECK-NEXT: cmp r3, #0
224 ; CHECK-NEXT: beq.w .LBB1_8
225 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
226 ; CHECK-NEXT: movs r7, #0
227 ; CHECK-NEXT: cmp r3, #3
228 ; CHECK-NEXT: bhi .LBB1_3
229 ; CHECK-NEXT: @ %bb.2:
230 ; CHECK-NEXT: mov r12, r0
231 ; CHECK-NEXT: mov r9, r1
232 ; CHECK-NEXT: mov r11, r2
233 ; CHECK-NEXT: b .LBB1_6
234 ; CHECK-NEXT: .LBB1_3: @ %vector.ph
235 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
236 ; CHECK-NEXT: bic r3, r3, #3
237 ; CHECK-NEXT: subs r7, r3, #4
238 ; CHECK-NEXT: adr r4, .LCPI1_0
239 ; CHECK-NEXT: movs r6, #1
240 ; CHECK-NEXT: vldrw.u32 q0, [r4]
241 ; CHECK-NEXT: adr r4, .LCPI1_1
242 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
243 ; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
244 ; CHECK-NEXT: add.w r11, r2, r3, lsl #2
245 ; CHECK-NEXT: add.w r9, r1, r3, lsl #2
246 ; CHECK-NEXT: add.w r12, r0, r3, lsl #2
247 ; CHECK-NEXT: vldrw.u32 q1, [r4]
248 ; CHECK-NEXT: .LBB1_4: @ %vector.body
249 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
250 ; CHECK-NEXT: vldrw.u32 q3, [r1], #16
251 ; CHECK-NEXT: vldrw.u32 q2, [r0], #16
252 ; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
253 ; CHECK-NEXT: mov.w r2, #-1
254 ; CHECK-NEXT: vmov.f32 s16, s10
255 ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
256 ; CHECK-NEXT: vmov.f32 s18, s11
257 ; CHECK-NEXT: vmov.f32 s20, s14
258 ; CHECK-NEXT: vmov.f32 s22, s15
259 ; CHECK-NEXT: vmullb.s32 q6, q5, q4
260 ; CHECK-NEXT: vmov.f32 s10, s9
261 ; CHECK-NEXT: vmov r4, r7, d13
262 ; CHECK-NEXT: asrl r4, r7, #31
263 ; CHECK-NEXT: vmov r6, s12
264 ; CHECK-NEXT: rsbs.w r5, r4, #-2147483648
265 ; CHECK-NEXT: sbcs.w r5, r2, r7
266 ; CHECK-NEXT: mov.w r5, #0
268 ; CHECK-NEXT: movlt r5, #1
269 ; CHECK-NEXT: cmp r5, #0
270 ; CHECK-NEXT: vmov r10, r5, d12
271 ; CHECK-NEXT: csetm r8, ne
272 ; CHECK-NEXT: asrl r10, r5, #31
273 ; CHECK-NEXT: rsbs.w r3, r10, #-2147483648
274 ; CHECK-NEXT: vmov q6[2], q6[0], r10, r4
275 ; CHECK-NEXT: sbcs.w r3, r2, r5
276 ; CHECK-NEXT: vmov q6[3], q6[1], r5, r7
277 ; CHECK-NEXT: mov.w r3, #0
279 ; CHECK-NEXT: movlt r3, #1
280 ; CHECK-NEXT: cmp r3, #0
281 ; CHECK-NEXT: csetm r3, ne
282 ; CHECK-NEXT: vmov q4[2], q4[0], r3, r8
283 ; CHECK-NEXT: vmov q4[3], q4[1], r3, r8
284 ; CHECK-NEXT: mvn r8, #-2147483648
285 ; CHECK-NEXT: vbic q5, q0, q4
286 ; CHECK-NEXT: vand q4, q6, q4
287 ; CHECK-NEXT: vorr q4, q4, q5
288 ; CHECK-NEXT: vmov r3, r4, d8
289 ; CHECK-NEXT: subs.w r3, r3, r8
290 ; CHECK-NEXT: sbcs r3, r4, #0
291 ; CHECK-NEXT: vmov r4, r5, d9
292 ; CHECK-NEXT: mov.w r3, #0
294 ; CHECK-NEXT: movlt r3, #1
295 ; CHECK-NEXT: cmp r3, #0
296 ; CHECK-NEXT: csetm r3, ne
297 ; CHECK-NEXT: vmov.32 q5[1], r3
298 ; CHECK-NEXT: subs.w r4, r4, r8
299 ; CHECK-NEXT: sbcs r4, r5, #0
300 ; CHECK-NEXT: vmov r5, s8
301 ; CHECK-NEXT: mov.w r4, #0
303 ; CHECK-NEXT: movlt r4, #1
304 ; CHECK-NEXT: cmp r4, #0
305 ; CHECK-NEXT: csetm r4, ne
306 ; CHECK-NEXT: vmov q5[2], q5[0], r3, r4
307 ; CHECK-NEXT: vmov r3, s10
308 ; CHECK-NEXT: vmov.f32 s10, s13
309 ; CHECK-NEXT: vbic q6, q1, q5
310 ; CHECK-NEXT: vand q4, q4, q5
311 ; CHECK-NEXT: vorr q4, q4, q6
312 ; CHECK-NEXT: vmov r4, s10
313 ; CHECK-NEXT: smull r6, r5, r6, r5
314 ; CHECK-NEXT: asrl r6, r5, #31
315 ; CHECK-NEXT: smull r4, r7, r4, r3
316 ; CHECK-NEXT: asrl r4, r7, #31
317 ; CHECK-NEXT: rsbs.w r3, r4, #-2147483648
318 ; CHECK-NEXT: vmov q5[2], q5[0], r6, r4
319 ; CHECK-NEXT: sbcs.w r3, r2, r7
320 ; CHECK-NEXT: vmov q5[3], q5[1], r5, r7
321 ; CHECK-NEXT: mov.w r3, #0
323 ; CHECK-NEXT: movlt r3, #1
324 ; CHECK-NEXT: cmp r3, #0
325 ; CHECK-NEXT: csetm r3, ne
326 ; CHECK-NEXT: rsbs.w r1, r6, #-2147483648
327 ; CHECK-NEXT: sbcs.w r1, r2, r5
328 ; CHECK-NEXT: mov.w r1, #0
330 ; CHECK-NEXT: movlt r1, #1
331 ; CHECK-NEXT: cmp r1, #0
332 ; CHECK-NEXT: csetm r1, ne
333 ; CHECK-NEXT: vmov q2[2], q2[0], r1, r3
334 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r3
335 ; CHECK-NEXT: ldrd r1, r2, [sp, #8] @ 8-byte Folded Reload
336 ; CHECK-NEXT: vbic q3, q0, q2
337 ; CHECK-NEXT: vand q2, q5, q2
338 ; CHECK-NEXT: vorr q2, q2, q3
339 ; CHECK-NEXT: vmov r4, r3, d4
340 ; CHECK-NEXT: subs.w r4, r4, r8
341 ; CHECK-NEXT: sbcs r3, r3, #0
342 ; CHECK-NEXT: mov.w r3, #0
344 ; CHECK-NEXT: movlt r3, #1
345 ; CHECK-NEXT: cmp r3, #0
346 ; CHECK-NEXT: vmov r3, r4, d5
347 ; CHECK-NEXT: csetm r5, ne
348 ; CHECK-NEXT: vmov.32 q3[1], r5
349 ; CHECK-NEXT: subs.w r3, r3, r8
350 ; CHECK-NEXT: sbcs r3, r4, #0
351 ; CHECK-NEXT: mov.w r3, #0
353 ; CHECK-NEXT: movlt r3, #1
354 ; CHECK-NEXT: cmp r3, #0
355 ; CHECK-NEXT: csetm r3, ne
356 ; CHECK-NEXT: vmov q3[2], q3[0], r5, r3
357 ; CHECK-NEXT: vbic q5, q1, q3
358 ; CHECK-NEXT: vand q2, q2, q3
359 ; CHECK-NEXT: vorr q2, q2, q5
360 ; CHECK-NEXT: vmov.f32 s9, s10
361 ; CHECK-NEXT: vmov.f32 s10, s16
362 ; CHECK-NEXT: vmov.f32 s11, s18
363 ; CHECK-NEXT: vstrb.8 q2, [r2], #16
364 ; CHECK-NEXT: le lr, .LBB1_4
365 ; CHECK-NEXT: @ %bb.5: @ %middle.block
366 ; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload
367 ; CHECK-NEXT: cmp r7, r3
368 ; CHECK-NEXT: beq .LBB1_8
369 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader21
370 ; CHECK-NEXT: sub.w lr, r3, r7
371 ; CHECK-NEXT: mov.w r0, #-1
372 ; CHECK-NEXT: mov.w r3, #-2147483648
373 ; CHECK-NEXT: mvn r2, #-2147483648
374 ; CHECK-NEXT: .LBB1_7: @ %for.body
375 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
376 ; CHECK-NEXT: ldr r1, [r12], #4
377 ; CHECK-NEXT: ldr r4, [r9], #4
378 ; CHECK-NEXT: smull r4, r1, r4, r1
379 ; CHECK-NEXT: asrl r4, r1, #31
380 ; CHECK-NEXT: subs r5, r3, r4
381 ; CHECK-NEXT: sbcs.w r5, r0, r1
382 ; CHECK-NEXT: mov.w r5, #0
384 ; CHECK-NEXT: movlt r5, #1
385 ; CHECK-NEXT: cmp r5, #0
386 ; CHECK-NEXT: csel r4, r4, r3, ne
387 ; CHECK-NEXT: csel r1, r1, r0, ne
388 ; CHECK-NEXT: subs r5, r4, r2
389 ; CHECK-NEXT: sbcs r1, r1, #0
390 ; CHECK-NEXT: csel r1, r4, r2, lt
391 ; CHECK-NEXT: str r1, [r11], #4
392 ; CHECK-NEXT: le lr, .LBB1_7
393 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup
394 ; CHECK-NEXT: add sp, #16
395 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
396 ; CHECK-NEXT: add sp, #4
397 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
398 ; CHECK-NEXT: .p2align 4
399 ; CHECK-NEXT: @ %bb.9:
400 ; CHECK-NEXT: .LCPI1_0:
401 ; CHECK-NEXT: .long 2147483648 @ 0x80000000
402 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
403 ; CHECK-NEXT: .long 2147483648 @ 0x80000000
404 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
405 ; CHECK-NEXT: .LCPI1_1:
406 ; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
407 ; CHECK-NEXT: .long 0 @ 0x0
408 ; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
409 ; CHECK-NEXT: .long 0 @ 0x0
411 %cmp8 = icmp eq i32 %N, 0
412 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
414 for.body.preheader: ; preds = %entry
415 %min.iters.check = icmp ult i32 %N, 4
416 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
418 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
419 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
420 %pSrcA.addr.011.ph = phi i32* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
421 %pSrcB.addr.010.ph = phi i32* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
422 %pDst.addr.09.ph = phi i32* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
425 vector.ph: ; preds = %for.body.preheader
426 %n.vec = and i32 %N, -4
427 %ind.end = getelementptr i32, i32* %pSrcA, i32 %n.vec
428 %ind.end15 = getelementptr i32, i32* %pSrcB, i32 %n.vec
429 %ind.end17 = getelementptr i32, i32* %pDst, i32 %n.vec
430 br label %vector.body
432 vector.body: ; preds = %vector.body, %vector.ph
433 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
434 %next.gep = getelementptr i32, i32* %pSrcA, i32 %index
435 %next.gep18 = getelementptr i32, i32* %pSrcB, i32 %index
436 %next.gep19 = getelementptr i32, i32* %pDst, i32 %index
437 %0 = bitcast i32* %next.gep to <4 x i32>*
438 %wide.load = load <4 x i32>, <4 x i32>* %0, align 4
439 %1 = sext <4 x i32> %wide.load to <4 x i64>
440 %2 = bitcast i32* %next.gep18 to <4 x i32>*
441 %wide.load20 = load <4 x i32>, <4 x i32>* %2, align 4
442 %3 = sext <4 x i32> %wide.load20 to <4 x i64>
443 %4 = mul nsw <4 x i64> %3, %1
444 %5 = ashr <4 x i64> %4, <i64 31, i64 31, i64 31, i64 31>
445 %6 = icmp sgt <4 x i64> %5, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
446 %7 = select <4 x i1> %6, <4 x i64> %5, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
447 %8 = icmp slt <4 x i64> %7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
448 %9 = select <4 x i1> %8, <4 x i64> %7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
449 %10 = trunc <4 x i64> %9 to <4 x i32>
450 %11 = bitcast i32* %next.gep19 to <4 x i32>*
451 store <4 x i32> %10, <4 x i32>* %11, align 4
452 %index.next = add i32 %index, 4
453 %12 = icmp eq i32 %index.next, %n.vec
454 br i1 %12, label %middle.block, label %vector.body
456 middle.block: ; preds = %vector.body
457 %cmp.n = icmp eq i32 %n.vec, %N
458 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
460 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
463 for.body: ; preds = %for.body.preheader21, %for.body
464 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
465 %pSrcA.addr.011 = phi i32* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
466 %pSrcB.addr.010 = phi i32* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
467 %pDst.addr.09 = phi i32* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
468 %incdec.ptr = getelementptr inbounds i32, i32* %pSrcA.addr.011, i32 1
469 %13 = load i32, i32* %pSrcA.addr.011, align 4
470 %conv = sext i32 %13 to i64
471 %incdec.ptr1 = getelementptr inbounds i32, i32* %pSrcB.addr.010, i32 1
472 %14 = load i32, i32* %pSrcB.addr.010, align 4
473 %conv2 = sext i32 %14 to i64
474 %mul = mul nsw i64 %conv2, %conv
475 %shr = ashr i64 %mul, 31
476 %15 = icmp sgt i64 %shr, -2147483648
477 %.val.i = select i1 %15, i64 %shr, i64 -2147483648
478 %16 = icmp slt i64 %.val.i, 2147483647
479 %retval.0.i = select i1 %16, i64 %.val.i, i64 2147483647
480 %conv3 = trunc i64 %retval.0.i to i32
481 %incdec.ptr4 = getelementptr inbounds i32, i32* %pDst.addr.09, i32 1
482 store i32 %conv3, i32* %pDst.addr.09, align 4
483 %inc = add nuw i32 %i.012, 1
484 %exitcond = icmp eq i32 %inc, %N
485 br i1 %exitcond, label %for.cond.cleanup, label %for.body
488 define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
489 ; CHECK-LABEL: ssatmul_4t_q31:
490 ; CHECK: @ %bb.0: @ %entry
491 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
492 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
493 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
494 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
495 ; CHECK-NEXT: .pad #16
496 ; CHECK-NEXT: sub sp, #16
497 ; CHECK-NEXT: cmp r3, #0
498 ; CHECK-NEXT: beq.w .LBB2_3
499 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
500 ; CHECK-NEXT: adds r7, r3, #3
501 ; CHECK-NEXT: movs r6, #1
502 ; CHECK-NEXT: bic r7, r7, #3
503 ; CHECK-NEXT: adr r4, .LCPI2_1
504 ; CHECK-NEXT: subs r7, #4
505 ; CHECK-NEXT: adr r5, .LCPI2_2
506 ; CHECK-NEXT: vldrw.u32 q2, [r4]
507 ; CHECK-NEXT: vldrw.u32 q3, [r5]
508 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
509 ; CHECK-NEXT: adr r6, .LCPI2_0
510 ; CHECK-NEXT: subs r7, r3, #1
511 ; CHECK-NEXT: vldrw.u32 q0, [r6]
512 ; CHECK-NEXT: mov.w r9, #0
513 ; CHECK-NEXT: vdup.32 q1, r7
514 ; CHECK-NEXT: mov.w r12, #-1
515 ; CHECK-NEXT: mvn r8, #-2147483648
516 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
517 ; CHECK-NEXT: .LBB2_2: @ %vector.body
518 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
519 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
520 ; CHECK-NEXT: vdup.32 q4, r9
521 ; CHECK-NEXT: add.w r9, r9, #4
522 ; CHECK-NEXT: vorr q4, q4, q0
523 ; CHECK-NEXT: vpt.u32 cs, q1, q4
524 ; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
526 ; CHECK-NEXT: vldrwt.u32 q5, [r1], #16
527 ; CHECK-NEXT: vmov.f32 s24, s18
528 ; CHECK-NEXT: vmov.f32 s26, s19
529 ; CHECK-NEXT: vmov.f32 s28, s22
530 ; CHECK-NEXT: vmov.f32 s30, s23
531 ; CHECK-NEXT: vmullb.s32 q0, q7, q6
532 ; CHECK-NEXT: vmov r6, r5, d1
533 ; CHECK-NEXT: asrl r6, r5, #31
534 ; CHECK-NEXT: rsbs.w r7, r6, #-2147483648
535 ; CHECK-NEXT: sbcs.w r7, r12, r5
536 ; CHECK-NEXT: mov.w r7, #0
538 ; CHECK-NEXT: movlt r7, #1
539 ; CHECK-NEXT: cmp r7, #0
540 ; CHECK-NEXT: vmov r4, r7, d0
541 ; CHECK-NEXT: csetm r10, ne
542 ; CHECK-NEXT: asrl r4, r7, #31
543 ; CHECK-NEXT: rsbs.w r3, r4, #-2147483648
544 ; CHECK-NEXT: vmov q7[2], q7[0], r4, r6
545 ; CHECK-NEXT: sbcs.w r3, r12, r7
546 ; CHECK-NEXT: vmov q7[3], q7[1], r7, r5
547 ; CHECK-NEXT: mov.w r3, #0
548 ; CHECK-NEXT: vmov r7, s20
550 ; CHECK-NEXT: movlt r3, #1
551 ; CHECK-NEXT: cmp r3, #0
552 ; CHECK-NEXT: csetm r3, ne
553 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r10
554 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r10
555 ; CHECK-NEXT: vbic q6, q2, q0
556 ; CHECK-NEXT: vand q0, q7, q0
557 ; CHECK-NEXT: vorr q6, q0, q6
558 ; CHECK-NEXT: vmov r3, r4, d12
559 ; CHECK-NEXT: subs.w r3, r3, r8
560 ; CHECK-NEXT: sbcs r3, r4, #0
561 ; CHECK-NEXT: vmov r4, r5, d13
562 ; CHECK-NEXT: mov.w r3, #0
564 ; CHECK-NEXT: movlt r3, #1
565 ; CHECK-NEXT: cmp r3, #0
566 ; CHECK-NEXT: csetm r3, ne
567 ; CHECK-NEXT: vmov.32 q0[1], r3
568 ; CHECK-NEXT: subs.w r4, r4, r8
569 ; CHECK-NEXT: sbcs r4, r5, #0
570 ; CHECK-NEXT: mov.w r4, #0
572 ; CHECK-NEXT: movlt r4, #1
573 ; CHECK-NEXT: cmp r4, #0
574 ; CHECK-NEXT: csetm r4, ne
575 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r4
576 ; CHECK-NEXT: vbic q7, q3, q0
577 ; CHECK-NEXT: vand q0, q6, q0
578 ; CHECK-NEXT: vorr q6, q0, q7
579 ; CHECK-NEXT: vmov.f32 s2, s17
580 ; CHECK-NEXT: vmov r3, s2
581 ; CHECK-NEXT: vmov.f32 s2, s21
582 ; CHECK-NEXT: vmov r4, s2
583 ; CHECK-NEXT: smull r6, r5, r4, r3
584 ; CHECK-NEXT: vmov r4, s16
585 ; CHECK-NEXT: asrl r6, r5, #31
586 ; CHECK-NEXT: rsbs.w r3, r6, #-2147483648
587 ; CHECK-NEXT: sbcs.w r3, r12, r5
588 ; CHECK-NEXT: mov.w r3, #0
590 ; CHECK-NEXT: movlt r3, #1
591 ; CHECK-NEXT: cmp r3, #0
592 ; CHECK-NEXT: csetm r10, ne
593 ; CHECK-NEXT: smull r4, r7, r7, r4
594 ; CHECK-NEXT: asrl r4, r7, #31
595 ; CHECK-NEXT: rsbs.w r3, r4, #-2147483648
596 ; CHECK-NEXT: vmov q5[2], q5[0], r4, r6
597 ; CHECK-NEXT: sbcs.w r3, r12, r7
598 ; CHECK-NEXT: vmov q5[3], q5[1], r7, r5
599 ; CHECK-NEXT: mov.w r3, #0
601 ; CHECK-NEXT: movlt r3, #1
602 ; CHECK-NEXT: cmp r3, #0
603 ; CHECK-NEXT: csetm r3, ne
604 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r10
605 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r10
606 ; CHECK-NEXT: vbic q4, q2, q0
607 ; CHECK-NEXT: vand q0, q5, q0
608 ; CHECK-NEXT: vorr q4, q0, q4
609 ; CHECK-NEXT: vmov r3, r4, d8
610 ; CHECK-NEXT: subs.w r3, r3, r8
611 ; CHECK-NEXT: sbcs r3, r4, #0
612 ; CHECK-NEXT: vmov r4, r5, d9
613 ; CHECK-NEXT: mov.w r3, #0
615 ; CHECK-NEXT: movlt r3, #1
616 ; CHECK-NEXT: cmp r3, #0
617 ; CHECK-NEXT: csetm r3, ne
618 ; CHECK-NEXT: vmov.32 q0[1], r3
619 ; CHECK-NEXT: subs.w r4, r4, r8
620 ; CHECK-NEXT: sbcs r4, r5, #0
621 ; CHECK-NEXT: mov.w r4, #0
623 ; CHECK-NEXT: movlt r4, #1
624 ; CHECK-NEXT: cmp r4, #0
625 ; CHECK-NEXT: csetm r4, ne
626 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r4
627 ; CHECK-NEXT: vbic q5, q3, q0
628 ; CHECK-NEXT: vand q0, q4, q0
629 ; CHECK-NEXT: vorr q0, q0, q5
630 ; CHECK-NEXT: vmov.f32 s1, s2
631 ; CHECK-NEXT: vmov.f32 s2, s24
632 ; CHECK-NEXT: vmov.f32 s3, s26
634 ; CHECK-NEXT: vstrwt.32 q0, [r2], #16
635 ; CHECK-NEXT: le lr, .LBB2_2
636 ; CHECK-NEXT: .LBB2_3: @ %for.cond.cleanup
637 ; CHECK-NEXT: add sp, #16
638 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
639 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
640 ; CHECK-NEXT: .p2align 4
641 ; CHECK-NEXT: @ %bb.4:
642 ; CHECK-NEXT: .LCPI2_0:
643 ; CHECK-NEXT: .long 0 @ 0x0
644 ; CHECK-NEXT: .long 1 @ 0x1
645 ; CHECK-NEXT: .long 2 @ 0x2
646 ; CHECK-NEXT: .long 3 @ 0x3
647 ; CHECK-NEXT: .LCPI2_1:
648 ; CHECK-NEXT: .long 2147483648 @ 0x80000000
649 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
650 ; CHECK-NEXT: .long 2147483648 @ 0x80000000
651 ; CHECK-NEXT: .long 4294967295 @ 0xffffffff
652 ; CHECK-NEXT: .LCPI2_2:
653 ; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
654 ; CHECK-NEXT: .long 0 @ 0x0
655 ; CHECK-NEXT: .long 2147483647 @ 0x7fffffff
656 ; CHECK-NEXT: .long 0 @ 0x0
658 %cmp8 = icmp eq i32 %N, 0
659 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
661 vector.ph: ; preds = %entry
662 %n.rnd.up = add i32 %N, 3
663 %n.vec = and i32 %n.rnd.up, -4
664 %trip.count.minus.1 = add i32 %N, -1
665 %broadcast.splatinsert20 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
666 %broadcast.splat21 = shufflevector <4 x i32> %broadcast.splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
667 br label %vector.body
669 vector.body: ; preds = %vector.body, %vector.ph
670 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
671 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
672 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
673 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
674 %next.gep = getelementptr i32, i32* %pSrcA, i32 %index
675 %next.gep18 = getelementptr i32, i32* %pSrcB, i32 %index
676 %next.gep19 = getelementptr i32, i32* %pDst, i32 %index
677 %0 = icmp ule <4 x i32> %induction, %broadcast.splat21
678 %1 = bitcast i32* %next.gep to <4 x i32>*
679 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> undef)
680 %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
681 %3 = bitcast i32* %next.gep18 to <4 x i32>*
682 %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %0, <4 x i32> undef)
683 %4 = sext <4 x i32> %wide.masked.load22 to <4 x i64>
684 %5 = mul nsw <4 x i64> %4, %2
685 %6 = ashr <4 x i64> %5, <i64 31, i64 31, i64 31, i64 31>
686 %7 = icmp sgt <4 x i64> %6, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
687 %8 = select <4 x i1> %7, <4 x i64> %6, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
688 %9 = icmp slt <4 x i64> %8, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
689 %10 = select <4 x i1> %9, <4 x i64> %8, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
690 %11 = trunc <4 x i64> %10 to <4 x i32>
691 %12 = bitcast i32* %next.gep19 to <4 x i32>*
692 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %11, <4 x i32>* %12, i32 4, <4 x i1> %0)
693 %index.next = add i32 %index, 4
694 %13 = icmp eq i32 %index.next, %n.vec
695 br i1 %13, label %for.cond.cleanup, label %vector.body
697 for.cond.cleanup: ; preds = %vector.body, %entry
701 define arm_aapcs_vfpcc void @usatmul_2_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
702 ; CHECK-LABEL: usatmul_2_q31:
703 ; CHECK: @ %bb.0: @ %entry
704 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
705 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
706 ; CHECK-NEXT: .pad #4
707 ; CHECK-NEXT: sub sp, #4
708 ; CHECK-NEXT: cmp r3, #0
709 ; CHECK-NEXT: beq .LBB3_8
710 ; CHECK-NEXT: @ %bb.1: @ %entry
711 ; CHECK-NEXT: cmp r3, #1
712 ; CHECK-NEXT: bne .LBB3_3
713 ; CHECK-NEXT: @ %bb.2:
714 ; CHECK-NEXT: movs r7, #0
715 ; CHECK-NEXT: mov r12, r0
716 ; CHECK-NEXT: mov r11, r1
717 ; CHECK-NEXT: mov r8, r2
718 ; CHECK-NEXT: b .LBB3_6
719 ; CHECK-NEXT: .LBB3_3: @ %vector.ph
720 ; CHECK-NEXT: bic r5, r3, #1
721 ; CHECK-NEXT: movs r6, #1
722 ; CHECK-NEXT: subs r7, r5, #2
723 ; CHECK-NEXT: str r5, [sp] @ 4-byte Spill
724 ; CHECK-NEXT: add.w r8, r2, r5, lsl #2
725 ; CHECK-NEXT: add.w r11, r1, r5, lsl #2
726 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1
727 ; CHECK-NEXT: add.w r12, r0, r5, lsl #2
728 ; CHECK-NEXT: .LBB3_4: @ %vector.body
729 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
730 ; CHECK-NEXT: ldrd r4, r7, [r0], #8
731 ; CHECK-NEXT: ldrd r5, r10, [r1], #8
732 ; CHECK-NEXT: umull r4, r5, r5, r4
733 ; CHECK-NEXT: lsrl r4, r5, #31
734 ; CHECK-NEXT: subs.w r6, r4, #-1
735 ; CHECK-NEXT: umull r6, r7, r10, r7
736 ; CHECK-NEXT: sbcs r5, r5, #0
737 ; CHECK-NEXT: mov.w r5, #0
739 ; CHECK-NEXT: movlo r5, #1
740 ; CHECK-NEXT: cmp r5, #0
741 ; CHECK-NEXT: lsrl r6, r7, #31
742 ; CHECK-NEXT: csetm r9, ne
743 ; CHECK-NEXT: subs.w r5, r6, #-1
744 ; CHECK-NEXT: vmov.32 q0[1], r9
745 ; CHECK-NEXT: sbcs r5, r7, #0
746 ; CHECK-NEXT: vmov q1[2], q1[0], r4, r6
747 ; CHECK-NEXT: mov.w r5, #0
749 ; CHECK-NEXT: movlo r5, #1
750 ; CHECK-NEXT: cmp r5, #0
751 ; CHECK-NEXT: csetm r5, ne
752 ; CHECK-NEXT: vmov q0[2], q0[0], r9, r5
753 ; CHECK-NEXT: vand q1, q1, q0
754 ; CHECK-NEXT: vorn q0, q1, q0
755 ; CHECK-NEXT: vmov r4, s2
756 ; CHECK-NEXT: vmov r5, s0
757 ; CHECK-NEXT: strd r5, r4, [r2], #8
758 ; CHECK-NEXT: le lr, .LBB3_4
759 ; CHECK-NEXT: @ %bb.5: @ %middle.block
760 ; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload
761 ; CHECK-NEXT: cmp r7, r3
762 ; CHECK-NEXT: beq .LBB3_8
763 ; CHECK-NEXT: .LBB3_6: @ %for.body.preheader
764 ; CHECK-NEXT: sub.w lr, r3, r7
765 ; CHECK-NEXT: .LBB3_7: @ %for.body
766 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
767 ; CHECK-NEXT: ldr r0, [r12], #4
768 ; CHECK-NEXT: ldr r1, [r11], #4
769 ; CHECK-NEXT: umull r0, r1, r1, r0
770 ; CHECK-NEXT: lsrl r0, r1, #31
771 ; CHECK-NEXT: subs.w r2, r0, #-1
772 ; CHECK-NEXT: sbcs r1, r1, #0
774 ; CHECK-NEXT: movhs.w r0, #-1
775 ; CHECK-NEXT: str r0, [r8], #4
776 ; CHECK-NEXT: le lr, .LBB3_7
777 ; CHECK-NEXT: .LBB3_8: @ %for.cond.cleanup
778 ; CHECK-NEXT: add sp, #4
779 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
781 switch i32 %N, label %vector.ph [
782 i32 0, label %for.cond.cleanup
783 i32 1, label %for.body.preheader
786 vector.ph: ; preds = %entry
787 %n.vec = and i32 %N, -2
788 %ind.end = getelementptr i32, i32* %pSrcA, i32 %n.vec
789 %ind.end15 = getelementptr i32, i32* %pSrcB, i32 %n.vec
790 %ind.end17 = getelementptr i32, i32* %pDst, i32 %n.vec
791 br label %vector.body
793 vector.body: ; preds = %vector.body, %vector.ph
794 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
795 %next.gep = getelementptr i32, i32* %pSrcA, i32 %index
796 %next.gep18 = getelementptr i32, i32* %pSrcB, i32 %index
797 %next.gep19 = getelementptr i32, i32* %pDst, i32 %index
798 %0 = bitcast i32* %next.gep to <2 x i32>*
799 %wide.load = load <2 x i32>, <2 x i32>* %0, align 4
800 %1 = zext <2 x i32> %wide.load to <2 x i64>
801 %2 = bitcast i32* %next.gep18 to <2 x i32>*
802 %wide.load20 = load <2 x i32>, <2 x i32>* %2, align 4
803 %3 = zext <2 x i32> %wide.load20 to <2 x i64>
804 %4 = mul nuw <2 x i64> %3, %1
805 %5 = lshr <2 x i64> %4, <i64 31, i64 31>
806 %6 = icmp ult <2 x i64> %5, <i64 4294967295, i64 4294967295>
807 %7 = select <2 x i1> %6, <2 x i64> %5, <2 x i64> <i64 4294967295, i64 4294967295>
808 %8 = trunc <2 x i64> %7 to <2 x i32>
809 %9 = bitcast i32* %next.gep19 to <2 x i32>*
810 store <2 x i32> %8, <2 x i32>* %9, align 4
811 %index.next = add i32 %index, 2
812 %10 = icmp eq i32 %index.next, %n.vec
813 br i1 %10, label %middle.block, label %vector.body
815 middle.block: ; preds = %vector.body
816 %cmp.n = icmp eq i32 %n.vec, %N
817 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader
819 for.body.preheader: ; preds = %entry, %middle.block
820 %i.012.ph = phi i32 [ 0, %entry ], [ %n.vec, %middle.block ]
821 %pSrcA.addr.011.ph = phi i32* [ %pSrcA, %entry ], [ %ind.end, %middle.block ]
822 %pSrcB.addr.010.ph = phi i32* [ %pSrcB, %entry ], [ %ind.end15, %middle.block ]
823 %pDst.addr.09.ph = phi i32* [ %pDst, %entry ], [ %ind.end17, %middle.block ]
826 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
829 for.body: ; preds = %for.body.preheader, %for.body
830 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader ]
831 %pSrcA.addr.011 = phi i32* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader ]
832 %pSrcB.addr.010 = phi i32* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader ]
833 %pDst.addr.09 = phi i32* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader ]
834 %incdec.ptr = getelementptr inbounds i32, i32* %pSrcA.addr.011, i32 1
835 %11 = load i32, i32* %pSrcA.addr.011, align 4
836 %conv = zext i32 %11 to i64
837 %incdec.ptr1 = getelementptr inbounds i32, i32* %pSrcB.addr.010, i32 1
838 %12 = load i32, i32* %pSrcB.addr.010, align 4
839 %conv2 = zext i32 %12 to i64
840 %mul = mul nuw i64 %conv2, %conv
841 %shr = lshr i64 %mul, 31
842 %13 = icmp ult i64 %shr, 4294967295
843 %retval.0.i = select i1 %13, i64 %shr, i64 4294967295
844 %conv3 = trunc i64 %retval.0.i to i32
845 %incdec.ptr4 = getelementptr inbounds i32, i32* %pDst.addr.09, i32 1
846 store i32 %conv3, i32* %pDst.addr.09, align 4
847 %inc = add nuw i32 %i.012, 1
848 %exitcond = icmp eq i32 %inc, %N
849 br i1 %exitcond, label %for.cond.cleanup, label %for.body
852 define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
853 ; CHECK-LABEL: usatmul_4_q31:
854 ; CHECK: @ %bb.0: @ %entry
855 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
856 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
857 ; CHECK-NEXT: .pad #4
858 ; CHECK-NEXT: sub sp, #4
859 ; CHECK-NEXT: .vsave {d8, d9}
860 ; CHECK-NEXT: vpush {d8, d9}
861 ; CHECK-NEXT: cmp r3, #0
862 ; CHECK-NEXT: beq.w .LBB4_8
863 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
864 ; CHECK-NEXT: mov.w r8, #0
865 ; CHECK-NEXT: cmp r3, #3
866 ; CHECK-NEXT: bhi .LBB4_3
867 ; CHECK-NEXT: @ %bb.2:
868 ; CHECK-NEXT: mov r12, r0
869 ; CHECK-NEXT: mov r9, r1
870 ; CHECK-NEXT: mov r10, r2
871 ; CHECK-NEXT: b .LBB4_6
872 ; CHECK-NEXT: .LBB4_3: @ %vector.ph
873 ; CHECK-NEXT: bic r8, r3, #3
874 ; CHECK-NEXT: movs r6, #1
875 ; CHECK-NEXT: sub.w r7, r8, #4
876 ; CHECK-NEXT: add.w r10, r2, r8, lsl #2
877 ; CHECK-NEXT: add.w r9, r1, r8, lsl #2
878 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
879 ; CHECK-NEXT: add.w r12, r0, r8, lsl #2
880 ; CHECK-NEXT: .LBB4_4: @ %vector.body
881 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
882 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
883 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16
884 ; CHECK-NEXT: vmov.f32 s4, s2
885 ; CHECK-NEXT: vmov.f32 s12, s10
886 ; CHECK-NEXT: vmov.f32 s6, s3
887 ; CHECK-NEXT: vmov.f32 s14, s11
888 ; CHECK-NEXT: vmullb.u32 q4, q3, q1
889 ; CHECK-NEXT: vmov.f32 s2, s1
890 ; CHECK-NEXT: vmov r4, r5, d8
891 ; CHECK-NEXT: lsrl r4, r5, #31
892 ; CHECK-NEXT: vmov.f32 s10, s9
893 ; CHECK-NEXT: subs.w r6, r4, #-1
894 ; CHECK-NEXT: sbcs r5, r5, #0
895 ; CHECK-NEXT: vmov r6, r7, d9
896 ; CHECK-NEXT: mov.w r5, #0
897 ; CHECK-NEXT: lsrl r6, r7, #31
899 ; CHECK-NEXT: movlo r5, #1
900 ; CHECK-NEXT: cmp r5, #0
901 ; CHECK-NEXT: csetm r11, ne
902 ; CHECK-NEXT: subs.w r5, r6, #-1
903 ; CHECK-NEXT: sbcs r5, r7, #0
904 ; CHECK-NEXT: vmov.32 q1[1], r11
905 ; CHECK-NEXT: mov.w r5, #0
906 ; CHECK-NEXT: vmov q3[2], q3[0], r4, r6
908 ; CHECK-NEXT: movlo r5, #1
909 ; CHECK-NEXT: cmp r5, #0
910 ; CHECK-NEXT: csetm r5, ne
911 ; CHECK-NEXT: vmov q1[2], q1[0], r11, r5
912 ; CHECK-NEXT: vand q3, q3, q1
913 ; CHECK-NEXT: vorn q1, q3, q1
914 ; CHECK-NEXT: vmullb.u32 q3, q2, q0
915 ; CHECK-NEXT: vmov r4, r5, d6
916 ; CHECK-NEXT: lsrl r4, r5, #31
917 ; CHECK-NEXT: subs.w r6, r4, #-1
918 ; CHECK-NEXT: sbcs r5, r5, #0
919 ; CHECK-NEXT: vmov r6, r7, d7
920 ; CHECK-NEXT: mov.w r5, #0
921 ; CHECK-NEXT: lsrl r6, r7, #31
923 ; CHECK-NEXT: movlo r5, #1
924 ; CHECK-NEXT: cmp r5, #0
925 ; CHECK-NEXT: csetm r11, ne
926 ; CHECK-NEXT: subs.w r5, r6, #-1
927 ; CHECK-NEXT: sbcs r5, r7, #0
928 ; CHECK-NEXT: vmov.32 q0[1], r11
929 ; CHECK-NEXT: mov.w r5, #0
930 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r6
932 ; CHECK-NEXT: movlo r5, #1
933 ; CHECK-NEXT: cmp r5, #0
934 ; CHECK-NEXT: csetm r5, ne
935 ; CHECK-NEXT: vmov q0[2], q0[0], r11, r5
936 ; CHECK-NEXT: vand q2, q2, q0
937 ; CHECK-NEXT: vorn q0, q2, q0
938 ; CHECK-NEXT: vmov.f32 s1, s2
939 ; CHECK-NEXT: vmov.f32 s2, s4
940 ; CHECK-NEXT: vmov.f32 s3, s6
941 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
942 ; CHECK-NEXT: le lr, .LBB4_4
943 ; CHECK-NEXT: @ %bb.5: @ %middle.block
944 ; CHECK-NEXT: cmp r8, r3
945 ; CHECK-NEXT: beq .LBB4_8
946 ; CHECK-NEXT: .LBB4_6: @ %for.body.preheader21
947 ; CHECK-NEXT: sub.w lr, r3, r8
948 ; CHECK-NEXT: .LBB4_7: @ %for.body
949 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
950 ; CHECK-NEXT: ldr r0, [r12], #4
951 ; CHECK-NEXT: ldr r1, [r9], #4
952 ; CHECK-NEXT: umull r0, r1, r1, r0
953 ; CHECK-NEXT: lsrl r0, r1, #31
954 ; CHECK-NEXT: subs.w r2, r0, #-1
955 ; CHECK-NEXT: sbcs r1, r1, #0
957 ; CHECK-NEXT: movhs.w r0, #-1
958 ; CHECK-NEXT: str r0, [r10], #4
959 ; CHECK-NEXT: le lr, .LBB4_7
960 ; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup
961 ; CHECK-NEXT: vpop {d8, d9}
962 ; CHECK-NEXT: add sp, #4
963 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
965 %cmp8 = icmp eq i32 %N, 0
966 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
968 for.body.preheader: ; preds = %entry
969 %min.iters.check = icmp ult i32 %N, 4
970 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
972 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
973 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
974 %pSrcA.addr.011.ph = phi i32* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
975 %pSrcB.addr.010.ph = phi i32* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
976 %pDst.addr.09.ph = phi i32* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
979 vector.ph: ; preds = %for.body.preheader
980 %n.vec = and i32 %N, -4
981 %ind.end = getelementptr i32, i32* %pSrcA, i32 %n.vec
982 %ind.end15 = getelementptr i32, i32* %pSrcB, i32 %n.vec
983 %ind.end17 = getelementptr i32, i32* %pDst, i32 %n.vec
984 br label %vector.body
986 vector.body: ; preds = %vector.body, %vector.ph
987 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
988 %next.gep = getelementptr i32, i32* %pSrcA, i32 %index
989 %next.gep18 = getelementptr i32, i32* %pSrcB, i32 %index
990 %next.gep19 = getelementptr i32, i32* %pDst, i32 %index
991 %0 = bitcast i32* %next.gep to <4 x i32>*
992 %wide.load = load <4 x i32>, <4 x i32>* %0, align 4
993 %1 = zext <4 x i32> %wide.load to <4 x i64>
994 %2 = bitcast i32* %next.gep18 to <4 x i32>*
995 %wide.load20 = load <4 x i32>, <4 x i32>* %2, align 4
996 %3 = zext <4 x i32> %wide.load20 to <4 x i64>
997 %4 = mul nuw <4 x i64> %3, %1
998 %5 = lshr <4 x i64> %4, <i64 31, i64 31, i64 31, i64 31>
999 %6 = icmp ult <4 x i64> %5, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1000 %7 = select <4 x i1> %6, <4 x i64> %5, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1001 %8 = trunc <4 x i64> %7 to <4 x i32>
1002 %9 = bitcast i32* %next.gep19 to <4 x i32>*
1003 store <4 x i32> %8, <4 x i32>* %9, align 4
1004 %index.next = add i32 %index, 4
1005 %10 = icmp eq i32 %index.next, %n.vec
1006 br i1 %10, label %middle.block, label %vector.body
1008 middle.block: ; preds = %vector.body
1009 %cmp.n = icmp eq i32 %n.vec, %N
1010 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1012 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1015 for.body: ; preds = %for.body.preheader21, %for.body
1016 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1017 %pSrcA.addr.011 = phi i32* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1018 %pSrcB.addr.010 = phi i32* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1019 %pDst.addr.09 = phi i32* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1020 %incdec.ptr = getelementptr inbounds i32, i32* %pSrcA.addr.011, i32 1
1021 %11 = load i32, i32* %pSrcA.addr.011, align 4
1022 %conv = zext i32 %11 to i64
1023 %incdec.ptr1 = getelementptr inbounds i32, i32* %pSrcB.addr.010, i32 1
1024 %12 = load i32, i32* %pSrcB.addr.010, align 4
1025 %conv2 = zext i32 %12 to i64
1026 %mul = mul nuw i64 %conv2, %conv
1027 %shr = lshr i64 %mul, 31
1028 %13 = icmp ult i64 %shr, 4294967295
1029 %retval.0.i = select i1 %13, i64 %shr, i64 4294967295
1030 %conv3 = trunc i64 %retval.0.i to i32
1031 %incdec.ptr4 = getelementptr inbounds i32, i32* %pDst.addr.09, i32 1
1032 store i32 %conv3, i32* %pDst.addr.09, align 4
1033 %inc = add nuw i32 %i.012, 1
1034 %exitcond = icmp eq i32 %inc, %N
1035 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1041 define arm_aapcs_vfpcc void @ssatmul_4_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
1042 ; CHECK-LABEL: ssatmul_4_q15:
1043 ; CHECK: @ %bb.0: @ %entry
1044 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1045 ; CHECK-NEXT: push {r4, r5, r6, lr}
1046 ; CHECK-NEXT: cbz r3, .LBB5_8
1047 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1048 ; CHECK-NEXT: cmp r3, #3
1049 ; CHECK-NEXT: bhi .LBB5_3
1050 ; CHECK-NEXT: @ %bb.2:
1051 ; CHECK-NEXT: movs r5, #0
1052 ; CHECK-NEXT: mov r12, r0
1053 ; CHECK-NEXT: mov r6, r1
1054 ; CHECK-NEXT: mov r4, r2
1055 ; CHECK-NEXT: b .LBB5_6
1056 ; CHECK-NEXT: .LBB5_3: @ %vector.ph
1057 ; CHECK-NEXT: bic r5, r3, #3
1058 ; CHECK-NEXT: movs r4, #1
1059 ; CHECK-NEXT: subs r6, r5, #4
1060 ; CHECK-NEXT: add.w r12, r0, r5, lsl #1
1061 ; CHECK-NEXT: add.w lr, r4, r6, lsr #2
1062 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1
1063 ; CHECK-NEXT: add.w r6, r1, r5, lsl #1
1064 ; CHECK-NEXT: .LBB5_4: @ %vector.body
1065 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1066 ; CHECK-NEXT: vldrh.s32 q0, [r0], #8
1067 ; CHECK-NEXT: vldrh.s32 q1, [r1], #8
1068 ; CHECK-NEXT: vmul.i32 q0, q1, q0
1069 ; CHECK-NEXT: vqshrnb.s32 q0, q0, #15
1070 ; CHECK-NEXT: vstrh.32 q0, [r2], #8
1071 ; CHECK-NEXT: le lr, .LBB5_4
1072 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1073 ; CHECK-NEXT: cmp r5, r3
1075 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
1076 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader21
1077 ; CHECK-NEXT: sub.w lr, r3, r5
1078 ; CHECK-NEXT: .LBB5_7: @ %for.body
1079 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1080 ; CHECK-NEXT: ldrsh r0, [r12], #2
1081 ; CHECK-NEXT: ldrsh r1, [r6], #2
1082 ; CHECK-NEXT: muls r0, r1, r0
1083 ; CHECK-NEXT: ssat r0, #16, r0, asr #15
1084 ; CHECK-NEXT: strh r0, [r4], #2
1085 ; CHECK-NEXT: le lr, .LBB5_7
1086 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup
1087 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1089 %cmp8 = icmp eq i32 %N, 0
1090 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1092 for.body.preheader: ; preds = %entry
1093 %min.iters.check = icmp ult i32 %N, 4
1094 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1096 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
1097 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1098 %pSrcA.addr.011.ph = phi i16* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1099 %pSrcB.addr.010.ph = phi i16* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1100 %pDst.addr.09.ph = phi i16* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1103 vector.ph: ; preds = %for.body.preheader
1104 %n.vec = and i32 %N, -4
1105 %ind.end = getelementptr i16, i16* %pSrcA, i32 %n.vec
1106 %ind.end15 = getelementptr i16, i16* %pSrcB, i32 %n.vec
1107 %ind.end17 = getelementptr i16, i16* %pDst, i32 %n.vec
1108 br label %vector.body
1110 vector.body: ; preds = %vector.body, %vector.ph
1111 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1112 %next.gep = getelementptr i16, i16* %pSrcA, i32 %index
1113 %next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
1114 %next.gep19 = getelementptr i16, i16* %pDst, i32 %index
1115 %0 = bitcast i16* %next.gep to <4 x i16>*
1116 %wide.load = load <4 x i16>, <4 x i16>* %0, align 2
1117 %1 = sext <4 x i16> %wide.load to <4 x i32>
1118 %2 = bitcast i16* %next.gep18 to <4 x i16>*
1119 %wide.load20 = load <4 x i16>, <4 x i16>* %2, align 2
1120 %3 = sext <4 x i16> %wide.load20 to <4 x i32>
1121 %4 = mul nsw <4 x i32> %3, %1
1122 %5 = ashr <4 x i32> %4, <i32 15, i32 15, i32 15, i32 15>
1123 %6 = icmp sgt <4 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1124 %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1125 %8 = icmp slt <4 x i32> %7, <i32 32767, i32 32767, i32 32767, i32 32767>
1126 %9 = select <4 x i1> %8, <4 x i32> %7, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1127 %10 = trunc <4 x i32> %9 to <4 x i16>
1128 %11 = bitcast i16* %next.gep19 to <4 x i16>*
1129 store <4 x i16> %10, <4 x i16>* %11, align 2
1130 %index.next = add i32 %index, 4
1131 %12 = icmp eq i32 %index.next, %n.vec
1132 br i1 %12, label %middle.block, label %vector.body
1134 middle.block: ; preds = %vector.body
1135 %cmp.n = icmp eq i32 %n.vec, %N
1136 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1138 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1141 for.body: ; preds = %for.body.preheader21, %for.body
1142 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1143 %pSrcA.addr.011 = phi i16* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1144 %pSrcB.addr.010 = phi i16* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1145 %pDst.addr.09 = phi i16* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1146 %incdec.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.011, i32 1
1147 %13 = load i16, i16* %pSrcA.addr.011, align 2
1148 %conv = sext i16 %13 to i32
1149 %incdec.ptr1 = getelementptr inbounds i16, i16* %pSrcB.addr.010, i32 1
1150 %14 = load i16, i16* %pSrcB.addr.010, align 2
1151 %conv2 = sext i16 %14 to i32
1152 %mul = mul nsw i32 %conv2, %conv
1153 %shr = ashr i32 %mul, 15
1154 %15 = icmp sgt i32 %shr, -32768
1155 %.val.i = select i1 %15, i32 %shr, i32 -32768
1156 %16 = icmp slt i32 %.val.i, 32767
1157 %retval.0.i = select i1 %16, i32 %.val.i, i32 32767
1158 %conv3 = trunc i32 %retval.0.i to i16
1159 %incdec.ptr4 = getelementptr inbounds i16, i16* %pDst.addr.09, i32 1
1160 store i16 %conv3, i16* %pDst.addr.09, align 2
1161 %inc = add nuw i32 %i.012, 1
1162 %exitcond = icmp eq i32 %inc, %N
1163 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1166 define arm_aapcs_vfpcc void @ssatmul_8_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
1167 ; CHECK-LABEL: ssatmul_8_q15:
1168 ; CHECK: @ %bb.0: @ %entry
1169 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1170 ; CHECK-NEXT: push {r4, r5, r6, lr}
1171 ; CHECK-NEXT: cbz r3, .LBB6_8
1172 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1173 ; CHECK-NEXT: cmp r3, #7
1174 ; CHECK-NEXT: bhi .LBB6_3
1175 ; CHECK-NEXT: @ %bb.2:
1176 ; CHECK-NEXT: movs r5, #0
1177 ; CHECK-NEXT: mov r12, r0
1178 ; CHECK-NEXT: mov r6, r1
1179 ; CHECK-NEXT: mov r4, r2
1180 ; CHECK-NEXT: b .LBB6_6
1181 ; CHECK-NEXT: .LBB6_3: @ %vector.ph
1182 ; CHECK-NEXT: bic r5, r3, #7
1183 ; CHECK-NEXT: movs r4, #1
1184 ; CHECK-NEXT: sub.w r6, r5, #8
1185 ; CHECK-NEXT: add.w r12, r0, r5, lsl #1
1186 ; CHECK-NEXT: add.w lr, r4, r6, lsr #3
1187 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1
1188 ; CHECK-NEXT: add.w r6, r1, r5, lsl #1
1189 ; CHECK-NEXT: .LBB6_4: @ %vector.body
1190 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1191 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
1192 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
1193 ; CHECK-NEXT: vmullt.s16 q2, q1, q0
1194 ; CHECK-NEXT: vmullb.s16 q0, q1, q0
1195 ; CHECK-NEXT: vqshrnb.s32 q0, q0, #15
1196 ; CHECK-NEXT: vqshrnt.s32 q0, q2, #15
1197 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
1198 ; CHECK-NEXT: le lr, .LBB6_4
1199 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1200 ; CHECK-NEXT: cmp r5, r3
1202 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
1203 ; CHECK-NEXT: .LBB6_6: @ %for.body.preheader21
1204 ; CHECK-NEXT: sub.w lr, r3, r5
1205 ; CHECK-NEXT: .LBB6_7: @ %for.body
1206 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1207 ; CHECK-NEXT: ldrsh r0, [r12], #2
1208 ; CHECK-NEXT: ldrsh r1, [r6], #2
1209 ; CHECK-NEXT: muls r0, r1, r0
1210 ; CHECK-NEXT: ssat r0, #16, r0, asr #15
1211 ; CHECK-NEXT: strh r0, [r4], #2
1212 ; CHECK-NEXT: le lr, .LBB6_7
1213 ; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup
1214 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1216 %cmp8 = icmp eq i32 %N, 0
1217 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1219 for.body.preheader: ; preds = %entry
1220 %min.iters.check = icmp ult i32 %N, 8
1221 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1223 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
1224 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1225 %pSrcA.addr.011.ph = phi i16* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1226 %pSrcB.addr.010.ph = phi i16* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1227 %pDst.addr.09.ph = phi i16* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1230 vector.ph: ; preds = %for.body.preheader
1231 %n.vec = and i32 %N, -8
1232 %ind.end = getelementptr i16, i16* %pSrcA, i32 %n.vec
1233 %ind.end15 = getelementptr i16, i16* %pSrcB, i32 %n.vec
1234 %ind.end17 = getelementptr i16, i16* %pDst, i32 %n.vec
1235 br label %vector.body
1237 vector.body: ; preds = %vector.body, %vector.ph
1238 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1239 %next.gep = getelementptr i16, i16* %pSrcA, i32 %index
1240 %next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
1241 %next.gep19 = getelementptr i16, i16* %pDst, i32 %index
1242 %0 = bitcast i16* %next.gep to <8 x i16>*
1243 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
1244 %1 = sext <8 x i16> %wide.load to <8 x i32>
1245 %2 = bitcast i16* %next.gep18 to <8 x i16>*
1246 %wide.load20 = load <8 x i16>, <8 x i16>* %2, align 2
1247 %3 = sext <8 x i16> %wide.load20 to <8 x i32>
1248 %4 = mul nsw <8 x i32> %3, %1
1249 %5 = ashr <8 x i32> %4, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
1250 %6 = icmp sgt <8 x i32> %5, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1251 %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1252 %8 = icmp slt <8 x i32> %7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
1253 %9 = select <8 x i1> %8, <8 x i32> %7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
1254 %10 = trunc <8 x i32> %9 to <8 x i16>
1255 %11 = bitcast i16* %next.gep19 to <8 x i16>*
1256 store <8 x i16> %10, <8 x i16>* %11, align 2
1257 %index.next = add i32 %index, 8
1258 %12 = icmp eq i32 %index.next, %n.vec
1259 br i1 %12, label %middle.block, label %vector.body
1261 middle.block: ; preds = %vector.body
1262 %cmp.n = icmp eq i32 %n.vec, %N
1263 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1265 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1268 for.body: ; preds = %for.body.preheader21, %for.body
1269 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1270 %pSrcA.addr.011 = phi i16* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1271 %pSrcB.addr.010 = phi i16* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1272 %pDst.addr.09 = phi i16* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1273 %incdec.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.011, i32 1
1274 %13 = load i16, i16* %pSrcA.addr.011, align 2
1275 %conv = sext i16 %13 to i32
1276 %incdec.ptr1 = getelementptr inbounds i16, i16* %pSrcB.addr.010, i32 1
1277 %14 = load i16, i16* %pSrcB.addr.010, align 2
1278 %conv2 = sext i16 %14 to i32
1279 %mul = mul nsw i32 %conv2, %conv
1280 %shr = ashr i32 %mul, 15
1281 %15 = icmp sgt i32 %shr, -32768
1282 %.val.i = select i1 %15, i32 %shr, i32 -32768
1283 %16 = icmp slt i32 %.val.i, 32767
1284 %retval.0.i = select i1 %16, i32 %.val.i, i32 32767
1285 %conv3 = trunc i32 %retval.0.i to i16
1286 %incdec.ptr4 = getelementptr inbounds i16, i16* %pDst.addr.09, i32 1
1287 store i16 %conv3, i16* %pDst.addr.09, align 2
1288 %inc = add nuw i32 %i.012, 1
1289 %exitcond = icmp eq i32 %inc, %N
1290 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1293 define arm_aapcs_vfpcc void @ssatmul_8i_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
1294 ; CHECK-LABEL: ssatmul_8i_q15:
1295 ; CHECK: @ %bb.0: @ %entry
1296 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1297 ; CHECK-NEXT: push {r4, r5, r6, lr}
1298 ; CHECK-NEXT: cbz r3, .LBB7_8
1299 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1300 ; CHECK-NEXT: cmp r3, #7
1301 ; CHECK-NEXT: bhi .LBB7_3
1302 ; CHECK-NEXT: @ %bb.2:
1303 ; CHECK-NEXT: movs r5, #0
1304 ; CHECK-NEXT: mov r12, r0
1305 ; CHECK-NEXT: mov r6, r1
1306 ; CHECK-NEXT: mov r4, r2
1307 ; CHECK-NEXT: b .LBB7_6
1308 ; CHECK-NEXT: .LBB7_3: @ %vector.ph
1309 ; CHECK-NEXT: bic r5, r3, #7
1310 ; CHECK-NEXT: movs r4, #1
1311 ; CHECK-NEXT: sub.w r6, r5, #8
1312 ; CHECK-NEXT: add.w r12, r0, r5, lsl #1
1313 ; CHECK-NEXT: add.w lr, r4, r6, lsr #3
1314 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1
1315 ; CHECK-NEXT: add.w r6, r1, r5, lsl #1
1316 ; CHECK-NEXT: .LBB7_4: @ %vector.body
1317 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1318 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
1319 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
1320 ; CHECK-NEXT: vmullt.s16 q2, q1, q0
1321 ; CHECK-NEXT: vmullb.s16 q0, q1, q0
1322 ; CHECK-NEXT: vqshrnb.s32 q0, q0, #15
1323 ; CHECK-NEXT: vqshrnt.s32 q0, q2, #15
1324 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
1325 ; CHECK-NEXT: le lr, .LBB7_4
1326 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1327 ; CHECK-NEXT: cmp r5, r3
1329 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
1330 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader21
1331 ; CHECK-NEXT: sub.w lr, r3, r5
1332 ; CHECK-NEXT: .LBB7_7: @ %for.body
1333 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1334 ; CHECK-NEXT: ldrsh r0, [r12], #2
1335 ; CHECK-NEXT: ldrsh r1, [r6], #2
1336 ; CHECK-NEXT: muls r0, r1, r0
1337 ; CHECK-NEXT: ssat r0, #16, r0, asr #15
1338 ; CHECK-NEXT: strh r0, [r4], #2
1339 ; CHECK-NEXT: le lr, .LBB7_7
1340 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup
1341 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1343 %cmp8 = icmp eq i32 %N, 0
1344 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1346 for.body.preheader: ; preds = %entry
1347 %min.iters.check = icmp ult i32 %N, 8
1348 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1350 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
1351 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1352 %pSrcA.addr.011.ph = phi i16* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1353 %pSrcB.addr.010.ph = phi i16* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1354 %pDst.addr.09.ph = phi i16* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1357 vector.ph: ; preds = %for.body.preheader
1358 %n.vec = and i32 %N, -8
1359 %ind.end = getelementptr i16, i16* %pSrcA, i32 %n.vec
1360 %ind.end15 = getelementptr i16, i16* %pSrcB, i32 %n.vec
1361 %ind.end17 = getelementptr i16, i16* %pDst, i32 %n.vec
1362 br label %vector.body
1364 vector.body: ; preds = %vector.body, %vector.ph
1365 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1366 %next.gep = getelementptr i16, i16* %pSrcA, i32 %index
1367 %next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
1368 %next.gep19 = getelementptr i16, i16* %pDst, i32 %index
1369 %0 = bitcast i16* %next.gep to <8 x i16>*
1370 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
1371 %1 = shufflevector <8 x i16> %wide.load, <8 x i16> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1372 %2 = shufflevector <8 x i16> %wide.load, <8 x i16> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1373 %3 = sext <4 x i16> %1 to <4 x i32>
1374 %4 = sext <4 x i16> %2 to <4 x i32>
1375 %5 = bitcast i16* %next.gep18 to <8 x i16>*
1376 %wide.load20 = load <8 x i16>, <8 x i16>* %5, align 2
1377 %6 = shufflevector <8 x i16> %wide.load20, <8 x i16> %wide.load20, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1378 %7 = shufflevector <8 x i16> %wide.load20, <8 x i16> %wide.load20, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1379 %8 = sext <4 x i16> %6 to <4 x i32>
1380 %9 = sext <4 x i16> %7 to <4 x i32>
1381 %10 = mul <4 x i32> %8, %3
1382 %11 = mul <4 x i32> %9, %4
1383 %12 = ashr <4 x i32> %10, <i32 15, i32 15, i32 15, i32 15>
1384 %13 = ashr <4 x i32> %11, <i32 15, i32 15, i32 15, i32 15>
1385 %14 = icmp sgt <4 x i32> %12, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1386 %15 = icmp sgt <4 x i32> %13, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1387 %16 = select <4 x i1> %14, <4 x i32> %12, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1388 %17 = select <4 x i1> %15, <4 x i32> %13, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1389 %18 = icmp slt <4 x i32> %16, <i32 32767, i32 32767, i32 32767, i32 32767>
1390 %19 = icmp slt <4 x i32> %17, <i32 32767, i32 32767, i32 32767, i32 32767>
1391 %20 = select <4 x i1> %18, <4 x i32> %16, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1392 %21 = select <4 x i1> %19, <4 x i32> %17, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1393 %22 = shufflevector <4 x i32> %20, <4 x i32> %21, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1394 %23 = trunc <8 x i32> %22 to <8 x i16>
1395 %24 = bitcast i16* %next.gep19 to <8 x i16>*
1396 store <8 x i16> %23, <8 x i16>* %24, align 2
1397 %index.next = add i32 %index, 8
1398 %25 = icmp eq i32 %index.next, %n.vec
1399 br i1 %25, label %middle.block, label %vector.body
1401 middle.block: ; preds = %vector.body
1402 %cmp.n = icmp eq i32 %n.vec, %N
1403 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1405 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1408 for.body: ; preds = %for.body, %for.body.preheader21
1409 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1410 %pSrcA.addr.011 = phi i16* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1411 %pSrcB.addr.010 = phi i16* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1412 %pDst.addr.09 = phi i16* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1413 %incdec.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.011, i32 1
1414 %26 = load i16, i16* %pSrcA.addr.011, align 2
1415 %conv = sext i16 %26 to i32
1416 %incdec.ptr1 = getelementptr inbounds i16, i16* %pSrcB.addr.010, i32 1
1417 %27 = load i16, i16* %pSrcB.addr.010, align 2
1418 %conv2 = sext i16 %27 to i32
1419 %mul = mul nsw i32 %conv2, %conv
1420 %shr = ashr i32 %mul, 15
1421 %28 = icmp sgt i32 %shr, -32768
1422 %.val.i = select i1 %28, i32 %shr, i32 -32768
1423 %29 = icmp slt i32 %.val.i, 32767
1424 %retval.0.i = select i1 %29, i32 %.val.i, i32 32767
1425 %conv3 = trunc i32 %retval.0.i to i16
1426 %incdec.ptr4 = getelementptr inbounds i16, i16* %pDst.addr.09, i32 1
1427 store i16 %conv3, i16* %pDst.addr.09, align 2
1428 %inc = add nuw i32 %i.012, 1
1429 %exitcond = icmp eq i32 %inc, %N
1430 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1433 define arm_aapcs_vfpcc void @ssatmul_s4t_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
1434 ; CHECK-LABEL: ssatmul_s4t_q15:
1435 ; CHECK: @ %bb.0: @ %entry
1436 ; CHECK-NEXT: .save {r4, lr}
1437 ; CHECK-NEXT: push {r4, lr}
1438 ; CHECK-NEXT: cmp r3, #0
1440 ; CHECK-NEXT: popeq {r4, pc}
1441 ; CHECK-NEXT: .LBB8_1: @ %vector.ph
1442 ; CHECK-NEXT: add.w r12, r3, #3
1443 ; CHECK-NEXT: mov.w lr, #1
1444 ; CHECK-NEXT: bic r12, r12, #3
1445 ; CHECK-NEXT: adr r4, .LCPI8_0
1446 ; CHECK-NEXT: sub.w r12, r12, #4
1447 ; CHECK-NEXT: vldrw.u32 q0, [r4]
1448 ; CHECK-NEXT: add.w lr, lr, r12, lsr #2
1449 ; CHECK-NEXT: sub.w r12, r3, #1
1450 ; CHECK-NEXT: movs r3, #0
1451 ; CHECK-NEXT: vdup.32 q1, r12
1452 ; CHECK-NEXT: .LBB8_2: @ %vector.body
1453 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1454 ; CHECK-NEXT: vdup.32 q2, r3
1455 ; CHECK-NEXT: adds r3, #4
1456 ; CHECK-NEXT: vorr q2, q2, q0
1457 ; CHECK-NEXT: vptt.u32 cs, q1, q2
1458 ; CHECK-NEXT: vldrht.s32 q2, [r0], #8
1459 ; CHECK-NEXT: vldrht.s32 q3, [r1], #8
1460 ; CHECK-NEXT: vmul.i32 q2, q3, q2
1461 ; CHECK-NEXT: vqshrnb.s32 q2, q2, #15
1463 ; CHECK-NEXT: vstrht.32 q2, [r2], #8
1464 ; CHECK-NEXT: le lr, .LBB8_2
1465 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1466 ; CHECK-NEXT: pop {r4, pc}
1467 ; CHECK-NEXT: .p2align 4
1468 ; CHECK-NEXT: @ %bb.4:
1469 ; CHECK-NEXT: .LCPI8_0:
1470 ; CHECK-NEXT: .long 0 @ 0x0
1471 ; CHECK-NEXT: .long 1 @ 0x1
1472 ; CHECK-NEXT: .long 2 @ 0x2
1473 ; CHECK-NEXT: .long 3 @ 0x3
1475 %cmp8 = icmp eq i32 %N, 0
1476 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
1478 vector.ph: ; preds = %entry
1479 %n.rnd.up = add i32 %N, 3
1480 %n.vec = and i32 %n.rnd.up, -4
1481 %trip.count.minus.1 = add i32 %N, -1
1482 %broadcast.splatinsert20 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
1483 %broadcast.splat21 = shufflevector <4 x i32> %broadcast.splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
1484 br label %vector.body
1486 vector.body: ; preds = %vector.body, %vector.ph
1487 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1488 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
1489 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1490 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
1491 %next.gep = getelementptr i16, i16* %pSrcA, i32 %index
1492 %next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
1493 %next.gep19 = getelementptr i16, i16* %pDst, i32 %index
1494 %0 = icmp ule <4 x i32> %induction, %broadcast.splat21
1495 %1 = bitcast i16* %next.gep to <4 x i16>*
1496 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %1, i32 2, <4 x i1> %0, <4 x i16> undef)
1497 %2 = sext <4 x i16> %wide.masked.load to <4 x i32>
1498 %3 = bitcast i16* %next.gep18 to <4 x i16>*
1499 %wide.masked.load22 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %3, i32 2, <4 x i1> %0, <4 x i16> undef)
1500 %4 = sext <4 x i16> %wide.masked.load22 to <4 x i32>
1501 %5 = mul nsw <4 x i32> %4, %2
1502 %6 = ashr <4 x i32> %5, <i32 15, i32 15, i32 15, i32 15>
1503 %7 = icmp sgt <4 x i32> %6, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1504 %8 = select <4 x i1> %7, <4 x i32> %6, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1505 %9 = icmp slt <4 x i32> %8, <i32 32767, i32 32767, i32 32767, i32 32767>
1506 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1507 %11 = trunc <4 x i32> %10 to <4 x i16>
1508 %12 = bitcast i16* %next.gep19 to <4 x i16>*
1509 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %11, <4 x i16>* %12, i32 2, <4 x i1> %0)
1510 %index.next = add i32 %index, 4
1511 %13 = icmp eq i32 %index.next, %n.vec
1512 br i1 %13, label %for.cond.cleanup, label %vector.body
1514 for.cond.cleanup: ; preds = %vector.body, %entry
1518 define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
1519 ; CHECK-LABEL: ssatmul_8t_q15:
1520 ; CHECK: @ %bb.0: @ %entry
1521 ; CHECK-NEXT: .save {r4, lr}
1522 ; CHECK-NEXT: push {r4, lr}
1523 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1524 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1525 ; CHECK-NEXT: cmp r3, #0
1526 ; CHECK-NEXT: beq .LBB9_3
1527 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1528 ; CHECK-NEXT: add.w r12, r3, #7
1529 ; CHECK-NEXT: adr r4, .LCPI9_0
1530 ; CHECK-NEXT: bic r12, r12, #7
1531 ; CHECK-NEXT: mov.w lr, #1
1532 ; CHECK-NEXT: sub.w r12, r12, #8
1533 ; CHECK-NEXT: vldrw.u32 q0, [r4]
1534 ; CHECK-NEXT: adr r4, .LCPI9_1
1535 ; CHECK-NEXT: vmov.i8 q2, #0x0
1536 ; CHECK-NEXT: add.w lr, lr, r12, lsr #3
1537 ; CHECK-NEXT: sub.w r12, r3, #1
1538 ; CHECK-NEXT: vldrw.u32 q4, [r4]
1539 ; CHECK-NEXT: movs r3, #0
1540 ; CHECK-NEXT: vdup.32 q1, r12
1541 ; CHECK-NEXT: vmov.i8 q3, #0xff
1542 ; CHECK-NEXT: .LBB9_2: @ %vector.body
1543 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1544 ; CHECK-NEXT: vdup.32 q6, r3
1545 ; CHECK-NEXT: adds r3, #8
1546 ; CHECK-NEXT: vorr q5, q6, q0
1547 ; CHECK-NEXT: vorr q6, q6, q4
1548 ; CHECK-NEXT: vcmp.u32 cs, q1, q5
1549 ; CHECK-NEXT: vpsel q7, q3, q2
1550 ; CHECK-NEXT: vcmp.u32 cs, q1, q6
1551 ; CHECK-NEXT: vmov r4, r12, d14
1552 ; CHECK-NEXT: vpsel q6, q3, q2
1553 ; CHECK-NEXT: vmov.16 q5[0], r4
1554 ; CHECK-NEXT: vmov.16 q5[1], r12
1555 ; CHECK-NEXT: vmov r4, r12, d15
1556 ; CHECK-NEXT: vmov.16 q5[2], r4
1557 ; CHECK-NEXT: vmov.16 q5[3], r12
1558 ; CHECK-NEXT: vmov r4, r12, d12
1559 ; CHECK-NEXT: vmov.16 q5[4], r4
1560 ; CHECK-NEXT: vmov.16 q5[5], r12
1561 ; CHECK-NEXT: vmov r4, r12, d13
1562 ; CHECK-NEXT: vmov.16 q5[6], r4
1563 ; CHECK-NEXT: vmov.16 q5[7], r12
1564 ; CHECK-NEXT: vptt.i16 ne, q5, zr
1565 ; CHECK-NEXT: vldrht.u16 q5, [r0], #16
1566 ; CHECK-NEXT: vldrht.u16 q6, [r1], #16
1567 ; CHECK-NEXT: vmullt.s16 q7, q6, q5
1568 ; CHECK-NEXT: vmullb.s16 q5, q6, q5
1569 ; CHECK-NEXT: vqshrnb.s32 q5, q5, #15
1570 ; CHECK-NEXT: vqshrnt.s32 q5, q7, #15
1572 ; CHECK-NEXT: vstrht.16 q5, [r2], #16
1573 ; CHECK-NEXT: le lr, .LBB9_2
1574 ; CHECK-NEXT: .LBB9_3: @ %for.cond.cleanup
1575 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1576 ; CHECK-NEXT: pop {r4, pc}
1577 ; CHECK-NEXT: .p2align 4
1578 ; CHECK-NEXT: @ %bb.4:
1579 ; CHECK-NEXT: .LCPI9_0:
1580 ; CHECK-NEXT: .long 0 @ 0x0
1581 ; CHECK-NEXT: .long 1 @ 0x1
1582 ; CHECK-NEXT: .long 2 @ 0x2
1583 ; CHECK-NEXT: .long 3 @ 0x3
1584 ; CHECK-NEXT: .LCPI9_1:
1585 ; CHECK-NEXT: .long 4 @ 0x4
1586 ; CHECK-NEXT: .long 5 @ 0x5
1587 ; CHECK-NEXT: .long 6 @ 0x6
1588 ; CHECK-NEXT: .long 7 @ 0x7
1590 %cmp8 = icmp eq i32 %N, 0
1591 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
1593 vector.ph: ; preds = %entry
1594 %n.rnd.up = add i32 %N, 7
1595 %n.vec = and i32 %n.rnd.up, -8
1596 %trip.count.minus.1 = add i32 %N, -1
1597 %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
1598 %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer
1599 br label %vector.body
1601 vector.body: ; preds = %vector.body, %vector.ph
1602 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1603 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
1604 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1605 %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1606 %next.gep = getelementptr i16, i16* %pSrcA, i32 %index
1607 %next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
1608 %next.gep19 = getelementptr i16, i16* %pDst, i32 %index
1609 %0 = icmp ule <8 x i32> %induction, %broadcast.splat21
1610 %1 = bitcast i16* %next.gep to <8 x i16>*
1611 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %0, <8 x i16> undef)
1612 %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
1613 %3 = bitcast i16* %next.gep18 to <8 x i16>*
1614 %wide.masked.load22 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %3, i32 2, <8 x i1> %0, <8 x i16> undef)
1615 %4 = sext <8 x i16> %wide.masked.load22 to <8 x i32>
1616 %5 = mul nsw <8 x i32> %4, %2
1617 %6 = ashr <8 x i32> %5, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
1618 %7 = icmp sgt <8 x i32> %6, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1619 %8 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1620 %9 = icmp slt <8 x i32> %8, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
1621 %10 = select <8 x i1> %9, <8 x i32> %8, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
1622 %11 = trunc <8 x i32> %10 to <8 x i16>
1623 %12 = bitcast i16* %next.gep19 to <8 x i16>*
1624 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %11, <8 x i16>* %12, i32 2, <8 x i1> %0)
1625 %index.next = add i32 %index, 8
1626 %13 = icmp eq i32 %index.next, %n.vec
1627 br i1 %13, label %for.cond.cleanup, label %vector.body
1629 for.cond.cleanup: ; preds = %vector.body, %entry
1633 define arm_aapcs_vfpcc void @ssatmul_8ti_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
1634 ; CHECK-LABEL: ssatmul_8ti_q15:
1635 ; CHECK: @ %bb.0: @ %entry
1636 ; CHECK-NEXT: .save {r4, lr}
1637 ; CHECK-NEXT: push {r4, lr}
1638 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1639 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1640 ; CHECK-NEXT: cmp r3, #0
1641 ; CHECK-NEXT: beq .LBB10_3
1642 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1643 ; CHECK-NEXT: add.w r12, r3, #7
1644 ; CHECK-NEXT: adr r4, .LCPI10_0
1645 ; CHECK-NEXT: bic r12, r12, #7
1646 ; CHECK-NEXT: mov.w lr, #1
1647 ; CHECK-NEXT: sub.w r12, r12, #8
1648 ; CHECK-NEXT: vldrw.u32 q0, [r4]
1649 ; CHECK-NEXT: adr r4, .LCPI10_1
1650 ; CHECK-NEXT: vmov.i8 q2, #0x0
1651 ; CHECK-NEXT: add.w lr, lr, r12, lsr #3
1652 ; CHECK-NEXT: sub.w r12, r3, #1
1653 ; CHECK-NEXT: vldrw.u32 q4, [r4]
1654 ; CHECK-NEXT: movs r3, #0
1655 ; CHECK-NEXT: vdup.32 q1, r12
1656 ; CHECK-NEXT: vmov.i8 q3, #0xff
1657 ; CHECK-NEXT: .LBB10_2: @ %vector.body
1658 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1659 ; CHECK-NEXT: vdup.32 q6, r3
1660 ; CHECK-NEXT: adds r3, #8
1661 ; CHECK-NEXT: vorr q5, q6, q0
1662 ; CHECK-NEXT: vorr q6, q6, q4
1663 ; CHECK-NEXT: vcmp.u32 cs, q1, q5
1664 ; CHECK-NEXT: vpsel q7, q3, q2
1665 ; CHECK-NEXT: vcmp.u32 cs, q1, q6
1666 ; CHECK-NEXT: vmov r4, r12, d14
1667 ; CHECK-NEXT: vpsel q6, q3, q2
1668 ; CHECK-NEXT: vmov.16 q5[0], r4
1669 ; CHECK-NEXT: vmov.16 q5[1], r12
1670 ; CHECK-NEXT: vmov r4, r12, d15
1671 ; CHECK-NEXT: vmov.16 q5[2], r4
1672 ; CHECK-NEXT: vmov.16 q5[3], r12
1673 ; CHECK-NEXT: vmov r4, r12, d12
1674 ; CHECK-NEXT: vmov.16 q5[4], r4
1675 ; CHECK-NEXT: vmov.16 q5[5], r12
1676 ; CHECK-NEXT: vmov r4, r12, d13
1677 ; CHECK-NEXT: vmov.16 q5[6], r4
1678 ; CHECK-NEXT: vmov.16 q5[7], r12
1679 ; CHECK-NEXT: vptt.i16 ne, q5, zr
1680 ; CHECK-NEXT: vldrht.u16 q5, [r0], #16
1681 ; CHECK-NEXT: vldrht.u16 q6, [r1], #16
1682 ; CHECK-NEXT: vmullt.s16 q7, q6, q5
1683 ; CHECK-NEXT: vmullb.s16 q5, q6, q5
1684 ; CHECK-NEXT: vqshrnb.s32 q5, q5, #15
1685 ; CHECK-NEXT: vqshrnt.s32 q5, q7, #15
1687 ; CHECK-NEXT: vstrht.16 q5, [r2], #16
1688 ; CHECK-NEXT: le lr, .LBB10_2
1689 ; CHECK-NEXT: .LBB10_3: @ %for.cond.cleanup
1690 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1691 ; CHECK-NEXT: pop {r4, pc}
1692 ; CHECK-NEXT: .p2align 4
1693 ; CHECK-NEXT: @ %bb.4:
1694 ; CHECK-NEXT: .LCPI10_0:
1695 ; CHECK-NEXT: .long 0 @ 0x0
1696 ; CHECK-NEXT: .long 1 @ 0x1
1697 ; CHECK-NEXT: .long 2 @ 0x2
1698 ; CHECK-NEXT: .long 3 @ 0x3
1699 ; CHECK-NEXT: .LCPI10_1:
1700 ; CHECK-NEXT: .long 4 @ 0x4
1701 ; CHECK-NEXT: .long 5 @ 0x5
1702 ; CHECK-NEXT: .long 6 @ 0x6
1703 ; CHECK-NEXT: .long 7 @ 0x7
1705 %cmp8 = icmp eq i32 %N, 0
1706 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
1708 vector.ph: ; preds = %entry
1709 %n.rnd.up = add i32 %N, 7
1710 %n.vec = and i32 %n.rnd.up, -8
1711 %trip.count.minus.1 = add i32 %N, -1
1712 %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
1713 %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer
1714 br label %vector.body
1716 vector.body: ; preds = %vector.body, %vector.ph
1717 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1718 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
1719 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
1720 %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1721 %next.gep = getelementptr i16, i16* %pSrcA, i32 %index
1722 %next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
1723 %next.gep19 = getelementptr i16, i16* %pDst, i32 %index
1724 %0 = icmp ule <8 x i32> %induction, %broadcast.splat21
1725 %1 = bitcast i16* %next.gep to <8 x i16>*
1726 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %0, <8 x i16> undef)
1727 %2 = shufflevector <8 x i16> %wide.masked.load, <8 x i16> %wide.masked.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1728 %3 = shufflevector <8 x i16> %wide.masked.load, <8 x i16> %wide.masked.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1729 %4 = sext <4 x i16> %2 to <4 x i32>
1730 %5 = sext <4 x i16> %3 to <4 x i32>
1731 %6 = bitcast i16* %next.gep18 to <8 x i16>*
1732 %wide.masked.load22 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %6, i32 2, <8 x i1> %0, <8 x i16> undef)
1733 %7 = shufflevector <8 x i16> %wide.masked.load22, <8 x i16> %wide.masked.load22, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1734 %8 = shufflevector <8 x i16> %wide.masked.load22, <8 x i16> %wide.masked.load22, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1735 %9 = sext <4 x i16> %7 to <4 x i32>
1736 %10 = sext <4 x i16> %8 to <4 x i32>
1737 %11 = mul <4 x i32> %9, %4
1738 %12 = mul <4 x i32> %10, %5
1739 %13 = ashr <4 x i32> %11, <i32 15, i32 15, i32 15, i32 15>
1740 %14 = ashr <4 x i32> %12, <i32 15, i32 15, i32 15, i32 15>
1741 %15 = icmp sgt <4 x i32> %13, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1742 %16 = icmp sgt <4 x i32> %14, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1743 %17 = select <4 x i1> %15, <4 x i32> %13, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1744 %18 = select <4 x i1> %16, <4 x i32> %14, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
1745 %19 = icmp slt <4 x i32> %17, <i32 32767, i32 32767, i32 32767, i32 32767>
1746 %20 = icmp slt <4 x i32> %18, <i32 32767, i32 32767, i32 32767, i32 32767>
1747 %21 = select <4 x i1> %19, <4 x i32> %17, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1748 %22 = select <4 x i1> %20, <4 x i32> %18, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
1749 %23 = shufflevector <4 x i32> %21, <4 x i32> %22, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1750 %24 = trunc <8 x i32> %23 to <8 x i16>
1751 %25 = bitcast i16* %next.gep19 to <8 x i16>*
1752 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %24, <8 x i16>* %25, i32 2, <8 x i1> %0)
1753 %index.next = add i32 %index, 8
1754 %26 = icmp eq i32 %index.next, %n.vec
1755 br i1 %26, label %for.cond.cleanup, label %vector.body
1757 for.cond.cleanup: ; preds = %vector.body, %entry
1761 define arm_aapcs_vfpcc void @usatmul_4_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
1762 ; CHECK-LABEL: usatmul_4_q15:
1763 ; CHECK: @ %bb.0: @ %entry
1764 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1765 ; CHECK-NEXT: push {r4, r5, r6, lr}
1766 ; CHECK-NEXT: cmp r3, #0
1767 ; CHECK-NEXT: beq .LBB11_8
1768 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1769 ; CHECK-NEXT: cmp r3, #3
1770 ; CHECK-NEXT: bhi .LBB11_3
1771 ; CHECK-NEXT: @ %bb.2:
1772 ; CHECK-NEXT: movs r5, #0
1773 ; CHECK-NEXT: mov r12, r0
1774 ; CHECK-NEXT: mov r6, r1
1775 ; CHECK-NEXT: mov r4, r2
1776 ; CHECK-NEXT: b .LBB11_6
1777 ; CHECK-NEXT: .LBB11_3: @ %vector.ph
1778 ; CHECK-NEXT: bic r5, r3, #3
1779 ; CHECK-NEXT: movs r4, #1
1780 ; CHECK-NEXT: subs r6, r5, #4
1781 ; CHECK-NEXT: add.w r12, r0, r5, lsl #1
1782 ; CHECK-NEXT: add.w lr, r4, r6, lsr #2
1783 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1
1784 ; CHECK-NEXT: add.w r6, r1, r5, lsl #1
1785 ; CHECK-NEXT: .LBB11_4: @ %vector.body
1786 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1787 ; CHECK-NEXT: vldrh.u32 q0, [r0], #8
1788 ; CHECK-NEXT: vldrh.u32 q1, [r1], #8
1789 ; CHECK-NEXT: vmul.i32 q0, q1, q0
1790 ; CHECK-NEXT: vqshrnb.u32 q0, q0, #15
1791 ; CHECK-NEXT: vstrh.32 q0, [r2], #8
1792 ; CHECK-NEXT: le lr, .LBB11_4
1793 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1794 ; CHECK-NEXT: cmp r5, r3
1796 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
1797 ; CHECK-NEXT: .LBB11_6: @ %for.body.preheader21
1798 ; CHECK-NEXT: sub.w lr, r3, r5
1799 ; CHECK-NEXT: movw r0, #65535
1800 ; CHECK-NEXT: .LBB11_7: @ %for.body
1801 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1802 ; CHECK-NEXT: ldrh r1, [r12], #2
1803 ; CHECK-NEXT: ldrh r2, [r6], #2
1804 ; CHECK-NEXT: muls r1, r2, r1
1805 ; CHECK-NEXT: lsrs r2, r1, #15
1806 ; CHECK-NEXT: cmp r2, r0
1807 ; CHECK-NEXT: movw r2, #65535
1809 ; CHECK-NEXT: lsrlo r2, r1, #15
1810 ; CHECK-NEXT: strh r2, [r4], #2
1811 ; CHECK-NEXT: le lr, .LBB11_7
1812 ; CHECK-NEXT: .LBB11_8: @ %for.cond.cleanup
1813 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1815 %cmp8 = icmp eq i32 %N, 0
1816 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1818 for.body.preheader: ; preds = %entry
1819 %min.iters.check = icmp ult i32 %N, 4
1820 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1822 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
1823 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1824 %pSrcA.addr.011.ph = phi i16* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1825 %pSrcB.addr.010.ph = phi i16* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1826 %pDst.addr.09.ph = phi i16* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1829 vector.ph: ; preds = %for.body.preheader
1830 %n.vec = and i32 %N, -4
1831 %ind.end = getelementptr i16, i16* %pSrcA, i32 %n.vec
1832 %ind.end15 = getelementptr i16, i16* %pSrcB, i32 %n.vec
1833 %ind.end17 = getelementptr i16, i16* %pDst, i32 %n.vec
1834 br label %vector.body
1836 vector.body: ; preds = %vector.body, %vector.ph
1837 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1838 %next.gep = getelementptr i16, i16* %pSrcA, i32 %index
1839 %next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
1840 %next.gep19 = getelementptr i16, i16* %pDst, i32 %index
1841 %0 = bitcast i16* %next.gep to <4 x i16>*
1842 %wide.load = load <4 x i16>, <4 x i16>* %0, align 2
1843 %1 = zext <4 x i16> %wide.load to <4 x i32>
1844 %2 = bitcast i16* %next.gep18 to <4 x i16>*
1845 %wide.load20 = load <4 x i16>, <4 x i16>* %2, align 2
1846 %3 = zext <4 x i16> %wide.load20 to <4 x i32>
1847 %4 = mul nuw <4 x i32> %3, %1
1848 %5 = lshr <4 x i32> %4, <i32 15, i32 15, i32 15, i32 15>
1849 %6 = icmp ult <4 x i32> %5, <i32 65535, i32 65535, i32 65535, i32 65535>
1850 %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
1851 %8 = trunc <4 x i32> %7 to <4 x i16>
1852 %9 = bitcast i16* %next.gep19 to <4 x i16>*
1853 store <4 x i16> %8, <4 x i16>* %9, align 2
1854 %index.next = add i32 %index, 4
1855 %10 = icmp eq i32 %index.next, %n.vec
1856 br i1 %10, label %middle.block, label %vector.body
1858 middle.block: ; preds = %vector.body
1859 %cmp.n = icmp eq i32 %n.vec, %N
1860 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1862 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1865 for.body: ; preds = %for.body.preheader21, %for.body
1866 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1867 %pSrcA.addr.011 = phi i16* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1868 %pSrcB.addr.010 = phi i16* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1869 %pDst.addr.09 = phi i16* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1870 %incdec.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.011, i32 1
1871 %11 = load i16, i16* %pSrcA.addr.011, align 2
1872 %conv = zext i16 %11 to i32
1873 %incdec.ptr1 = getelementptr inbounds i16, i16* %pSrcB.addr.010, i32 1
1874 %12 = load i16, i16* %pSrcB.addr.010, align 2
1875 %conv2 = zext i16 %12 to i32
1876 %mul = mul nuw i32 %conv2, %conv
1877 %shr = lshr i32 %mul, 15
1878 %13 = icmp ult i32 %shr, 65535
1879 %retval.0.i = select i1 %13, i32 %shr, i32 65535
1880 %conv3 = trunc i32 %retval.0.i to i16
1881 %incdec.ptr4 = getelementptr inbounds i16, i16* %pDst.addr.09, i32 1
1882 store i16 %conv3, i16* %pDst.addr.09, align 2
1883 %inc = add nuw i32 %i.012, 1
1884 %exitcond = icmp eq i32 %inc, %N
1885 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1888 define arm_aapcs_vfpcc void @usatmul_8_q15(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* noalias nocapture %pDst, i32 %N) {
1889 ; CHECK-LABEL: usatmul_8_q15:
1890 ; CHECK: @ %bb.0: @ %entry
1891 ; CHECK-NEXT: .save {r4, r5, r6, lr}
1892 ; CHECK-NEXT: push {r4, r5, r6, lr}
1893 ; CHECK-NEXT: cmp r3, #0
1894 ; CHECK-NEXT: beq .LBB12_8
1895 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1896 ; CHECK-NEXT: cmp r3, #7
1897 ; CHECK-NEXT: bhi .LBB12_3
1898 ; CHECK-NEXT: @ %bb.2:
1899 ; CHECK-NEXT: movs r5, #0
1900 ; CHECK-NEXT: mov r12, r0
1901 ; CHECK-NEXT: mov r6, r1
1902 ; CHECK-NEXT: mov r4, r2
1903 ; CHECK-NEXT: b .LBB12_6
1904 ; CHECK-NEXT: .LBB12_3: @ %vector.ph
1905 ; CHECK-NEXT: bic r5, r3, #7
1906 ; CHECK-NEXT: movs r4, #1
1907 ; CHECK-NEXT: sub.w r6, r5, #8
1908 ; CHECK-NEXT: add.w r12, r0, r5, lsl #1
1909 ; CHECK-NEXT: add.w lr, r4, r6, lsr #3
1910 ; CHECK-NEXT: add.w r4, r2, r5, lsl #1
1911 ; CHECK-NEXT: add.w r6, r1, r5, lsl #1
1912 ; CHECK-NEXT: .LBB12_4: @ %vector.body
1913 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1914 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
1915 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
1916 ; CHECK-NEXT: vmullt.u16 q2, q1, q0
1917 ; CHECK-NEXT: vmullb.u16 q0, q1, q0
1918 ; CHECK-NEXT: vqshrnb.u32 q0, q0, #15
1919 ; CHECK-NEXT: vqshrnt.u32 q0, q2, #15
1920 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
1921 ; CHECK-NEXT: le lr, .LBB12_4
1922 ; CHECK-NEXT: @ %bb.5: @ %middle.block
1923 ; CHECK-NEXT: cmp r5, r3
1925 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
1926 ; CHECK-NEXT: .LBB12_6: @ %for.body.preheader21
1927 ; CHECK-NEXT: sub.w lr, r3, r5
1928 ; CHECK-NEXT: movw r0, #65535
1929 ; CHECK-NEXT: .LBB12_7: @ %for.body
1930 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1931 ; CHECK-NEXT: ldrh r1, [r12], #2
1932 ; CHECK-NEXT: ldrh r2, [r6], #2
1933 ; CHECK-NEXT: muls r1, r2, r1
1934 ; CHECK-NEXT: lsrs r2, r1, #15
1935 ; CHECK-NEXT: cmp r2, r0
1936 ; CHECK-NEXT: movw r2, #65535
1938 ; CHECK-NEXT: lsrlo r2, r1, #15
1939 ; CHECK-NEXT: strh r2, [r4], #2
1940 ; CHECK-NEXT: le lr, .LBB12_7
1941 ; CHECK-NEXT: .LBB12_8: @ %for.cond.cleanup
1942 ; CHECK-NEXT: pop {r4, r5, r6, pc}
1944 %cmp8 = icmp eq i32 %N, 0
1945 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1947 for.body.preheader: ; preds = %entry
1948 %min.iters.check = icmp ult i32 %N, 8
1949 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
1951 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
1952 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1953 %pSrcA.addr.011.ph = phi i16* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
1954 %pSrcB.addr.010.ph = phi i16* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
1955 %pDst.addr.09.ph = phi i16* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
1958 vector.ph: ; preds = %for.body.preheader
1959 %n.vec = and i32 %N, -8
1960 %ind.end = getelementptr i16, i16* %pSrcA, i32 %n.vec
1961 %ind.end15 = getelementptr i16, i16* %pSrcB, i32 %n.vec
1962 %ind.end17 = getelementptr i16, i16* %pDst, i32 %n.vec
1963 br label %vector.body
1965 vector.body: ; preds = %vector.body, %vector.ph
1966 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1967 %next.gep = getelementptr i16, i16* %pSrcA, i32 %index
1968 %next.gep18 = getelementptr i16, i16* %pSrcB, i32 %index
1969 %next.gep19 = getelementptr i16, i16* %pDst, i32 %index
1970 %0 = bitcast i16* %next.gep to <8 x i16>*
1971 %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
1972 %1 = zext <8 x i16> %wide.load to <8 x i32>
1973 %2 = bitcast i16* %next.gep18 to <8 x i16>*
1974 %wide.load20 = load <8 x i16>, <8 x i16>* %2, align 2
1975 %3 = zext <8 x i16> %wide.load20 to <8 x i32>
1976 %4 = mul nuw <8 x i32> %3, %1
1977 %5 = lshr <8 x i32> %4, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
1978 %6 = icmp ult <8 x i32> %5, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1979 %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
1980 %8 = trunc <8 x i32> %7 to <8 x i16>
1981 %9 = bitcast i16* %next.gep19 to <8 x i16>*
1982 store <8 x i16> %8, <8 x i16>* %9, align 2
1983 %index.next = add i32 %index, 8
1984 %10 = icmp eq i32 %index.next, %n.vec
1985 br i1 %10, label %middle.block, label %vector.body
1987 middle.block: ; preds = %vector.body
1988 %cmp.n = icmp eq i32 %n.vec, %N
1989 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
1991 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1994 for.body: ; preds = %for.body.preheader21, %for.body
1995 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
1996 %pSrcA.addr.011 = phi i16* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
1997 %pSrcB.addr.010 = phi i16* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
1998 %pDst.addr.09 = phi i16* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
1999 %incdec.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.011, i32 1
2000 %11 = load i16, i16* %pSrcA.addr.011, align 2
2001 %conv = zext i16 %11 to i32
2002 %incdec.ptr1 = getelementptr inbounds i16, i16* %pSrcB.addr.010, i32 1
2003 %12 = load i16, i16* %pSrcB.addr.010, align 2
2004 %conv2 = zext i16 %12 to i32
2005 %mul = mul nuw i32 %conv2, %conv
2006 %shr = lshr i32 %mul, 15
2007 %13 = icmp ult i32 %shr, 65535
2008 %retval.0.i = select i1 %13, i32 %shr, i32 65535
2009 %conv3 = trunc i32 %retval.0.i to i16
2010 %incdec.ptr4 = getelementptr inbounds i16, i16* %pDst.addr.09, i32 1
2011 store i16 %conv3, i16* %pDst.addr.09, align 2
2012 %inc = add nuw i32 %i.012, 1
2013 %exitcond = icmp eq i32 %inc, %N
2014 br i1 %exitcond, label %for.cond.cleanup, label %for.body
2020 define arm_aapcs_vfpcc void @ssatmul_4_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
2021 ; CHECK-LABEL: ssatmul_4_q7:
2022 ; CHECK: @ %bb.0: @ %entry
2023 ; CHECK-NEXT: .save {r4, r5, r6, lr}
2024 ; CHECK-NEXT: push {r4, r5, r6, lr}
2025 ; CHECK-NEXT: cmp r3, #0
2026 ; CHECK-NEXT: beq .LBB13_8
2027 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
2028 ; CHECK-NEXT: cmp r3, #3
2029 ; CHECK-NEXT: bhi .LBB13_3
2030 ; CHECK-NEXT: @ %bb.2:
2031 ; CHECK-NEXT: movs r5, #0
2032 ; CHECK-NEXT: mov r12, r0
2033 ; CHECK-NEXT: mov r6, r1
2034 ; CHECK-NEXT: mov r4, r2
2035 ; CHECK-NEXT: b .LBB13_6
2036 ; CHECK-NEXT: .LBB13_3: @ %vector.ph
2037 ; CHECK-NEXT: bic r5, r3, #3
2038 ; CHECK-NEXT: movs r4, #1
2039 ; CHECK-NEXT: subs r6, r5, #4
2040 ; CHECK-NEXT: add.w r12, r0, r5
2041 ; CHECK-NEXT: vmvn.i32 q0, #0x7f
2042 ; CHECK-NEXT: vmov.i32 q1, #0x7f
2043 ; CHECK-NEXT: add.w lr, r4, r6, lsr #2
2044 ; CHECK-NEXT: adds r4, r2, r5
2045 ; CHECK-NEXT: adds r6, r1, r5
2046 ; CHECK-NEXT: .LBB13_4: @ %vector.body
2047 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2048 ; CHECK-NEXT: vldrb.s32 q2, [r0], #4
2049 ; CHECK-NEXT: vldrb.s32 q3, [r1], #4
2050 ; CHECK-NEXT: vmul.i32 q2, q3, q2
2051 ; CHECK-NEXT: vshr.s32 q2, q2, #7
2052 ; CHECK-NEXT: vmax.s32 q2, q2, q0
2053 ; CHECK-NEXT: vmin.s32 q2, q2, q1
2054 ; CHECK-NEXT: vstrb.32 q2, [r2], #4
2055 ; CHECK-NEXT: le lr, .LBB13_4
2056 ; CHECK-NEXT: @ %bb.5: @ %middle.block
2057 ; CHECK-NEXT: cmp r5, r3
2059 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
2060 ; CHECK-NEXT: .LBB13_6: @ %for.body.preheader21
2061 ; CHECK-NEXT: sub.w lr, r3, r5
2062 ; CHECK-NEXT: .LBB13_7: @ %for.body
2063 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2064 ; CHECK-NEXT: ldrsb r0, [r12], #1
2065 ; CHECK-NEXT: ldrsb r1, [r6], #1
2066 ; CHECK-NEXT: muls r0, r1, r0
2067 ; CHECK-NEXT: ssat r0, #8, r0, asr #7
2068 ; CHECK-NEXT: strb r0, [r4], #1
2069 ; CHECK-NEXT: le lr, .LBB13_7
2070 ; CHECK-NEXT: .LBB13_8: @ %for.cond.cleanup
2071 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2073 %cmp8 = icmp eq i32 %N, 0
2074 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
2076 for.body.preheader: ; preds = %entry
2077 %min.iters.check = icmp ult i32 %N, 4
2078 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph
2080 for.body.preheader21: ; preds = %middle.block, %for.body.preheader
2081 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2082 %pSrcA.addr.011.ph = phi i8* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2083 %pSrcB.addr.010.ph = phi i8* [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ]
2084 %pDst.addr.09.ph = phi i8* [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ]
2087 vector.ph: ; preds = %for.body.preheader
2088 %n.vec = and i32 %N, -4
2089 %ind.end = getelementptr i8, i8* %pSrcA, i32 %n.vec
2090 %ind.end15 = getelementptr i8, i8* %pSrcB, i32 %n.vec
2091 %ind.end17 = getelementptr i8, i8* %pDst, i32 %n.vec
2092 br label %vector.body
2094 vector.body: ; preds = %vector.body, %vector.ph
2095 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2096 %next.gep = getelementptr i8, i8* %pSrcA, i32 %index
2097 %next.gep18 = getelementptr i8, i8* %pSrcB, i32 %index
2098 %next.gep19 = getelementptr i8, i8* %pDst, i32 %index
2099 %0 = bitcast i8* %next.gep to <4 x i8>*
2100 %wide.load = load <4 x i8>, <4 x i8>* %0, align 1
2101 %1 = sext <4 x i8> %wide.load to <4 x i32>
2102 %2 = bitcast i8* %next.gep18 to <4 x i8>*
2103 %wide.load20 = load <4 x i8>, <4 x i8>* %2, align 1
2104 %3 = sext <4 x i8> %wide.load20 to <4 x i32>
2105 %4 = mul nsw <4 x i32> %3, %1
2106 %5 = ashr <4 x i32> %4, <i32 7, i32 7, i32 7, i32 7>
2107 %6 = icmp sgt <4 x i32> %5, <i32 -128, i32 -128, i32 -128, i32 -128>
2108 %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 -128, i32 -128, i32 -128, i32 -128>
2109 %8 = icmp slt <4 x i32> %7, <i32 127, i32 127, i32 127, i32 127>
2110 %9 = select <4 x i1> %8, <4 x i32> %7, <4 x i32> <i32 127, i32 127, i32 127, i32 127>
2111 %10 = trunc <4 x i32> %9 to <4 x i8>
2112 %11 = bitcast i8* %next.gep19 to <4 x i8>*
2113 store <4 x i8> %10, <4 x i8>* %11, align 1
2114 %index.next = add i32 %index, 4
2115 %12 = icmp eq i32 %index.next, %n.vec
2116 br i1 %12, label %middle.block, label %vector.body
2118 middle.block: ; preds = %vector.body
2119 %cmp.n = icmp eq i32 %n.vec, %N
2120 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21
2122 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
2125 for.body: ; preds = %for.body.preheader21, %for.body
2126 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ]
2127 %pSrcA.addr.011 = phi i8* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ]
2128 %pSrcB.addr.010 = phi i8* [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ]
2129 %pDst.addr.09 = phi i8* [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ]
2130 %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.011, i32 1
2131 %13 = load i8, i8* %pSrcA.addr.011, align 1
2132 %conv = sext i8 %13 to i32
2133 %incdec.ptr1 = getelementptr inbounds i8, i8* %pSrcB.addr.010, i32 1
2134 %14 = load i8, i8* %pSrcB.addr.010, align 1
2135 %conv2 = sext i8 %14 to i32
2136 %mul = mul nsw i32 %conv2, %conv
2137 %shr = ashr i32 %mul, 7
2138 %15 = icmp sgt i32 %shr, -128
2139 %.val.i = select i1 %15, i32 %shr, i32 -128
2140 %16 = icmp slt i32 %.val.i, 127
2141 %retval.0.i = select i1 %16, i32 %.val.i, i32 127
2142 %conv3 = trunc i32 %retval.0.i to i8
2143 %incdec.ptr4 = getelementptr inbounds i8, i8* %pDst.addr.09, i32 1
2144 store i8 %conv3, i8* %pDst.addr.09, align 1
2145 %inc = add nuw i32 %i.012, 1
2146 %exitcond = icmp eq i32 %inc, %N
2147 br i1 %exitcond, label %for.cond.cleanup, label %for.body
2150 define arm_aapcs_vfpcc void @ssatmul_8_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
2151 ; CHECK-LABEL: ssatmul_8_q7:
2152 ; CHECK: @ %bb.0: @ %entry
2153 ; CHECK-NEXT: .save {r4, r5, r6, lr}
2154 ; CHECK-NEXT: push {r4, r5, r6, lr}
2155 ; CHECK-NEXT: cbz r3, .LBB14_8
2156 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
2157 ; CHECK-NEXT: cmp r3, #7
2158 ; CHECK-NEXT: bhi .LBB14_3
2159 ; CHECK-NEXT: @ %bb.2:
2160 ; CHECK-NEXT: movs r5, #0
2161 ; CHECK-NEXT: mov r12, r0
2162 ; CHECK-NEXT: mov r6, r1
2163 ; CHECK-NEXT: mov r4, r2
2164 ; CHECK-NEXT: b .LBB14_6
2165 ; CHECK-NEXT: .LBB14_3: @ %vector.ph
2166 ; CHECK-NEXT: bic r5, r3, #7
2167 ; CHECK-NEXT: movs r4, #1
2168 ; CHECK-NEXT: sub.w r6, r5, #8
2169 ; CHECK-NEXT: add.w r12, r0, r5
2170 ; CHECK-NEXT: add.w lr, r4, r6, lsr #3
2171 ; CHECK-NEXT: adds r4, r2, r5
2172 ; CHECK-NEXT: adds r6, r1, r5
2173 ; CHECK-NEXT: .LBB14_4: @ %vector.body
2174 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2175 ; CHECK-NEXT: vldrb.s16 q0, [r0], #8
2176 ; CHECK-NEXT: vldrb.s16 q1, [r1], #8
2177 ; CHECK-NEXT: vmul.i16 q0, q1, q0
2178 ; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
2179 ; CHECK-NEXT: vstrb.16 q0, [r2], #8
2180 ; CHECK-NEXT: le lr, .LBB14_4
2181 ; CHECK-NEXT: @ %bb.5: @ %middle.block
2182 ; CHECK-NEXT: cmp r5, r3
2184 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
2185 ; CHECK-NEXT: .LBB14_6: @ %for.body.preheader23
2186 ; CHECK-NEXT: sub.w lr, r3, r5
2187 ; CHECK-NEXT: .LBB14_7: @ %for.body
2188 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2189 ; CHECK-NEXT: ldrsb r0, [r12], #1
2190 ; CHECK-NEXT: ldrsb r1, [r6], #1
2191 ; CHECK-NEXT: muls r0, r1, r0
2192 ; CHECK-NEXT: ssat r0, #8, r0, asr #7
2193 ; CHECK-NEXT: strb r0, [r4], #1
2194 ; CHECK-NEXT: le lr, .LBB14_7
2195 ; CHECK-NEXT: .LBB14_8: @ %for.cond.cleanup
2196 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2198 %cmp10 = icmp eq i32 %N, 0
2199 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2201 for.body.preheader: ; preds = %entry
2202 %min.iters.check = icmp ult i32 %N, 8
2203 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2205 for.body.preheader23: ; preds = %middle.block, %for.body.preheader
2206 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2207 %pSrcA.addr.013.ph = phi i8* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2208 %pSrcB.addr.012.ph = phi i8* [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2209 %pDst.addr.011.ph = phi i8* [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2212 vector.ph: ; preds = %for.body.preheader
2213 %n.vec = and i32 %N, -8
2214 %ind.end = getelementptr i8, i8* %pSrcA, i32 %n.vec
2215 %ind.end17 = getelementptr i8, i8* %pSrcB, i32 %n.vec
2216 %ind.end19 = getelementptr i8, i8* %pDst, i32 %n.vec
2217 br label %vector.body
2219 vector.body: ; preds = %vector.body, %vector.ph
2220 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2221 %next.gep = getelementptr i8, i8* %pSrcA, i32 %index
2222 %next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
2223 %next.gep21 = getelementptr i8, i8* %pDst, i32 %index
2224 %0 = bitcast i8* %next.gep to <8 x i8>*
2225 %wide.load = load <8 x i8>, <8 x i8>* %0, align 1
2226 %1 = sext <8 x i8> %wide.load to <8 x i16>
2227 %2 = bitcast i8* %next.gep20 to <8 x i8>*
2228 %wide.load22 = load <8 x i8>, <8 x i8>* %2, align 1
2229 %3 = sext <8 x i8> %wide.load22 to <8 x i16>
2230 %4 = mul nsw <8 x i16> %3, %1
2231 %5 = ashr <8 x i16> %4, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2232 %6 = icmp sgt <8 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2233 %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2234 %8 = icmp slt <8 x i16> %7, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2235 %9 = select <8 x i1> %8, <8 x i16> %7, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2236 %10 = trunc <8 x i16> %9 to <8 x i8>
2237 %11 = bitcast i8* %next.gep21 to <8 x i8>*
2238 store <8 x i8> %10, <8 x i8>* %11, align 1
2239 %index.next = add i32 %index, 8
2240 %12 = icmp eq i32 %index.next, %n.vec
2241 br i1 %12, label %middle.block, label %vector.body
2243 middle.block: ; preds = %vector.body
2244 %cmp.n = icmp eq i32 %n.vec, %N
2245 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
2247 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
2250 for.body: ; preds = %for.body.preheader23, %for.body
2251 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
2252 %pSrcA.addr.013 = phi i8* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
2253 %pSrcB.addr.012 = phi i8* [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
2254 %pDst.addr.011 = phi i8* [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
2255 %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.013, i32 1
2256 %13 = load i8, i8* %pSrcA.addr.013, align 1
2257 %conv1 = sext i8 %13 to i16
2258 %incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.012, i32 1
2259 %14 = load i8, i8* %pSrcB.addr.012, align 1
2260 %conv3 = sext i8 %14 to i16
2261 %mul = mul nsw i16 %conv3, %conv1
2262 %shr = ashr i16 %mul, 7
2263 %15 = icmp sgt i16 %shr, -128
2264 %.val.i = select i1 %15, i16 %shr, i16 -128
2265 %16 = icmp slt i16 %.val.i, 127
2266 %retval.0.i = select i1 %16, i16 %.val.i, i16 127
2267 %conv5 = trunc i16 %retval.0.i to i8
2268 %incdec.ptr6 = getelementptr inbounds i8, i8* %pDst.addr.011, i32 1
2269 store i8 %conv5, i8* %pDst.addr.011, align 1
2270 %inc = add nuw i32 %i.014, 1
2271 %exitcond = icmp eq i32 %inc, %N
2272 br i1 %exitcond, label %for.cond.cleanup, label %for.body
2275 define arm_aapcs_vfpcc void @ssatmul_16_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
2276 ; CHECK-LABEL: ssatmul_16_q7:
2277 ; CHECK: @ %bb.0: @ %entry
2278 ; CHECK-NEXT: .save {r4, r5, r6, lr}
2279 ; CHECK-NEXT: push {r4, r5, r6, lr}
2280 ; CHECK-NEXT: cbz r3, .LBB15_8
2281 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
2282 ; CHECK-NEXT: cmp r3, #15
2283 ; CHECK-NEXT: bhi .LBB15_3
2284 ; CHECK-NEXT: @ %bb.2:
2285 ; CHECK-NEXT: movs r5, #0
2286 ; CHECK-NEXT: mov r12, r0
2287 ; CHECK-NEXT: mov r6, r1
2288 ; CHECK-NEXT: mov r4, r2
2289 ; CHECK-NEXT: b .LBB15_6
2290 ; CHECK-NEXT: .LBB15_3: @ %vector.ph
2291 ; CHECK-NEXT: bic r5, r3, #15
2292 ; CHECK-NEXT: movs r4, #1
2293 ; CHECK-NEXT: sub.w r6, r5, #16
2294 ; CHECK-NEXT: add.w r12, r0, r5
2295 ; CHECK-NEXT: add.w lr, r4, r6, lsr #4
2296 ; CHECK-NEXT: adds r4, r2, r5
2297 ; CHECK-NEXT: adds r6, r1, r5
2298 ; CHECK-NEXT: .LBB15_4: @ %vector.body
2299 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2300 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2301 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
2302 ; CHECK-NEXT: vmullt.s8 q2, q1, q0
2303 ; CHECK-NEXT: vmullb.s8 q0, q1, q0
2304 ; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
2305 ; CHECK-NEXT: vqshrnt.s16 q0, q2, #7
2306 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
2307 ; CHECK-NEXT: le lr, .LBB15_4
2308 ; CHECK-NEXT: @ %bb.5: @ %middle.block
2309 ; CHECK-NEXT: cmp r5, r3
2311 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
2312 ; CHECK-NEXT: .LBB15_6: @ %for.body.preheader23
2313 ; CHECK-NEXT: sub.w lr, r3, r5
2314 ; CHECK-NEXT: .LBB15_7: @ %for.body
2315 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2316 ; CHECK-NEXT: ldrsb r0, [r12], #1
2317 ; CHECK-NEXT: ldrsb r1, [r6], #1
2318 ; CHECK-NEXT: muls r0, r1, r0
2319 ; CHECK-NEXT: ssat r0, #8, r0, asr #7
2320 ; CHECK-NEXT: strb r0, [r4], #1
2321 ; CHECK-NEXT: le lr, .LBB15_7
2322 ; CHECK-NEXT: .LBB15_8: @ %for.cond.cleanup
2323 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2325 %cmp10 = icmp eq i32 %N, 0
2326 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2328 for.body.preheader: ; preds = %entry
2329 %min.iters.check = icmp ult i32 %N, 16
2330 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2332 for.body.preheader23: ; preds = %middle.block, %for.body.preheader
2333 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2334 %pSrcA.addr.013.ph = phi i8* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2335 %pSrcB.addr.012.ph = phi i8* [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2336 %pDst.addr.011.ph = phi i8* [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2339 vector.ph: ; preds = %for.body.preheader
2340 %n.vec = and i32 %N, -16
2341 %ind.end = getelementptr i8, i8* %pSrcA, i32 %n.vec
2342 %ind.end17 = getelementptr i8, i8* %pSrcB, i32 %n.vec
2343 %ind.end19 = getelementptr i8, i8* %pDst, i32 %n.vec
2344 br label %vector.body
2346 vector.body: ; preds = %vector.body, %vector.ph
2347 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2348 %next.gep = getelementptr i8, i8* %pSrcA, i32 %index
2349 %next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
2350 %next.gep21 = getelementptr i8, i8* %pDst, i32 %index
2351 %0 = bitcast i8* %next.gep to <16 x i8>*
2352 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
2353 %1 = sext <16 x i8> %wide.load to <16 x i16>
2354 %2 = bitcast i8* %next.gep20 to <16 x i8>*
2355 %wide.load22 = load <16 x i8>, <16 x i8>* %2, align 1
2356 %3 = sext <16 x i8> %wide.load22 to <16 x i16>
2357 %4 = mul nsw <16 x i16> %3, %1
2358 %5 = ashr <16 x i16> %4, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2359 %6 = icmp sgt <16 x i16> %5, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2360 %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2361 %8 = icmp slt <16 x i16> %7, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2362 %9 = select <16 x i1> %8, <16 x i16> %7, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2363 %10 = trunc <16 x i16> %9 to <16 x i8>
2364 %11 = bitcast i8* %next.gep21 to <16 x i8>*
2365 store <16 x i8> %10, <16 x i8>* %11, align 1
2366 %index.next = add i32 %index, 16
2367 %12 = icmp eq i32 %index.next, %n.vec
2368 br i1 %12, label %middle.block, label %vector.body
2370 middle.block: ; preds = %vector.body
2371 %cmp.n = icmp eq i32 %n.vec, %N
2372 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
2374 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
2377 for.body: ; preds = %for.body.preheader23, %for.body
2378 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
2379 %pSrcA.addr.013 = phi i8* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
2380 %pSrcB.addr.012 = phi i8* [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
2381 %pDst.addr.011 = phi i8* [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
2382 %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.013, i32 1
2383 %13 = load i8, i8* %pSrcA.addr.013, align 1
2384 %conv1 = sext i8 %13 to i16
2385 %incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.012, i32 1
2386 %14 = load i8, i8* %pSrcB.addr.012, align 1
2387 %conv3 = sext i8 %14 to i16
2388 %mul = mul nsw i16 %conv3, %conv1
2389 %shr = ashr i16 %mul, 7
2390 %15 = icmp sgt i16 %shr, -128
2391 %.val.i = select i1 %15, i16 %shr, i16 -128
2392 %16 = icmp slt i16 %.val.i, 127
2393 %retval.0.i = select i1 %16, i16 %.val.i, i16 127
2394 %conv5 = trunc i16 %retval.0.i to i8
2395 %incdec.ptr6 = getelementptr inbounds i8, i8* %pDst.addr.011, i32 1
2396 store i8 %conv5, i8* %pDst.addr.011, align 1
2397 %inc = add nuw i32 %i.014, 1
2398 %exitcond = icmp eq i32 %inc, %N
2399 br i1 %exitcond, label %for.cond.cleanup, label %for.body
2402 define arm_aapcs_vfpcc void @ssatmul_16i_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
2403 ; CHECK-LABEL: ssatmul_16i_q7:
2404 ; CHECK: @ %bb.0: @ %entry
2405 ; CHECK-NEXT: .save {r4, r5, r6, lr}
2406 ; CHECK-NEXT: push {r4, r5, r6, lr}
2407 ; CHECK-NEXT: cbz r3, .LBB16_8
2408 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
2409 ; CHECK-NEXT: cmp r3, #15
2410 ; CHECK-NEXT: bhi .LBB16_3
2411 ; CHECK-NEXT: @ %bb.2:
2412 ; CHECK-NEXT: movs r5, #0
2413 ; CHECK-NEXT: mov r12, r0
2414 ; CHECK-NEXT: mov r6, r1
2415 ; CHECK-NEXT: mov r4, r2
2416 ; CHECK-NEXT: b .LBB16_6
2417 ; CHECK-NEXT: .LBB16_3: @ %vector.ph
2418 ; CHECK-NEXT: bic r5, r3, #15
2419 ; CHECK-NEXT: movs r4, #1
2420 ; CHECK-NEXT: sub.w r6, r5, #16
2421 ; CHECK-NEXT: add.w r12, r0, r5
2422 ; CHECK-NEXT: add.w lr, r4, r6, lsr #4
2423 ; CHECK-NEXT: adds r4, r2, r5
2424 ; CHECK-NEXT: adds r6, r1, r5
2425 ; CHECK-NEXT: .LBB16_4: @ %vector.body
2426 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2427 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2428 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
2429 ; CHECK-NEXT: vmullt.s8 q2, q1, q0
2430 ; CHECK-NEXT: vmullb.s8 q0, q1, q0
2431 ; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
2432 ; CHECK-NEXT: vqshrnt.s16 q0, q2, #7
2433 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
2434 ; CHECK-NEXT: le lr, .LBB16_4
2435 ; CHECK-NEXT: @ %bb.5: @ %middle.block
2436 ; CHECK-NEXT: cmp r5, r3
2438 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
2439 ; CHECK-NEXT: .LBB16_6: @ %for.body.preheader23
2440 ; CHECK-NEXT: sub.w lr, r3, r5
2441 ; CHECK-NEXT: .LBB16_7: @ %for.body
2442 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2443 ; CHECK-NEXT: ldrsb r0, [r12], #1
2444 ; CHECK-NEXT: ldrsb r1, [r6], #1
2445 ; CHECK-NEXT: muls r0, r1, r0
2446 ; CHECK-NEXT: ssat r0, #8, r0, asr #7
2447 ; CHECK-NEXT: strb r0, [r4], #1
2448 ; CHECK-NEXT: le lr, .LBB16_7
2449 ; CHECK-NEXT: .LBB16_8: @ %for.cond.cleanup
2450 ; CHECK-NEXT: pop {r4, r5, r6, pc}
2452 %cmp10 = icmp eq i32 %N, 0
2453 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
2455 for.body.preheader: ; preds = %entry
2456 %min.iters.check = icmp ult i32 %N, 16
2457 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
2459 for.body.preheader23: ; preds = %middle.block, %for.body.preheader
2460 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
2461 %pSrcA.addr.013.ph = phi i8* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
2462 %pSrcB.addr.012.ph = phi i8* [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
2463 %pDst.addr.011.ph = phi i8* [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
2466 vector.ph: ; preds = %for.body.preheader
2467 %n.vec = and i32 %N, -16
2468 %ind.end = getelementptr i8, i8* %pSrcA, i32 %n.vec
2469 %ind.end17 = getelementptr i8, i8* %pSrcB, i32 %n.vec
2470 %ind.end19 = getelementptr i8, i8* %pDst, i32 %n.vec
2471 br label %vector.body
2473 vector.body: ; preds = %vector.body, %vector.ph
2474 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2475 %next.gep = getelementptr i8, i8* %pSrcA, i32 %index
2476 %next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
2477 %next.gep21 = getelementptr i8, i8* %pDst, i32 %index
2478 %0 = bitcast i8* %next.gep to <16 x i8>*
2479 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
2480 %1 = shufflevector <16 x i8> %wide.load, <16 x i8> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2481 %2 = shufflevector <16 x i8> %wide.load, <16 x i8> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2482 %3 = sext <8 x i8> %1 to <8 x i16>
2483 %4 = sext <8 x i8> %2 to <8 x i16>
2484 %5 = bitcast i8* %next.gep20 to <16 x i8>*
2485 %wide.load22 = load <16 x i8>, <16 x i8>* %5, align 1
2486 %6 = shufflevector <16 x i8> %wide.load22, <16 x i8> %wide.load22, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2487 %7 = shufflevector <16 x i8> %wide.load22, <16 x i8> %wide.load22, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2488 %8 = sext <8 x i8> %6 to <8 x i16>
2489 %9 = sext <8 x i8> %7 to <8 x i16>
2490 %10 = mul <8 x i16> %8, %3
2491 %11 = mul <8 x i16> %9, %4
2492 %12 = ashr <8 x i16> %10, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2493 %13 = ashr <8 x i16> %11, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2494 %14 = icmp sgt <8 x i16> %12, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2495 %15 = icmp sgt <8 x i16> %13, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2496 %16 = select <8 x i1> %14, <8 x i16> %12, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2497 %17 = select <8 x i1> %15, <8 x i16> %13, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2498 %18 = icmp slt <8 x i16> %16, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2499 %19 = icmp slt <8 x i16> %17, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2500 %20 = select <8 x i1> %18, <8 x i16> %16, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2501 %21 = select <8 x i1> %19, <8 x i16> %17, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2502 %22 = shufflevector <8 x i16> %20, <8 x i16> %21, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
2503 %23 = trunc <16 x i16> %22 to <16 x i8>
2504 %24 = bitcast i8* %next.gep21 to <16 x i8>*
2505 store <16 x i8> %23, <16 x i8>* %24, align 1
2506 %index.next = add i32 %index, 16
2507 %25 = icmp eq i32 %index.next, %n.vec
2508 br i1 %25, label %middle.block, label %vector.body
2510 middle.block: ; preds = %vector.body
2511 %cmp.n = icmp eq i32 %n.vec, %N
2512 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
2514 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
2517 for.body: ; preds = %for.body, %for.body.preheader23
2518 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
2519 %pSrcA.addr.013 = phi i8* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
2520 %pSrcB.addr.012 = phi i8* [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
2521 %pDst.addr.011 = phi i8* [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
2522 %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.013, i32 1
2523 %26 = load i8, i8* %pSrcA.addr.013, align 1
2524 %conv1 = sext i8 %26 to i16
2525 %incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.012, i32 1
2526 %27 = load i8, i8* %pSrcB.addr.012, align 1
2527 %conv3 = sext i8 %27 to i16
2528 %mul = mul nsw i16 %conv3, %conv1
2529 %shr = ashr i16 %mul, 7
2530 %28 = icmp sgt i16 %shr, -128
2531 %.val.i = select i1 %28, i16 %shr, i16 -128
2532 %29 = icmp slt i16 %.val.i, 127
2533 %retval.0.i = select i1 %29, i16 %.val.i, i16 127
2534 %conv5 = trunc i16 %retval.0.i to i8
2535 %incdec.ptr6 = getelementptr inbounds i8, i8* %pDst.addr.011, i32 1
2536 store i8 %conv5, i8* %pDst.addr.011, align 1
2537 %inc = add nuw i32 %i.014, 1
2538 %exitcond = icmp eq i32 %inc, %N
2539 br i1 %exitcond, label %for.cond.cleanup, label %for.body
2542 define arm_aapcs_vfpcc void @ssatmul_8t_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
2543 ; CHECK-LABEL: ssatmul_8t_q7:
2544 ; CHECK: @ %bb.0: @ %entry
2545 ; CHECK-NEXT: .save {r4, lr}
2546 ; CHECK-NEXT: push {r4, lr}
2547 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2548 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2549 ; CHECK-NEXT: cmp r3, #0
2550 ; CHECK-NEXT: beq .LBB17_3
2551 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2552 ; CHECK-NEXT: add.w r12, r3, #7
2553 ; CHECK-NEXT: adr r4, .LCPI17_0
2554 ; CHECK-NEXT: bic r12, r12, #7
2555 ; CHECK-NEXT: mov.w lr, #1
2556 ; CHECK-NEXT: sub.w r12, r12, #8
2557 ; CHECK-NEXT: vldrw.u32 q0, [r4]
2558 ; CHECK-NEXT: adr r4, .LCPI17_1
2559 ; CHECK-NEXT: vmov.i8 q2, #0x0
2560 ; CHECK-NEXT: add.w lr, lr, r12, lsr #3
2561 ; CHECK-NEXT: sub.w r12, r3, #1
2562 ; CHECK-NEXT: vldrw.u32 q4, [r4]
2563 ; CHECK-NEXT: movs r3, #0
2564 ; CHECK-NEXT: vdup.32 q1, r12
2565 ; CHECK-NEXT: vmov.i8 q3, #0xff
2566 ; CHECK-NEXT: .LBB17_2: @ %vector.body
2567 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2568 ; CHECK-NEXT: vdup.32 q6, r3
2569 ; CHECK-NEXT: adds r3, #8
2570 ; CHECK-NEXT: vorr q5, q6, q0
2571 ; CHECK-NEXT: vorr q6, q6, q4
2572 ; CHECK-NEXT: vcmp.u32 cs, q1, q5
2573 ; CHECK-NEXT: vpsel q7, q3, q2
2574 ; CHECK-NEXT: vcmp.u32 cs, q1, q6
2575 ; CHECK-NEXT: vmov r4, r12, d14
2576 ; CHECK-NEXT: vpsel q6, q3, q2
2577 ; CHECK-NEXT: vmov.16 q5[0], r4
2578 ; CHECK-NEXT: vmov.16 q5[1], r12
2579 ; CHECK-NEXT: vmov r4, r12, d15
2580 ; CHECK-NEXT: vmov.16 q5[2], r4
2581 ; CHECK-NEXT: vmov.16 q5[3], r12
2582 ; CHECK-NEXT: vmov r4, r12, d12
2583 ; CHECK-NEXT: vmov.16 q5[4], r4
2584 ; CHECK-NEXT: vmov.16 q5[5], r12
2585 ; CHECK-NEXT: vmov r4, r12, d13
2586 ; CHECK-NEXT: vmov.16 q5[6], r4
2587 ; CHECK-NEXT: vmov.16 q5[7], r12
2588 ; CHECK-NEXT: vptt.i16 ne, q5, zr
2589 ; CHECK-NEXT: vldrbt.s16 q5, [r0], #8
2590 ; CHECK-NEXT: vldrbt.s16 q6, [r1], #8
2591 ; CHECK-NEXT: vmul.i16 q5, q6, q5
2592 ; CHECK-NEXT: vqshrnb.s16 q5, q5, #7
2594 ; CHECK-NEXT: vstrbt.16 q5, [r2], #8
2595 ; CHECK-NEXT: le lr, .LBB17_2
2596 ; CHECK-NEXT: .LBB17_3: @ %for.cond.cleanup
2597 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2598 ; CHECK-NEXT: pop {r4, pc}
2599 ; CHECK-NEXT: .p2align 4
2600 ; CHECK-NEXT: @ %bb.4:
2601 ; CHECK-NEXT: .LCPI17_0:
2602 ; CHECK-NEXT: .long 0 @ 0x0
2603 ; CHECK-NEXT: .long 1 @ 0x1
2604 ; CHECK-NEXT: .long 2 @ 0x2
2605 ; CHECK-NEXT: .long 3 @ 0x3
2606 ; CHECK-NEXT: .LCPI17_1:
2607 ; CHECK-NEXT: .long 4 @ 0x4
2608 ; CHECK-NEXT: .long 5 @ 0x5
2609 ; CHECK-NEXT: .long 6 @ 0x6
2610 ; CHECK-NEXT: .long 7 @ 0x7
2612 %cmp10 = icmp eq i32 %N, 0
2613 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
2615 vector.ph: ; preds = %entry
2616 %n.rnd.up = add i32 %N, 7
2617 %n.vec = and i32 %n.rnd.up, -8
2618 %trip.count.minus.1 = add i32 %N, -1
2619 %broadcast.splatinsert22 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
2620 %broadcast.splat23 = shufflevector <8 x i32> %broadcast.splatinsert22, <8 x i32> undef, <8 x i32> zeroinitializer
2621 br label %vector.body
2623 vector.body: ; preds = %vector.body, %vector.ph
2624 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2625 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
2626 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
2627 %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2628 %next.gep = getelementptr i8, i8* %pSrcA, i32 %index
2629 %next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
2630 %next.gep21 = getelementptr i8, i8* %pDst, i32 %index
2631 %0 = icmp ule <8 x i32> %induction, %broadcast.splat23
2632 %1 = bitcast i8* %next.gep to <8 x i8>*
2633 %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %1, i32 1, <8 x i1> %0, <8 x i8> undef)
2634 %2 = sext <8 x i8> %wide.masked.load to <8 x i16>
2635 %3 = bitcast i8* %next.gep20 to <8 x i8>*
2636 %wide.masked.load24 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %3, i32 1, <8 x i1> %0, <8 x i8> undef)
2637 %4 = sext <8 x i8> %wide.masked.load24 to <8 x i16>
2638 %5 = mul nsw <8 x i16> %4, %2
2639 %6 = ashr <8 x i16> %5, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2640 %7 = icmp sgt <8 x i16> %6, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2641 %8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2642 %9 = icmp slt <8 x i16> %8, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2643 %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2644 %11 = trunc <8 x i16> %10 to <8 x i8>
2645 %12 = bitcast i8* %next.gep21 to <8 x i8>*
2646 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %11, <8 x i8>* %12, i32 1, <8 x i1> %0)
2647 %index.next = add i32 %index, 8
2648 %13 = icmp eq i32 %index.next, %n.vec
2649 br i1 %13, label %for.cond.cleanup, label %vector.body
2651 for.cond.cleanup: ; preds = %vector.body, %entry
2655 define arm_aapcs_vfpcc void @ssatmul_16t_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
2656 ; CHECK-LABEL: ssatmul_16t_q7:
2657 ; CHECK: @ %bb.0: @ %entry
2658 ; CHECK-NEXT: .save {r4, r5, r7, lr}
2659 ; CHECK-NEXT: push {r4, r5, r7, lr}
2660 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2661 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2662 ; CHECK-NEXT: .pad #48
2663 ; CHECK-NEXT: sub sp, #48
2664 ; CHECK-NEXT: cmp r3, #0
2665 ; CHECK-NEXT: beq.w .LBB18_3
2666 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2667 ; CHECK-NEXT: add.w r12, r3, #15
2668 ; CHECK-NEXT: adr r4, .LCPI18_0
2669 ; CHECK-NEXT: bic r12, r12, #15
2670 ; CHECK-NEXT: vldrw.u32 q0, [r4]
2671 ; CHECK-NEXT: sub.w r12, r12, #16
2672 ; CHECK-NEXT: mov.w lr, #1
2673 ; CHECK-NEXT: adr r4, .LCPI18_1
2674 ; CHECK-NEXT: movs r5, #0
2675 ; CHECK-NEXT: add.w lr, lr, r12, lsr #4
2676 ; CHECK-NEXT: sub.w r12, r3, #1
2677 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
2678 ; CHECK-NEXT: vldrw.u32 q0, [r4]
2679 ; CHECK-NEXT: adr r4, .LCPI18_2
2680 ; CHECK-NEXT: vdup.32 q1, r12
2681 ; CHECK-NEXT: vmov.i8 q2, #0x0
2682 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
2683 ; CHECK-NEXT: vldrw.u32 q0, [r4]
2684 ; CHECK-NEXT: adr r4, .LCPI18_3
2685 ; CHECK-NEXT: vmov.i8 q3, #0xff
2686 ; CHECK-NEXT: vldrw.u32 q6, [r4]
2687 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
2688 ; CHECK-NEXT: .LBB18_2: @ %vector.body
2689 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2690 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
2691 ; CHECK-NEXT: vdup.32 q0, r5
2692 ; CHECK-NEXT: adds r5, #16
2693 ; CHECK-NEXT: vorr q4, q0, q4
2694 ; CHECK-NEXT: vcmp.u32 cs, q1, q4
2695 ; CHECK-NEXT: vpsel q4, q3, q2
2696 ; CHECK-NEXT: vmov r4, r3, d8
2697 ; CHECK-NEXT: vmov.16 q7[0], r4
2698 ; CHECK-NEXT: vmov.16 q7[1], r3
2699 ; CHECK-NEXT: vmov r3, r4, d9
2700 ; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload
2701 ; CHECK-NEXT: vmov.16 q7[2], r3
2702 ; CHECK-NEXT: vmov.16 q7[3], r4
2703 ; CHECK-NEXT: vorr q4, q0, q4
2704 ; CHECK-NEXT: vcmp.u32 cs, q1, q4
2705 ; CHECK-NEXT: vpsel q4, q3, q2
2706 ; CHECK-NEXT: vmov r3, r4, d8
2707 ; CHECK-NEXT: vmov.16 q7[4], r3
2708 ; CHECK-NEXT: vmov.16 q7[5], r4
2709 ; CHECK-NEXT: vmov r3, r4, d9
2710 ; CHECK-NEXT: vmov.16 q7[6], r3
2711 ; CHECK-NEXT: vmov.16 q7[7], r4
2712 ; CHECK-NEXT: vcmp.i16 ne, q7, zr
2713 ; CHECK-NEXT: vpsel q4, q3, q2
2714 ; CHECK-NEXT: vmov.u16 r3, q4[0]
2715 ; CHECK-NEXT: vmov.8 q7[0], r3
2716 ; CHECK-NEXT: vmov.u16 r3, q4[1]
2717 ; CHECK-NEXT: vmov.8 q7[1], r3
2718 ; CHECK-NEXT: vmov.u16 r3, q4[2]
2719 ; CHECK-NEXT: vmov.8 q7[2], r3
2720 ; CHECK-NEXT: vmov.u16 r3, q4[3]
2721 ; CHECK-NEXT: vmov.8 q7[3], r3
2722 ; CHECK-NEXT: vmov.u16 r3, q4[4]
2723 ; CHECK-NEXT: vmov.8 q7[4], r3
2724 ; CHECK-NEXT: vmov.u16 r3, q4[5]
2725 ; CHECK-NEXT: vmov.8 q7[5], r3
2726 ; CHECK-NEXT: vmov.u16 r3, q4[6]
2727 ; CHECK-NEXT: vmov.8 q7[6], r3
2728 ; CHECK-NEXT: vmov.u16 r3, q4[7]
2729 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
2730 ; CHECK-NEXT: vmov.8 q7[7], r3
2731 ; CHECK-NEXT: vorr q4, q0, q4
2732 ; CHECK-NEXT: vorr q0, q0, q6
2733 ; CHECK-NEXT: vcmp.u32 cs, q1, q4
2734 ; CHECK-NEXT: vpsel q5, q3, q2
2735 ; CHECK-NEXT: vcmp.u32 cs, q1, q0
2736 ; CHECK-NEXT: vmov r3, r4, d10
2737 ; CHECK-NEXT: vpsel q0, q3, q2
2738 ; CHECK-NEXT: vmov.16 q4[0], r3
2739 ; CHECK-NEXT: vmov.16 q4[1], r4
2740 ; CHECK-NEXT: vmov r3, r4, d11
2741 ; CHECK-NEXT: vmov.16 q4[2], r3
2742 ; CHECK-NEXT: vmov.16 q4[3], r4
2743 ; CHECK-NEXT: vmov r3, r4, d0
2744 ; CHECK-NEXT: vmov.16 q4[4], r3
2745 ; CHECK-NEXT: vmov.16 q4[5], r4
2746 ; CHECK-NEXT: vmov r3, r4, d1
2747 ; CHECK-NEXT: vmov.16 q4[6], r3
2748 ; CHECK-NEXT: vmov.16 q4[7], r4
2749 ; CHECK-NEXT: vcmp.i16 ne, q4, zr
2750 ; CHECK-NEXT: vpsel q0, q3, q2
2751 ; CHECK-NEXT: vmov.u16 r3, q0[0]
2752 ; CHECK-NEXT: vmov.8 q7[8], r3
2753 ; CHECK-NEXT: vmov.u16 r3, q0[1]
2754 ; CHECK-NEXT: vmov.8 q7[9], r3
2755 ; CHECK-NEXT: vmov.u16 r3, q0[2]
2756 ; CHECK-NEXT: vmov.8 q7[10], r3
2757 ; CHECK-NEXT: vmov.u16 r3, q0[3]
2758 ; CHECK-NEXT: vmov.8 q7[11], r3
2759 ; CHECK-NEXT: vmov.u16 r3, q0[4]
2760 ; CHECK-NEXT: vmov.8 q7[12], r3
2761 ; CHECK-NEXT: vmov.u16 r3, q0[5]
2762 ; CHECK-NEXT: vmov.8 q7[13], r3
2763 ; CHECK-NEXT: vmov.u16 r3, q0[6]
2764 ; CHECK-NEXT: vmov.8 q7[14], r3
2765 ; CHECK-NEXT: vmov.u16 r3, q0[7]
2766 ; CHECK-NEXT: vmov.8 q7[15], r3
2767 ; CHECK-NEXT: vptt.i8 ne, q7, zr
2768 ; CHECK-NEXT: vldrbt.u8 q0, [r0], #16
2769 ; CHECK-NEXT: vldrbt.u8 q4, [r1], #16
2770 ; CHECK-NEXT: vmullt.s8 q5, q4, q0
2771 ; CHECK-NEXT: vmullb.s8 q0, q4, q0
2772 ; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
2773 ; CHECK-NEXT: vqshrnt.s16 q0, q5, #7
2775 ; CHECK-NEXT: vstrbt.8 q0, [r2], #16
2776 ; CHECK-NEXT: le lr, .LBB18_2
2777 ; CHECK-NEXT: .LBB18_3: @ %for.cond.cleanup
2778 ; CHECK-NEXT: add sp, #48
2779 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2780 ; CHECK-NEXT: pop {r4, r5, r7, pc}
2781 ; CHECK-NEXT: .p2align 4
2782 ; CHECK-NEXT: @ %bb.4:
2783 ; CHECK-NEXT: .LCPI18_0:
2784 ; CHECK-NEXT: .long 0 @ 0x0
2785 ; CHECK-NEXT: .long 1 @ 0x1
2786 ; CHECK-NEXT: .long 2 @ 0x2
2787 ; CHECK-NEXT: .long 3 @ 0x3
2788 ; CHECK-NEXT: .LCPI18_1:
2789 ; CHECK-NEXT: .long 4 @ 0x4
2790 ; CHECK-NEXT: .long 5 @ 0x5
2791 ; CHECK-NEXT: .long 6 @ 0x6
2792 ; CHECK-NEXT: .long 7 @ 0x7
2793 ; CHECK-NEXT: .LCPI18_2:
2794 ; CHECK-NEXT: .long 8 @ 0x8
2795 ; CHECK-NEXT: .long 9 @ 0x9
2796 ; CHECK-NEXT: .long 10 @ 0xa
2797 ; CHECK-NEXT: .long 11 @ 0xb
2798 ; CHECK-NEXT: .LCPI18_3:
2799 ; CHECK-NEXT: .long 12 @ 0xc
2800 ; CHECK-NEXT: .long 13 @ 0xd
2801 ; CHECK-NEXT: .long 14 @ 0xe
2802 ; CHECK-NEXT: .long 15 @ 0xf
2804 %cmp10 = icmp eq i32 %N, 0
2805 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
2807 vector.ph: ; preds = %entry
2808 %n.rnd.up = add i32 %N, 15
2809 %n.vec = and i32 %n.rnd.up, -16
2810 %trip.count.minus.1 = add i32 %N, -1
2811 %broadcast.splatinsert22 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
2812 %broadcast.splat23 = shufflevector <16 x i32> %broadcast.splatinsert22, <16 x i32> undef, <16 x i32> zeroinitializer
2813 br label %vector.body
2815 vector.body: ; preds = %vector.body, %vector.ph
2816 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2817 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
2818 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
2819 %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2820 %next.gep = getelementptr i8, i8* %pSrcA, i32 %index
2821 %next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
2822 %next.gep21 = getelementptr i8, i8* %pDst, i32 %index
2823 %0 = icmp ule <16 x i32> %induction, %broadcast.splat23
2824 %1 = bitcast i8* %next.gep to <16 x i8>*
2825 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %0, <16 x i8> undef)
2826 %2 = sext <16 x i8> %wide.masked.load to <16 x i16>
2827 %3 = bitcast i8* %next.gep20 to <16 x i8>*
2828 %wide.masked.load24 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %3, i32 1, <16 x i1> %0, <16 x i8> undef)
2829 %4 = sext <16 x i8> %wide.masked.load24 to <16 x i16>
2830 %5 = mul nsw <16 x i16> %4, %2
2831 %6 = ashr <16 x i16> %5, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2832 %7 = icmp sgt <16 x i16> %6, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2833 %8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
2834 %9 = icmp slt <16 x i16> %8, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2835 %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
2836 %11 = trunc <16 x i16> %10 to <16 x i8>
2837 %12 = bitcast i8* %next.gep21 to <16 x i8>*
2838 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %11, <16 x i8>* %12, i32 1, <16 x i1> %0)
2839 %index.next = add i32 %index, 16
2840 %13 = icmp eq i32 %index.next, %n.vec
2841 br i1 %13, label %for.cond.cleanup, label %vector.body
2843 for.cond.cleanup: ; preds = %vector.body, %entry
2847 define arm_aapcs_vfpcc void @ssatmul_16ti_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
2848 ; CHECK-LABEL: ssatmul_16ti_q7:
2849 ; CHECK: @ %bb.0: @ %entry
2850 ; CHECK-NEXT: .save {r4, r5, r7, lr}
2851 ; CHECK-NEXT: push {r4, r5, r7, lr}
2852 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2853 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2854 ; CHECK-NEXT: .pad #48
2855 ; CHECK-NEXT: sub sp, #48
2856 ; CHECK-NEXT: cmp r3, #0
2857 ; CHECK-NEXT: beq.w .LBB19_3
2858 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2859 ; CHECK-NEXT: add.w r12, r3, #15
2860 ; CHECK-NEXT: adr r4, .LCPI19_0
2861 ; CHECK-NEXT: bic r12, r12, #15
2862 ; CHECK-NEXT: vldrw.u32 q0, [r4]
2863 ; CHECK-NEXT: sub.w r12, r12, #16
2864 ; CHECK-NEXT: mov.w lr, #1
2865 ; CHECK-NEXT: adr r4, .LCPI19_1
2866 ; CHECK-NEXT: movs r5, #0
2867 ; CHECK-NEXT: add.w lr, lr, r12, lsr #4
2868 ; CHECK-NEXT: sub.w r12, r3, #1
2869 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
2870 ; CHECK-NEXT: vldrw.u32 q0, [r4]
2871 ; CHECK-NEXT: adr r4, .LCPI19_2
2872 ; CHECK-NEXT: vdup.32 q1, r12
2873 ; CHECK-NEXT: vmov.i8 q2, #0x0
2874 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
2875 ; CHECK-NEXT: vldrw.u32 q0, [r4]
2876 ; CHECK-NEXT: adr r4, .LCPI19_3
2877 ; CHECK-NEXT: vmov.i8 q3, #0xff
2878 ; CHECK-NEXT: vldrw.u32 q6, [r4]
2879 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
2880 ; CHECK-NEXT: .LBB19_2: @ %vector.body
2881 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2882 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
2883 ; CHECK-NEXT: vdup.32 q0, r5
2884 ; CHECK-NEXT: adds r5, #16
2885 ; CHECK-NEXT: vorr q4, q0, q4
2886 ; CHECK-NEXT: vcmp.u32 cs, q1, q4
2887 ; CHECK-NEXT: vpsel q4, q3, q2
2888 ; CHECK-NEXT: vmov r4, r3, d8
2889 ; CHECK-NEXT: vmov.16 q7[0], r4
2890 ; CHECK-NEXT: vmov.16 q7[1], r3
2891 ; CHECK-NEXT: vmov r3, r4, d9
2892 ; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload
2893 ; CHECK-NEXT: vmov.16 q7[2], r3
2894 ; CHECK-NEXT: vmov.16 q7[3], r4
2895 ; CHECK-NEXT: vorr q4, q0, q4
2896 ; CHECK-NEXT: vcmp.u32 cs, q1, q4
2897 ; CHECK-NEXT: vpsel q4, q3, q2
2898 ; CHECK-NEXT: vmov r3, r4, d8
2899 ; CHECK-NEXT: vmov.16 q7[4], r3
2900 ; CHECK-NEXT: vmov.16 q7[5], r4
2901 ; CHECK-NEXT: vmov r3, r4, d9
2902 ; CHECK-NEXT: vmov.16 q7[6], r3
2903 ; CHECK-NEXT: vmov.16 q7[7], r4
2904 ; CHECK-NEXT: vcmp.i16 ne, q7, zr
2905 ; CHECK-NEXT: vpsel q4, q3, q2
2906 ; CHECK-NEXT: vmov.u16 r3, q4[0]
2907 ; CHECK-NEXT: vmov.8 q7[0], r3
2908 ; CHECK-NEXT: vmov.u16 r3, q4[1]
2909 ; CHECK-NEXT: vmov.8 q7[1], r3
2910 ; CHECK-NEXT: vmov.u16 r3, q4[2]
2911 ; CHECK-NEXT: vmov.8 q7[2], r3
2912 ; CHECK-NEXT: vmov.u16 r3, q4[3]
2913 ; CHECK-NEXT: vmov.8 q7[3], r3
2914 ; CHECK-NEXT: vmov.u16 r3, q4[4]
2915 ; CHECK-NEXT: vmov.8 q7[4], r3
2916 ; CHECK-NEXT: vmov.u16 r3, q4[5]
2917 ; CHECK-NEXT: vmov.8 q7[5], r3
2918 ; CHECK-NEXT: vmov.u16 r3, q4[6]
2919 ; CHECK-NEXT: vmov.8 q7[6], r3
2920 ; CHECK-NEXT: vmov.u16 r3, q4[7]
2921 ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
2922 ; CHECK-NEXT: vmov.8 q7[7], r3
2923 ; CHECK-NEXT: vorr q4, q0, q4
2924 ; CHECK-NEXT: vorr q0, q0, q6
2925 ; CHECK-NEXT: vcmp.u32 cs, q1, q4
2926 ; CHECK-NEXT: vpsel q5, q3, q2
2927 ; CHECK-NEXT: vcmp.u32 cs, q1, q0
2928 ; CHECK-NEXT: vmov r3, r4, d10
2929 ; CHECK-NEXT: vpsel q0, q3, q2
2930 ; CHECK-NEXT: vmov.16 q4[0], r3
2931 ; CHECK-NEXT: vmov.16 q4[1], r4
2932 ; CHECK-NEXT: vmov r3, r4, d11
2933 ; CHECK-NEXT: vmov.16 q4[2], r3
2934 ; CHECK-NEXT: vmov.16 q4[3], r4
2935 ; CHECK-NEXT: vmov r3, r4, d0
2936 ; CHECK-NEXT: vmov.16 q4[4], r3
2937 ; CHECK-NEXT: vmov.16 q4[5], r4
2938 ; CHECK-NEXT: vmov r3, r4, d1
2939 ; CHECK-NEXT: vmov.16 q4[6], r3
2940 ; CHECK-NEXT: vmov.16 q4[7], r4
2941 ; CHECK-NEXT: vcmp.i16 ne, q4, zr
2942 ; CHECK-NEXT: vpsel q0, q3, q2
2943 ; CHECK-NEXT: vmov.u16 r3, q0[0]
2944 ; CHECK-NEXT: vmov.8 q7[8], r3
2945 ; CHECK-NEXT: vmov.u16 r3, q0[1]
2946 ; CHECK-NEXT: vmov.8 q7[9], r3
2947 ; CHECK-NEXT: vmov.u16 r3, q0[2]
2948 ; CHECK-NEXT: vmov.8 q7[10], r3
2949 ; CHECK-NEXT: vmov.u16 r3, q0[3]
2950 ; CHECK-NEXT: vmov.8 q7[11], r3
2951 ; CHECK-NEXT: vmov.u16 r3, q0[4]
2952 ; CHECK-NEXT: vmov.8 q7[12], r3
2953 ; CHECK-NEXT: vmov.u16 r3, q0[5]
2954 ; CHECK-NEXT: vmov.8 q7[13], r3
2955 ; CHECK-NEXT: vmov.u16 r3, q0[6]
2956 ; CHECK-NEXT: vmov.8 q7[14], r3
2957 ; CHECK-NEXT: vmov.u16 r3, q0[7]
2958 ; CHECK-NEXT: vmov.8 q7[15], r3
2959 ; CHECK-NEXT: vptt.i8 ne, q7, zr
2960 ; CHECK-NEXT: vldrbt.u8 q0, [r0], #16
2961 ; CHECK-NEXT: vldrbt.u8 q4, [r1], #16
2962 ; CHECK-NEXT: vmullt.s8 q5, q4, q0
2963 ; CHECK-NEXT: vmullb.s8 q0, q4, q0
2964 ; CHECK-NEXT: vqshrnb.s16 q0, q0, #7
2965 ; CHECK-NEXT: vqshrnt.s16 q0, q5, #7
2967 ; CHECK-NEXT: vstrbt.8 q0, [r2], #16
2968 ; CHECK-NEXT: le lr, .LBB19_2
2969 ; CHECK-NEXT: .LBB19_3: @ %for.cond.cleanup
2970 ; CHECK-NEXT: add sp, #48
2971 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2972 ; CHECK-NEXT: pop {r4, r5, r7, pc}
2973 ; CHECK-NEXT: .p2align 4
2974 ; CHECK-NEXT: @ %bb.4:
2975 ; CHECK-NEXT: .LCPI19_0:
2976 ; CHECK-NEXT: .long 0 @ 0x0
2977 ; CHECK-NEXT: .long 1 @ 0x1
2978 ; CHECK-NEXT: .long 2 @ 0x2
2979 ; CHECK-NEXT: .long 3 @ 0x3
2980 ; CHECK-NEXT: .LCPI19_1:
2981 ; CHECK-NEXT: .long 4 @ 0x4
2982 ; CHECK-NEXT: .long 5 @ 0x5
2983 ; CHECK-NEXT: .long 6 @ 0x6
2984 ; CHECK-NEXT: .long 7 @ 0x7
2985 ; CHECK-NEXT: .LCPI19_2:
2986 ; CHECK-NEXT: .long 8 @ 0x8
2987 ; CHECK-NEXT: .long 9 @ 0x9
2988 ; CHECK-NEXT: .long 10 @ 0xa
2989 ; CHECK-NEXT: .long 11 @ 0xb
2990 ; CHECK-NEXT: .LCPI19_3:
2991 ; CHECK-NEXT: .long 12 @ 0xc
2992 ; CHECK-NEXT: .long 13 @ 0xd
2993 ; CHECK-NEXT: .long 14 @ 0xe
2994 ; CHECK-NEXT: .long 15 @ 0xf
2996 %cmp10 = icmp eq i32 %N, 0
2997 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
2999 vector.ph: ; preds = %entry
3000 %n.rnd.up = add i32 %N, 15
3001 %n.vec = and i32 %n.rnd.up, -16
3002 %trip.count.minus.1 = add i32 %N, -1
3003 %broadcast.splatinsert22 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
3004 %broadcast.splat23 = shufflevector <16 x i32> %broadcast.splatinsert22, <16 x i32> undef, <16 x i32> zeroinitializer
3005 br label %vector.body
3007 vector.body: ; preds = %vector.body, %vector.ph
3008 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
3009 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
3010 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
3011 %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3012 %next.gep = getelementptr i8, i8* %pSrcA, i32 %index
3013 %next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
3014 %next.gep21 = getelementptr i8, i8* %pDst, i32 %index
3015 %0 = icmp ule <16 x i32> %induction, %broadcast.splat23
3016 %1 = bitcast i8* %next.gep to <16 x i8>*
3017 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %0, <16 x i8> undef)
3018 %2 = shufflevector <16 x i8> %wide.masked.load, <16 x i8> %wide.masked.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
3019 %3 = shufflevector <16 x i8> %wide.masked.load, <16 x i8> %wide.masked.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
3020 %4 = sext <8 x i8> %2 to <8 x i16>
3021 %5 = sext <8 x i8> %3 to <8 x i16>
3022 %6 = bitcast i8* %next.gep20 to <16 x i8>*
3023 %wide.masked.load24 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %6, i32 1, <16 x i1> %0, <16 x i8> undef)
3024 %7 = shufflevector <16 x i8> %wide.masked.load24, <16 x i8> %wide.masked.load24, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
3025 %8 = shufflevector <16 x i8> %wide.masked.load24, <16 x i8> %wide.masked.load24, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
3026 %9 = sext <8 x i8> %7 to <8 x i16>
3027 %10 = sext <8 x i8> %8 to <8 x i16>
3028 %11 = mul <8 x i16> %9, %4
3029 %12 = mul <8 x i16> %10, %5
3030 %13 = ashr <8 x i16> %11, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
3031 %14 = ashr <8 x i16> %12, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
3032 %15 = icmp sgt <8 x i16> %13, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
3033 %16 = icmp sgt <8 x i16> %14, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
3034 %17 = select <8 x i1> %15, <8 x i16> %13, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
3035 %18 = select <8 x i1> %16, <8 x i16> %14, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
3036 %19 = icmp slt <8 x i16> %17, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
3037 %20 = icmp slt <8 x i16> %18, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
3038 %21 = select <8 x i1> %19, <8 x i16> %17, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
3039 %22 = select <8 x i1> %20, <8 x i16> %18, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
3040 %23 = shufflevector <8 x i16> %21, <8 x i16> %22, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
3041 %24 = trunc <16 x i16> %23 to <16 x i8>
3042 %25 = bitcast i8* %next.gep21 to <16 x i8>*
3043 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %24, <16 x i8>* %25, i32 1, <16 x i1> %0)
3044 %index.next = add i32 %index, 16
3045 %26 = icmp eq i32 %index.next, %n.vec
3046 br i1 %26, label %for.cond.cleanup, label %vector.body
3048 for.cond.cleanup: ; preds = %vector.body, %entry
3052 define arm_aapcs_vfpcc void @usatmul_8_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
3053 ; CHECK-LABEL: usatmul_8_q7:
3054 ; CHECK: @ %bb.0: @ %entry
3055 ; CHECK-NEXT: .save {r4, r5, r6, lr}
3056 ; CHECK-NEXT: push {r4, r5, r6, lr}
3057 ; CHECK-NEXT: cbz r3, .LBB20_8
3058 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
3059 ; CHECK-NEXT: cmp r3, #7
3060 ; CHECK-NEXT: bhi .LBB20_3
3061 ; CHECK-NEXT: @ %bb.2:
3062 ; CHECK-NEXT: movs r5, #0
3063 ; CHECK-NEXT: mov r12, r0
3064 ; CHECK-NEXT: mov r6, r1
3065 ; CHECK-NEXT: mov r4, r2
3066 ; CHECK-NEXT: b .LBB20_6
3067 ; CHECK-NEXT: .LBB20_3: @ %vector.ph
3068 ; CHECK-NEXT: bic r5, r3, #7
3069 ; CHECK-NEXT: movs r4, #1
3070 ; CHECK-NEXT: sub.w r6, r5, #8
3071 ; CHECK-NEXT: add.w r12, r0, r5
3072 ; CHECK-NEXT: add.w lr, r4, r6, lsr #3
3073 ; CHECK-NEXT: adds r4, r2, r5
3074 ; CHECK-NEXT: adds r6, r1, r5
3075 ; CHECK-NEXT: .LBB20_4: @ %vector.body
3076 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
3077 ; CHECK-NEXT: vldrb.u16 q0, [r0], #8
3078 ; CHECK-NEXT: vldrb.u16 q1, [r1], #8
3079 ; CHECK-NEXT: vmul.i16 q0, q1, q0
3080 ; CHECK-NEXT: vqshrnb.u16 q0, q0, #7
3081 ; CHECK-NEXT: vstrb.16 q0, [r2], #8
3082 ; CHECK-NEXT: le lr, .LBB20_4
3083 ; CHECK-NEXT: @ %bb.5: @ %middle.block
3084 ; CHECK-NEXT: cmp r5, r3
3086 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
3087 ; CHECK-NEXT: .LBB20_6: @ %for.body.preheader23
3088 ; CHECK-NEXT: sub.w lr, r3, r5
3089 ; CHECK-NEXT: .LBB20_7: @ %for.body
3090 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
3091 ; CHECK-NEXT: ldrb r0, [r12], #1
3092 ; CHECK-NEXT: ldrb r1, [r6], #1
3093 ; CHECK-NEXT: muls r0, r1, r0
3094 ; CHECK-NEXT: lsrs r1, r0, #7
3095 ; CHECK-NEXT: cmp r1, #255
3096 ; CHECK-NEXT: mov.w r1, #255
3098 ; CHECK-NEXT: lsrlo r1, r0, #7
3099 ; CHECK-NEXT: strb r1, [r4], #1
3100 ; CHECK-NEXT: le lr, .LBB20_7
3101 ; CHECK-NEXT: .LBB20_8: @ %for.cond.cleanup
3102 ; CHECK-NEXT: pop {r4, r5, r6, pc}
3104 %cmp10 = icmp eq i32 %N, 0
3105 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
3107 for.body.preheader: ; preds = %entry
3108 %min.iters.check = icmp ult i32 %N, 8
3109 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
3111 for.body.preheader23: ; preds = %middle.block, %for.body.preheader
3112 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
3113 %pSrcA.addr.013.ph = phi i8* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
3114 %pSrcB.addr.012.ph = phi i8* [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
3115 %pDst.addr.011.ph = phi i8* [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
3118 vector.ph: ; preds = %for.body.preheader
3119 %n.vec = and i32 %N, -8
3120 %ind.end = getelementptr i8, i8* %pSrcA, i32 %n.vec
3121 %ind.end17 = getelementptr i8, i8* %pSrcB, i32 %n.vec
3122 %ind.end19 = getelementptr i8, i8* %pDst, i32 %n.vec
3123 br label %vector.body
3125 vector.body: ; preds = %vector.body, %vector.ph
3126 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
3127 %next.gep = getelementptr i8, i8* %pSrcA, i32 %index
3128 %next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
3129 %next.gep21 = getelementptr i8, i8* %pDst, i32 %index
3130 %0 = bitcast i8* %next.gep to <8 x i8>*
3131 %wide.load = load <8 x i8>, <8 x i8>* %0, align 1
3132 %1 = zext <8 x i8> %wide.load to <8 x i16>
3133 %2 = bitcast i8* %next.gep20 to <8 x i8>*
3134 %wide.load22 = load <8 x i8>, <8 x i8>* %2, align 1
3135 %3 = zext <8 x i8> %wide.load22 to <8 x i16>
3136 %4 = mul nuw <8 x i16> %3, %1
3137 %5 = lshr <8 x i16> %4, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
3138 %6 = icmp ult <8 x i16> %5, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
3139 %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
3140 %8 = trunc <8 x i16> %7 to <8 x i8>
3141 %9 = bitcast i8* %next.gep21 to <8 x i8>*
3142 store <8 x i8> %8, <8 x i8>* %9, align 1
3143 %index.next = add i32 %index, 8
3144 %10 = icmp eq i32 %index.next, %n.vec
3145 br i1 %10, label %middle.block, label %vector.body
3147 middle.block: ; preds = %vector.body
3148 %cmp.n = icmp eq i32 %n.vec, %N
3149 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
3151 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
3154 for.body: ; preds = %for.body.preheader23, %for.body
3155 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
3156 %pSrcA.addr.013 = phi i8* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
3157 %pSrcB.addr.012 = phi i8* [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
3158 %pDst.addr.011 = phi i8* [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
3159 %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.013, i32 1
3160 %11 = load i8, i8* %pSrcA.addr.013, align 1
3161 %conv1 = zext i8 %11 to i16
3162 %incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.012, i32 1
3163 %12 = load i8, i8* %pSrcB.addr.012, align 1
3164 %conv3 = zext i8 %12 to i16
3165 %mul = mul nuw i16 %conv3, %conv1
3166 %13 = lshr i16 %mul, 7
3167 %14 = icmp ult i16 %13, 255
3168 %retval.0.i = select i1 %14, i16 %13, i16 255
3169 %conv5 = trunc i16 %retval.0.i to i8
3170 %incdec.ptr6 = getelementptr inbounds i8, i8* %pDst.addr.011, i32 1
3171 store i8 %conv5, i8* %pDst.addr.011, align 1
3172 %inc = add nuw i32 %i.014, 1
3173 %exitcond = icmp eq i32 %inc, %N
3174 br i1 %exitcond, label %for.cond.cleanup, label %for.body
3177 define arm_aapcs_vfpcc void @usatmul_16_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) {
3178 ; CHECK-LABEL: usatmul_16_q7:
3179 ; CHECK: @ %bb.0: @ %entry
3180 ; CHECK-NEXT: .save {r4, r5, r6, lr}
3181 ; CHECK-NEXT: push {r4, r5, r6, lr}
3182 ; CHECK-NEXT: cmp r3, #0
3183 ; CHECK-NEXT: beq .LBB21_8
3184 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
3185 ; CHECK-NEXT: cmp r3, #15
3186 ; CHECK-NEXT: bhi .LBB21_3
3187 ; CHECK-NEXT: @ %bb.2:
3188 ; CHECK-NEXT: movs r5, #0
3189 ; CHECK-NEXT: mov r12, r0
3190 ; CHECK-NEXT: mov r6, r1
3191 ; CHECK-NEXT: mov r4, r2
3192 ; CHECK-NEXT: b .LBB21_6
3193 ; CHECK-NEXT: .LBB21_3: @ %vector.ph
3194 ; CHECK-NEXT: bic r5, r3, #15
3195 ; CHECK-NEXT: movs r4, #1
3196 ; CHECK-NEXT: sub.w r6, r5, #16
3197 ; CHECK-NEXT: add.w r12, r0, r5
3198 ; CHECK-NEXT: add.w lr, r4, r6, lsr #4
3199 ; CHECK-NEXT: adds r4, r2, r5
3200 ; CHECK-NEXT: adds r6, r1, r5
3201 ; CHECK-NEXT: .LBB21_4: @ %vector.body
3202 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
3203 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
3204 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
3205 ; CHECK-NEXT: vmullt.u8 q2, q1, q0
3206 ; CHECK-NEXT: vmullb.u8 q0, q1, q0
3207 ; CHECK-NEXT: vqshrnb.u16 q2, q2, #7
3208 ; CHECK-NEXT: vqshrnb.u16 q0, q0, #7
3209 ; CHECK-NEXT: vmovlb.u8 q2, q2
3210 ; CHECK-NEXT: vmovlb.u8 q0, q0
3211 ; CHECK-NEXT: vmovnt.i16 q0, q2
3212 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
3213 ; CHECK-NEXT: le lr, .LBB21_4
3214 ; CHECK-NEXT: @ %bb.5: @ %middle.block
3215 ; CHECK-NEXT: cmp r5, r3
3217 ; CHECK-NEXT: popeq {r4, r5, r6, pc}
3218 ; CHECK-NEXT: .LBB21_6: @ %for.body.preheader23
3219 ; CHECK-NEXT: sub.w lr, r3, r5
3220 ; CHECK-NEXT: .LBB21_7: @ %for.body
3221 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
3222 ; CHECK-NEXT: ldrb r0, [r12], #1
3223 ; CHECK-NEXT: ldrb r1, [r6], #1
3224 ; CHECK-NEXT: muls r0, r1, r0
3225 ; CHECK-NEXT: lsrs r1, r0, #7
3226 ; CHECK-NEXT: cmp r1, #255
3227 ; CHECK-NEXT: mov.w r1, #255
3229 ; CHECK-NEXT: lsrlo r1, r0, #7
3230 ; CHECK-NEXT: strb r1, [r4], #1
3231 ; CHECK-NEXT: le lr, .LBB21_7
3232 ; CHECK-NEXT: .LBB21_8: @ %for.cond.cleanup
3233 ; CHECK-NEXT: pop {r4, r5, r6, pc}
3235 %cmp10 = icmp eq i32 %N, 0
3236 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
3238 for.body.preheader: ; preds = %entry
3239 %min.iters.check = icmp ult i32 %N, 16
3240 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph
3242 for.body.preheader23: ; preds = %middle.block, %for.body.preheader
3243 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
3244 %pSrcA.addr.013.ph = phi i8* [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ]
3245 %pSrcB.addr.012.ph = phi i8* [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ]
3246 %pDst.addr.011.ph = phi i8* [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ]
3249 vector.ph: ; preds = %for.body.preheader
3250 %n.vec = and i32 %N, -16
3251 %ind.end = getelementptr i8, i8* %pSrcA, i32 %n.vec
3252 %ind.end17 = getelementptr i8, i8* %pSrcB, i32 %n.vec
3253 %ind.end19 = getelementptr i8, i8* %pDst, i32 %n.vec
3254 br label %vector.body
3256 vector.body: ; preds = %vector.body, %vector.ph
3257 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
3258 %next.gep = getelementptr i8, i8* %pSrcA, i32 %index
3259 %next.gep20 = getelementptr i8, i8* %pSrcB, i32 %index
3260 %next.gep21 = getelementptr i8, i8* %pDst, i32 %index
3261 %0 = bitcast i8* %next.gep to <16 x i8>*
3262 %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
3263 %1 = zext <16 x i8> %wide.load to <16 x i16>
3264 %2 = bitcast i8* %next.gep20 to <16 x i8>*
3265 %wide.load22 = load <16 x i8>, <16 x i8>* %2, align 1
3266 %3 = zext <16 x i8> %wide.load22 to <16 x i16>
3267 %4 = mul nuw <16 x i16> %3, %1
3268 %5 = lshr <16 x i16> %4, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
3269 %6 = icmp ult <16 x i16> %5, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
3270 %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
3271 %8 = trunc <16 x i16> %7 to <16 x i8>
3272 %9 = bitcast i8* %next.gep21 to <16 x i8>*
3273 store <16 x i8> %8, <16 x i8>* %9, align 1
3274 %index.next = add i32 %index, 16
3275 %10 = icmp eq i32 %index.next, %n.vec
3276 br i1 %10, label %middle.block, label %vector.body
3278 middle.block: ; preds = %vector.body
3279 %cmp.n = icmp eq i32 %n.vec, %N
3280 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23
3282 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
3285 for.body: ; preds = %for.body.preheader23, %for.body
3286 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ]
3287 %pSrcA.addr.013 = phi i8* [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ]
3288 %pSrcB.addr.012 = phi i8* [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ]
3289 %pDst.addr.011 = phi i8* [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ]
3290 %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.013, i32 1
3291 %11 = load i8, i8* %pSrcA.addr.013, align 1
3292 %conv1 = zext i8 %11 to i16
3293 %incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.012, i32 1
3294 %12 = load i8, i8* %pSrcB.addr.012, align 1
3295 %conv3 = zext i8 %12 to i16
3296 %mul = mul nuw i16 %conv3, %conv1
3297 %13 = lshr i16 %mul, 7
3298 %14 = icmp ult i16 %13, 255
3299 %retval.0.i = select i1 %14, i16 %13, i16 255
3300 %conv5 = trunc i16 %retval.0.i to i8
3301 %incdec.ptr6 = getelementptr inbounds i8, i8* %pDst.addr.011, i32 1
3302 store i8 %conv5, i8* %pDst.addr.011, align 1
3303 %inc = add nuw i32 %i.014, 1
3304 %exitcond = icmp eq i32 %inc, %N
3305 br i1 %exitcond, label %for.cond.cleanup, label %for.body
3308 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
3309 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
3310 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
3311 declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
3312 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
3313 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
3314 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
3315 declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>)
3316 declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
3317 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)