1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s
4 define void @matrix_mul_unsigned(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
5 ; CHECK-LABEL: matrix_mul_unsigned:
6 ; CHECK: // %bb.0: // %vector.header
7 ; CHECK-NEXT: and w8, w3, #0xffff
8 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
9 ; CHECK-NEXT: dup v0.4h, w8
10 ; CHECK-NEXT: and x8, x0, #0xfffffff8
11 ; CHECK-NEXT: .LBB0_1: // %vector.body
12 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
13 ; CHECK-NEXT: add x9, x2, w0, uxtw #1
14 ; CHECK-NEXT: subs x8, x8, #8
15 ; CHECK-NEXT: ldp d1, d2, [x9]
16 ; CHECK-NEXT: add x9, x1, w0, uxtw #2
17 ; CHECK-NEXT: add w0, w0, #8
18 ; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
19 ; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
20 ; CHECK-NEXT: stp q1, q2, [x9]
21 ; CHECK-NEXT: b.ne .LBB0_1
22 ; CHECK-NEXT: // %bb.2: // %for.end12
25 %conv4 = zext i16 %val to i32
26 %wide.trip.count = zext i32 %N to i64
27 %0 = add nsw i64 %wide.trip.count, -1
28 %min.iters.check = icmp ult i32 %N, 8
29 %1 = trunc i64 %0 to i32
30 %2 = icmp ugt i64 %0, 4294967295
31 %n.vec = and i64 %wide.trip.count, 4294967288
32 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
33 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
34 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
35 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
36 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
39 vector.body: ; preds = %vector.header, %vector.body
40 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
41 %3 = trunc i64 %index to i32
43 %5 = zext i32 %4 to i64
44 %6 = getelementptr inbounds i16, i16* %A, i64 %5
45 %7 = bitcast i16* %6 to <4 x i16>*
46 %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
47 %8 = getelementptr inbounds i16, i16* %6, i64 4
48 %9 = bitcast i16* %8 to <4 x i16>*
49 %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
50 %10 = zext <4 x i16> %wide.load to <4 x i32>
51 %11 = zext <4 x i16> %wide.load30 to <4 x i32>
52 %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
53 %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
54 %14 = getelementptr inbounds i32, i32* %C, i64 %5
55 %15 = bitcast i32* %14 to <4 x i32>*
56 store <4 x i32> %12, <4 x i32>* %15, align 4
57 %16 = getelementptr inbounds i32, i32* %14, i64 4
58 %17 = bitcast i32* %16 to <4 x i32>*
59 store <4 x i32> %13, <4 x i32>* %17, align 4
60 %index.next = add i64 %index, 8
61 %18 = icmp eq i64 %index.next, %n.vec
62 br i1 %18, label %for.end12, label %vector.body
64 for.end12: ; preds = %vector.body
68 define void @matrix_mul_signed(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
69 ; CHECK-LABEL: matrix_mul_signed:
70 ; CHECK: // %bb.0: // %vector.header
71 ; CHECK-NEXT: sxth w8, w3
72 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
73 ; CHECK-NEXT: dup v0.4h, w8
74 ; CHECK-NEXT: and x8, x0, #0xfffffff8
75 ; CHECK-NEXT: .LBB1_1: // %vector.body
76 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
77 ; CHECK-NEXT: add x9, x2, w0, sxtw #1
78 ; CHECK-NEXT: subs x8, x8, #8
79 ; CHECK-NEXT: ldp d1, d2, [x9]
80 ; CHECK-NEXT: add x9, x1, w0, sxtw #2
81 ; CHECK-NEXT: add w0, w0, #8
82 ; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h
83 ; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h
84 ; CHECK-NEXT: stp q1, q2, [x9]
85 ; CHECK-NEXT: b.ne .LBB1_1
86 ; CHECK-NEXT: // %bb.2: // %for.end12
89 %conv4 = sext i16 %val to i32
90 %wide.trip.count = sext i32 %N to i64
91 %0 = add nsw i64 %wide.trip.count, -1
92 %min.iters.check = icmp ult i32 %N, 8
93 %1 = trunc i64 %0 to i32
94 %2 = icmp ugt i64 %0, 4294967295
95 %n.vec = and i64 %wide.trip.count, 4294967288
96 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
97 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
98 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
99 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
100 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
101 br label %vector.body
103 vector.body: ; preds = %vector.header, %vector.body
104 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
105 %3 = trunc i64 %index to i32
107 %5 = sext i32 %4 to i64
108 %6 = getelementptr inbounds i16, i16* %A, i64 %5
109 %7 = bitcast i16* %6 to <4 x i16>*
110 %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
111 %8 = getelementptr inbounds i16, i16* %6, i64 4
112 %9 = bitcast i16* %8 to <4 x i16>*
113 %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
114 %10 = sext <4 x i16> %wide.load to <4 x i32>
115 %11 = sext <4 x i16> %wide.load30 to <4 x i32>
116 %12 = mul nsw <4 x i32> %broadcast.splat, %10
117 %13 = mul nsw <4 x i32> %broadcast.splat32, %11
118 %14 = getelementptr inbounds i32, i32* %C, i64 %5
119 %15 = bitcast i32* %14 to <4 x i32>*
120 store <4 x i32> %12, <4 x i32>* %15, align 4
121 %16 = getelementptr inbounds i32, i32* %14, i64 4
122 %17 = bitcast i32* %16 to <4 x i32>*
123 store <4 x i32> %13, <4 x i32>* %17, align 4
124 %index.next = add i64 %index, 8
125 %18 = icmp eq i64 %index.next, %n.vec
126 br i1 %18, label %for.end12, label %vector.body
128 for.end12: ; preds = %vector.body
133 define void @matrix_mul_double_shuffle(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
134 ; CHECK-LABEL: matrix_mul_double_shuffle:
135 ; CHECK: // %bb.0: // %vector.header
136 ; CHECK-NEXT: and w8, w3, #0xffff
137 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
138 ; CHECK-NEXT: dup v0.4h, w8
139 ; CHECK-NEXT: and x8, x0, #0xfffffff8
140 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 def $x0
141 ; CHECK-NEXT: .LBB2_1: // %vector.body
142 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
143 ; CHECK-NEXT: ldrh w9, [x2], #16
144 ; CHECK-NEXT: subs x8, x8, #8
145 ; CHECK-NEXT: dup v1.4h, w9
146 ; CHECK-NEXT: ubfiz x9, x0, #2, #32
147 ; CHECK-NEXT: add w0, w0, #8
148 ; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
149 ; CHECK-NEXT: str q1, [x1, x9]
150 ; CHECK-NEXT: b.ne .LBB2_1
151 ; CHECK-NEXT: // %bb.2: // %for.end12
154 %conv4 = zext i16 %val to i32
155 %wide.trip.count = zext i32 %N to i64
156 %0 = add nsw i64 %wide.trip.count, -1
157 %min.iters.check = icmp ult i32 %N, 8
158 %1 = trunc i64 %0 to i32
159 %2 = icmp ugt i64 %0, 4294967295
160 %n.vec = and i64 %wide.trip.count, 4294967288
161 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
162 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
163 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
164 br label %vector.body
166 vector.body: ; preds = %vector.header, %vector.body
167 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
168 %g = getelementptr inbounds i16, i16* %A, i64 %index
169 %val1 = load i16, i16* %g
170 %splat.input.ext = zext i16 %val1 to i32
171 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %splat.input.ext, i32 0
172 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> %broadcast.splat, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
173 %3 = trunc i64 %index to i32
175 %5 = zext i32 %4 to i64
176 %6 = mul nuw nsw <4 x i32> %broadcast.splat, %broadcast.splat32
177 %7 = getelementptr inbounds i32, i32* %C, i64 %5
178 %8 = bitcast i32* %7 to <4 x i32>*
179 store <4 x i32> %6, <4 x i32>* %8, align 4
180 %index.next = add i64 %index, 8
181 %9 = icmp eq i64 %index.next, %n.vec
182 br i1 %9, label %for.end12, label %vector.body
184 for.end12: ; preds = %vector.body
189 define void @larger_smull(i16* nocapture noundef readonly %x, i16 noundef %y, i32* noalias nocapture noundef writeonly %s, i32 noundef %n) {
190 ; CHECK-LABEL: larger_smull:
191 ; CHECK: // %bb.0: // %entry
192 ; CHECK-NEXT: cmp w3, #1
193 ; CHECK-NEXT: b.lt .LBB3_8
194 ; CHECK-NEXT: // %bb.1: // %for.body.preheader
195 ; CHECK-NEXT: sxth w8, w1
196 ; CHECK-NEXT: cmp w3, #15
197 ; CHECK-NEXT: mov w9, w3
198 ; CHECK-NEXT: b.hi .LBB3_3
199 ; CHECK-NEXT: // %bb.2:
200 ; CHECK-NEXT: mov x10, xzr
201 ; CHECK-NEXT: b .LBB3_6
202 ; CHECK-NEXT: .LBB3_3: // %vector.ph
203 ; CHECK-NEXT: dup v0.8h, w8
204 ; CHECK-NEXT: and x10, x9, #0xfffffff0
205 ; CHECK-NEXT: add x11, x2, #32
206 ; CHECK-NEXT: add x12, x0, #16
207 ; CHECK-NEXT: mov x13, x10
208 ; CHECK-NEXT: .LBB3_4: // %vector.body
209 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
210 ; CHECK-NEXT: ldp q1, q2, [x12, #-16]
211 ; CHECK-NEXT: subs x13, x13, #16
212 ; CHECK-NEXT: add x12, x12, #32
213 ; CHECK-NEXT: smull2 v3.4s, v0.8h, v1.8h
214 ; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h
215 ; CHECK-NEXT: smull2 v4.4s, v0.8h, v2.8h
216 ; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h
217 ; CHECK-NEXT: stp q1, q3, [x11, #-32]
218 ; CHECK-NEXT: stp q2, q4, [x11], #64
219 ; CHECK-NEXT: b.ne .LBB3_4
220 ; CHECK-NEXT: // %bb.5: // %middle.block
221 ; CHECK-NEXT: cmp x10, x9
222 ; CHECK-NEXT: b.eq .LBB3_8
223 ; CHECK-NEXT: .LBB3_6: // %for.body.preheader1
224 ; CHECK-NEXT: add x11, x2, x10, lsl #2
225 ; CHECK-NEXT: add x12, x0, x10, lsl #1
226 ; CHECK-NEXT: sub x9, x9, x10
227 ; CHECK-NEXT: .LBB3_7: // %for.body
228 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
229 ; CHECK-NEXT: ldrsh w10, [x12], #2
230 ; CHECK-NEXT: subs x9, x9, #1
231 ; CHECK-NEXT: mul w10, w10, w8
232 ; CHECK-NEXT: str w10, [x11], #4
233 ; CHECK-NEXT: b.ne .LBB3_7
234 ; CHECK-NEXT: .LBB3_8: // %for.cond.cleanup
237 %conv1 = sext i16 %y to i32
238 %cmp8 = icmp sgt i32 %n, 0
239 br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
241 for.body.preheader: ; preds = %entry
242 %wide.trip.count = zext i32 %n to i64
243 %min.iters.check = icmp ult i32 %n, 16
244 br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph
246 vector.ph: ; preds = %for.body.preheader
247 %n.vec = and i64 %wide.trip.count, 4294967280
248 %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0
249 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
250 %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0
251 %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer
252 br label %vector.body
254 vector.body: ; preds = %vector.body, %vector.ph
255 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
256 %0 = getelementptr inbounds i16, i16* %x, i64 %index
257 %1 = bitcast i16* %0 to <8 x i16>*
258 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
259 %2 = getelementptr inbounds i16, i16* %0, i64 8
260 %3 = bitcast i16* %2 to <8 x i16>*
261 %wide.load11 = load <8 x i16>, <8 x i16>* %3, align 2
262 %4 = sext <8 x i16> %wide.load to <8 x i32>
263 %5 = sext <8 x i16> %wide.load11 to <8 x i32>
264 %6 = mul nsw <8 x i32> %broadcast.splat, %4
265 %7 = mul nsw <8 x i32> %broadcast.splat13, %5
266 %8 = getelementptr inbounds i32, i32* %s, i64 %index
267 %9 = bitcast i32* %8 to <8 x i32>*
268 store <8 x i32> %6, <8 x i32>* %9, align 4
269 %10 = getelementptr inbounds i32, i32* %8, i64 8
270 %11 = bitcast i32* %10 to <8 x i32>*
271 store <8 x i32> %7, <8 x i32>* %11, align 4
272 %index.next = add nuw i64 %index, 16
273 %12 = icmp eq i64 %index.next, %n.vec
274 br i1 %12, label %middle.block, label %vector.body
276 middle.block: ; preds = %vector.body
277 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
278 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14
280 for.body.preheader14: ; preds = %for.body.preheader, %middle.block
281 %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
284 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
287 for.body: ; preds = %for.body.preheader14, %for.body
288 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ]
289 %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv
290 %13 = load i16, i16* %arrayidx, align 2
291 %conv = sext i16 %13 to i32
292 %mul = mul nsw i32 %conv, %conv1
293 %arrayidx3 = getelementptr inbounds i32, i32* %s, i64 %indvars.iv
294 store i32 %mul, i32* %arrayidx3, align 4
295 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
296 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
297 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
301 define void @larger_umull(i16* nocapture noundef readonly %x, i16 noundef %y, i32* noalias nocapture noundef writeonly %s, i32 noundef %n) {
302 ; CHECK-LABEL: larger_umull:
303 ; CHECK: // %bb.0: // %entry
304 ; CHECK-NEXT: cmp w3, #1
305 ; CHECK-NEXT: b.lt .LBB4_8
306 ; CHECK-NEXT: // %bb.1: // %for.body.preheader
307 ; CHECK-NEXT: cmp w3, #15
308 ; CHECK-NEXT: and w8, w1, #0xffff
309 ; CHECK-NEXT: mov w9, w3
310 ; CHECK-NEXT: b.hi .LBB4_3
311 ; CHECK-NEXT: // %bb.2:
312 ; CHECK-NEXT: mov x10, xzr
313 ; CHECK-NEXT: b .LBB4_6
314 ; CHECK-NEXT: .LBB4_3: // %vector.ph
315 ; CHECK-NEXT: dup v0.8h, w8
316 ; CHECK-NEXT: and x10, x9, #0xfffffff0
317 ; CHECK-NEXT: add x11, x2, #32
318 ; CHECK-NEXT: add x12, x0, #16
319 ; CHECK-NEXT: mov x13, x10
320 ; CHECK-NEXT: .LBB4_4: // %vector.body
321 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
322 ; CHECK-NEXT: ldp q1, q2, [x12, #-16]
323 ; CHECK-NEXT: subs x13, x13, #16
324 ; CHECK-NEXT: add x12, x12, #32
325 ; CHECK-NEXT: umull2 v3.4s, v0.8h, v1.8h
326 ; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
327 ; CHECK-NEXT: umull2 v4.4s, v0.8h, v2.8h
328 ; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
329 ; CHECK-NEXT: stp q1, q3, [x11, #-32]
330 ; CHECK-NEXT: stp q2, q4, [x11], #64
331 ; CHECK-NEXT: b.ne .LBB4_4
332 ; CHECK-NEXT: // %bb.5: // %middle.block
333 ; CHECK-NEXT: cmp x10, x9
334 ; CHECK-NEXT: b.eq .LBB4_8
335 ; CHECK-NEXT: .LBB4_6: // %for.body.preheader1
336 ; CHECK-NEXT: add x11, x2, x10, lsl #2
337 ; CHECK-NEXT: add x12, x0, x10, lsl #1
338 ; CHECK-NEXT: sub x9, x9, x10
339 ; CHECK-NEXT: .LBB4_7: // %for.body
340 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
341 ; CHECK-NEXT: ldrh w10, [x12], #2
342 ; CHECK-NEXT: subs x9, x9, #1
343 ; CHECK-NEXT: mul w10, w10, w8
344 ; CHECK-NEXT: str w10, [x11], #4
345 ; CHECK-NEXT: b.ne .LBB4_7
346 ; CHECK-NEXT: .LBB4_8: // %for.cond.cleanup
349 %conv1 = zext i16 %y to i32
350 %cmp8 = icmp sgt i32 %n, 0
351 br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
353 for.body.preheader: ; preds = %entry
354 %wide.trip.count = zext i32 %n to i64
355 %min.iters.check = icmp ult i32 %n, 16
356 br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph
358 vector.ph: ; preds = %for.body.preheader
359 %n.vec = and i64 %wide.trip.count, 4294967280
360 %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0
361 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
362 %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0
363 %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer
364 br label %vector.body
366 vector.body: ; preds = %vector.body, %vector.ph
367 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
368 %0 = getelementptr inbounds i16, i16* %x, i64 %index
369 %1 = bitcast i16* %0 to <8 x i16>*
370 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
371 %2 = getelementptr inbounds i16, i16* %0, i64 8
372 %3 = bitcast i16* %2 to <8 x i16>*
373 %wide.load11 = load <8 x i16>, <8 x i16>* %3, align 2
374 %4 = zext <8 x i16> %wide.load to <8 x i32>
375 %5 = zext <8 x i16> %wide.load11 to <8 x i32>
376 %6 = mul nuw <8 x i32> %broadcast.splat, %4
377 %7 = mul nuw <8 x i32> %broadcast.splat13, %5
378 %8 = getelementptr inbounds i32, i32* %s, i64 %index
379 %9 = bitcast i32* %8 to <8 x i32>*
380 store <8 x i32> %6, <8 x i32>* %9, align 4
381 %10 = getelementptr inbounds i32, i32* %8, i64 8
382 %11 = bitcast i32* %10 to <8 x i32>*
383 store <8 x i32> %7, <8 x i32>* %11, align 4
384 %index.next = add nuw i64 %index, 16
385 %12 = icmp eq i64 %index.next, %n.vec
386 br i1 %12, label %middle.block, label %vector.body
388 middle.block: ; preds = %vector.body
389 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
390 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14
392 for.body.preheader14: ; preds = %for.body.preheader, %middle.block
393 %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
396 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
399 for.body: ; preds = %for.body.preheader14, %for.body
400 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ]
401 %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv
402 %13 = load i16, i16* %arrayidx, align 2
403 %conv = zext i16 %13 to i32
404 %mul = mul nuw i32 %conv, %conv1
405 %arrayidx3 = getelementptr inbounds i32, i32* %s, i64 %indvars.iv
406 store i32 %mul, i32* %arrayidx3, align 4
407 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
408 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
409 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
413 define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A, i8 noundef %B, i32 noundef %n) {
414 ; CHECK-LABEL: red_mla_dup_ext_u8_s8_s16:
415 ; CHECK: // %bb.0: // %entry
416 ; CHECK-NEXT: cbz w2, .LBB5_3
417 ; CHECK-NEXT: // %bb.1: // %for.body.preheader
418 ; CHECK-NEXT: sxtb w9, w1
419 ; CHECK-NEXT: cmp w2, #15
420 ; CHECK-NEXT: mov w10, w2
421 ; CHECK-NEXT: b.hi .LBB5_4
422 ; CHECK-NEXT: // %bb.2:
423 ; CHECK-NEXT: mov x11, xzr
424 ; CHECK-NEXT: mov w8, wzr
425 ; CHECK-NEXT: b .LBB5_7
426 ; CHECK-NEXT: .LBB5_3:
427 ; CHECK-NEXT: mov w8, wzr
428 ; CHECK-NEXT: mov w0, w8
430 ; CHECK-NEXT: .LBB5_4: // %vector.ph
431 ; CHECK-NEXT: movi v0.2d, #0000000000000000
432 ; CHECK-NEXT: movi v1.2d, #0000000000000000
433 ; CHECK-NEXT: and x11, x10, #0xfffffff0
434 ; CHECK-NEXT: dup v2.8h, w9
435 ; CHECK-NEXT: add x8, x0, #8
436 ; CHECK-NEXT: mov x12, x11
437 ; CHECK-NEXT: .LBB5_5: // %vector.body
438 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
439 ; CHECK-NEXT: ldp d3, d4, [x8, #-8]
440 ; CHECK-NEXT: subs x12, x12, #16
441 ; CHECK-NEXT: add x8, x8, #16
442 ; CHECK-NEXT: ushll v3.8h, v3.8b, #0
443 ; CHECK-NEXT: ushll v4.8h, v4.8b, #0
444 ; CHECK-NEXT: mla v0.8h, v2.8h, v3.8h
445 ; CHECK-NEXT: mla v1.8h, v2.8h, v4.8h
446 ; CHECK-NEXT: b.ne .LBB5_5
447 ; CHECK-NEXT: // %bb.6: // %middle.block
448 ; CHECK-NEXT: add v0.8h, v1.8h, v0.8h
449 ; CHECK-NEXT: cmp x11, x10
450 ; CHECK-NEXT: addv h0, v0.8h
451 ; CHECK-NEXT: fmov w8, s0
452 ; CHECK-NEXT: b.eq .LBB5_9
453 ; CHECK-NEXT: .LBB5_7: // %for.body.preheader1
454 ; CHECK-NEXT: sub x10, x10, x11
455 ; CHECK-NEXT: add x11, x0, x11
456 ; CHECK-NEXT: .LBB5_8: // %for.body
457 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
458 ; CHECK-NEXT: ldrb w12, [x11], #1
459 ; CHECK-NEXT: subs x10, x10, #1
460 ; CHECK-NEXT: madd w8, w12, w9, w8
461 ; CHECK-NEXT: b.ne .LBB5_8
462 ; CHECK-NEXT: .LBB5_9: // %for.cond.cleanup
463 ; CHECK-NEXT: mov w0, w8
466 %conv2 = sext i8 %B to i16
467 %cmp10.not = icmp eq i32 %n, 0
468 br i1 %cmp10.not, label %for.cond.cleanup, label %for.body.preheader
470 for.body.preheader: ; preds = %entry
471 %wide.trip.count = zext i32 %n to i64
472 %min.iters.check = icmp ult i32 %n, 16
473 br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph
475 vector.ph: ; preds = %for.body.preheader
476 %n.vec = and i64 %wide.trip.count, 4294967280
477 %broadcast.splatinsert = insertelement <8 x i16> poison, i16 %conv2, i64 0
478 %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer
479 %broadcast.splatinsert15 = insertelement <8 x i16> poison, i16 %conv2, i64 0
480 %broadcast.splat16 = shufflevector <8 x i16> %broadcast.splatinsert15, <8 x i16> poison, <8 x i32> zeroinitializer
481 br label %vector.body
483 vector.body: ; preds = %vector.body, %vector.ph
484 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
485 %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %8, %vector.body ]
486 %vec.phi13 = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
487 %0 = getelementptr inbounds i8, i8* %A, i64 %index
488 %1 = bitcast i8* %0 to <8 x i8>*
489 %wide.load = load <8 x i8>, <8 x i8>* %1, align 1
490 %2 = getelementptr inbounds i8, i8* %0, i64 8
491 %3 = bitcast i8* %2 to <8 x i8>*
492 %wide.load14 = load <8 x i8>, <8 x i8>* %3, align 1
493 %4 = zext <8 x i8> %wide.load to <8 x i16>
494 %5 = zext <8 x i8> %wide.load14 to <8 x i16>
495 %6 = mul nsw <8 x i16> %broadcast.splat, %4
496 %7 = mul nsw <8 x i16> %broadcast.splat16, %5
497 %8 = add <8 x i16> %6, %vec.phi
498 %9 = add <8 x i16> %7, %vec.phi13
499 %index.next = add nuw i64 %index, 16
500 %10 = icmp eq i64 %index.next, %n.vec
501 br i1 %10, label %middle.block, label %vector.body
503 middle.block: ; preds = %vector.body
504 %bin.rdx = add <8 x i16> %9, %8
505 %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %bin.rdx)
506 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
507 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader17
509 for.body.preheader17: ; preds = %for.body.preheader, %middle.block
510 %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
511 %s.011.ph = phi i16 [ 0, %for.body.preheader ], [ %11, %middle.block ]
514 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
515 %s.0.lcssa = phi i16 [ 0, %entry ], [ %11, %middle.block ], [ %add, %for.body ]
518 for.body: ; preds = %for.body.preheader17, %for.body
519 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ]
520 %s.011 = phi i16 [ %add, %for.body ], [ %s.011.ph, %for.body.preheader17 ]
521 %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
522 %12 = load i8, i8* %arrayidx, align 1
523 %13 = zext i8 %12 to i16
524 %mul = mul nsw i16 %13, %conv2
525 %add = add i16 %mul, %s.011
526 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
527 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
528 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
531 define void @sink_v2z64_1(i32 *%p, i32 *%d, i64 %n, <2 x i32> %a) {
532 ; CHECK-LABEL: sink_v2z64_1:
533 ; CHECK: // %bb.0: // %entry
534 ; CHECK-NEXT: mov x8, xzr
535 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
536 ; CHECK-NEXT: .LBB6_1: // %loop
537 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
538 ; CHECK-NEXT: ldr d1, [x0]
539 ; CHECK-NEXT: subs x2, x2, #8
540 ; CHECK-NEXT: add x8, x8, #8
541 ; CHECK-NEXT: umull v1.2d, v1.2s, v0.s[1]
542 ; CHECK-NEXT: shrn v1.2s, v1.2d, #15
543 ; CHECK-NEXT: str d1, [x0], #32
544 ; CHECK-NEXT: b.ne .LBB6_1
545 ; CHECK-NEXT: // %bb.2: // %exit
548 %ext = zext <2 x i32> %a to <2 x i64>
549 %broadcast.splat = shufflevector <2 x i64> %ext, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
553 %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
554 %g = getelementptr inbounds i32, i32 *%p, i64 %index
555 %gb = bitcast i32* %g to <2 x i32>*
556 %l = load <2 x i32>, <2 x i32> *%gb, align 4
557 %e = zext <2 x i32> %l to <2 x i64>
558 %m = mul <2 x i64> %e, %broadcast.splat
559 %s = ashr <2 x i64> %m, <i64 15, i64 15>
560 %t = trunc <2 x i64> %s to <2 x i32>
561 %h = getelementptr inbounds i32, i32 *%d, i64 %index
562 %hb = bitcast i32* %g to <2 x i32>*
563 store <2 x i32> %t, <2 x i32> *%hb, align 4
564 %index.next = add nuw i64 %index, 8
565 %c = icmp eq i64 %index.next, %n
566 br i1 %c, label %exit, label %loop
572 define void @sink_v4i64_1(i32 *%p, i32 *%d, i64 %n, <2 x i32> %a) {
573 ; CHECK-LABEL: sink_v4i64_1:
574 ; CHECK: // %bb.0: // %entry
575 ; CHECK-NEXT: mov x8, xzr
576 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
577 ; CHECK-NEXT: .LBB7_1: // %loop
578 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
579 ; CHECK-NEXT: ldr q1, [x0]
580 ; CHECK-NEXT: subs x2, x2, #8
581 ; CHECK-NEXT: add x8, x8, #8
582 ; CHECK-NEXT: smull v2.2d, v1.2s, v0.s[1]
583 ; CHECK-NEXT: smull2 v1.2d, v1.4s, v0.s[1]
584 ; CHECK-NEXT: shrn v2.2s, v2.2d, #15
585 ; CHECK-NEXT: shrn2 v2.4s, v1.2d, #15
586 ; CHECK-NEXT: str q2, [x0], #32
587 ; CHECK-NEXT: b.ne .LBB7_1
588 ; CHECK-NEXT: // %bb.2: // %exit
591 %ext = sext <2 x i32> %a to <2 x i64>
592 %broadcast.splat = shufflevector <2 x i64> %ext, <2 x i64> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
596 %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
597 %g = getelementptr inbounds i32, i32 *%p, i64 %index
598 %gb = bitcast i32* %g to <4 x i32>*
599 %l = load <4 x i32>, <4 x i32> *%gb, align 4
600 %e = sext <4 x i32> %l to <4 x i64>
601 %m = mul <4 x i64> %e, %broadcast.splat
602 %s = ashr <4 x i64> %m, <i64 15, i64 15, i64 15, i64 15>
603 %t = trunc <4 x i64> %s to <4 x i32>
604 %h = getelementptr inbounds i32, i32 *%d, i64 %index
605 %hb = bitcast i32* %g to <4 x i32>*
606 store <4 x i32> %t, <4 x i32> *%hb, align 4
607 %index.next = add nuw i64 %index, 8
608 %c = icmp eq i64 %index.next, %n
609 br i1 %c, label %exit, label %loop
615 define void @sink_v8z16_0(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) {
616 ; CHECK-LABEL: sink_v8z16_0:
617 ; CHECK: // %bb.0: // %entry
618 ; CHECK-NEXT: dup v0.8b, v0.b[0]
619 ; CHECK-NEXT: mov x8, xzr
620 ; CHECK-NEXT: .LBB8_1: // %loop
621 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
622 ; CHECK-NEXT: ldr d1, [x0]
623 ; CHECK-NEXT: subs x2, x2, #8
624 ; CHECK-NEXT: add x8, x8, #8
625 ; CHECK-NEXT: umull v1.8h, v1.8b, v0.8b
626 ; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
627 ; CHECK-NEXT: xtn v1.8b, v1.8h
628 ; CHECK-NEXT: str d1, [x0], #32
629 ; CHECK-NEXT: b.ne .LBB8_1
630 ; CHECK-NEXT: // %bb.2: // %exit
633 %ext = zext <16 x i8> %a to <16 x i16>
634 %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
638 %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
639 %g = getelementptr inbounds i32, i32 *%p, i64 %index
640 %gb = bitcast i32* %g to <8 x i8>*
641 %l = load <8 x i8>, <8 x i8> *%gb, align 4
642 %e = zext <8 x i8> %l to <8 x i16>
643 %m = mul <8 x i16> %e, %broadcast.splat
644 %s = ashr <8 x i16> %m, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
645 %t = trunc <8 x i16> %s to <8 x i8>
646 %h = getelementptr inbounds i32, i32 *%d, i64 %index
647 %hb = bitcast i32* %g to <8 x i8>*
648 store <8 x i8> %t, <8 x i8> *%hb, align 4
649 %index.next = add nuw i64 %index, 8
650 %c = icmp eq i64 %index.next, %n
651 br i1 %c, label %exit, label %loop
657 define void @sink_v16s16_8(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) {
658 ; CHECK-LABEL: sink_v16s16_8:
659 ; CHECK: // %bb.0: // %entry
660 ; CHECK-NEXT: dup v0.16b, v0.b[10]
661 ; CHECK-NEXT: mov x8, xzr
662 ; CHECK-NEXT: .LBB9_1: // %loop
663 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
664 ; CHECK-NEXT: ldr q1, [x0]
665 ; CHECK-NEXT: subs x2, x2, #8
666 ; CHECK-NEXT: add x8, x8, #8
667 ; CHECK-NEXT: smull v2.8h, v1.8b, v0.8b
668 ; CHECK-NEXT: smull2 v1.8h, v1.16b, v0.16b
669 ; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
670 ; CHECK-NEXT: cmlt v2.8h, v2.8h, #0
671 ; CHECK-NEXT: uzp1 v1.16b, v2.16b, v1.16b
672 ; CHECK-NEXT: str q1, [x0], #32
673 ; CHECK-NEXT: b.ne .LBB9_1
674 ; CHECK-NEXT: // %bb.2: // %exit
677 %ext = sext <16 x i8> %a to <16 x i16>
678 %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <16 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
682 %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
683 %g = getelementptr inbounds i32, i32 *%p, i64 %index
684 %gb = bitcast i32* %g to <16 x i8>*
685 %l = load <16 x i8>, <16 x i8> *%gb, align 4
686 %e = sext <16 x i8> %l to <16 x i16>
687 %m = mul <16 x i16> %e, %broadcast.splat
688 %s = ashr <16 x i16> %m, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
689 %t = trunc <16 x i16> %s to <16 x i8>
690 %h = getelementptr inbounds i32, i32 *%d, i64 %index
691 %hb = bitcast i32* %g to <16 x i8>*
692 store <16 x i8> %t, <16 x i8> *%hb, align 4
693 %index.next = add nuw i64 %index, 8
694 %c = icmp eq i64 %index.next, %n
695 br i1 %c, label %exit, label %loop
701 define void @matrix_mul_unsigned_and(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
702 ; CHECK-LABEL: matrix_mul_unsigned_and:
703 ; CHECK: // %bb.0: // %vector.header
704 ; CHECK-NEXT: and w8, w3, #0xffff
705 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
706 ; CHECK-NEXT: dup v0.4h, w8
707 ; CHECK-NEXT: and x8, x0, #0xfffffff8
708 ; CHECK-NEXT: .LBB10_1: // %vector.body
709 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
710 ; CHECK-NEXT: add x9, x2, w0, uxtw #1
711 ; CHECK-NEXT: subs x8, x8, #8
712 ; CHECK-NEXT: ldp d1, d2, [x9]
713 ; CHECK-NEXT: add x9, x1, w0, uxtw #2
714 ; CHECK-NEXT: add w0, w0, #8
715 ; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
716 ; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
717 ; CHECK-NEXT: stp q1, q2, [x9]
718 ; CHECK-NEXT: b.ne .LBB10_1
719 ; CHECK-NEXT: // %bb.2: // %for.end12
722 %conv4 = and i32 %val, 65535
723 %wide.trip.count = zext i32 %N to i64
724 %0 = add nsw i64 %wide.trip.count, -1
725 %min.iters.check = icmp ult i32 %N, 8
726 %1 = trunc i64 %0 to i32
727 %2 = icmp ugt i64 %0, 4294967295
728 %n.vec = and i64 %wide.trip.count, 4294967288
729 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
730 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
731 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
732 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
733 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
734 br label %vector.body
736 vector.body: ; preds = %vector.header, %vector.body
737 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
738 %3 = trunc i64 %index to i32
740 %5 = zext i32 %4 to i64
741 %6 = getelementptr inbounds i16, i16* %A, i64 %5
742 %7 = bitcast i16* %6 to <4 x i16>*
743 %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
744 %8 = getelementptr inbounds i16, i16* %6, i64 4
745 %9 = bitcast i16* %8 to <4 x i16>*
746 %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
747 %10 = zext <4 x i16> %wide.load to <4 x i32>
748 %11 = zext <4 x i16> %wide.load30 to <4 x i32>
749 %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
750 %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
751 %14 = getelementptr inbounds i32, i32* %C, i64 %5
752 %15 = bitcast i32* %14 to <4 x i32>*
753 store <4 x i32> %12, <4 x i32>* %15, align 4
754 %16 = getelementptr inbounds i32, i32* %14, i64 4
755 %17 = bitcast i32* %16 to <4 x i32>*
756 store <4 x i32> %13, <4 x i32>* %17, align 4
757 %index.next = add i64 %index, 8
758 %18 = icmp eq i64 %index.next, %n.vec
759 br i1 %18, label %for.end12, label %vector.body
761 for.end12: ; preds = %vector.body
765 define void @matrix_mul_unsigned_and_double(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
766 ; CHECK-LABEL: matrix_mul_unsigned_and_double:
767 ; CHECK: // %bb.0: // %vector.header
768 ; CHECK-NEXT: and w8, w3, #0xffff
769 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
770 ; CHECK-NEXT: dup v0.8h, w8
771 ; CHECK-NEXT: and x8, x0, #0xfffffff0
772 ; CHECK-NEXT: .LBB11_1: // %vector.body
773 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
774 ; CHECK-NEXT: add x9, x2, w0, uxtw #1
775 ; CHECK-NEXT: subs x8, x8, #16
776 ; CHECK-NEXT: ldr q1, [x9]
777 ; CHECK-NEXT: ldur q2, [x9, #8]
778 ; CHECK-NEXT: add x9, x1, w0, uxtw #2
779 ; CHECK-NEXT: add w0, w0, #16
780 ; CHECK-NEXT: umull2 v3.4s, v0.8h, v1.8h
781 ; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
782 ; CHECK-NEXT: umull2 v4.4s, v0.8h, v2.8h
783 ; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
784 ; CHECK-NEXT: stp q1, q3, [x9]
785 ; CHECK-NEXT: stp q2, q4, [x9, #32]
786 ; CHECK-NEXT: b.ne .LBB11_1
787 ; CHECK-NEXT: // %bb.2: // %for.end12
790 %conv4 = and i32 %val, 65535
791 %wide.trip.count = zext i32 %N to i64
792 %0 = add nsw i64 %wide.trip.count, -1
793 %min.iters.check = icmp ult i32 %N, 16
794 %1 = trunc i64 %0 to i32
795 %2 = icmp ugt i64 %0, 4294967295
796 %n.vec = and i64 %wide.trip.count, 4294967280
797 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %conv4, i32 0
798 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
799 %broadcast.splatinsert31 = insertelement <8 x i32> undef, i32 %conv4, i32 0
800 %broadcast.splat32 = shufflevector <8 x i32> %broadcast.splatinsert31, <8 x i32> undef, <8 x i32> zeroinitializer
801 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
802 br label %vector.body
804 vector.body: ; preds = %vector.header, %vector.body
805 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
806 %3 = trunc i64 %index to i32
808 %5 = zext i32 %4 to i64
809 %6 = getelementptr inbounds i16, i16* %A, i64 %5
810 %7 = bitcast i16* %6 to <8 x i16>*
811 %wide.load = load <8 x i16>, <8 x i16>* %7, align 2
812 %8 = getelementptr inbounds i16, i16* %6, i64 4
813 %9 = bitcast i16* %8 to <8 x i16>*
814 %wide.load30 = load <8 x i16>, <8 x i16>* %9, align 2
815 %10 = zext <8 x i16> %wide.load to <8 x i32>
816 %11 = zext <8 x i16> %wide.load30 to <8 x i32>
817 %12 = mul nuw nsw <8 x i32> %broadcast.splat, %10
818 %13 = mul nuw nsw <8 x i32> %broadcast.splat32, %11
819 %14 = getelementptr inbounds i32, i32* %C, i64 %5
820 %15 = bitcast i32* %14 to <8 x i32>*
821 store <8 x i32> %12, <8 x i32>* %15, align 4
822 %16 = getelementptr inbounds i32, i32* %14, i64 8
823 %17 = bitcast i32* %16 to <8 x i32>*
824 store <8 x i32> %13, <8 x i32>* %17, align 4
825 %index.next = add i64 %index, 16
826 %18 = icmp eq i64 %index.next, %n.vec
827 br i1 %18, label %for.end12, label %vector.body
829 for.end12: ; preds = %vector.body
833 define void @matrix_mul_signed_and(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
834 ; CHECK-LABEL: matrix_mul_signed_and:
835 ; CHECK: // %bb.0: // %vector.header
836 ; CHECK-NEXT: and w8, w3, #0xffff
837 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
838 ; CHECK-NEXT: dup v0.4s, w8
839 ; CHECK-NEXT: and x8, x0, #0xfffffff8
840 ; CHECK-NEXT: .LBB12_1: // %vector.body
841 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
842 ; CHECK-NEXT: add x9, x2, w0, uxtw #1
843 ; CHECK-NEXT: subs x8, x8, #8
844 ; CHECK-NEXT: ldp d1, d2, [x9]
845 ; CHECK-NEXT: add x9, x1, w0, uxtw #2
846 ; CHECK-NEXT: add w0, w0, #8
847 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0
848 ; CHECK-NEXT: sshll v2.4s, v2.4h, #0
849 ; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
850 ; CHECK-NEXT: mul v2.4s, v0.4s, v2.4s
851 ; CHECK-NEXT: stp q1, q2, [x9]
852 ; CHECK-NEXT: b.ne .LBB12_1
853 ; CHECK-NEXT: // %bb.2: // %for.end12
856 %conv4 = and i32 %val, 65535
857 %wide.trip.count = zext i32 %N to i64
858 %0 = add nsw i64 %wide.trip.count, -1
859 %min.iters.check = icmp ult i32 %N, 8
860 %1 = trunc i64 %0 to i32
861 %2 = icmp ugt i64 %0, 4294967295
862 %n.vec = and i64 %wide.trip.count, 4294967288
863 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
864 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
865 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
866 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
867 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
868 br label %vector.body
870 vector.body: ; preds = %vector.header, %vector.body
871 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
872 %3 = trunc i64 %index to i32
874 %5 = zext i32 %4 to i64
875 %6 = getelementptr inbounds i16, i16* %A, i64 %5
876 %7 = bitcast i16* %6 to <4 x i16>*
877 %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
878 %8 = getelementptr inbounds i16, i16* %6, i64 4
879 %9 = bitcast i16* %8 to <4 x i16>*
880 %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
881 %10 = sext <4 x i16> %wide.load to <4 x i32>
882 %11 = sext <4 x i16> %wide.load30 to <4 x i32>
883 %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
884 %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
885 %14 = getelementptr inbounds i32, i32* %C, i64 %5
886 %15 = bitcast i32* %14 to <4 x i32>*
887 store <4 x i32> %12, <4 x i32>* %15, align 4
888 %16 = getelementptr inbounds i32, i32* %14, i64 4
889 %17 = bitcast i32* %16 to <4 x i32>*
890 store <4 x i32> %13, <4 x i32>* %17, align 4
891 %index.next = add i64 %index, 8
892 %18 = icmp eq i64 %index.next, %n.vec
893 br i1 %18, label %for.end12, label %vector.body
895 for.end12: ; preds = %vector.body
899 define void @matrix_mul_signed_and_double(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
900 ; CHECK-LABEL: matrix_mul_signed_and_double:
901 ; CHECK: // %bb.0: // %vector.header
902 ; CHECK-NEXT: and w8, w3, #0xffff
903 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
904 ; CHECK-NEXT: dup v0.4s, w8
905 ; CHECK-NEXT: and x8, x0, #0xfffffff0
906 ; CHECK-NEXT: .LBB13_1: // %vector.body
907 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
908 ; CHECK-NEXT: add x9, x2, w0, uxtw #1
909 ; CHECK-NEXT: subs x8, x8, #16
910 ; CHECK-NEXT: ldr q1, [x9]
911 ; CHECK-NEXT: ldur q2, [x9, #8]
912 ; CHECK-NEXT: add x9, x1, w0, uxtw #2
913 ; CHECK-NEXT: add w0, w0, #16
914 ; CHECK-NEXT: sshll2 v3.4s, v1.8h, #0
915 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0
916 ; CHECK-NEXT: sshll2 v4.4s, v2.8h, #0
917 ; CHECK-NEXT: sshll v2.4s, v2.4h, #0
918 ; CHECK-NEXT: mul v3.4s, v0.4s, v3.4s
919 ; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
920 ; CHECK-NEXT: mul v4.4s, v0.4s, v4.4s
921 ; CHECK-NEXT: mul v2.4s, v0.4s, v2.4s
922 ; CHECK-NEXT: stp q1, q3, [x9]
923 ; CHECK-NEXT: stp q2, q4, [x9, #32]
924 ; CHECK-NEXT: b.ne .LBB13_1
925 ; CHECK-NEXT: // %bb.2: // %for.end12
928 %conv4 = and i32 %val, 65535
929 %wide.trip.count = zext i32 %N to i64
930 %0 = add nsw i64 %wide.trip.count, -1
931 %min.iters.check = icmp ult i32 %N, 16
932 %1 = trunc i64 %0 to i32
933 %2 = icmp ugt i64 %0, 4294967295
934 %n.vec = and i64 %wide.trip.count, 4294967280
935 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %conv4, i32 0
936 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
937 %broadcast.splatinsert31 = insertelement <8 x i32> undef, i32 %conv4, i32 0
938 %broadcast.splat32 = shufflevector <8 x i32> %broadcast.splatinsert31, <8 x i32> undef, <8 x i32> zeroinitializer
939 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
940 br label %vector.body
942 vector.body: ; preds = %vector.header, %vector.body
943 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
944 %3 = trunc i64 %index to i32
946 %5 = zext i32 %4 to i64
947 %6 = getelementptr inbounds i16, i16* %A, i64 %5
948 %7 = bitcast i16* %6 to <8 x i16>*
949 %wide.load = load <8 x i16>, <8 x i16>* %7, align 2
950 %8 = getelementptr inbounds i16, i16* %6, i64 4
951 %9 = bitcast i16* %8 to <8 x i16>*
952 %wide.load30 = load <8 x i16>, <8 x i16>* %9, align 2
953 %10 = sext <8 x i16> %wide.load to <8 x i32>
954 %11 = sext <8 x i16> %wide.load30 to <8 x i32>
955 %12 = mul nuw nsw <8 x i32> %broadcast.splat, %10
956 %13 = mul nuw nsw <8 x i32> %broadcast.splat32, %11
957 %14 = getelementptr inbounds i32, i32* %C, i64 %5
958 %15 = bitcast i32* %14 to <8 x i32>*
959 store <8 x i32> %12, <8 x i32>* %15, align 4
960 %16 = getelementptr inbounds i32, i32* %14, i64 8
961 %17 = bitcast i32* %16 to <8 x i32>*
962 store <8 x i32> %13, <8 x i32>* %17, align 4
963 %index.next = add i64 %index, 16
964 %18 = icmp eq i64 %index.next, %n.vec
965 br i1 %18, label %for.end12, label %vector.body
967 for.end12: ; preds = %vector.body
971 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)