1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s
4 define void @matrix_mul_unsigned(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
5 ; CHECK-LABEL: matrix_mul_unsigned:
6 ; CHECK: // %bb.0: // %vector.header
7 ; CHECK-NEXT: and w9, w3, #0xffff
8 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
9 ; CHECK-NEXT: and x8, x0, #0xfffffff8
10 ; CHECK-NEXT: dup v0.4h, w9
11 ; CHECK-NEXT: .LBB0_1: // %vector.body
12 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
13 ; CHECK-NEXT: add x9, x2, w0, uxtw #1
14 ; CHECK-NEXT: ldp d1, d2, [x9]
15 ; CHECK-NEXT: add x9, x1, w0, uxtw #2
16 ; CHECK-NEXT: subs x8, x8, #8
17 ; CHECK-NEXT: add w0, w0, #8
18 ; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
19 ; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
20 ; CHECK-NEXT: stp q1, q2, [x9]
21 ; CHECK-NEXT: b.ne .LBB0_1
22 ; CHECK-NEXT: // %bb.2: // %for.end12
25 %conv4 = zext i16 %val to i32
26 %wide.trip.count = zext i32 %N to i64
27 %0 = add nsw i64 %wide.trip.count, -1
28 %min.iters.check = icmp ult i32 %N, 8
29 %1 = trunc i64 %0 to i32
30 %2 = icmp ugt i64 %0, 4294967295
31 %n.vec = and i64 %wide.trip.count, 4294967288
32 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
33 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
34 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
35 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
36 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
39 vector.body: ; preds = %vector.header, %vector.body
40 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
41 %3 = trunc i64 %index to i32
43 %5 = zext i32 %4 to i64
44 %6 = getelementptr inbounds i16, i16* %A, i64 %5
45 %7 = bitcast i16* %6 to <4 x i16>*
46 %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
47 %8 = getelementptr inbounds i16, i16* %6, i64 4
48 %9 = bitcast i16* %8 to <4 x i16>*
49 %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
50 %10 = zext <4 x i16> %wide.load to <4 x i32>
51 %11 = zext <4 x i16> %wide.load30 to <4 x i32>
52 %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
53 %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
54 %14 = getelementptr inbounds i32, i32* %C, i64 %5
55 %15 = bitcast i32* %14 to <4 x i32>*
56 store <4 x i32> %12, <4 x i32>* %15, align 4
57 %16 = getelementptr inbounds i32, i32* %14, i64 4
58 %17 = bitcast i32* %16 to <4 x i32>*
59 store <4 x i32> %13, <4 x i32>* %17, align 4
60 %index.next = add i64 %index, 8
61 %18 = icmp eq i64 %index.next, %n.vec
62 br i1 %18, label %for.end12, label %vector.body
64 for.end12: ; preds = %vector.body
68 define void @matrix_mul_signed(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
69 ; CHECK-LABEL: matrix_mul_signed:
70 ; CHECK: // %bb.0: // %vector.header
71 ; CHECK-NEXT: sxth w9, w3
72 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
73 ; CHECK-NEXT: and x8, x0, #0xfffffff8
74 ; CHECK-NEXT: dup v0.4h, w9
75 ; CHECK-NEXT: .LBB1_1: // %vector.body
76 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
77 ; CHECK-NEXT: add x9, x2, w0, sxtw #1
78 ; CHECK-NEXT: ldp d1, d2, [x9]
79 ; CHECK-NEXT: add x9, x1, w0, sxtw #2
80 ; CHECK-NEXT: subs x8, x8, #8
81 ; CHECK-NEXT: add w0, w0, #8
82 ; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h
83 ; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h
84 ; CHECK-NEXT: stp q1, q2, [x9]
85 ; CHECK-NEXT: b.ne .LBB1_1
86 ; CHECK-NEXT: // %bb.2: // %for.end12
89 %conv4 = sext i16 %val to i32
90 %wide.trip.count = sext i32 %N to i64
91 %0 = add nsw i64 %wide.trip.count, -1
92 %min.iters.check = icmp ult i32 %N, 8
93 %1 = trunc i64 %0 to i32
94 %2 = icmp ugt i64 %0, 4294967295
95 %n.vec = and i64 %wide.trip.count, 4294967288
96 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
97 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
98 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
99 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
100 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
101 br label %vector.body
103 vector.body: ; preds = %vector.header, %vector.body
104 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
105 %3 = trunc i64 %index to i32
107 %5 = sext i32 %4 to i64
108 %6 = getelementptr inbounds i16, i16* %A, i64 %5
109 %7 = bitcast i16* %6 to <4 x i16>*
110 %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
111 %8 = getelementptr inbounds i16, i16* %6, i64 4
112 %9 = bitcast i16* %8 to <4 x i16>*
113 %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
114 %10 = sext <4 x i16> %wide.load to <4 x i32>
115 %11 = sext <4 x i16> %wide.load30 to <4 x i32>
116 %12 = mul nsw <4 x i32> %broadcast.splat, %10
117 %13 = mul nsw <4 x i32> %broadcast.splat32, %11
118 %14 = getelementptr inbounds i32, i32* %C, i64 %5
119 %15 = bitcast i32* %14 to <4 x i32>*
120 store <4 x i32> %12, <4 x i32>* %15, align 4
121 %16 = getelementptr inbounds i32, i32* %14, i64 4
122 %17 = bitcast i32* %16 to <4 x i32>*
123 store <4 x i32> %13, <4 x i32>* %17, align 4
124 %index.next = add i64 %index, 8
125 %18 = icmp eq i64 %index.next, %n.vec
126 br i1 %18, label %for.end12, label %vector.body
128 for.end12: ; preds = %vector.body
133 define void @matrix_mul_double_shuffle(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
134 ; CHECK-LABEL: matrix_mul_double_shuffle:
135 ; CHECK: // %bb.0: // %vector.header
136 ; CHECK-NEXT: and w9, w3, #0xffff
137 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
138 ; CHECK-NEXT: and x8, x0, #0xfffffff8
139 ; CHECK-NEXT: dup v0.4h, w9
140 ; CHECK-NEXT: .LBB2_1: // %vector.body
141 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
142 ; CHECK-NEXT: ldrh w9, [x2], #16
143 ; CHECK-NEXT: mov w10, w0
144 ; CHECK-NEXT: subs x8, x8, #8
145 ; CHECK-NEXT: lsl x10, x10, #2
146 ; CHECK-NEXT: dup v1.4h, w9
147 ; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
148 ; CHECK-NEXT: add w0, w0, #8
149 ; CHECK-NEXT: str q1, [x1, x10]
150 ; CHECK-NEXT: b.ne .LBB2_1
151 ; CHECK-NEXT: // %bb.2: // %for.end12
154 %conv4 = zext i16 %val to i32
155 %wide.trip.count = zext i32 %N to i64
156 %0 = add nsw i64 %wide.trip.count, -1
157 %min.iters.check = icmp ult i32 %N, 8
158 %1 = trunc i64 %0 to i32
159 %2 = icmp ugt i64 %0, 4294967295
160 %n.vec = and i64 %wide.trip.count, 4294967288
161 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
162 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
163 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
164 br label %vector.body
166 vector.body: ; preds = %vector.header, %vector.body
167 %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
168 %g = getelementptr inbounds i16, i16* %A, i64 %index
169 %val1 = load i16, i16* %g
170 %splat.input.ext = zext i16 %val1 to i32
171 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %splat.input.ext, i32 0
172 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> %broadcast.splat, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
173 %3 = trunc i64 %index to i32
175 %5 = zext i32 %4 to i64
176 %6 = mul nuw nsw <4 x i32> %broadcast.splat, %broadcast.splat32
177 %7 = getelementptr inbounds i32, i32* %C, i64 %5
178 %8 = bitcast i32* %7 to <4 x i32>*
179 store <4 x i32> %6, <4 x i32>* %8, align 4
180 %index.next = add i64 %index, 8
181 %9 = icmp eq i64 %index.next, %n.vec
182 br i1 %9, label %for.end12, label %vector.body
184 for.end12: ; preds = %vector.body