1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
4 define <4 x i16> @mlai16_trunc(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
5 ; CHECK-LABEL: mlai16_trunc:
6 ; CHECK: // %bb.0: // %entry
7 ; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
8 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
9 ; CHECK-NEXT: xtn v0.4h, v0.4s
12 %v0 = sext <4 x i16> %vec0 to <4 x i32>
13 %v1 = sext <4 x i16> %vec1 to <4 x i32>
14 %v2 = sext <4 x i16> %vec2 to <4 x i32>
15 %v3 = mul <4 x i32> %v1, %v0
16 %v4 = add <4 x i32> %v3, %v2
17 %v5 = trunc <4 x i32> %v4 to <4 x i16>
21 define <4 x i32> @mlai16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
22 ; CHECK-LABEL: mlai16_and:
23 ; CHECK: // %bb.0: // %entry
24 ; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
25 ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
26 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
27 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
30 %v0 = sext <4 x i16> %vec0 to <4 x i32>
31 %v1 = sext <4 x i16> %vec1 to <4 x i32>
32 %v2 = sext <4 x i16> %vec2 to <4 x i32>
33 %v3 = mul <4 x i32> %v1, %v0
34 %v4 = add <4 x i32> %v3, %v2
35 %v5 = and <4 x i32> %v4, <i32 65535, i32 65535, i32 65535, i32 65535>
39 define void @mlai16_loadstore(ptr %a, ptr %b, ptr %c) {
40 ; CHECK-LABEL: mlai16_loadstore:
41 ; CHECK: // %bb.0: // %entry
42 ; CHECK-NEXT: ldr d0, [x0, #16]
43 ; CHECK-NEXT: ldr d1, [x1, #16]
44 ; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
45 ; CHECK-NEXT: ldr d1, [x2, #16]
46 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
47 ; CHECK-NEXT: xtn v0.4h, v0.4s
48 ; CHECK-NEXT: str d0, [x0, #16]
51 %scevgep0 = getelementptr i16, ptr %a, i32 8
52 %vec0 = load <4 x i16>, ptr %scevgep0, align 8
53 %v0 = sext <4 x i16> %vec0 to <4 x i32>
54 %scevgep1 = getelementptr i16, ptr %b, i32 8
55 %vec1 = load <4 x i16>, ptr %scevgep1, align 8
56 %v1 = sext <4 x i16> %vec1 to <4 x i32>
57 %scevgep2 = getelementptr i16, ptr %c, i32 8
58 %vec2 = load <4 x i16>, ptr %scevgep2, align 8
59 %v2 = sext <4 x i16> %vec2 to <4 x i32>
60 %v3 = mul <4 x i32> %v1, %v0
61 %v4 = add <4 x i32> %v3, %v2
62 %v5 = trunc <4 x i32> %v4 to <4 x i16>
63 %scevgep3 = getelementptr i16, ptr %a, i32 8
64 store <4 x i16> %v5, ptr %scevgep3, align 8
68 define <4 x i16> @addmuli16_trunc(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
69 ; CHECK-LABEL: addmuli16_trunc:
70 ; CHECK: // %bb.0: // %entry
71 ; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h
72 ; CHECK-NEXT: smlal v1.4s, v0.4h, v2.4h
73 ; CHECK-NEXT: xtn v0.4h, v1.4s
76 %v0 = sext <4 x i16> %vec0 to <4 x i32>
77 %v1 = sext <4 x i16> %vec1 to <4 x i32>
78 %v2 = sext <4 x i16> %vec2 to <4 x i32>
79 %v3 = add <4 x i32> %v1, %v0
80 %v4 = mul <4 x i32> %v3, %v2
81 %v5 = trunc <4 x i32> %v4 to <4 x i16>
85 define <4 x i32> @addmuli16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
86 ; CHECK-LABEL: addmuli16_and:
87 ; CHECK: // %bb.0: // %entry
88 ; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h
89 ; CHECK-NEXT: smlal v1.4s, v0.4h, v2.4h
90 ; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
91 ; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
94 %v0 = sext <4 x i16> %vec0 to <4 x i32>
95 %v1 = sext <4 x i16> %vec1 to <4 x i32>
96 %v2 = sext <4 x i16> %vec2 to <4 x i32>
97 %v3 = add <4 x i32> %v1, %v0
98 %v4 = mul <4 x i32> %v3, %v2
99 %v5 = and <4 x i32> %v4, <i32 65535, i32 65535, i32 65535, i32 65535>
103 define void @addmuli16_loadstore(ptr %a, ptr %b, ptr %c) {
104 ; CHECK-LABEL: addmuli16_loadstore:
105 ; CHECK: // %bb.0: // %entry
106 ; CHECK-NEXT: ldr d0, [x1, #16]
107 ; CHECK-NEXT: ldr d1, [x2, #16]
108 ; CHECK-NEXT: ldr d2, [x0, #16]
109 ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
110 ; CHECK-NEXT: smlal v0.4s, v2.4h, v1.4h
111 ; CHECK-NEXT: xtn v0.4h, v0.4s
112 ; CHECK-NEXT: str d0, [x0, #16]
115 %scevgep0 = getelementptr i16, ptr %a, i32 8
116 %vec0 = load <4 x i16>, ptr %scevgep0, align 8
117 %v0 = sext <4 x i16> %vec0 to <4 x i32>
118 %scevgep1 = getelementptr i16, ptr %b, i32 8
119 %vec1 = load <4 x i16>, ptr %scevgep1, align 8
120 %v1 = sext <4 x i16> %vec1 to <4 x i32>
121 %scevgep2 = getelementptr i16, ptr %c, i32 8
122 %vec2 = load <4 x i16>, ptr %scevgep2, align 8
123 %v2 = sext <4 x i16> %vec2 to <4 x i32>
124 %v3 = add <4 x i32> %v1, %v0
125 %v4 = mul <4 x i32> %v3, %v2
126 %v5 = trunc <4 x i32> %v4 to <4 x i16>
127 %scevgep3 = getelementptr i16, ptr %a, i32 8
128 store <4 x i16> %v5, ptr %scevgep3, align 8
132 define <2 x i32> @mlai32_trunc(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
133 ; CHECK-LABEL: mlai32_trunc:
134 ; CHECK: // %bb.0: // %entry
135 ; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
136 ; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
137 ; CHECK-NEXT: xtn v0.2s, v0.2d
140 %v0 = sext <2 x i32> %vec0 to <2 x i64>
141 %v1 = sext <2 x i32> %vec1 to <2 x i64>
142 %v2 = sext <2 x i32> %vec2 to <2 x i64>
143 %v3 = mul <2 x i64> %v1, %v0
144 %v4 = add <2 x i64> %v3, %v2
145 %v5 = trunc <2 x i64> %v4 to <2 x i32>
149 define <2 x i64> @mlai32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
150 ; CHECK-LABEL: mlai32_and:
151 ; CHECK: // %bb.0: // %entry
152 ; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
153 ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
154 ; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
155 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
158 %v0 = sext <2 x i32> %vec0 to <2 x i64>
159 %v1 = sext <2 x i32> %vec1 to <2 x i64>
160 %v2 = sext <2 x i32> %vec2 to <2 x i64>
161 %v3 = mul <2 x i64> %v1, %v0
162 %v4 = add <2 x i64> %v3, %v2
163 %v5 = and <2 x i64> %v4, <i64 4294967295, i64 4294967295>
167 define void @mlai32_loadstore(ptr %a, ptr %b, ptr %c) {
168 ; CHECK-LABEL: mlai32_loadstore:
169 ; CHECK: // %bb.0: // %entry
170 ; CHECK-NEXT: ldr d0, [x0, #32]
171 ; CHECK-NEXT: ldr d1, [x1, #32]
172 ; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
173 ; CHECK-NEXT: ldr d1, [x2, #32]
174 ; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s
175 ; CHECK-NEXT: xtn v0.2s, v0.2d
176 ; CHECK-NEXT: str d0, [x0, #32]
179 %scevgep0 = getelementptr i32, ptr %a, i32 8
180 %vec0 = load <2 x i32>, ptr %scevgep0, align 8
181 %v0 = sext <2 x i32> %vec0 to <2 x i64>
182 %scevgep1 = getelementptr i32, ptr %b, i32 8
183 %vec1 = load <2 x i32>, ptr %scevgep1, align 8
184 %v1 = sext <2 x i32> %vec1 to <2 x i64>
185 %scevgep2 = getelementptr i32, ptr %c, i32 8
186 %vec2 = load <2 x i32>, ptr %scevgep2, align 8
187 %v2 = sext <2 x i32> %vec2 to <2 x i64>
188 %v3 = mul <2 x i64> %v1, %v0
189 %v4 = add <2 x i64> %v3, %v2
190 %v5 = trunc <2 x i64> %v4 to <2 x i32>
191 %scevgep3 = getelementptr i32, ptr %a, i32 8
192 store <2 x i32> %v5, ptr %scevgep3, align 8
196 define <2 x i32> @addmuli32_trunc(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
197 ; CHECK-LABEL: addmuli32_trunc:
198 ; CHECK: // %bb.0: // %entry
199 ; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s
200 ; CHECK-NEXT: smlal v1.2d, v0.2s, v2.2s
201 ; CHECK-NEXT: xtn v0.2s, v1.2d
204 %v0 = sext <2 x i32> %vec0 to <2 x i64>
205 %v1 = sext <2 x i32> %vec1 to <2 x i64>
206 %v2 = sext <2 x i32> %vec2 to <2 x i64>
207 %v3 = add <2 x i64> %v1, %v0
208 %v4 = mul <2 x i64> %v3, %v2
209 %v5 = trunc <2 x i64> %v4 to <2 x i32>
213 define <2 x i64> @addmuli32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
214 ; CHECK-LABEL: addmuli32_and:
215 ; CHECK: // %bb.0: // %entry
216 ; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s
217 ; CHECK-NEXT: smlal v1.2d, v0.2s, v2.2s
218 ; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
219 ; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
222 %v0 = sext <2 x i32> %vec0 to <2 x i64>
223 %v1 = sext <2 x i32> %vec1 to <2 x i64>
224 %v2 = sext <2 x i32> %vec2 to <2 x i64>
225 %v3 = add <2 x i64> %v1, %v0
226 %v4 = mul <2 x i64> %v3, %v2
227 %v5 = and <2 x i64> %v4, <i64 4294967295, i64 4294967295>
231 define void @addmuli32_loadstore(ptr %a, ptr %b, ptr %c) {
232 ; CHECK-LABEL: addmuli32_loadstore:
233 ; CHECK: // %bb.0: // %entry
234 ; CHECK-NEXT: ldr d0, [x1, #32]
235 ; CHECK-NEXT: ldr d1, [x2, #32]
236 ; CHECK-NEXT: ldr d2, [x0, #32]
237 ; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
238 ; CHECK-NEXT: smlal v0.2d, v2.2s, v1.2s
239 ; CHECK-NEXT: xtn v0.2s, v0.2d
240 ; CHECK-NEXT: str d0, [x0, #32]
243 %scevgep0 = getelementptr i32, ptr %a, i32 8
244 %vec0 = load <2 x i32>, ptr %scevgep0, align 8
245 %v0 = sext <2 x i32> %vec0 to <2 x i64>
246 %scevgep1 = getelementptr i32, ptr %b, i32 8
247 %vec1 = load <2 x i32>, ptr %scevgep1, align 8
248 %v1 = sext <2 x i32> %vec1 to <2 x i64>
249 %scevgep2 = getelementptr i32, ptr %c, i32 8
250 %vec2 = load <2 x i32>, ptr %scevgep2, align 8
251 %v2 = sext <2 x i32> %vec2 to <2 x i64>
252 %v3 = add <2 x i64> %v1, %v0
253 %v4 = mul <2 x i64> %v3, %v2
254 %v5 = trunc <2 x i64> %v4 to <2 x i32>
255 %scevgep3 = getelementptr i32, ptr %a, i32 8
256 store <2 x i32> %v5, ptr %scevgep3, align 8
260 define void @func1(ptr %a, ptr %b, ptr %c) {
261 ; CHECK-LABEL: func1:
262 ; CHECK: // %bb.0: // %entry
263 ; CHECK-NEXT: ldr d0, [x2, #16]
264 ; CHECK-NEXT: ldr d1, [x1, #16]
265 ; CHECK-NEXT: ldr d2, [x0, #16]
266 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
267 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
268 ; CHECK-NEXT: xtn v1.4h, v0.4s
269 ; CHECK-NEXT: str d1, [x0, #16]
270 ; CHECK-NEXT: ldr d1, [x2, #16]
271 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0
272 ; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s
273 ; CHECK-NEXT: xtn v1.4h, v0.4s
274 ; CHECK-NEXT: str d1, [x1, #16]
275 ; CHECK-NEXT: ldr d1, [x2, #16]
276 ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
277 ; CHECK-NEXT: xtn v0.4h, v0.4s
278 ; CHECK-NEXT: str d0, [x0, #16]
281 ; The test case trying to vectorize the pseudo code below.
282 ; a[i] = b[i] + c[i];
284 ; a[i] = b[i] + aptr c[i];
285 ; Checking that vector load a[i] for "a[i] = b[i] + aptr c[i]" is
286 ; scheduled before the first vector store to "a[i] = b[i] + c[i]".
287 ; Checking that there is no vector load a[i] scheduled between the vector
288 ; stores to a[i], otherwise the load of a[i] will be polluted by the first
289 ; vector store to a[i].
290 ; This test case check that the chain information is updated during
291 ; lowerMUL for the new created Load SDNode.
294 %scevgep0 = getelementptr i16, ptr %a, i32 8
295 %vec0 = load <4 x i16>, ptr %scevgep0, align 8
296 %scevgep1 = getelementptr i16, ptr %b, i32 8
297 %vec1 = load <4 x i16>, ptr %scevgep1, align 8
298 %0 = zext <4 x i16> %vec1 to <4 x i32>
299 %scevgep2 = getelementptr i16, ptr %c, i32 8
300 %vec2 = load <4 x i16>, ptr %scevgep2, align 8
301 %1 = sext <4 x i16> %vec2 to <4 x i32>
302 %vec3 = add <4 x i32> %1, %0
303 %2 = trunc <4 x i32> %vec3 to <4 x i16>
304 %scevgep3 = getelementptr i16, ptr %a, i32 8
305 store <4 x i16> %2, ptr %scevgep3, align 8
306 %vec4 = load <4 x i16>, ptr %scevgep2, align 8
307 %3 = sext <4 x i16> %vec4 to <4 x i32>
308 %vec5 = mul <4 x i32> %3, %vec3
309 %4 = trunc <4 x i32> %vec5 to <4 x i16>
310 store <4 x i16> %4, ptr %scevgep1, align 8
311 %5 = sext <4 x i16> %vec0 to <4 x i32>
312 %vec6 = load <4 x i16>, ptr %scevgep2, align 8
313 %6 = sext <4 x i16> %vec6 to <4 x i32>
314 %vec7 = mul <4 x i32> %6, %5
315 %vec8 = add <4 x i32> %vec7, %vec5
316 %7 = trunc <4 x i32> %vec8 to <4 x i16>
317 store <4 x i16> %7, ptr %scevgep3, align 8
321 define void @func2(ptr %a, ptr %b, ptr %c) {
322 ; CHECK-LABEL: func2:
323 ; CHECK: // %bb.0: // %entry
324 ; CHECK-NEXT: ldr d0, [x2, #16]
325 ; CHECK-NEXT: ldr d1, [x1, #16]
326 ; CHECK-NEXT: ldr d2, [x0, #16]
327 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
328 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
329 ; CHECK-NEXT: xtn v1.4h, v0.4s
330 ; CHECK-NEXT: str d1, [x0, #16]
331 ; CHECK-NEXT: ldr d1, [x2, #16]
332 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0
333 ; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s
334 ; CHECK-NEXT: xtn v1.4h, v0.4s
335 ; CHECK-NEXT: str d1, [x1, #16]
336 ; CHECK-NEXT: ldr d1, [x2, #16]
337 ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
338 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
339 ; CHECK-NEXT: xtn v0.4h, v0.4s
340 ; CHECK-NEXT: str d0, [x0, #16]
343 ; The test case trying to vectorize the pseudo code below.
344 ; a[i] = b[i] + c[i];
346 ; a[i] = b[i] + aptr c[i] + a[i];
347 ; Checking that vector load a[i] for "a[i] = b[i] + aptr c[i] + a[i]"
348 ; is scheduled before the first vector store to "a[i] = b[i] + c[i]".
349 ; Checking that there is no vector load a[i] scheduled between the first
350 ; vector store to a[i] and the vector add of a[i], otherwise the load of
351 ; a[i] will be polluted by the first vector store to a[i].
352 ; This test case check that both the chain and value of the new created
353 ; Load SDNode are updated during lowerMUL.
356 %scevgep0 = getelementptr i16, ptr %a, i32 8
357 %vec0 = load <4 x i16>, ptr %scevgep0, align 8
358 %scevgep1 = getelementptr i16, ptr %b, i32 8
359 %vec1 = load <4 x i16>, ptr %scevgep1, align 8
360 %0 = zext <4 x i16> %vec1 to <4 x i32>
361 %scevgep2 = getelementptr i16, ptr %c, i32 8
362 %vec2 = load <4 x i16>, ptr %scevgep2, align 8
363 %1 = sext <4 x i16> %vec2 to <4 x i32>
364 %vec3 = add <4 x i32> %1, %0
365 %2 = trunc <4 x i32> %vec3 to <4 x i16>
366 %scevgep3 = getelementptr i16, ptr %a, i32 8
367 store <4 x i16> %2, ptr %scevgep3, align 8
368 %vec4 = load <4 x i16>, ptr %scevgep2, align 8
369 %3 = sext <4 x i16> %vec4 to <4 x i32>
370 %vec5 = mul <4 x i32> %3, %vec3
371 %4 = trunc <4 x i32> %vec5 to <4 x i16>
372 store <4 x i16> %4, ptr %scevgep1, align 8
373 %5 = sext <4 x i16> %vec0 to <4 x i32>
374 %vec6 = load <4 x i16>, ptr %scevgep2, align 8
375 %6 = sext <4 x i16> %vec6 to <4 x i32>
376 %vec7 = mul <4 x i32> %6, %5
377 %vec8 = add <4 x i32> %vec7, %vec5
378 %vec9 = add <4 x i32> %vec8, %5
379 %7 = trunc <4 x i32> %vec9 to <4 x i16>
380 store <4 x i16> %7, ptr %scevgep3, align 8