1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 ; Don't use SVE for 64-bit vectors.
13 define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
14 ; CHECK-LABEL: ashr_v8i8:
16 ; CHECK-NEXT: neg v1.8b, v1.8b
17 ; CHECK-NEXT: sshl v0.8b, v0.8b, v1.8b
19 %res = ashr <8 x i8> %op1, %op2
23 ; Don't use SVE for 128-bit vectors.
24 define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
25 ; CHECK-LABEL: ashr_v16i8:
27 ; CHECK-NEXT: neg v1.16b, v1.16b
28 ; CHECK-NEXT: sshl v0.16b, v0.16b, v1.16b
30 %res = ashr <16 x i8> %op1, %op2
34 define void @ashr_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
35 ; CHECK-LABEL: ashr_v32i8:
37 ; CHECK-NEXT: ptrue p0.b, vl32
38 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
39 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
40 ; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b
41 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
43 %op1 = load <32 x i8>, ptr %a
44 %op2 = load <32 x i8>, ptr %b
45 %res = ashr <32 x i8> %op1, %op2
46 store <32 x i8> %res, ptr %a
50 define void @ashr_v64i8(ptr %a, ptr %b) #0 {
51 ; VBITS_GE_256-LABEL: ashr_v64i8:
52 ; VBITS_GE_256: // %bb.0:
53 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
54 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
55 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
56 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8]
57 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0]
58 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
59 ; VBITS_GE_256-NEXT: asr z0.b, p0/m, z0.b, z1.b
60 ; VBITS_GE_256-NEXT: movprfx z1, z2
61 ; VBITS_GE_256-NEXT: asr z1.b, p0/m, z1.b, z3.b
62 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
63 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
64 ; VBITS_GE_256-NEXT: ret
66 ; VBITS_GE_512-LABEL: ashr_v64i8:
67 ; VBITS_GE_512: // %bb.0:
68 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
69 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
70 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
71 ; VBITS_GE_512-NEXT: asr z0.b, p0/m, z0.b, z1.b
72 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
73 ; VBITS_GE_512-NEXT: ret
74 %op1 = load <64 x i8>, ptr %a
75 %op2 = load <64 x i8>, ptr %b
76 %res = ashr <64 x i8> %op1, %op2
77 store <64 x i8> %res, ptr %a
81 define void @ashr_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
82 ; CHECK-LABEL: ashr_v128i8:
84 ; CHECK-NEXT: ptrue p0.b, vl128
85 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
86 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
87 ; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b
88 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
90 %op1 = load <128 x i8>, ptr %a
91 %op2 = load <128 x i8>, ptr %b
92 %res = ashr <128 x i8> %op1, %op2
93 store <128 x i8> %res, ptr %a
97 define void @ashr_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
98 ; CHECK-LABEL: ashr_v256i8:
100 ; CHECK-NEXT: ptrue p0.b, vl256
101 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
102 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
103 ; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b
104 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
106 %op1 = load <256 x i8>, ptr %a
107 %op2 = load <256 x i8>, ptr %b
108 %res = ashr <256 x i8> %op1, %op2
109 store <256 x i8> %res, ptr %a
113 ; Don't use SVE for 64-bit vectors.
114 define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
115 ; CHECK-LABEL: ashr_v4i16:
117 ; CHECK-NEXT: neg v1.4h, v1.4h
118 ; CHECK-NEXT: sshl v0.4h, v0.4h, v1.4h
120 %res = ashr <4 x i16> %op1, %op2
124 ; Don't use SVE for 128-bit vectors.
125 define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
126 ; CHECK-LABEL: ashr_v8i16:
128 ; CHECK-NEXT: neg v1.8h, v1.8h
129 ; CHECK-NEXT: sshl v0.8h, v0.8h, v1.8h
131 %res = ashr <8 x i16> %op1, %op2
135 define void @ashr_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
136 ; CHECK-LABEL: ashr_v16i16:
138 ; CHECK-NEXT: ptrue p0.h, vl16
139 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
140 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
141 ; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h
142 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
144 %op1 = load <16 x i16>, ptr %a
145 %op2 = load <16 x i16>, ptr %b
146 %res = ashr <16 x i16> %op1, %op2
147 store <16 x i16> %res, ptr %a
151 define void @ashr_v32i16(ptr %a, ptr %b) #0 {
152 ; VBITS_GE_256-LABEL: ashr_v32i16:
153 ; VBITS_GE_256: // %bb.0:
154 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
155 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
156 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
157 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
158 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
159 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
160 ; VBITS_GE_256-NEXT: asr z0.h, p0/m, z0.h, z1.h
161 ; VBITS_GE_256-NEXT: movprfx z1, z2
162 ; VBITS_GE_256-NEXT: asr z1.h, p0/m, z1.h, z3.h
163 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
164 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
165 ; VBITS_GE_256-NEXT: ret
167 ; VBITS_GE_512-LABEL: ashr_v32i16:
168 ; VBITS_GE_512: // %bb.0:
169 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
170 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
171 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
172 ; VBITS_GE_512-NEXT: asr z0.h, p0/m, z0.h, z1.h
173 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
174 ; VBITS_GE_512-NEXT: ret
175 %op1 = load <32 x i16>, ptr %a
176 %op2 = load <32 x i16>, ptr %b
177 %res = ashr <32 x i16> %op1, %op2
178 store <32 x i16> %res, ptr %a
182 define void @ashr_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
183 ; CHECK-LABEL: ashr_v64i16:
185 ; CHECK-NEXT: ptrue p0.h, vl64
186 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
187 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
188 ; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h
189 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
191 %op1 = load <64 x i16>, ptr %a
192 %op2 = load <64 x i16>, ptr %b
193 %res = ashr <64 x i16> %op1, %op2
194 store <64 x i16> %res, ptr %a
198 define void @ashr_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
199 ; CHECK-LABEL: ashr_v128i16:
201 ; CHECK-NEXT: ptrue p0.h, vl128
202 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
203 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
204 ; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h
205 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
207 %op1 = load <128 x i16>, ptr %a
208 %op2 = load <128 x i16>, ptr %b
209 %res = ashr <128 x i16> %op1, %op2
210 store <128 x i16> %res, ptr %a
214 ; Don't use SVE for 64-bit vectors.
215 define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
216 ; CHECK-LABEL: ashr_v2i32:
218 ; CHECK-NEXT: neg v1.2s, v1.2s
219 ; CHECK-NEXT: sshl v0.2s, v0.2s, v1.2s
221 %res = ashr <2 x i32> %op1, %op2
225 ; Don't use SVE for 128-bit vectors.
226 define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
227 ; CHECK-LABEL: ashr_v4i32:
229 ; CHECK-NEXT: neg v1.4s, v1.4s
230 ; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s
232 %res = ashr <4 x i32> %op1, %op2
236 define void @ashr_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
237 ; CHECK-LABEL: ashr_v8i32:
239 ; CHECK-NEXT: ptrue p0.s, vl8
240 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
241 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
242 ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s
243 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
245 %op1 = load <8 x i32>, ptr %a
246 %op2 = load <8 x i32>, ptr %b
247 %res = ashr <8 x i32> %op1, %op2
248 store <8 x i32> %res, ptr %a
252 define void @ashr_v16i32(ptr %a, ptr %b) #0 {
253 ; VBITS_GE_256-LABEL: ashr_v16i32:
254 ; VBITS_GE_256: // %bb.0:
255 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
256 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
257 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
258 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
259 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
260 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
261 ; VBITS_GE_256-NEXT: asr z0.s, p0/m, z0.s, z1.s
262 ; VBITS_GE_256-NEXT: movprfx z1, z2
263 ; VBITS_GE_256-NEXT: asr z1.s, p0/m, z1.s, z3.s
264 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
265 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
266 ; VBITS_GE_256-NEXT: ret
268 ; VBITS_GE_512-LABEL: ashr_v16i32:
269 ; VBITS_GE_512: // %bb.0:
270 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
271 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
272 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
273 ; VBITS_GE_512-NEXT: asr z0.s, p0/m, z0.s, z1.s
274 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
275 ; VBITS_GE_512-NEXT: ret
276 %op1 = load <16 x i32>, ptr %a
277 %op2 = load <16 x i32>, ptr %b
278 %res = ashr <16 x i32> %op1, %op2
279 store <16 x i32> %res, ptr %a
283 define void @ashr_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
284 ; CHECK-LABEL: ashr_v32i32:
286 ; CHECK-NEXT: ptrue p0.s, vl32
287 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
288 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
289 ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s
290 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
292 %op1 = load <32 x i32>, ptr %a
293 %op2 = load <32 x i32>, ptr %b
294 %res = ashr <32 x i32> %op1, %op2
295 store <32 x i32> %res, ptr %a
299 define void @ashr_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
300 ; CHECK-LABEL: ashr_v64i32:
302 ; CHECK-NEXT: ptrue p0.s, vl64
303 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
304 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
305 ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s
306 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
308 %op1 = load <64 x i32>, ptr %a
309 %op2 = load <64 x i32>, ptr %b
310 %res = ashr <64 x i32> %op1, %op2
311 store <64 x i32> %res, ptr %a
315 ; Don't use SVE for 64-bit vectors.
316 define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
317 ; CHECK-LABEL: ashr_v1i64:
319 ; CHECK-NEXT: neg d1, d1
320 ; CHECK-NEXT: sshl d0, d0, d1
322 %res = ashr <1 x i64> %op1, %op2
326 ; Don't use SVE for 128-bit vectors.
327 define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
328 ; CHECK-LABEL: ashr_v2i64:
330 ; CHECK-NEXT: neg v1.2d, v1.2d
331 ; CHECK-NEXT: sshl v0.2d, v0.2d, v1.2d
333 %res = ashr <2 x i64> %op1, %op2
337 define void @ashr_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
338 ; CHECK-LABEL: ashr_v4i64:
340 ; CHECK-NEXT: ptrue p0.d, vl4
341 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
342 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
343 ; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d
344 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
346 %op1 = load <4 x i64>, ptr %a
347 %op2 = load <4 x i64>, ptr %b
348 %res = ashr <4 x i64> %op1, %op2
349 store <4 x i64> %res, ptr %a
353 define void @ashr_v8i64(ptr %a, ptr %b) #0 {
354 ; VBITS_GE_256-LABEL: ashr_v8i64:
355 ; VBITS_GE_256: // %bb.0:
356 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
357 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
358 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
359 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
360 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
361 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
362 ; VBITS_GE_256-NEXT: asr z0.d, p0/m, z0.d, z1.d
363 ; VBITS_GE_256-NEXT: movprfx z1, z2
364 ; VBITS_GE_256-NEXT: asr z1.d, p0/m, z1.d, z3.d
365 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
366 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
367 ; VBITS_GE_256-NEXT: ret
369 ; VBITS_GE_512-LABEL: ashr_v8i64:
370 ; VBITS_GE_512: // %bb.0:
371 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
372 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
373 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
374 ; VBITS_GE_512-NEXT: asr z0.d, p0/m, z0.d, z1.d
375 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
376 ; VBITS_GE_512-NEXT: ret
377 %op1 = load <8 x i64>, ptr %a
378 %op2 = load <8 x i64>, ptr %b
379 %res = ashr <8 x i64> %op1, %op2
380 store <8 x i64> %res, ptr %a
384 define void @ashr_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
385 ; CHECK-LABEL: ashr_v16i64:
387 ; CHECK-NEXT: ptrue p0.d, vl16
388 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
389 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
390 ; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d
391 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
393 %op1 = load <16 x i64>, ptr %a
394 %op2 = load <16 x i64>, ptr %b
395 %res = ashr <16 x i64> %op1, %op2
396 store <16 x i64> %res, ptr %a
400 define void @ashr_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
401 ; CHECK-LABEL: ashr_v32i64:
403 ; CHECK-NEXT: ptrue p0.d, vl32
404 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
405 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
406 ; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d
407 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
409 %op1 = load <32 x i64>, ptr %a
410 %op2 = load <32 x i64>, ptr %b
411 %res = ashr <32 x i64> %op1, %op2
412 store <32 x i64> %res, ptr %a
420 ; Don't use SVE for 64-bit vectors.
421 define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
422 ; CHECK-LABEL: lshr_v8i8:
424 ; CHECK-NEXT: neg v1.8b, v1.8b
425 ; CHECK-NEXT: ushl v0.8b, v0.8b, v1.8b
427 %res = lshr <8 x i8> %op1, %op2
431 ; Don't use SVE for 128-bit vectors.
432 define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
433 ; CHECK-LABEL: lshr_v16i8:
435 ; CHECK-NEXT: neg v1.16b, v1.16b
436 ; CHECK-NEXT: ushl v0.16b, v0.16b, v1.16b
438 %res = lshr <16 x i8> %op1, %op2
442 define void @lshr_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
443 ; CHECK-LABEL: lshr_v32i8:
445 ; CHECK-NEXT: ptrue p0.b, vl32
446 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
447 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
448 ; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b
449 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
451 %op1 = load <32 x i8>, ptr %a
452 %op2 = load <32 x i8>, ptr %b
453 %res = lshr <32 x i8> %op1, %op2
454 store <32 x i8> %res, ptr %a
458 define void @lshr_v64i8(ptr %a, ptr %b) #0 {
459 ; VBITS_GE_256-LABEL: lshr_v64i8:
460 ; VBITS_GE_256: // %bb.0:
461 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
462 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
463 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
464 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8]
465 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0]
466 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
467 ; VBITS_GE_256-NEXT: lsr z0.b, p0/m, z0.b, z1.b
468 ; VBITS_GE_256-NEXT: movprfx z1, z2
469 ; VBITS_GE_256-NEXT: lsr z1.b, p0/m, z1.b, z3.b
470 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
471 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
472 ; VBITS_GE_256-NEXT: ret
474 ; VBITS_GE_512-LABEL: lshr_v64i8:
475 ; VBITS_GE_512: // %bb.0:
476 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
477 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
478 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
479 ; VBITS_GE_512-NEXT: lsr z0.b, p0/m, z0.b, z1.b
480 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
481 ; VBITS_GE_512-NEXT: ret
482 %op1 = load <64 x i8>, ptr %a
483 %op2 = load <64 x i8>, ptr %b
484 %res = lshr <64 x i8> %op1, %op2
485 store <64 x i8> %res, ptr %a
489 define void @lshr_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
490 ; CHECK-LABEL: lshr_v128i8:
492 ; CHECK-NEXT: ptrue p0.b, vl128
493 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
494 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
495 ; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b
496 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
498 %op1 = load <128 x i8>, ptr %a
499 %op2 = load <128 x i8>, ptr %b
500 %res = lshr <128 x i8> %op1, %op2
501 store <128 x i8> %res, ptr %a
505 define void @lshr_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
506 ; CHECK-LABEL: lshr_v256i8:
508 ; CHECK-NEXT: ptrue p0.b, vl256
509 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
510 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
511 ; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b
512 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
514 %op1 = load <256 x i8>, ptr %a
515 %op2 = load <256 x i8>, ptr %b
516 %res = lshr <256 x i8> %op1, %op2
517 store <256 x i8> %res, ptr %a
521 ; Don't use SVE for 64-bit vectors.
522 define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
523 ; CHECK-LABEL: lshr_v4i16:
525 ; CHECK-NEXT: neg v1.4h, v1.4h
526 ; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h
528 %res = lshr <4 x i16> %op1, %op2
532 ; Don't use SVE for 128-bit vectors.
533 define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
534 ; CHECK-LABEL: lshr_v8i16:
536 ; CHECK-NEXT: neg v1.8h, v1.8h
537 ; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h
539 %res = lshr <8 x i16> %op1, %op2
543 define void @lshr_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
544 ; CHECK-LABEL: lshr_v16i16:
546 ; CHECK-NEXT: ptrue p0.h, vl16
547 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
548 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
549 ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h
550 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
552 %op1 = load <16 x i16>, ptr %a
553 %op2 = load <16 x i16>, ptr %b
554 %res = lshr <16 x i16> %op1, %op2
555 store <16 x i16> %res, ptr %a
559 define void @lshr_v32i16(ptr %a, ptr %b) #0 {
560 ; VBITS_GE_256-LABEL: lshr_v32i16:
561 ; VBITS_GE_256: // %bb.0:
562 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
563 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
564 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
565 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
566 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
567 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
568 ; VBITS_GE_256-NEXT: lsr z0.h, p0/m, z0.h, z1.h
569 ; VBITS_GE_256-NEXT: movprfx z1, z2
570 ; VBITS_GE_256-NEXT: lsr z1.h, p0/m, z1.h, z3.h
571 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
572 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
573 ; VBITS_GE_256-NEXT: ret
575 ; VBITS_GE_512-LABEL: lshr_v32i16:
576 ; VBITS_GE_512: // %bb.0:
577 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
578 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
579 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
580 ; VBITS_GE_512-NEXT: lsr z0.h, p0/m, z0.h, z1.h
581 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
582 ; VBITS_GE_512-NEXT: ret
583 %op1 = load <32 x i16>, ptr %a
584 %op2 = load <32 x i16>, ptr %b
585 %res = lshr <32 x i16> %op1, %op2
586 store <32 x i16> %res, ptr %a
590 define void @lshr_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
591 ; CHECK-LABEL: lshr_v64i16:
593 ; CHECK-NEXT: ptrue p0.h, vl64
594 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
595 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
596 ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h
597 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
599 %op1 = load <64 x i16>, ptr %a
600 %op2 = load <64 x i16>, ptr %b
601 %res = lshr <64 x i16> %op1, %op2
602 store <64 x i16> %res, ptr %a
606 define void @lshr_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
607 ; CHECK-LABEL: lshr_v128i16:
609 ; CHECK-NEXT: ptrue p0.h, vl128
610 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
611 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
612 ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h
613 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
615 %op1 = load <128 x i16>, ptr %a
616 %op2 = load <128 x i16>, ptr %b
617 %res = lshr <128 x i16> %op1, %op2
618 store <128 x i16> %res, ptr %a
622 ; Don't use SVE for 64-bit vectors.
623 define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
624 ; CHECK-LABEL: lshr_v2i32:
626 ; CHECK-NEXT: neg v1.2s, v1.2s
627 ; CHECK-NEXT: ushl v0.2s, v0.2s, v1.2s
629 %res = lshr <2 x i32> %op1, %op2
633 ; Don't use SVE for 128-bit vectors.
634 define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
635 ; CHECK-LABEL: lshr_v4i32:
637 ; CHECK-NEXT: neg v1.4s, v1.4s
638 ; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s
640 %res = lshr <4 x i32> %op1, %op2
644 define void @lshr_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
645 ; CHECK-LABEL: lshr_v8i32:
647 ; CHECK-NEXT: ptrue p0.s, vl8
648 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
649 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
650 ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s
651 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
653 %op1 = load <8 x i32>, ptr %a
654 %op2 = load <8 x i32>, ptr %b
655 %res = lshr <8 x i32> %op1, %op2
656 store <8 x i32> %res, ptr %a
660 define void @lshr_v16i32(ptr %a, ptr %b) #0 {
661 ; VBITS_GE_256-LABEL: lshr_v16i32:
662 ; VBITS_GE_256: // %bb.0:
663 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
664 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
665 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
666 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
667 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
668 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
669 ; VBITS_GE_256-NEXT: lsr z0.s, p0/m, z0.s, z1.s
670 ; VBITS_GE_256-NEXT: movprfx z1, z2
671 ; VBITS_GE_256-NEXT: lsr z1.s, p0/m, z1.s, z3.s
672 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
673 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
674 ; VBITS_GE_256-NEXT: ret
676 ; VBITS_GE_512-LABEL: lshr_v16i32:
677 ; VBITS_GE_512: // %bb.0:
678 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
679 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
680 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
681 ; VBITS_GE_512-NEXT: lsr z0.s, p0/m, z0.s, z1.s
682 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
683 ; VBITS_GE_512-NEXT: ret
684 %op1 = load <16 x i32>, ptr %a
685 %op2 = load <16 x i32>, ptr %b
686 %res = lshr <16 x i32> %op1, %op2
687 store <16 x i32> %res, ptr %a
691 define void @lshr_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
692 ; CHECK-LABEL: lshr_v32i32:
694 ; CHECK-NEXT: ptrue p0.s, vl32
695 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
696 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
697 ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s
698 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
700 %op1 = load <32 x i32>, ptr %a
701 %op2 = load <32 x i32>, ptr %b
702 %res = lshr <32 x i32> %op1, %op2
703 store <32 x i32> %res, ptr %a
707 define void @lshr_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
708 ; CHECK-LABEL: lshr_v64i32:
710 ; CHECK-NEXT: ptrue p0.s, vl64
711 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
712 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
713 ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s
714 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
716 %op1 = load <64 x i32>, ptr %a
717 %op2 = load <64 x i32>, ptr %b
718 %res = lshr <64 x i32> %op1, %op2
719 store <64 x i32> %res, ptr %a
723 ; Don't use SVE for 64-bit vectors.
724 define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
725 ; CHECK-LABEL: lshr_v1i64:
727 ; CHECK-NEXT: neg d1, d1
728 ; CHECK-NEXT: ushl d0, d0, d1
730 %res = lshr <1 x i64> %op1, %op2
734 ; Don't use SVE for 128-bit vectors.
735 define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
736 ; CHECK-LABEL: lshr_v2i64:
738 ; CHECK-NEXT: neg v1.2d, v1.2d
739 ; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d
741 %res = lshr <2 x i64> %op1, %op2
745 define void @lshr_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
746 ; CHECK-LABEL: lshr_v4i64:
748 ; CHECK-NEXT: ptrue p0.d, vl4
749 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
750 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
751 ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d
752 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
754 %op1 = load <4 x i64>, ptr %a
755 %op2 = load <4 x i64>, ptr %b
756 %res = lshr <4 x i64> %op1, %op2
757 store <4 x i64> %res, ptr %a
761 define void @lshr_v8i64(ptr %a, ptr %b) #0 {
762 ; VBITS_GE_256-LABEL: lshr_v8i64:
763 ; VBITS_GE_256: // %bb.0:
764 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
765 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
766 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
767 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
768 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
769 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
770 ; VBITS_GE_256-NEXT: lsr z0.d, p0/m, z0.d, z1.d
771 ; VBITS_GE_256-NEXT: movprfx z1, z2
772 ; VBITS_GE_256-NEXT: lsr z1.d, p0/m, z1.d, z3.d
773 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
774 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
775 ; VBITS_GE_256-NEXT: ret
777 ; VBITS_GE_512-LABEL: lshr_v8i64:
778 ; VBITS_GE_512: // %bb.0:
779 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
780 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
781 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
782 ; VBITS_GE_512-NEXT: lsr z0.d, p0/m, z0.d, z1.d
783 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
784 ; VBITS_GE_512-NEXT: ret
785 %op1 = load <8 x i64>, ptr %a
786 %op2 = load <8 x i64>, ptr %b
787 %res = lshr <8 x i64> %op1, %op2
788 store <8 x i64> %res, ptr %a
792 define void @lshr_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
793 ; CHECK-LABEL: lshr_v16i64:
795 ; CHECK-NEXT: ptrue p0.d, vl16
796 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
797 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
798 ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d
799 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
801 %op1 = load <16 x i64>, ptr %a
802 %op2 = load <16 x i64>, ptr %b
803 %res = lshr <16 x i64> %op1, %op2
804 store <16 x i64> %res, ptr %a
808 define void @lshr_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
809 ; CHECK-LABEL: lshr_v32i64:
811 ; CHECK-NEXT: ptrue p0.d, vl32
812 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
813 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
814 ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d
815 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
817 %op1 = load <32 x i64>, ptr %a
818 %op2 = load <32 x i64>, ptr %b
819 %res = lshr <32 x i64> %op1, %op2
820 store <32 x i64> %res, ptr %a
828 ; Don't use SVE for 64-bit vectors.
829 define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
830 ; CHECK-LABEL: shl_v8i8:
832 ; CHECK-NEXT: ushl v0.8b, v0.8b, v1.8b
834 %res = shl <8 x i8> %op1, %op2
838 ; Don't use SVE for 128-bit vectors.
839 define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
840 ; CHECK-LABEL: shl_v16i8:
842 ; CHECK-NEXT: ushl v0.16b, v0.16b, v1.16b
844 %res = shl <16 x i8> %op1, %op2
848 define void @shl_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
849 ; CHECK-LABEL: shl_v32i8:
851 ; CHECK-NEXT: ptrue p0.b, vl32
852 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
853 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
854 ; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b
855 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
857 %op1 = load <32 x i8>, ptr %a
858 %op2 = load <32 x i8>, ptr %b
859 %res = shl <32 x i8> %op1, %op2
860 store <32 x i8> %res, ptr %a
864 define void @shl_v64i8(ptr %a, ptr %b) #0 {
865 ; VBITS_GE_256-LABEL: shl_v64i8:
866 ; VBITS_GE_256: // %bb.0:
867 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
868 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
869 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
870 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8]
871 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0]
872 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
873 ; VBITS_GE_256-NEXT: lsl z0.b, p0/m, z0.b, z1.b
874 ; VBITS_GE_256-NEXT: movprfx z1, z2
875 ; VBITS_GE_256-NEXT: lsl z1.b, p0/m, z1.b, z3.b
876 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
877 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
878 ; VBITS_GE_256-NEXT: ret
880 ; VBITS_GE_512-LABEL: shl_v64i8:
881 ; VBITS_GE_512: // %bb.0:
882 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
883 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
884 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
885 ; VBITS_GE_512-NEXT: lsl z0.b, p0/m, z0.b, z1.b
886 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
887 ; VBITS_GE_512-NEXT: ret
888 %op1 = load <64 x i8>, ptr %a
889 %op2 = load <64 x i8>, ptr %b
890 %res = shl <64 x i8> %op1, %op2
891 store <64 x i8> %res, ptr %a
895 define void @shl_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
896 ; CHECK-LABEL: shl_v128i8:
898 ; CHECK-NEXT: ptrue p0.b, vl128
899 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
900 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
901 ; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b
902 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
904 %op1 = load <128 x i8>, ptr %a
905 %op2 = load <128 x i8>, ptr %b
906 %res = shl <128 x i8> %op1, %op2
907 store <128 x i8> %res, ptr %a
911 define void @shl_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
912 ; CHECK-LABEL: shl_v256i8:
914 ; CHECK-NEXT: ptrue p0.b, vl256
915 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
916 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
917 ; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b
918 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
920 %op1 = load <256 x i8>, ptr %a
921 %op2 = load <256 x i8>, ptr %b
922 %res = shl <256 x i8> %op1, %op2
923 store <256 x i8> %res, ptr %a
927 ; Don't use SVE for 64-bit vectors.
928 define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
929 ; CHECK-LABEL: shl_v4i16:
931 ; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h
933 %res = shl <4 x i16> %op1, %op2
937 ; Don't use SVE for 128-bit vectors.
938 define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
939 ; CHECK-LABEL: shl_v8i16:
941 ; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h
943 %res = shl <8 x i16> %op1, %op2
947 define void @shl_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
948 ; CHECK-LABEL: shl_v16i16:
950 ; CHECK-NEXT: ptrue p0.h, vl16
951 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
952 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
953 ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h
954 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
956 %op1 = load <16 x i16>, ptr %a
957 %op2 = load <16 x i16>, ptr %b
958 %res = shl <16 x i16> %op1, %op2
959 store <16 x i16> %res, ptr %a
963 define void @shl_v32i16(ptr %a, ptr %b) #0 {
964 ; VBITS_GE_256-LABEL: shl_v32i16:
965 ; VBITS_GE_256: // %bb.0:
966 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
967 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
968 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
969 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
970 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
971 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
972 ; VBITS_GE_256-NEXT: lsl z0.h, p0/m, z0.h, z1.h
973 ; VBITS_GE_256-NEXT: movprfx z1, z2
974 ; VBITS_GE_256-NEXT: lsl z1.h, p0/m, z1.h, z3.h
975 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
976 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
977 ; VBITS_GE_256-NEXT: ret
979 ; VBITS_GE_512-LABEL: shl_v32i16:
980 ; VBITS_GE_512: // %bb.0:
981 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
982 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
983 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
984 ; VBITS_GE_512-NEXT: lsl z0.h, p0/m, z0.h, z1.h
985 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
986 ; VBITS_GE_512-NEXT: ret
987 %op1 = load <32 x i16>, ptr %a
988 %op2 = load <32 x i16>, ptr %b
989 %res = shl <32 x i16> %op1, %op2
990 store <32 x i16> %res, ptr %a
994 define void @shl_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
995 ; CHECK-LABEL: shl_v64i16:
997 ; CHECK-NEXT: ptrue p0.h, vl64
998 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
999 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1000 ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h
1001 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1003 %op1 = load <64 x i16>, ptr %a
1004 %op2 = load <64 x i16>, ptr %b
1005 %res = shl <64 x i16> %op1, %op2
1006 store <64 x i16> %res, ptr %a
1010 define void @shl_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1011 ; CHECK-LABEL: shl_v128i16:
1013 ; CHECK-NEXT: ptrue p0.h, vl128
1014 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1015 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1016 ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h
1017 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1019 %op1 = load <128 x i16>, ptr %a
1020 %op2 = load <128 x i16>, ptr %b
1021 %res = shl <128 x i16> %op1, %op2
1022 store <128 x i16> %res, ptr %a
1026 ; Don't use SVE for 64-bit vectors.
1027 define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
1028 ; CHECK-LABEL: shl_v2i32:
1030 ; CHECK-NEXT: ushl v0.2s, v0.2s, v1.2s
1032 %res = shl <2 x i32> %op1, %op2
1036 ; Don't use SVE for 128-bit vectors.
1037 define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
1038 ; CHECK-LABEL: shl_v4i32:
1040 ; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s
1042 %res = shl <4 x i32> %op1, %op2
1046 define void @shl_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1047 ; CHECK-LABEL: shl_v8i32:
1049 ; CHECK-NEXT: ptrue p0.s, vl8
1050 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1051 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1052 ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s
1053 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1055 %op1 = load <8 x i32>, ptr %a
1056 %op2 = load <8 x i32>, ptr %b
1057 %res = shl <8 x i32> %op1, %op2
1058 store <8 x i32> %res, ptr %a
1062 define void @shl_v16i32(ptr %a, ptr %b) #0 {
1063 ; VBITS_GE_256-LABEL: shl_v16i32:
1064 ; VBITS_GE_256: // %bb.0:
1065 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1066 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1067 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1068 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1069 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
1070 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
1071 ; VBITS_GE_256-NEXT: lsl z0.s, p0/m, z0.s, z1.s
1072 ; VBITS_GE_256-NEXT: movprfx z1, z2
1073 ; VBITS_GE_256-NEXT: lsl z1.s, p0/m, z1.s, z3.s
1074 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1075 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1076 ; VBITS_GE_256-NEXT: ret
1078 ; VBITS_GE_512-LABEL: shl_v16i32:
1079 ; VBITS_GE_512: // %bb.0:
1080 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1081 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1082 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
1083 ; VBITS_GE_512-NEXT: lsl z0.s, p0/m, z0.s, z1.s
1084 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1085 ; VBITS_GE_512-NEXT: ret
1086 %op1 = load <16 x i32>, ptr %a
1087 %op2 = load <16 x i32>, ptr %b
1088 %res = shl <16 x i32> %op1, %op2
1089 store <16 x i32> %res, ptr %a
1093 define void @shl_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1094 ; CHECK-LABEL: shl_v32i32:
1096 ; CHECK-NEXT: ptrue p0.s, vl32
1097 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1098 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1099 ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s
1100 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1102 %op1 = load <32 x i32>, ptr %a
1103 %op2 = load <32 x i32>, ptr %b
1104 %res = shl <32 x i32> %op1, %op2
1105 store <32 x i32> %res, ptr %a
1109 define void @shl_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1110 ; CHECK-LABEL: shl_v64i32:
1112 ; CHECK-NEXT: ptrue p0.s, vl64
1113 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1114 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1115 ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s
1116 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1118 %op1 = load <64 x i32>, ptr %a
1119 %op2 = load <64 x i32>, ptr %b
1120 %res = shl <64 x i32> %op1, %op2
1121 store <64 x i32> %res, ptr %a
1125 ; Don't use SVE for 64-bit vectors.
1126 define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
1127 ; CHECK-LABEL: shl_v1i64:
1129 ; CHECK-NEXT: ushl d0, d0, d1
1131 %res = shl <1 x i64> %op1, %op2
1135 ; Don't use SVE for 128-bit vectors.
1136 define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
1137 ; CHECK-LABEL: shl_v2i64:
1139 ; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d
1141 %res = shl <2 x i64> %op1, %op2
1145 define void @shl_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1146 ; CHECK-LABEL: shl_v4i64:
1148 ; CHECK-NEXT: ptrue p0.d, vl4
1149 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1150 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1151 ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d
1152 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1154 %op1 = load <4 x i64>, ptr %a
1155 %op2 = load <4 x i64>, ptr %b
1156 %res = shl <4 x i64> %op1, %op2
1157 store <4 x i64> %res, ptr %a
1161 define void @shl_v8i64(ptr %a, ptr %b) #0 {
1162 ; VBITS_GE_256-LABEL: shl_v8i64:
1163 ; VBITS_GE_256: // %bb.0:
1164 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1165 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1166 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1167 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1168 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
1169 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
1170 ; VBITS_GE_256-NEXT: lsl z0.d, p0/m, z0.d, z1.d
1171 ; VBITS_GE_256-NEXT: movprfx z1, z2
1172 ; VBITS_GE_256-NEXT: lsl z1.d, p0/m, z1.d, z3.d
1173 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1174 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1175 ; VBITS_GE_256-NEXT: ret
1177 ; VBITS_GE_512-LABEL: shl_v8i64:
1178 ; VBITS_GE_512: // %bb.0:
1179 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1180 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1181 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
1182 ; VBITS_GE_512-NEXT: lsl z0.d, p0/m, z0.d, z1.d
1183 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1184 ; VBITS_GE_512-NEXT: ret
1185 %op1 = load <8 x i64>, ptr %a
1186 %op2 = load <8 x i64>, ptr %b
1187 %res = shl <8 x i64> %op1, %op2
1188 store <8 x i64> %res, ptr %a
1192 define void @shl_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1193 ; CHECK-LABEL: shl_v16i64:
1195 ; CHECK-NEXT: ptrue p0.d, vl16
1196 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1197 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1198 ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d
1199 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1201 %op1 = load <16 x i64>, ptr %a
1202 %op2 = load <16 x i64>, ptr %b
1203 %res = shl <16 x i64> %op1, %op2
1204 store <16 x i64> %res, ptr %a
1208 define void @shl_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1209 ; CHECK-LABEL: shl_v32i64:
1211 ; CHECK-NEXT: ptrue p0.d, vl32
1212 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1213 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1214 ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d
1215 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1217 %op1 = load <32 x i64>, ptr %a
1218 %op2 = load <32 x i64>, ptr %b
1219 %res = shl <32 x i64> %op1, %op2
1220 store <32 x i64> %res, ptr %a
1224 attributes #0 = { "target-features"="+sve" }