1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
3 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
4 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
7 target triple = "aarch64-unknown-linux-gnu"
13 ; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
14 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
15 define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
16 ; VBITS_GE_128-LABEL: srem_v8i8:
17 ; VBITS_GE_128: // %bb.0:
18 ; VBITS_GE_128-NEXT: sshll v2.8h, v1.8b, #0
19 ; VBITS_GE_128-NEXT: sshll v3.8h, v0.8b, #0
20 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
21 ; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0
22 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0
23 ; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0
24 ; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
25 ; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
26 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
27 ; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
28 ; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h
29 ; VBITS_GE_128-NEXT: mls v0.8b, v2.8b, v1.8b
30 ; VBITS_GE_128-NEXT: ret
32 ; VBITS_GE_256-LABEL: srem_v8i8:
33 ; VBITS_GE_256: // %bb.0:
34 ; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
35 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
36 ; VBITS_GE_256-NEXT: sunpklo z2.h, z1.b
37 ; VBITS_GE_256-NEXT: sunpklo z3.h, z0.b
38 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
39 ; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
40 ; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
41 ; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
42 ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
43 ; VBITS_GE_256-NEXT: umov w8, v2.h[0]
44 ; VBITS_GE_256-NEXT: umov w9, v2.h[1]
45 ; VBITS_GE_256-NEXT: fmov s3, w8
46 ; VBITS_GE_256-NEXT: umov w8, v2.h[2]
47 ; VBITS_GE_256-NEXT: mov v3.b[1], w9
48 ; VBITS_GE_256-NEXT: mov v3.b[2], w8
49 ; VBITS_GE_256-NEXT: umov w8, v2.h[3]
50 ; VBITS_GE_256-NEXT: mov v3.b[3], w8
51 ; VBITS_GE_256-NEXT: umov w8, v2.h[4]
52 ; VBITS_GE_256-NEXT: mov v3.b[4], w8
53 ; VBITS_GE_256-NEXT: umov w8, v2.h[5]
54 ; VBITS_GE_256-NEXT: mov v3.b[5], w8
55 ; VBITS_GE_256-NEXT: umov w8, v2.h[6]
56 ; VBITS_GE_256-NEXT: mov v3.b[6], w8
57 ; VBITS_GE_256-NEXT: umov w8, v2.h[7]
58 ; VBITS_GE_256-NEXT: mov v3.b[7], w8
59 ; VBITS_GE_256-NEXT: mls v0.8b, v3.8b, v1.8b
60 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
61 ; VBITS_GE_256-NEXT: ret
63 ; VBITS_GE_512-LABEL: srem_v8i8:
64 ; VBITS_GE_512: // %bb.0:
65 ; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1
66 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
67 ; VBITS_GE_512-NEXT: sunpklo z2.h, z1.b
68 ; VBITS_GE_512-NEXT: sunpklo z3.h, z0.b
69 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
70 ; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h
71 ; VBITS_GE_512-NEXT: sunpklo z3.s, z3.h
72 ; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
73 ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
74 ; VBITS_GE_512-NEXT: umov w8, v2.h[0]
75 ; VBITS_GE_512-NEXT: umov w9, v2.h[1]
76 ; VBITS_GE_512-NEXT: fmov s3, w8
77 ; VBITS_GE_512-NEXT: umov w8, v2.h[2]
78 ; VBITS_GE_512-NEXT: mov v3.b[1], w9
79 ; VBITS_GE_512-NEXT: mov v3.b[2], w8
80 ; VBITS_GE_512-NEXT: umov w8, v2.h[3]
81 ; VBITS_GE_512-NEXT: mov v3.b[3], w8
82 ; VBITS_GE_512-NEXT: umov w8, v2.h[4]
83 ; VBITS_GE_512-NEXT: mov v3.b[4], w8
84 ; VBITS_GE_512-NEXT: umov w8, v2.h[5]
85 ; VBITS_GE_512-NEXT: mov v3.b[5], w8
86 ; VBITS_GE_512-NEXT: umov w8, v2.h[6]
87 ; VBITS_GE_512-NEXT: mov v3.b[6], w8
88 ; VBITS_GE_512-NEXT: umov w8, v2.h[7]
89 ; VBITS_GE_512-NEXT: mov v3.b[7], w8
90 ; VBITS_GE_512-NEXT: mls v0.8b, v3.8b, v1.8b
91 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
92 ; VBITS_GE_512-NEXT: ret
93 %res = srem <8 x i8> %op1, %op2
97 define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
98 ; VBITS_GE_128-LABEL: srem_v16i8:
99 ; VBITS_GE_128: // %bb.0:
100 ; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0
101 ; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0
102 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
103 ; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0
104 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0
105 ; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0
106 ; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
107 ; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
108 ; VBITS_GE_128-NEXT: sshll v5.8h, v0.8b, #0
109 ; VBITS_GE_128-NEXT: sshll2 v7.4s, v5.8h, #0
110 ; VBITS_GE_128-NEXT: sshll v5.4s, v5.4h, #0
111 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
112 ; VBITS_GE_128-NEXT: sshll v3.8h, v1.8b, #0
113 ; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0
114 ; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
115 ; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
116 ; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
117 ; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
118 ; VBITS_GE_128-NEXT: uzp1 v3.8h, v3.8h, v6.8h
119 ; VBITS_GE_128-NEXT: uzp1 v2.16b, v3.16b, v2.16b
120 ; VBITS_GE_128-NEXT: mls v0.16b, v2.16b, v1.16b
121 ; VBITS_GE_128-NEXT: ret
123 ; VBITS_GE_256-LABEL: srem_v16i8:
124 ; VBITS_GE_256: // %bb.0:
125 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
126 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
127 ; VBITS_GE_256-NEXT: sunpklo z2.h, z1.b
128 ; VBITS_GE_256-NEXT: sunpklo z3.h, z0.b
129 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
130 ; VBITS_GE_256-NEXT: sunpklo z4.s, z2.h
131 ; VBITS_GE_256-NEXT: sunpklo z5.s, z3.h
132 ; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
133 ; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
134 ; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
135 ; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
136 ; VBITS_GE_256-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
137 ; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
138 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
139 ; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h
140 ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
141 ; VBITS_GE_256-NEXT: splice z3.h, p0, z3.h, z2.h
142 ; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z3.b
143 ; VBITS_GE_256-NEXT: mls v0.16b, v2.16b, v1.16b
144 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
145 ; VBITS_GE_256-NEXT: ret
147 ; VBITS_GE_512-LABEL: srem_v16i8:
148 ; VBITS_GE_512: // %bb.0:
149 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
150 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
151 ; VBITS_GE_512-NEXT: sunpklo z2.h, z1.b
152 ; VBITS_GE_512-NEXT: sunpklo z3.h, z0.b
153 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
154 ; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h
155 ; VBITS_GE_512-NEXT: sunpklo z3.s, z3.h
156 ; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
157 ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
158 ; VBITS_GE_512-NEXT: uzp1 z2.b, z2.b, z2.b
159 ; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b
160 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
161 ; VBITS_GE_512-NEXT: ret
162 %res = srem <16 x i8> %op1, %op2
166 define void @srem_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
167 ; CHECK-LABEL: srem_v32i8:
169 ; CHECK-NEXT: ptrue p0.b, vl32
170 ; CHECK-NEXT: ptrue p1.s, vl32
171 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
172 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
173 ; CHECK-NEXT: sunpklo z2.h, z1.b
174 ; CHECK-NEXT: sunpklo z3.h, z0.b
175 ; CHECK-NEXT: sunpklo z2.s, z2.h
176 ; CHECK-NEXT: sunpklo z3.s, z3.h
177 ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
178 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
179 ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
180 ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
181 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
183 %op1 = load <32 x i8>, ptr %a
184 %op2 = load <32 x i8>, ptr %b
185 %res = srem <32 x i8> %op1, %op2
186 store <32 x i8> %res, ptr %a
190 define void @srem_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
191 ; CHECK-LABEL: srem_v64i8:
193 ; CHECK-NEXT: ptrue p0.b, vl64
194 ; CHECK-NEXT: ptrue p1.s, vl64
195 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
196 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
197 ; CHECK-NEXT: sunpklo z2.h, z1.b
198 ; CHECK-NEXT: sunpklo z3.h, z0.b
199 ; CHECK-NEXT: sunpklo z2.s, z2.h
200 ; CHECK-NEXT: sunpklo z3.s, z3.h
201 ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
202 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
203 ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
204 ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
205 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
207 %op1 = load <64 x i8>, ptr %a
208 %op2 = load <64 x i8>, ptr %b
209 %res = srem <64 x i8> %op1, %op2
210 store <64 x i8> %res, ptr %a
214 define void @srem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
215 ; CHECK-LABEL: srem_v128i8:
217 ; CHECK-NEXT: ptrue p0.b, vl128
218 ; CHECK-NEXT: ptrue p1.s, vl64
219 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
220 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
221 ; CHECK-NEXT: sunpklo z2.h, z1.b
222 ; CHECK-NEXT: sunpklo z3.h, z0.b
223 ; CHECK-NEXT: sunpklo z4.s, z2.h
224 ; CHECK-NEXT: sunpklo z5.s, z3.h
225 ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
226 ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
227 ; CHECK-NEXT: sunpklo z2.s, z2.h
228 ; CHECK-NEXT: sunpklo z3.s, z3.h
229 ; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
230 ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
231 ; CHECK-NEXT: ptrue p1.h, vl64
232 ; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h
233 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
234 ; CHECK-NEXT: splice z3.h, p1, z3.h, z2.h
235 ; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b
236 ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
237 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
239 %op1 = load <128 x i8>, ptr %a
240 %op2 = load <128 x i8>, ptr %b
241 %res = srem <128 x i8> %op1, %op2
242 store <128 x i8> %res, ptr %a
246 define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
247 ; CHECK-LABEL: srem_v256i8:
249 ; CHECK-NEXT: ptrue p0.b, vl256
250 ; CHECK-NEXT: ptrue p1.s, vl64
251 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
252 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
253 ; CHECK-NEXT: sunpklo z2.h, z1.b
254 ; CHECK-NEXT: sunpklo z3.h, z0.b
255 ; CHECK-NEXT: sunpklo z4.s, z2.h
256 ; CHECK-NEXT: sunpklo z5.s, z3.h
257 ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
258 ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
259 ; CHECK-NEXT: sunpklo z2.s, z2.h
260 ; CHECK-NEXT: sunpklo z3.s, z3.h
261 ; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
262 ; CHECK-NEXT: mov z5.d, z0.d
263 ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #128
264 ; CHECK-NEXT: sunpklo z5.h, z5.b
265 ; CHECK-NEXT: sunpklo z7.s, z5.h
266 ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #128
267 ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
268 ; CHECK-NEXT: mov z3.d, z1.d
269 ; CHECK-NEXT: sunpklo z5.s, z5.h
270 ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128
271 ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
272 ; CHECK-NEXT: sunpklo z3.h, z3.b
273 ; CHECK-NEXT: sunpklo z6.s, z3.h
274 ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
275 ; CHECK-NEXT: sunpklo z3.s, z3.h
276 ; CHECK-NEXT: sdivr z6.s, p1/m, z6.s, z7.s
277 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
278 ; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z5.s
279 ; CHECK-NEXT: ptrue p1.h, vl64
280 ; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h
281 ; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h
282 ; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b
283 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
284 ; CHECK-NEXT: splice z5.h, p1, z5.h, z3.h
285 ; CHECK-NEXT: ptrue p1.b, vl128
286 ; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b
287 ; CHECK-NEXT: splice z2.b, p1, z2.b, z3.b
288 ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
289 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
291 %op1 = load <256 x i8>, ptr %a
292 %op2 = load <256 x i8>, ptr %b
293 %res = srem <256 x i8> %op1, %op2
294 store <256 x i8> %res, ptr %a
298 ; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
299 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
300 define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
301 ; VBITS_GE_128-LABEL: srem_v4i16:
302 ; VBITS_GE_128: // %bb.0:
303 ; VBITS_GE_128-NEXT: sshll v2.4s, v1.4h, #0
304 ; VBITS_GE_128-NEXT: sshll v3.4s, v0.4h, #0
305 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
306 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
307 ; VBITS_GE_128-NEXT: xtn v2.4h, v2.4s
308 ; VBITS_GE_128-NEXT: mls v0.4h, v2.4h, v1.4h
309 ; VBITS_GE_128-NEXT: ret
311 ; VBITS_GE_256-LABEL: srem_v4i16:
312 ; VBITS_GE_256: // %bb.0:
313 ; VBITS_GE_256-NEXT: sshll v2.4s, v1.4h, #0
314 ; VBITS_GE_256-NEXT: sshll v3.4s, v0.4h, #0
315 ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
316 ; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
317 ; VBITS_GE_256-NEXT: mov w8, v2.s[1]
318 ; VBITS_GE_256-NEXT: mov v3.16b, v2.16b
319 ; VBITS_GE_256-NEXT: mov w9, v2.s[2]
320 ; VBITS_GE_256-NEXT: mov v3.h[1], w8
321 ; VBITS_GE_256-NEXT: mov w8, v2.s[3]
322 ; VBITS_GE_256-NEXT: mov v3.h[2], w9
323 ; VBITS_GE_256-NEXT: mov v3.h[3], w8
324 ; VBITS_GE_256-NEXT: mls v0.4h, v3.4h, v1.4h
325 ; VBITS_GE_256-NEXT: ret
327 ; VBITS_GE_512-LABEL: srem_v4i16:
328 ; VBITS_GE_512: // %bb.0:
329 ; VBITS_GE_512-NEXT: sshll v2.4s, v1.4h, #0
330 ; VBITS_GE_512-NEXT: sshll v3.4s, v0.4h, #0
331 ; VBITS_GE_512-NEXT: ptrue p0.s, vl4
332 ; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
333 ; VBITS_GE_512-NEXT: mov w8, v2.s[1]
334 ; VBITS_GE_512-NEXT: mov v3.16b, v2.16b
335 ; VBITS_GE_512-NEXT: mov w9, v2.s[2]
336 ; VBITS_GE_512-NEXT: mov v3.h[1], w8
337 ; VBITS_GE_512-NEXT: mov w8, v2.s[3]
338 ; VBITS_GE_512-NEXT: mov v3.h[2], w9
339 ; VBITS_GE_512-NEXT: mov v3.h[3], w8
340 ; VBITS_GE_512-NEXT: mls v0.4h, v3.4h, v1.4h
341 ; VBITS_GE_512-NEXT: ret
342 %res = srem <4 x i16> %op1, %op2
346 define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
347 ; VBITS_GE_128-LABEL: srem_v8i16:
348 ; VBITS_GE_128: // %bb.0:
349 ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
350 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
351 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
352 ; VBITS_GE_128-NEXT: sshll v4.4s, v0.4h, #0
353 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
354 ; VBITS_GE_128-NEXT: sshll v3.4s, v1.4h, #0
355 ; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z4.s
356 ; VBITS_GE_128-NEXT: uzp1 v2.8h, v3.8h, v2.8h
357 ; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h
358 ; VBITS_GE_128-NEXT: ret
360 ; VBITS_GE_256-LABEL: srem_v8i16:
361 ; VBITS_GE_256: // %bb.0:
362 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
363 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
364 ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
365 ; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
366 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
367 ; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
368 ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
369 ; VBITS_GE_256-NEXT: mls v0.8h, v2.8h, v1.8h
370 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
371 ; VBITS_GE_256-NEXT: ret
373 ; VBITS_GE_512-LABEL: srem_v8i16:
374 ; VBITS_GE_512: // %bb.0:
375 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
376 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
377 ; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h
378 ; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h
379 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
380 ; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
381 ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
382 ; VBITS_GE_512-NEXT: mls v0.8h, v2.8h, v1.8h
383 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
384 ; VBITS_GE_512-NEXT: ret
385 %res = srem <8 x i16> %op1, %op2
389 define void @srem_v16i16(ptr %a, ptr %b) #0 {
390 ; VBITS_GE_128-LABEL: srem_v16i16:
391 ; VBITS_GE_128: // %bb.0:
392 ; VBITS_GE_128-NEXT: ldp q4, q1, [x1]
393 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
394 ; VBITS_GE_128-NEXT: ldr q0, [x0, #16]
395 ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
396 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
397 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v4.8h, #0
398 ; VBITS_GE_128-NEXT: sshll v16.4s, v0.4h, #0
399 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
400 ; VBITS_GE_128-NEXT: ldr q3, [x0]
401 ; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0
402 ; VBITS_GE_128-NEXT: sshll v7.4s, v3.4h, #0
403 ; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
404 ; VBITS_GE_128-NEXT: sshll v6.4s, v4.4h, #0
405 ; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
406 ; VBITS_GE_128-NEXT: sshll v7.4s, v1.4h, #0
407 ; VBITS_GE_128-NEXT: sdivr z7.s, p0/m, z7.s, z16.s
408 ; VBITS_GE_128-NEXT: uzp1 v5.8h, v6.8h, v5.8h
409 ; VBITS_GE_128-NEXT: mls v3.8h, v5.8h, v4.8h
410 ; VBITS_GE_128-NEXT: uzp1 v2.8h, v7.8h, v2.8h
411 ; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h
412 ; VBITS_GE_128-NEXT: stp q3, q0, [x0]
413 ; VBITS_GE_128-NEXT: ret
415 ; VBITS_GE_256-LABEL: srem_v16i16:
416 ; VBITS_GE_256: // %bb.0:
417 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
418 ; VBITS_GE_256-NEXT: ptrue p1.s, vl8
419 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
420 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
421 ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
422 ; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
423 ; VBITS_GE_256-NEXT: mov z4.d, z0.d
424 ; VBITS_GE_256-NEXT: ext z4.b, z4.b, z0.b, #16
425 ; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
426 ; VBITS_GE_256-NEXT: mov z3.d, z1.d
427 ; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h
428 ; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16
429 ; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
430 ; VBITS_GE_256-NEXT: sdivr z3.s, p1/m, z3.s, z4.s
431 ; VBITS_GE_256-NEXT: ptrue p1.h, vl8
432 ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
433 ; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
434 ; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z3.h
435 ; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h
436 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
437 ; VBITS_GE_256-NEXT: ret
439 ; VBITS_GE_512-LABEL: srem_v16i16:
440 ; VBITS_GE_512: // %bb.0:
441 ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
442 ; VBITS_GE_512-NEXT: ptrue p1.s, vl16
443 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
444 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
445 ; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h
446 ; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h
447 ; VBITS_GE_512-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
448 ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
449 ; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h
450 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
451 ; VBITS_GE_512-NEXT: ret
452 %op1 = load <16 x i16>, ptr %a
453 %op2 = load <16 x i16>, ptr %b
454 %res = srem <16 x i16> %op1, %op2
455 store <16 x i16> %res, ptr %a
459 define void @srem_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
460 ; CHECK-LABEL: srem_v32i16:
462 ; CHECK-NEXT: ptrue p0.h, vl32
463 ; CHECK-NEXT: ptrue p1.s, vl32
464 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
465 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
466 ; CHECK-NEXT: sunpklo z2.s, z1.h
467 ; CHECK-NEXT: sunpklo z3.s, z0.h
468 ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
469 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
470 ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
471 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
473 %op1 = load <32 x i16>, ptr %a
474 %op2 = load <32 x i16>, ptr %b
475 %res = srem <32 x i16> %op1, %op2
476 store <32 x i16> %res, ptr %a
480 define void @srem_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
481 ; CHECK-LABEL: srem_v64i16:
483 ; CHECK-NEXT: ptrue p0.h, vl64
484 ; CHECK-NEXT: ptrue p1.s, vl64
485 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
486 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
487 ; CHECK-NEXT: sunpklo z2.s, z1.h
488 ; CHECK-NEXT: sunpklo z3.s, z0.h
489 ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
490 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
491 ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
492 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
494 %op1 = load <64 x i16>, ptr %a
495 %op2 = load <64 x i16>, ptr %b
496 %res = srem <64 x i16> %op1, %op2
497 store <64 x i16> %res, ptr %a
501 define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
502 ; CHECK-LABEL: srem_v128i16:
504 ; CHECK-NEXT: ptrue p0.h, vl128
505 ; CHECK-NEXT: ptrue p1.s, vl64
506 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
507 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
508 ; CHECK-NEXT: sunpklo z2.s, z1.h
509 ; CHECK-NEXT: sunpklo z3.s, z0.h
510 ; CHECK-NEXT: mov z4.d, z0.d
511 ; CHECK-NEXT: ext z4.b, z4.b, z0.b, #128
512 ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
513 ; CHECK-NEXT: mov z3.d, z1.d
514 ; CHECK-NEXT: sunpklo z4.s, z4.h
515 ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128
516 ; CHECK-NEXT: sunpklo z3.s, z3.h
517 ; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z4.s
518 ; CHECK-NEXT: ptrue p1.h, vl64
519 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
520 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
521 ; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h
522 ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
523 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
525 %op1 = load <128 x i16>, ptr %a
526 %op2 = load <128 x i16>, ptr %b
527 %res = srem <128 x i16> %op1, %op2
528 store <128 x i16> %res, ptr %a
532 ; Vector v2i32 sdiv are not legal for NEON so use SVE when available.
533 define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
534 ; CHECK-LABEL: srem_v2i32:
536 ; CHECK-NEXT: ptrue p0.s, vl2
537 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
538 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
539 ; CHECK-NEXT: movprfx z2, z0
540 ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
541 ; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s
542 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
544 %res = srem <2 x i32> %op1, %op2
548 ; Vector v4i32 sdiv are not legal for NEON so use SVE when available.
549 define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
550 ; CHECK-LABEL: srem_v4i32:
552 ; CHECK-NEXT: ptrue p0.s, vl4
553 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
554 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
555 ; CHECK-NEXT: movprfx z2, z0
556 ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
557 ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
558 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
560 %res = srem <4 x i32> %op1, %op2
564 define void @srem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
565 ; CHECK-LABEL: srem_v8i32:
567 ; CHECK-NEXT: ptrue p0.s, vl8
568 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
569 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
570 ; CHECK-NEXT: movprfx z2, z0
571 ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
572 ; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
573 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
575 %op1 = load <8 x i32>, ptr %a
576 %op2 = load <8 x i32>, ptr %b
577 %res = srem <8 x i32> %op1, %op2
578 store <8 x i32> %res, ptr %a
582 define void @srem_v16i32(ptr %a, ptr %b) #0 {
583 ; VBITS_GE_128-LABEL: srem_v16i32:
584 ; VBITS_GE_128: // %bb.0:
585 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
586 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
587 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
588 ; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32]
589 ; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32]
590 ; VBITS_GE_128-NEXT: movprfx z4, z1
591 ; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z0.s
592 ; VBITS_GE_128-NEXT: movprfx z19, z2
593 ; VBITS_GE_128-NEXT: sdiv z19.s, p0/m, z19.s, z3.s
594 ; VBITS_GE_128-NEXT: movprfx z7, z5
595 ; VBITS_GE_128-NEXT: sdiv z7.s, p0/m, z7.s, z6.s
596 ; VBITS_GE_128-NEXT: movprfx z18, z16
597 ; VBITS_GE_128-NEXT: sdiv z18.s, p0/m, z18.s, z17.s
598 ; VBITS_GE_128-NEXT: mls v1.4s, v4.4s, v0.4s
599 ; VBITS_GE_128-NEXT: mls v2.4s, v19.4s, v3.4s
600 ; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s
601 ; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s
602 ; VBITS_GE_128-NEXT: stp q1, q2, [x0]
603 ; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32]
604 ; VBITS_GE_128-NEXT: ret
606 ; VBITS_GE_256-LABEL: srem_v16i32:
607 ; VBITS_GE_256: // %bb.0:
608 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
609 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
610 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
611 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
612 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
613 ; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1]
614 ; VBITS_GE_256-NEXT: movprfx z2, z0
615 ; VBITS_GE_256-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
616 ; VBITS_GE_256-NEXT: movprfx z5, z3
617 ; VBITS_GE_256-NEXT: sdiv z5.s, p0/m, z5.s, z4.s
618 ; VBITS_GE_256-NEXT: mls z0.s, p0/m, z2.s, z1.s
619 ; VBITS_GE_256-NEXT: movprfx z1, z3
620 ; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z4.s
621 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
622 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
623 ; VBITS_GE_256-NEXT: ret
625 ; VBITS_GE_512-LABEL: srem_v16i32:
626 ; VBITS_GE_512: // %bb.0:
627 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
628 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
629 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
630 ; VBITS_GE_512-NEXT: movprfx z2, z0
631 ; VBITS_GE_512-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
632 ; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s
633 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
634 ; VBITS_GE_512-NEXT: ret
635 %op1 = load <16 x i32>, ptr %a
636 %op2 = load <16 x i32>, ptr %b
637 %res = srem <16 x i32> %op1, %op2
638 store <16 x i32> %res, ptr %a
642 define void @srem_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
643 ; CHECK-LABEL: srem_v32i32:
645 ; CHECK-NEXT: ptrue p0.s, vl32
646 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
647 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
648 ; CHECK-NEXT: movprfx z2, z0
649 ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
650 ; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
651 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
653 %op1 = load <32 x i32>, ptr %a
654 %op2 = load <32 x i32>, ptr %b
655 %res = srem <32 x i32> %op1, %op2
656 store <32 x i32> %res, ptr %a
660 define void @srem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
661 ; CHECK-LABEL: srem_v64i32:
663 ; CHECK-NEXT: ptrue p0.s, vl64
664 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
665 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
666 ; CHECK-NEXT: movprfx z2, z0
667 ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
668 ; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
669 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
671 %op1 = load <64 x i32>, ptr %a
672 %op2 = load <64 x i32>, ptr %b
673 %res = srem <64 x i32> %op1, %op2
674 store <64 x i32> %res, ptr %a
678 ; Vector i64 sdiv are not legal for NEON so use SVE when available.
679 ; FIXME: We should be able to improve the codegen for the 128 bits case here.
680 define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
681 ; CHECK-LABEL: srem_v1i64:
683 ; CHECK-NEXT: ptrue p0.d, vl1
684 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
685 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
686 ; CHECK-NEXT: movprfx z2, z0
687 ; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
688 ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
689 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
691 %res = srem <1 x i64> %op1, %op2
695 ; Vector i64 sdiv are not legal for NEON so use SVE when available.
696 ; FIXME: We should be able to improve the codegen for the 128 bits case here.
697 define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
698 ; CHECK-LABEL: srem_v2i64:
700 ; CHECK-NEXT: ptrue p0.d, vl2
701 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
702 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
703 ; CHECK-NEXT: movprfx z2, z0
704 ; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
705 ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
706 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
708 %res = srem <2 x i64> %op1, %op2
712 define void @srem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
713 ; CHECK-LABEL: srem_v4i64:
715 ; CHECK-NEXT: ptrue p0.d, vl4
716 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
717 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
718 ; CHECK-NEXT: movprfx z2, z0
719 ; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
720 ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
721 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
723 %op1 = load <4 x i64>, ptr %a
724 %op2 = load <4 x i64>, ptr %b
725 %res = srem <4 x i64> %op1, %op2
726 store <4 x i64> %res, ptr %a
730 define void @srem_v8i64(ptr %a, ptr %b) #0 {
731 ; VBITS_GE_128-LABEL: srem_v8i64:
732 ; VBITS_GE_128: // %bb.0:
733 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
734 ; VBITS_GE_128-NEXT: ptrue p0.d, vl2
735 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
736 ; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32]
737 ; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32]
738 ; VBITS_GE_128-NEXT: movprfx z4, z1
739 ; VBITS_GE_128-NEXT: sdiv z4.d, p0/m, z4.d, z0.d
740 ; VBITS_GE_128-NEXT: movprfx z19, z2
741 ; VBITS_GE_128-NEXT: sdiv z19.d, p0/m, z19.d, z3.d
742 ; VBITS_GE_128-NEXT: movprfx z7, z5
743 ; VBITS_GE_128-NEXT: sdiv z7.d, p0/m, z7.d, z6.d
744 ; VBITS_GE_128-NEXT: movprfx z18, z16
745 ; VBITS_GE_128-NEXT: sdiv z18.d, p0/m, z18.d, z17.d
746 ; VBITS_GE_128-NEXT: msb z0.d, p0/m, z4.d, z1.d
747 ; VBITS_GE_128-NEXT: movprfx z1, z2
748 ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z19.d, z3.d
749 ; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d
750 ; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d
751 ; VBITS_GE_128-NEXT: stp q0, q1, [x0]
752 ; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32]
753 ; VBITS_GE_128-NEXT: ret
755 ; VBITS_GE_256-LABEL: srem_v8i64:
756 ; VBITS_GE_256: // %bb.0:
757 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
758 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
759 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
760 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
761 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
762 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1]
763 ; VBITS_GE_256-NEXT: movprfx z2, z0
764 ; VBITS_GE_256-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
765 ; VBITS_GE_256-NEXT: movprfx z5, z3
766 ; VBITS_GE_256-NEXT: sdiv z5.d, p0/m, z5.d, z4.d
767 ; VBITS_GE_256-NEXT: mls z0.d, p0/m, z2.d, z1.d
768 ; VBITS_GE_256-NEXT: movprfx z1, z3
769 ; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z4.d
770 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
771 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
772 ; VBITS_GE_256-NEXT: ret
774 ; VBITS_GE_512-LABEL: srem_v8i64:
775 ; VBITS_GE_512: // %bb.0:
776 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
777 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
778 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
779 ; VBITS_GE_512-NEXT: movprfx z2, z0
780 ; VBITS_GE_512-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
781 ; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d
782 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
783 ; VBITS_GE_512-NEXT: ret
784 %op1 = load <8 x i64>, ptr %a
785 %op2 = load <8 x i64>, ptr %b
786 %res = srem <8 x i64> %op1, %op2
787 store <8 x i64> %res, ptr %a
791 define void @srem_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
792 ; CHECK-LABEL: srem_v16i64:
794 ; CHECK-NEXT: ptrue p0.d, vl16
795 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
796 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
797 ; CHECK-NEXT: movprfx z2, z0
798 ; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
799 ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
800 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
802 %op1 = load <16 x i64>, ptr %a
803 %op2 = load <16 x i64>, ptr %b
804 %res = srem <16 x i64> %op1, %op2
805 store <16 x i64> %res, ptr %a
809 define void @srem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
810 ; CHECK-LABEL: srem_v32i64:
812 ; CHECK-NEXT: ptrue p0.d, vl32
813 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
814 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
815 ; CHECK-NEXT: movprfx z2, z0
816 ; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
817 ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
818 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
820 %op1 = load <32 x i64>, ptr %a
821 %op2 = load <32 x i64>, ptr %b
822 %res = srem <32 x i64> %op1, %op2
823 store <32 x i64> %res, ptr %a
831 ; Vector vXi8 udiv are not legal for NEON so use SVE when available.
832 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
833 define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
834 ; VBITS_GE_128-LABEL: urem_v8i8:
835 ; VBITS_GE_128: // %bb.0:
836 ; VBITS_GE_128-NEXT: ushll v2.8h, v1.8b, #0
837 ; VBITS_GE_128-NEXT: ushll v3.8h, v0.8b, #0
838 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
839 ; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0
840 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0
841 ; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0
842 ; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
843 ; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
844 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
845 ; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
846 ; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h
847 ; VBITS_GE_128-NEXT: mls v0.8b, v2.8b, v1.8b
848 ; VBITS_GE_128-NEXT: ret
850 ; VBITS_GE_256-LABEL: urem_v8i8:
851 ; VBITS_GE_256: // %bb.0:
852 ; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
853 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
854 ; VBITS_GE_256-NEXT: uunpklo z2.h, z1.b
855 ; VBITS_GE_256-NEXT: uunpklo z3.h, z0.b
856 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
857 ; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
858 ; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
859 ; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
860 ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
861 ; VBITS_GE_256-NEXT: umov w8, v2.h[0]
862 ; VBITS_GE_256-NEXT: umov w9, v2.h[1]
863 ; VBITS_GE_256-NEXT: fmov s3, w8
864 ; VBITS_GE_256-NEXT: umov w8, v2.h[2]
865 ; VBITS_GE_256-NEXT: mov v3.b[1], w9
866 ; VBITS_GE_256-NEXT: mov v3.b[2], w8
867 ; VBITS_GE_256-NEXT: umov w8, v2.h[3]
868 ; VBITS_GE_256-NEXT: mov v3.b[3], w8
869 ; VBITS_GE_256-NEXT: umov w8, v2.h[4]
870 ; VBITS_GE_256-NEXT: mov v3.b[4], w8
871 ; VBITS_GE_256-NEXT: umov w8, v2.h[5]
872 ; VBITS_GE_256-NEXT: mov v3.b[5], w8
873 ; VBITS_GE_256-NEXT: umov w8, v2.h[6]
874 ; VBITS_GE_256-NEXT: mov v3.b[6], w8
875 ; VBITS_GE_256-NEXT: umov w8, v2.h[7]
876 ; VBITS_GE_256-NEXT: mov v3.b[7], w8
877 ; VBITS_GE_256-NEXT: mls v0.8b, v3.8b, v1.8b
878 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
879 ; VBITS_GE_256-NEXT: ret
881 ; VBITS_GE_512-LABEL: urem_v8i8:
882 ; VBITS_GE_512: // %bb.0:
883 ; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1
884 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
885 ; VBITS_GE_512-NEXT: uunpklo z2.h, z1.b
886 ; VBITS_GE_512-NEXT: uunpklo z3.h, z0.b
887 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
888 ; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h
889 ; VBITS_GE_512-NEXT: uunpklo z3.s, z3.h
890 ; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s
891 ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
892 ; VBITS_GE_512-NEXT: umov w8, v2.h[0]
893 ; VBITS_GE_512-NEXT: umov w9, v2.h[1]
894 ; VBITS_GE_512-NEXT: fmov s3, w8
895 ; VBITS_GE_512-NEXT: umov w8, v2.h[2]
896 ; VBITS_GE_512-NEXT: mov v3.b[1], w9
897 ; VBITS_GE_512-NEXT: mov v3.b[2], w8
898 ; VBITS_GE_512-NEXT: umov w8, v2.h[3]
899 ; VBITS_GE_512-NEXT: mov v3.b[3], w8
900 ; VBITS_GE_512-NEXT: umov w8, v2.h[4]
901 ; VBITS_GE_512-NEXT: mov v3.b[4], w8
902 ; VBITS_GE_512-NEXT: umov w8, v2.h[5]
903 ; VBITS_GE_512-NEXT: mov v3.b[5], w8
904 ; VBITS_GE_512-NEXT: umov w8, v2.h[6]
905 ; VBITS_GE_512-NEXT: mov v3.b[6], w8
906 ; VBITS_GE_512-NEXT: umov w8, v2.h[7]
907 ; VBITS_GE_512-NEXT: mov v3.b[7], w8
908 ; VBITS_GE_512-NEXT: mls v0.8b, v3.8b, v1.8b
909 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
910 ; VBITS_GE_512-NEXT: ret
911 %res = urem <8 x i8> %op1, %op2
915 define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
916 ; VBITS_GE_128-LABEL: urem_v16i8:
917 ; VBITS_GE_128: // %bb.0:
918 ; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0
919 ; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0
920 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
921 ; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0
922 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0
923 ; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0
924 ; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
925 ; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
926 ; VBITS_GE_128-NEXT: ushll v5.8h, v0.8b, #0
927 ; VBITS_GE_128-NEXT: ushll2 v7.4s, v5.8h, #0
928 ; VBITS_GE_128-NEXT: ushll v5.4s, v5.4h, #0
929 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
930 ; VBITS_GE_128-NEXT: ushll v3.8h, v1.8b, #0
931 ; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0
932 ; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
933 ; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s
934 ; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
935 ; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s
936 ; VBITS_GE_128-NEXT: uzp1 v3.8h, v3.8h, v6.8h
937 ; VBITS_GE_128-NEXT: uzp1 v2.16b, v3.16b, v2.16b
938 ; VBITS_GE_128-NEXT: mls v0.16b, v2.16b, v1.16b
939 ; VBITS_GE_128-NEXT: ret
941 ; VBITS_GE_256-LABEL: urem_v16i8:
942 ; VBITS_GE_256: // %bb.0:
943 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
944 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
945 ; VBITS_GE_256-NEXT: uunpklo z2.h, z1.b
946 ; VBITS_GE_256-NEXT: uunpklo z3.h, z0.b
947 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
948 ; VBITS_GE_256-NEXT: uunpklo z4.s, z2.h
949 ; VBITS_GE_256-NEXT: uunpklo z5.s, z3.h
950 ; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
951 ; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
952 ; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
953 ; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
954 ; VBITS_GE_256-NEXT: udivr z4.s, p0/m, z4.s, z5.s
955 ; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
956 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
957 ; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h
958 ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
959 ; VBITS_GE_256-NEXT: splice z3.h, p0, z3.h, z2.h
960 ; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z3.b
961 ; VBITS_GE_256-NEXT: mls v0.16b, v2.16b, v1.16b
962 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
963 ; VBITS_GE_256-NEXT: ret
965 ; VBITS_GE_512-LABEL: urem_v16i8:
966 ; VBITS_GE_512: // %bb.0:
967 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
968 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
969 ; VBITS_GE_512-NEXT: uunpklo z2.h, z1.b
970 ; VBITS_GE_512-NEXT: uunpklo z3.h, z0.b
971 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
972 ; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h
973 ; VBITS_GE_512-NEXT: uunpklo z3.s, z3.h
974 ; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s
975 ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
976 ; VBITS_GE_512-NEXT: uzp1 z2.b, z2.b, z2.b
977 ; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b
978 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
979 ; VBITS_GE_512-NEXT: ret
980 %res = urem <16 x i8> %op1, %op2
984 define void @urem_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
985 ; CHECK-LABEL: urem_v32i8:
987 ; CHECK-NEXT: ptrue p0.b, vl32
988 ; CHECK-NEXT: ptrue p1.s, vl32
989 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
990 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
991 ; CHECK-NEXT: uunpklo z2.h, z1.b
992 ; CHECK-NEXT: uunpklo z3.h, z0.b
993 ; CHECK-NEXT: uunpklo z2.s, z2.h
994 ; CHECK-NEXT: uunpklo z3.s, z3.h
995 ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
996 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
997 ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
998 ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
999 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1001 %op1 = load <32 x i8>, ptr %a
1002 %op2 = load <32 x i8>, ptr %b
1003 %res = urem <32 x i8> %op1, %op2
1004 store <32 x i8> %res, ptr %a
1008 define void @urem_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
1009 ; CHECK-LABEL: urem_v64i8:
1011 ; CHECK-NEXT: ptrue p0.b, vl64
1012 ; CHECK-NEXT: ptrue p1.s, vl64
1013 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1014 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
1015 ; CHECK-NEXT: uunpklo z2.h, z1.b
1016 ; CHECK-NEXT: uunpklo z3.h, z0.b
1017 ; CHECK-NEXT: uunpklo z2.s, z2.h
1018 ; CHECK-NEXT: uunpklo z3.s, z3.h
1019 ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
1020 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
1021 ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
1022 ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
1023 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1025 %op1 = load <64 x i8>, ptr %a
1026 %op2 = load <64 x i8>, ptr %b
1027 %res = urem <64 x i8> %op1, %op2
1028 store <64 x i8> %res, ptr %a
1032 define void @urem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
1033 ; CHECK-LABEL: urem_v128i8:
1035 ; CHECK-NEXT: ptrue p0.b, vl128
1036 ; CHECK-NEXT: ptrue p1.s, vl64
1037 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1038 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
1039 ; CHECK-NEXT: uunpklo z2.h, z1.b
1040 ; CHECK-NEXT: uunpklo z3.h, z0.b
1041 ; CHECK-NEXT: uunpklo z4.s, z2.h
1042 ; CHECK-NEXT: uunpklo z5.s, z3.h
1043 ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
1044 ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
1045 ; CHECK-NEXT: uunpklo z2.s, z2.h
1046 ; CHECK-NEXT: uunpklo z3.s, z3.h
1047 ; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
1048 ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
1049 ; CHECK-NEXT: ptrue p1.h, vl64
1050 ; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h
1051 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
1052 ; CHECK-NEXT: splice z3.h, p1, z3.h, z2.h
1053 ; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b
1054 ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
1055 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1057 %op1 = load <128 x i8>, ptr %a
1058 %op2 = load <128 x i8>, ptr %b
1059 %res = urem <128 x i8> %op1, %op2
1060 store <128 x i8> %res, ptr %a
1064 define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
1065 ; CHECK-LABEL: urem_v256i8:
1067 ; CHECK-NEXT: ptrue p0.b, vl256
1068 ; CHECK-NEXT: ptrue p1.s, vl64
1069 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1070 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
1071 ; CHECK-NEXT: uunpklo z2.h, z1.b
1072 ; CHECK-NEXT: uunpklo z3.h, z0.b
1073 ; CHECK-NEXT: uunpklo z4.s, z2.h
1074 ; CHECK-NEXT: uunpklo z5.s, z3.h
1075 ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
1076 ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
1077 ; CHECK-NEXT: uunpklo z2.s, z2.h
1078 ; CHECK-NEXT: uunpklo z3.s, z3.h
1079 ; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
1080 ; CHECK-NEXT: mov z5.d, z0.d
1081 ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #128
1082 ; CHECK-NEXT: uunpklo z5.h, z5.b
1083 ; CHECK-NEXT: uunpklo z7.s, z5.h
1084 ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #128
1085 ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
1086 ; CHECK-NEXT: mov z3.d, z1.d
1087 ; CHECK-NEXT: uunpklo z5.s, z5.h
1088 ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128
1089 ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
1090 ; CHECK-NEXT: uunpklo z3.h, z3.b
1091 ; CHECK-NEXT: uunpklo z6.s, z3.h
1092 ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
1093 ; CHECK-NEXT: uunpklo z3.s, z3.h
1094 ; CHECK-NEXT: udivr z6.s, p1/m, z6.s, z7.s
1095 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
1096 ; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z5.s
1097 ; CHECK-NEXT: ptrue p1.h, vl64
1098 ; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h
1099 ; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h
1100 ; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b
1101 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
1102 ; CHECK-NEXT: splice z5.h, p1, z5.h, z3.h
1103 ; CHECK-NEXT: ptrue p1.b, vl128
1104 ; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b
1105 ; CHECK-NEXT: splice z2.b, p1, z2.b, z3.b
1106 ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
1107 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1109 %op1 = load <256 x i8>, ptr %a
1110 %op2 = load <256 x i8>, ptr %b
1111 %res = urem <256 x i8> %op1, %op2
1112 store <256 x i8> %res, ptr %a
1116 ; Vector vXi16 udiv are not legal for NEON so use SVE when available.
1117 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
1118 define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
1119 ; VBITS_GE_128-LABEL: urem_v4i16:
1120 ; VBITS_GE_128: // %bb.0:
1121 ; VBITS_GE_128-NEXT: ushll v2.4s, v1.4h, #0
1122 ; VBITS_GE_128-NEXT: ushll v3.4s, v0.4h, #0
1123 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
1124 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
1125 ; VBITS_GE_128-NEXT: xtn v2.4h, v2.4s
1126 ; VBITS_GE_128-NEXT: mls v0.4h, v2.4h, v1.4h
1127 ; VBITS_GE_128-NEXT: ret
1129 ; VBITS_GE_256-LABEL: urem_v4i16:
1130 ; VBITS_GE_256: // %bb.0:
1131 ; VBITS_GE_256-NEXT: ushll v2.4s, v1.4h, #0
1132 ; VBITS_GE_256-NEXT: ushll v3.4s, v0.4h, #0
1133 ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
1134 ; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
1135 ; VBITS_GE_256-NEXT: mov w8, v2.s[1]
1136 ; VBITS_GE_256-NEXT: mov v3.16b, v2.16b
1137 ; VBITS_GE_256-NEXT: mov w9, v2.s[2]
1138 ; VBITS_GE_256-NEXT: mov v3.h[1], w8
1139 ; VBITS_GE_256-NEXT: mov w8, v2.s[3]
1140 ; VBITS_GE_256-NEXT: mov v3.h[2], w9
1141 ; VBITS_GE_256-NEXT: mov v3.h[3], w8
1142 ; VBITS_GE_256-NEXT: mls v0.4h, v3.4h, v1.4h
1143 ; VBITS_GE_256-NEXT: ret
1145 ; VBITS_GE_512-LABEL: urem_v4i16:
1146 ; VBITS_GE_512: // %bb.0:
1147 ; VBITS_GE_512-NEXT: ushll v2.4s, v1.4h, #0
1148 ; VBITS_GE_512-NEXT: ushll v3.4s, v0.4h, #0
1149 ; VBITS_GE_512-NEXT: ptrue p0.s, vl4
1150 ; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s
1151 ; VBITS_GE_512-NEXT: mov w8, v2.s[1]
1152 ; VBITS_GE_512-NEXT: mov v3.16b, v2.16b
1153 ; VBITS_GE_512-NEXT: mov w9, v2.s[2]
1154 ; VBITS_GE_512-NEXT: mov v3.h[1], w8
1155 ; VBITS_GE_512-NEXT: mov w8, v2.s[3]
1156 ; VBITS_GE_512-NEXT: mov v3.h[2], w9
1157 ; VBITS_GE_512-NEXT: mov v3.h[3], w8
1158 ; VBITS_GE_512-NEXT: mls v0.4h, v3.4h, v1.4h
1159 ; VBITS_GE_512-NEXT: ret
1160 %res = urem <4 x i16> %op1, %op2
1164 define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
1165 ; VBITS_GE_128-LABEL: urem_v8i16:
1166 ; VBITS_GE_128: // %bb.0:
1167 ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
1168 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
1169 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
1170 ; VBITS_GE_128-NEXT: ushll v4.4s, v0.4h, #0
1171 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
1172 ; VBITS_GE_128-NEXT: ushll v3.4s, v1.4h, #0
1173 ; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z4.s
1174 ; VBITS_GE_128-NEXT: uzp1 v2.8h, v3.8h, v2.8h
1175 ; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h
1176 ; VBITS_GE_128-NEXT: ret
1178 ; VBITS_GE_256-LABEL: urem_v8i16:
1179 ; VBITS_GE_256: // %bb.0:
1180 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
1181 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
1182 ; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
1183 ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
1184 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1185 ; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
1186 ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
1187 ; VBITS_GE_256-NEXT: mls v0.8h, v2.8h, v1.8h
1188 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
1189 ; VBITS_GE_256-NEXT: ret
1191 ; VBITS_GE_512-LABEL: urem_v8i16:
1192 ; VBITS_GE_512: // %bb.0:
1193 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
1194 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
1195 ; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h
1196 ; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h
1197 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
1198 ; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s
1199 ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
1200 ; VBITS_GE_512-NEXT: mls v0.8h, v2.8h, v1.8h
1201 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
1202 ; VBITS_GE_512-NEXT: ret
1203 %res = urem <8 x i16> %op1, %op2
1207 define void @urem_v16i16(ptr %a, ptr %b) #0 {
1208 ; VBITS_GE_128-LABEL: urem_v16i16:
1209 ; VBITS_GE_128: // %bb.0:
1210 ; VBITS_GE_128-NEXT: ldp q4, q1, [x1]
1211 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
1212 ; VBITS_GE_128-NEXT: ldr q0, [x0, #16]
1213 ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
1214 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
1215 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v4.8h, #0
1216 ; VBITS_GE_128-NEXT: ushll v16.4s, v0.4h, #0
1217 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
1218 ; VBITS_GE_128-NEXT: ldr q3, [x0]
1219 ; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0
1220 ; VBITS_GE_128-NEXT: ushll v7.4s, v3.4h, #0
1221 ; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s
1222 ; VBITS_GE_128-NEXT: ushll v6.4s, v4.4h, #0
1223 ; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s
1224 ; VBITS_GE_128-NEXT: ushll v7.4s, v1.4h, #0
1225 ; VBITS_GE_128-NEXT: udivr z7.s, p0/m, z7.s, z16.s
1226 ; VBITS_GE_128-NEXT: uzp1 v5.8h, v6.8h, v5.8h
1227 ; VBITS_GE_128-NEXT: mls v3.8h, v5.8h, v4.8h
1228 ; VBITS_GE_128-NEXT: uzp1 v2.8h, v7.8h, v2.8h
1229 ; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h
1230 ; VBITS_GE_128-NEXT: stp q3, q0, [x0]
1231 ; VBITS_GE_128-NEXT: ret
1233 ; VBITS_GE_256-LABEL: urem_v16i16:
1234 ; VBITS_GE_256: // %bb.0:
1235 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1236 ; VBITS_GE_256-NEXT: ptrue p1.s, vl8
1237 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
1238 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
1239 ; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
1240 ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
1241 ; VBITS_GE_256-NEXT: mov z4.d, z0.d
1242 ; VBITS_GE_256-NEXT: ext z4.b, z4.b, z0.b, #16
1243 ; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s
1244 ; VBITS_GE_256-NEXT: mov z3.d, z1.d
1245 ; VBITS_GE_256-NEXT: uunpklo z4.s, z4.h
1246 ; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16
1247 ; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
1248 ; VBITS_GE_256-NEXT: udivr z3.s, p1/m, z3.s, z4.s
1249 ; VBITS_GE_256-NEXT: ptrue p1.h, vl8
1250 ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
1251 ; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
1252 ; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z3.h
1253 ; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h
1254 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
1255 ; VBITS_GE_256-NEXT: ret
1257 ; VBITS_GE_512-LABEL: urem_v16i16:
1258 ; VBITS_GE_512: // %bb.0:
1259 ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
1260 ; VBITS_GE_512-NEXT: ptrue p1.s, vl16
1261 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1262 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
1263 ; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h
1264 ; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h
1265 ; VBITS_GE_512-NEXT: udivr z2.s, p1/m, z2.s, z3.s
1266 ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
1267 ; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h
1268 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
1269 ; VBITS_GE_512-NEXT: ret
1270 %op1 = load <16 x i16>, ptr %a
1271 %op2 = load <16 x i16>, ptr %b
1272 %res = urem <16 x i16> %op1, %op2
1273 store <16 x i16> %res, ptr %a
1277 define void @urem_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1278 ; CHECK-LABEL: urem_v32i16:
1280 ; CHECK-NEXT: ptrue p0.h, vl32
1281 ; CHECK-NEXT: ptrue p1.s, vl32
1282 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1283 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1284 ; CHECK-NEXT: uunpklo z2.s, z1.h
1285 ; CHECK-NEXT: uunpklo z3.s, z0.h
1286 ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
1287 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
1288 ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
1289 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1291 %op1 = load <32 x i16>, ptr %a
1292 %op2 = load <32 x i16>, ptr %b
1293 %res = urem <32 x i16> %op1, %op2
1294 store <32 x i16> %res, ptr %a
1298 define void @urem_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1299 ; CHECK-LABEL: urem_v64i16:
1301 ; CHECK-NEXT: ptrue p0.h, vl64
1302 ; CHECK-NEXT: ptrue p1.s, vl64
1303 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1304 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1305 ; CHECK-NEXT: uunpklo z2.s, z1.h
1306 ; CHECK-NEXT: uunpklo z3.s, z0.h
1307 ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
1308 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
1309 ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
1310 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1312 %op1 = load <64 x i16>, ptr %a
1313 %op2 = load <64 x i16>, ptr %b
1314 %res = urem <64 x i16> %op1, %op2
1315 store <64 x i16> %res, ptr %a
1319 define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1320 ; CHECK-LABEL: urem_v128i16:
1322 ; CHECK-NEXT: ptrue p0.h, vl128
1323 ; CHECK-NEXT: ptrue p1.s, vl64
1324 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1325 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1326 ; CHECK-NEXT: uunpklo z2.s, z1.h
1327 ; CHECK-NEXT: uunpklo z3.s, z0.h
1328 ; CHECK-NEXT: mov z4.d, z0.d
1329 ; CHECK-NEXT: ext z4.b, z4.b, z0.b, #128
1330 ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
1331 ; CHECK-NEXT: mov z3.d, z1.d
1332 ; CHECK-NEXT: uunpklo z4.s, z4.h
1333 ; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128
1334 ; CHECK-NEXT: uunpklo z3.s, z3.h
1335 ; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z4.s
1336 ; CHECK-NEXT: ptrue p1.h, vl64
1337 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
1338 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
1339 ; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h
1340 ; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
1341 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1343 %op1 = load <128 x i16>, ptr %a
1344 %op2 = load <128 x i16>, ptr %b
1345 %res = urem <128 x i16> %op1, %op2
1346 store <128 x i16> %res, ptr %a
1350 ; Vector v2i32 udiv are not legal for NEON so use SVE when available.
1351 define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
1352 ; CHECK-LABEL: urem_v2i32:
1354 ; CHECK-NEXT: ptrue p0.s, vl2
1355 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
1356 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
1357 ; CHECK-NEXT: movprfx z2, z0
1358 ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
1359 ; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s
1360 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1362 %res = urem <2 x i32> %op1, %op2
1366 ; Vector v4i32 udiv are not legal for NEON so use SVE when available.
1367 define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
1368 ; CHECK-LABEL: urem_v4i32:
1370 ; CHECK-NEXT: ptrue p0.s, vl4
1371 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
1372 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1373 ; CHECK-NEXT: movprfx z2, z0
1374 ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
1375 ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
1376 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
1378 %res = urem <4 x i32> %op1, %op2
1382 define void @urem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1383 ; CHECK-LABEL: urem_v8i32:
1385 ; CHECK-NEXT: ptrue p0.s, vl8
1386 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1387 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1388 ; CHECK-NEXT: movprfx z2, z0
1389 ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
1390 ; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
1391 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1393 %op1 = load <8 x i32>, ptr %a
1394 %op2 = load <8 x i32>, ptr %b
1395 %res = urem <8 x i32> %op1, %op2
1396 store <8 x i32> %res, ptr %a
1400 define void @urem_v16i32(ptr %a, ptr %b) #0 {
1401 ; VBITS_GE_128-LABEL: urem_v16i32:
1402 ; VBITS_GE_128: // %bb.0:
1403 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
1404 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
1405 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
1406 ; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32]
1407 ; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32]
1408 ; VBITS_GE_128-NEXT: movprfx z4, z1
1409 ; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z0.s
1410 ; VBITS_GE_128-NEXT: movprfx z19, z2
1411 ; VBITS_GE_128-NEXT: udiv z19.s, p0/m, z19.s, z3.s
1412 ; VBITS_GE_128-NEXT: movprfx z7, z5
1413 ; VBITS_GE_128-NEXT: udiv z7.s, p0/m, z7.s, z6.s
1414 ; VBITS_GE_128-NEXT: movprfx z18, z16
1415 ; VBITS_GE_128-NEXT: udiv z18.s, p0/m, z18.s, z17.s
1416 ; VBITS_GE_128-NEXT: mls v1.4s, v4.4s, v0.4s
1417 ; VBITS_GE_128-NEXT: mls v2.4s, v19.4s, v3.4s
1418 ; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s
1419 ; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s
1420 ; VBITS_GE_128-NEXT: stp q1, q2, [x0]
1421 ; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32]
1422 ; VBITS_GE_128-NEXT: ret
1424 ; VBITS_GE_256-LABEL: urem_v16i32:
1425 ; VBITS_GE_256: // %bb.0:
1426 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1427 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1428 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1429 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1430 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
1431 ; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1]
1432 ; VBITS_GE_256-NEXT: movprfx z2, z0
1433 ; VBITS_GE_256-NEXT: udiv z2.s, p0/m, z2.s, z1.s
1434 ; VBITS_GE_256-NEXT: movprfx z5, z3
1435 ; VBITS_GE_256-NEXT: udiv z5.s, p0/m, z5.s, z4.s
1436 ; VBITS_GE_256-NEXT: mls z0.s, p0/m, z2.s, z1.s
1437 ; VBITS_GE_256-NEXT: movprfx z1, z3
1438 ; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z4.s
1439 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1440 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1441 ; VBITS_GE_256-NEXT: ret
1443 ; VBITS_GE_512-LABEL: urem_v16i32:
1444 ; VBITS_GE_512: // %bb.0:
1445 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1446 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1447 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
1448 ; VBITS_GE_512-NEXT: movprfx z2, z0
1449 ; VBITS_GE_512-NEXT: udiv z2.s, p0/m, z2.s, z1.s
1450 ; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s
1451 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1452 ; VBITS_GE_512-NEXT: ret
1453 %op1 = load <16 x i32>, ptr %a
1454 %op2 = load <16 x i32>, ptr %b
1455 %res = urem <16 x i32> %op1, %op2
1456 store <16 x i32> %res, ptr %a
1460 define void @urem_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1461 ; CHECK-LABEL: urem_v32i32:
1463 ; CHECK-NEXT: ptrue p0.s, vl32
1464 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1465 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1466 ; CHECK-NEXT: movprfx z2, z0
1467 ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
1468 ; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
1469 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1471 %op1 = load <32 x i32>, ptr %a
1472 %op2 = load <32 x i32>, ptr %b
1473 %res = urem <32 x i32> %op1, %op2
1474 store <32 x i32> %res, ptr %a
1478 define void @urem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1479 ; CHECK-LABEL: urem_v64i32:
1481 ; CHECK-NEXT: ptrue p0.s, vl64
1482 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1483 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1484 ; CHECK-NEXT: movprfx z2, z0
1485 ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
1486 ; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
1487 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1489 %op1 = load <64 x i32>, ptr %a
1490 %op2 = load <64 x i32>, ptr %b
1491 %res = urem <64 x i32> %op1, %op2
1492 store <64 x i32> %res, ptr %a
1496 ; Vector i64 udiv are not legal for NEON so use SVE when available.
1497 ; FIXME: We should be able to improve the codegen for the 128 bits case here.
1498 define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
1499 ; CHECK-LABEL: urem_v1i64:
1501 ; CHECK-NEXT: ptrue p0.d, vl1
1502 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
1503 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
1504 ; CHECK-NEXT: movprfx z2, z0
1505 ; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
1506 ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
1507 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1509 %res = urem <1 x i64> %op1, %op2
1513 ; Vector i64 udiv are not legal for NEON so use SVE when available.
1514 ; FIXME: We should be able to improve the codegen for the 128 bits case here.
1515 define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
1516 ; CHECK-LABEL: urem_v2i64:
1518 ; CHECK-NEXT: ptrue p0.d, vl2
1519 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
1520 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1521 ; CHECK-NEXT: movprfx z2, z0
1522 ; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
1523 ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
1524 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
1526 %res = urem <2 x i64> %op1, %op2
1530 define void @urem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1531 ; CHECK-LABEL: urem_v4i64:
1533 ; CHECK-NEXT: ptrue p0.d, vl4
1534 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1535 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1536 ; CHECK-NEXT: movprfx z2, z0
1537 ; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
1538 ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
1539 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1541 %op1 = load <4 x i64>, ptr %a
1542 %op2 = load <4 x i64>, ptr %b
1543 %res = urem <4 x i64> %op1, %op2
1544 store <4 x i64> %res, ptr %a
1548 define void @urem_v8i64(ptr %a, ptr %b) #0 {
1549 ; VBITS_GE_128-LABEL: urem_v8i64:
1550 ; VBITS_GE_128: // %bb.0:
1551 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
1552 ; VBITS_GE_128-NEXT: ptrue p0.d, vl2
1553 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
1554 ; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32]
1555 ; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32]
1556 ; VBITS_GE_128-NEXT: movprfx z4, z1
1557 ; VBITS_GE_128-NEXT: udiv z4.d, p0/m, z4.d, z0.d
1558 ; VBITS_GE_128-NEXT: movprfx z19, z2
1559 ; VBITS_GE_128-NEXT: udiv z19.d, p0/m, z19.d, z3.d
1560 ; VBITS_GE_128-NEXT: movprfx z7, z5
1561 ; VBITS_GE_128-NEXT: udiv z7.d, p0/m, z7.d, z6.d
1562 ; VBITS_GE_128-NEXT: movprfx z18, z16
1563 ; VBITS_GE_128-NEXT: udiv z18.d, p0/m, z18.d, z17.d
1564 ; VBITS_GE_128-NEXT: msb z0.d, p0/m, z4.d, z1.d
1565 ; VBITS_GE_128-NEXT: movprfx z1, z2
1566 ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z19.d, z3.d
1567 ; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d
1568 ; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d
1569 ; VBITS_GE_128-NEXT: stp q0, q1, [x0]
1570 ; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32]
1571 ; VBITS_GE_128-NEXT: ret
1573 ; VBITS_GE_256-LABEL: urem_v8i64:
1574 ; VBITS_GE_256: // %bb.0:
1575 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1576 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1577 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1578 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1579 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
1580 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1]
1581 ; VBITS_GE_256-NEXT: movprfx z2, z0
1582 ; VBITS_GE_256-NEXT: udiv z2.d, p0/m, z2.d, z1.d
1583 ; VBITS_GE_256-NEXT: movprfx z5, z3
1584 ; VBITS_GE_256-NEXT: udiv z5.d, p0/m, z5.d, z4.d
1585 ; VBITS_GE_256-NEXT: mls z0.d, p0/m, z2.d, z1.d
1586 ; VBITS_GE_256-NEXT: movprfx z1, z3
1587 ; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z4.d
1588 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1589 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1590 ; VBITS_GE_256-NEXT: ret
1592 ; VBITS_GE_512-LABEL: urem_v8i64:
1593 ; VBITS_GE_512: // %bb.0:
1594 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1595 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1596 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
1597 ; VBITS_GE_512-NEXT: movprfx z2, z0
1598 ; VBITS_GE_512-NEXT: udiv z2.d, p0/m, z2.d, z1.d
1599 ; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d
1600 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1601 ; VBITS_GE_512-NEXT: ret
1602 %op1 = load <8 x i64>, ptr %a
1603 %op2 = load <8 x i64>, ptr %b
1604 %res = urem <8 x i64> %op1, %op2
1605 store <8 x i64> %res, ptr %a
1609 define void @urem_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1610 ; CHECK-LABEL: urem_v16i64:
1612 ; CHECK-NEXT: ptrue p0.d, vl16
1613 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1614 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1615 ; CHECK-NEXT: movprfx z2, z0
1616 ; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
1617 ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
1618 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1620 %op1 = load <16 x i64>, ptr %a
1621 %op2 = load <16 x i64>, ptr %b
1622 %res = urem <16 x i64> %op1, %op2
1623 store <16 x i64> %res, ptr %a
1627 define void @urem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1628 ; CHECK-LABEL: urem_v32i64:
1630 ; CHECK-NEXT: ptrue p0.d, vl32
1631 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1632 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1633 ; CHECK-NEXT: movprfx z2, z0
1634 ; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
1635 ; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
1636 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1638 %op1 = load <32 x i64>, ptr %a
1639 %op2 = load <32 x i64>, ptr %b
1640 %res = urem <32 x i64> %op1, %op2
1641 store <32 x i64> %res, ptr %a
1645 attributes #0 = { "target-features"="+sve" }