1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
3 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
4 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
7 target triple = "aarch64-unknown-linux-gnu"
13 ; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
14 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
15 define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
16 ; VBITS_GE_128-LABEL: sdiv_v8i8:
17 ; VBITS_GE_128: // %bb.0:
18 ; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0
19 ; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0
20 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
21 ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
22 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
23 ; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
24 ; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
25 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
26 ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
27 ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
28 ; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h
29 ; VBITS_GE_128-NEXT: ret
31 ; VBITS_GE_256-LABEL: sdiv_v8i8:
32 ; VBITS_GE_256: // %bb.0:
33 ; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
34 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
35 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
36 ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
37 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
38 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
39 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
40 ; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
41 ; VBITS_GE_256-NEXT: uzp1 z1.h, z0.h, z0.h
42 ; VBITS_GE_256-NEXT: umov w8, v1.h[0]
43 ; VBITS_GE_256-NEXT: umov w9, v1.h[1]
44 ; VBITS_GE_256-NEXT: fmov s0, w8
45 ; VBITS_GE_256-NEXT: umov w8, v1.h[2]
46 ; VBITS_GE_256-NEXT: mov v0.b[1], w9
47 ; VBITS_GE_256-NEXT: mov v0.b[2], w8
48 ; VBITS_GE_256-NEXT: umov w8, v1.h[3]
49 ; VBITS_GE_256-NEXT: mov v0.b[3], w8
50 ; VBITS_GE_256-NEXT: umov w8, v1.h[4]
51 ; VBITS_GE_256-NEXT: mov v0.b[4], w8
52 ; VBITS_GE_256-NEXT: umov w8, v1.h[5]
53 ; VBITS_GE_256-NEXT: mov v0.b[5], w8
54 ; VBITS_GE_256-NEXT: umov w8, v1.h[6]
55 ; VBITS_GE_256-NEXT: mov v0.b[6], w8
56 ; VBITS_GE_256-NEXT: umov w8, v1.h[7]
57 ; VBITS_GE_256-NEXT: mov v0.b[7], w8
58 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
59 ; VBITS_GE_256-NEXT: ret
61 ; VBITS_GE_512-LABEL: sdiv_v8i8:
62 ; VBITS_GE_512: // %bb.0:
63 ; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1
64 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
65 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
66 ; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b
67 ; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
68 ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
69 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
70 ; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
71 ; VBITS_GE_512-NEXT: uzp1 z1.h, z0.h, z0.h
72 ; VBITS_GE_512-NEXT: umov w8, v1.h[0]
73 ; VBITS_GE_512-NEXT: umov w9, v1.h[1]
74 ; VBITS_GE_512-NEXT: fmov s0, w8
75 ; VBITS_GE_512-NEXT: umov w8, v1.h[2]
76 ; VBITS_GE_512-NEXT: mov v0.b[1], w9
77 ; VBITS_GE_512-NEXT: mov v0.b[2], w8
78 ; VBITS_GE_512-NEXT: umov w8, v1.h[3]
79 ; VBITS_GE_512-NEXT: mov v0.b[3], w8
80 ; VBITS_GE_512-NEXT: umov w8, v1.h[4]
81 ; VBITS_GE_512-NEXT: mov v0.b[4], w8
82 ; VBITS_GE_512-NEXT: umov w8, v1.h[5]
83 ; VBITS_GE_512-NEXT: mov v0.b[5], w8
84 ; VBITS_GE_512-NEXT: umov w8, v1.h[6]
85 ; VBITS_GE_512-NEXT: mov v0.b[6], w8
86 ; VBITS_GE_512-NEXT: umov w8, v1.h[7]
87 ; VBITS_GE_512-NEXT: mov v0.b[7], w8
88 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0
89 ; VBITS_GE_512-NEXT: ret
90 %res = sdiv <8 x i8> %op1, %op2
94 define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
95 ; VBITS_GE_128-LABEL: sdiv_v16i8:
96 ; VBITS_GE_128: // %bb.0:
97 ; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0
98 ; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0
99 ; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0
100 ; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0
101 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
102 ; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0
103 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0
104 ; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0
105 ; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
106 ; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
107 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v0.8h, #0
108 ; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
109 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
110 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v1.8h, #0
111 ; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
112 ; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
113 ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
114 ; VBITS_GE_128-NEXT: uzp1 v1.8h, v2.8h, v4.8h
115 ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v3.8h
116 ; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v1.16b
117 ; VBITS_GE_128-NEXT: ret
119 ; VBITS_GE_256-LABEL: sdiv_v16i8:
120 ; VBITS_GE_256: // %bb.0:
121 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
122 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
123 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
124 ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
125 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
126 ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
127 ; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
128 ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
129 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
130 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
131 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
132 ; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
133 ; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
134 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
135 ; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h
136 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
137 ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
138 ; VBITS_GE_256-NEXT: uzp1 z0.b, z1.b, z1.b
139 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
140 ; VBITS_GE_256-NEXT: ret
142 ; VBITS_GE_512-LABEL: sdiv_v16i8:
143 ; VBITS_GE_512: // %bb.0:
144 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
145 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
146 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
147 ; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b
148 ; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
149 ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
150 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
151 ; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
152 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
153 ; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
154 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
155 ; VBITS_GE_512-NEXT: ret
156 %res = sdiv <16 x i8> %op1, %op2
160 define void @sdiv_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
161 ; CHECK-LABEL: sdiv_v32i8:
163 ; CHECK-NEXT: ptrue p0.b, vl32
164 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
165 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
166 ; CHECK-NEXT: ptrue p0.s, vl32
167 ; CHECK-NEXT: sunpklo z1.h, z1.b
168 ; CHECK-NEXT: sunpklo z0.h, z0.b
169 ; CHECK-NEXT: sunpklo z1.s, z1.h
170 ; CHECK-NEXT: sunpklo z0.s, z0.h
171 ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
172 ; CHECK-NEXT: st1b { z0.s }, p0, [x0]
174 %op1 = load <32 x i8>, ptr %a
175 %op2 = load <32 x i8>, ptr %b
176 %res = sdiv <32 x i8> %op1, %op2
177 store <32 x i8> %res, ptr %a
181 define void @sdiv_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
182 ; CHECK-LABEL: sdiv_v64i8:
184 ; CHECK-NEXT: ptrue p0.b, vl64
185 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
186 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
187 ; CHECK-NEXT: ptrue p0.s, vl64
188 ; CHECK-NEXT: sunpklo z1.h, z1.b
189 ; CHECK-NEXT: sunpklo z0.h, z0.b
190 ; CHECK-NEXT: sunpklo z1.s, z1.h
191 ; CHECK-NEXT: sunpklo z0.s, z0.h
192 ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
193 ; CHECK-NEXT: st1b { z0.s }, p0, [x0]
195 %op1 = load <64 x i8>, ptr %a
196 %op2 = load <64 x i8>, ptr %b
197 %res = sdiv <64 x i8> %op1, %op2
198 store <64 x i8> %res, ptr %a
202 define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
203 ; CHECK-LABEL: sdiv_v128i8:
205 ; CHECK-NEXT: ptrue p0.b, vl128
206 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
207 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
208 ; CHECK-NEXT: ptrue p0.s, vl64
209 ; CHECK-NEXT: sunpklo z1.h, z1.b
210 ; CHECK-NEXT: sunpklo z0.h, z0.b
211 ; CHECK-NEXT: sunpklo z2.s, z1.h
212 ; CHECK-NEXT: sunpklo z3.s, z0.h
213 ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
214 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
215 ; CHECK-NEXT: sunpklo z1.s, z1.h
216 ; CHECK-NEXT: sunpklo z0.s, z0.h
217 ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
218 ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
219 ; CHECK-NEXT: ptrue p0.h, vl64
220 ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
221 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
222 ; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h
223 ; CHECK-NEXT: ptrue p0.h, vl128
224 ; CHECK-NEXT: st1b { z1.h }, p0, [x0]
226 %op1 = load <128 x i8>, ptr %a
227 %op2 = load <128 x i8>, ptr %b
228 %res = sdiv <128 x i8> %op1, %op2
229 store <128 x i8> %res, ptr %a
233 define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
234 ; CHECK-LABEL: sdiv_v256i8:
236 ; CHECK-NEXT: ptrue p0.b, vl256
237 ; CHECK-NEXT: ptrue p1.s, vl64
238 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
239 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
240 ; CHECK-NEXT: sunpklo z2.h, z1.b
241 ; CHECK-NEXT: sunpklo z3.h, z0.b
242 ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
243 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
244 ; CHECK-NEXT: sunpklo z1.h, z1.b
245 ; CHECK-NEXT: sunpklo z4.s, z2.h
246 ; CHECK-NEXT: sunpklo z5.s, z3.h
247 ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
248 ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
249 ; CHECK-NEXT: sunpklo z0.h, z0.b
250 ; CHECK-NEXT: sunpklo z2.s, z2.h
251 ; CHECK-NEXT: sunpklo z3.s, z3.h
252 ; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
253 ; CHECK-NEXT: sunpklo z5.s, z0.h
254 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
255 ; CHECK-NEXT: sunpklo z0.s, z0.h
256 ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
257 ; CHECK-NEXT: sunpklo z3.s, z1.h
258 ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
259 ; CHECK-NEXT: sunpklo z1.s, z1.h
260 ; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z5.s
261 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
262 ; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
263 ; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h
264 ; CHECK-NEXT: ptrue p1.h, vl64
265 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
266 ; CHECK-NEXT: splice z1.h, p1, z1.h, z2.h
267 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
268 ; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h
269 ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b
270 ; CHECK-NEXT: ptrue p1.b, vl128
271 ; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b
272 ; CHECK-NEXT: splice z0.b, p1, z0.b, z1.b
273 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
275 %op1 = load <256 x i8>, ptr %a
276 %op2 = load <256 x i8>, ptr %b
277 %res = sdiv <256 x i8> %op1, %op2
278 store <256 x i8> %res, ptr %a
282 ; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
283 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
284 define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
285 ; VBITS_GE_128-LABEL: sdiv_v4i16:
286 ; VBITS_GE_128: // %bb.0:
287 ; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
288 ; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
289 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
290 ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
291 ; VBITS_GE_128-NEXT: xtn v0.4h, v0.4s
292 ; VBITS_GE_128-NEXT: ret
294 ; VBITS_GE_256-LABEL: sdiv_v4i16:
295 ; VBITS_GE_256: // %bb.0:
296 ; VBITS_GE_256-NEXT: sshll v1.4s, v1.4h, #0
297 ; VBITS_GE_256-NEXT: sshll v0.4s, v0.4h, #0
298 ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
299 ; VBITS_GE_256-NEXT: sdivr z1.s, p0/m, z1.s, z0.s
300 ; VBITS_GE_256-NEXT: mov w8, v1.s[1]
301 ; VBITS_GE_256-NEXT: mov v0.16b, v1.16b
302 ; VBITS_GE_256-NEXT: mov w9, v1.s[2]
303 ; VBITS_GE_256-NEXT: mov v0.h[1], w8
304 ; VBITS_GE_256-NEXT: mov w8, v1.s[3]
305 ; VBITS_GE_256-NEXT: mov v0.h[2], w9
306 ; VBITS_GE_256-NEXT: mov v0.h[3], w8
307 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
308 ; VBITS_GE_256-NEXT: ret
310 ; VBITS_GE_512-LABEL: sdiv_v4i16:
311 ; VBITS_GE_512: // %bb.0:
312 ; VBITS_GE_512-NEXT: sshll v1.4s, v1.4h, #0
313 ; VBITS_GE_512-NEXT: sshll v0.4s, v0.4h, #0
314 ; VBITS_GE_512-NEXT: ptrue p0.s, vl4
315 ; VBITS_GE_512-NEXT: sdivr z1.s, p0/m, z1.s, z0.s
316 ; VBITS_GE_512-NEXT: mov w8, v1.s[1]
317 ; VBITS_GE_512-NEXT: mov v0.16b, v1.16b
318 ; VBITS_GE_512-NEXT: mov w9, v1.s[2]
319 ; VBITS_GE_512-NEXT: mov v0.h[1], w8
320 ; VBITS_GE_512-NEXT: mov w8, v1.s[3]
321 ; VBITS_GE_512-NEXT: mov v0.h[2], w9
322 ; VBITS_GE_512-NEXT: mov v0.h[3], w8
323 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0
324 ; VBITS_GE_512-NEXT: ret
325 %res = sdiv <4 x i16> %op1, %op2
329 define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
330 ; VBITS_GE_128-LABEL: sdiv_v8i16:
331 ; VBITS_GE_128: // %bb.0:
332 ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
333 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
334 ; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
335 ; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
336 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
337 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
338 ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
339 ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
340 ; VBITS_GE_128-NEXT: ret
342 ; VBITS_GE_256-LABEL: sdiv_v8i16:
343 ; VBITS_GE_256: // %bb.0:
344 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
345 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
346 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
347 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
348 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
349 ; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
350 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
351 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
352 ; VBITS_GE_256-NEXT: ret
354 ; VBITS_GE_512-LABEL: sdiv_v8i16:
355 ; VBITS_GE_512: // %bb.0:
356 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
357 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
358 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
359 ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
360 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
361 ; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
362 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
363 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
364 ; VBITS_GE_512-NEXT: ret
365 %res = sdiv <8 x i16> %op1, %op2
369 define void @sdiv_v16i16(ptr %a, ptr %b) #0 {
370 ; VBITS_GE_128-LABEL: sdiv_v16i16:
371 ; VBITS_GE_128: // %bb.0:
372 ; VBITS_GE_128-NEXT: ldp q4, q1, [x1]
373 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
374 ; VBITS_GE_128-NEXT: ldr q0, [x0, #16]
375 ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
376 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
377 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v4.8h, #0
378 ; VBITS_GE_128-NEXT: sshll v4.4s, v4.4h, #0
379 ; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
380 ; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
381 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
382 ; VBITS_GE_128-NEXT: ldr q3, [x0]
383 ; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0
384 ; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
385 ; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
386 ; VBITS_GE_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
387 ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
388 ; VBITS_GE_128-NEXT: uzp1 v1.8h, v3.8h, v5.8h
389 ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
390 ; VBITS_GE_128-NEXT: stp q1, q0, [x0]
391 ; VBITS_GE_128-NEXT: ret
393 ; VBITS_GE_256-LABEL: sdiv_v16i16:
394 ; VBITS_GE_256: // %bb.0:
395 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
396 ; VBITS_GE_256-NEXT: ptrue p1.s, vl8
397 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
398 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
399 ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
400 ; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
401 ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
402 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
403 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
404 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
405 ; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
406 ; VBITS_GE_256-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
407 ; VBITS_GE_256-NEXT: ptrue p1.h, vl8
408 ; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h
409 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
410 ; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z0.h
411 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
412 ; VBITS_GE_256-NEXT: ret
414 ; VBITS_GE_512-LABEL: sdiv_v16i16:
415 ; VBITS_GE_512: // %bb.0:
416 ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
417 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
418 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
419 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
420 ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
421 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
422 ; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
423 ; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0]
424 ; VBITS_GE_512-NEXT: ret
425 %op1 = load <16 x i16>, ptr %a
426 %op2 = load <16 x i16>, ptr %b
427 %res = sdiv <16 x i16> %op1, %op2
428 store <16 x i16> %res, ptr %a
432 define void @sdiv_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
433 ; CHECK-LABEL: sdiv_v32i16:
435 ; CHECK-NEXT: ptrue p0.h, vl32
436 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
437 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
438 ; CHECK-NEXT: ptrue p0.s, vl32
439 ; CHECK-NEXT: sunpklo z1.s, z1.h
440 ; CHECK-NEXT: sunpklo z0.s, z0.h
441 ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
442 ; CHECK-NEXT: st1h { z0.s }, p0, [x0]
444 %op1 = load <32 x i16>, ptr %a
445 %op2 = load <32 x i16>, ptr %b
446 %res = sdiv <32 x i16> %op1, %op2
447 store <32 x i16> %res, ptr %a
451 define void @sdiv_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
452 ; CHECK-LABEL: sdiv_v64i16:
454 ; CHECK-NEXT: ptrue p0.h, vl64
455 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
456 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
457 ; CHECK-NEXT: ptrue p0.s, vl64
458 ; CHECK-NEXT: sunpklo z1.s, z1.h
459 ; CHECK-NEXT: sunpklo z0.s, z0.h
460 ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
461 ; CHECK-NEXT: st1h { z0.s }, p0, [x0]
463 %op1 = load <64 x i16>, ptr %a
464 %op2 = load <64 x i16>, ptr %b
465 %res = sdiv <64 x i16> %op1, %op2
466 store <64 x i16> %res, ptr %a
470 define void @sdiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
471 ; CHECK-LABEL: sdiv_v128i16:
473 ; CHECK-NEXT: ptrue p0.h, vl128
474 ; CHECK-NEXT: ptrue p1.s, vl64
475 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
476 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
477 ; CHECK-NEXT: sunpklo z2.s, z1.h
478 ; CHECK-NEXT: sunpklo z3.s, z0.h
479 ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
480 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
481 ; CHECK-NEXT: sunpklo z1.s, z1.h
482 ; CHECK-NEXT: sunpklo z0.s, z0.h
483 ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
484 ; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
485 ; CHECK-NEXT: ptrue p1.h, vl64
486 ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
487 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
488 ; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h
489 ; CHECK-NEXT: st1h { z1.h }, p0, [x0]
491 %op1 = load <128 x i16>, ptr %a
492 %op2 = load <128 x i16>, ptr %b
493 %res = sdiv <128 x i16> %op1, %op2
494 store <128 x i16> %res, ptr %a
498 ; Vector v2i32 sdiv are not legal for NEON so use SVE when available.
499 define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
500 ; CHECK-LABEL: sdiv_v2i32:
502 ; CHECK-NEXT: ptrue p0.s, vl2
503 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
504 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
505 ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
506 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
508 %res = sdiv <2 x i32> %op1, %op2
512 ; Vector v4i32 sdiv are not legal for NEON so use SVE when available.
513 define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
514 ; CHECK-LABEL: sdiv_v4i32:
516 ; CHECK-NEXT: ptrue p0.s, vl4
517 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
518 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
519 ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
520 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
522 %res = sdiv <4 x i32> %op1, %op2
526 define void @sdiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
527 ; CHECK-LABEL: sdiv_v8i32:
529 ; CHECK-NEXT: ptrue p0.s, vl8
530 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
531 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
532 ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
533 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
535 %op1 = load <8 x i32>, ptr %a
536 %op2 = load <8 x i32>, ptr %b
537 %res = sdiv <8 x i32> %op1, %op2
538 store <8 x i32> %res, ptr %a
542 define void @sdiv_v16i32(ptr %a, ptr %b) #0 {
543 ; VBITS_GE_128-LABEL: sdiv_v16i32:
544 ; VBITS_GE_128: // %bb.0:
545 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
546 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
547 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
548 ; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32]
549 ; VBITS_GE_128-NEXT: sdivr z0.s, p0/m, z0.s, z1.s
550 ; VBITS_GE_128-NEXT: ldr q1, [x0, #48]
551 ; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z4.s
552 ; VBITS_GE_128-NEXT: ldr q4, [x0, #32]
553 ; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z5.s
554 ; VBITS_GE_128-NEXT: sdiv z2.s, p0/m, z2.s, z3.s
555 ; VBITS_GE_128-NEXT: stp q4, q1, [x0, #32]
556 ; VBITS_GE_128-NEXT: stp q0, q2, [x0]
557 ; VBITS_GE_128-NEXT: ret
559 ; VBITS_GE_256-LABEL: sdiv_v16i32:
560 ; VBITS_GE_256: // %bb.0:
561 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
562 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
563 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
564 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
565 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1]
566 ; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
567 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
568 ; VBITS_GE_256-NEXT: sdiv z1.s, p0/m, z1.s, z2.s
569 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
570 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
571 ; VBITS_GE_256-NEXT: ret
573 ; VBITS_GE_512-LABEL: sdiv_v16i32:
574 ; VBITS_GE_512: // %bb.0:
575 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
576 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
577 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
578 ; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
579 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
580 ; VBITS_GE_512-NEXT: ret
581 %op1 = load <16 x i32>, ptr %a
582 %op2 = load <16 x i32>, ptr %b
583 %res = sdiv <16 x i32> %op1, %op2
584 store <16 x i32> %res, ptr %a
588 define void @sdiv_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
589 ; CHECK-LABEL: sdiv_v32i32:
591 ; CHECK-NEXT: ptrue p0.s, vl32
592 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
593 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
594 ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
595 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
597 %op1 = load <32 x i32>, ptr %a
598 %op2 = load <32 x i32>, ptr %b
599 %res = sdiv <32 x i32> %op1, %op2
600 store <32 x i32> %res, ptr %a
604 define void @sdiv_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
605 ; CHECK-LABEL: sdiv_v64i32:
607 ; CHECK-NEXT: ptrue p0.s, vl64
608 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
609 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
610 ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
611 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
613 %op1 = load <64 x i32>, ptr %a
614 %op2 = load <64 x i32>, ptr %b
615 %res = sdiv <64 x i32> %op1, %op2
616 store <64 x i32> %res, ptr %a
620 ; Vector i64 sdiv are not legal for NEON so use SVE when available.
621 define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
622 ; CHECK-LABEL: sdiv_v1i64:
624 ; CHECK-NEXT: ptrue p0.d, vl1
625 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
626 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
627 ; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
628 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
630 %res = sdiv <1 x i64> %op1, %op2
634 ; Vector i64 sdiv are not legal for NEON so use SVE when available.
635 define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
636 ; CHECK-LABEL: sdiv_v2i64:
638 ; CHECK-NEXT: ptrue p0.d, vl2
639 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
640 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
641 ; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
642 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
644 %res = sdiv <2 x i64> %op1, %op2
648 define void @sdiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
649 ; CHECK-LABEL: sdiv_v4i64:
651 ; CHECK-NEXT: ptrue p0.d, vl4
652 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
653 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
654 ; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
655 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
657 %op1 = load <4 x i64>, ptr %a
658 %op2 = load <4 x i64>, ptr %b
659 %res = sdiv <4 x i64> %op1, %op2
660 store <4 x i64> %res, ptr %a
664 define void @sdiv_v8i64(ptr %a, ptr %b) #0 {
665 ; VBITS_GE_128-LABEL: sdiv_v8i64:
666 ; VBITS_GE_128: // %bb.0:
667 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
668 ; VBITS_GE_128-NEXT: ptrue p0.d, vl2
669 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
670 ; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32]
671 ; VBITS_GE_128-NEXT: sdivr z0.d, p0/m, z0.d, z1.d
672 ; VBITS_GE_128-NEXT: ldr q1, [x0, #48]
673 ; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z4.d
674 ; VBITS_GE_128-NEXT: ldr q4, [x0, #32]
675 ; VBITS_GE_128-NEXT: sdiv z4.d, p0/m, z4.d, z5.d
676 ; VBITS_GE_128-NEXT: sdiv z2.d, p0/m, z2.d, z3.d
677 ; VBITS_GE_128-NEXT: stp q4, q1, [x0, #32]
678 ; VBITS_GE_128-NEXT: stp q0, q2, [x0]
679 ; VBITS_GE_128-NEXT: ret
681 ; VBITS_GE_256-LABEL: sdiv_v8i64:
682 ; VBITS_GE_256: // %bb.0:
683 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
684 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
685 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
686 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
687 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1]
688 ; VBITS_GE_256-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
689 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
690 ; VBITS_GE_256-NEXT: sdiv z1.d, p0/m, z1.d, z2.d
691 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
692 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
693 ; VBITS_GE_256-NEXT: ret
695 ; VBITS_GE_512-LABEL: sdiv_v8i64:
696 ; VBITS_GE_512: // %bb.0:
697 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
698 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
699 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
700 ; VBITS_GE_512-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
701 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
702 ; VBITS_GE_512-NEXT: ret
703 %op1 = load <8 x i64>, ptr %a
704 %op2 = load <8 x i64>, ptr %b
705 %res = sdiv <8 x i64> %op1, %op2
706 store <8 x i64> %res, ptr %a
710 define void @sdiv_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
711 ; CHECK-LABEL: sdiv_v16i64:
713 ; CHECK-NEXT: ptrue p0.d, vl16
714 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
715 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
716 ; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
717 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
719 %op1 = load <16 x i64>, ptr %a
720 %op2 = load <16 x i64>, ptr %b
721 %res = sdiv <16 x i64> %op1, %op2
722 store <16 x i64> %res, ptr %a
726 define void @sdiv_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
727 ; CHECK-LABEL: sdiv_v32i64:
729 ; CHECK-NEXT: ptrue p0.d, vl32
730 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
731 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
732 ; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
733 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
735 %op1 = load <32 x i64>, ptr %a
736 %op2 = load <32 x i64>, ptr %b
737 %res = sdiv <32 x i64> %op1, %op2
738 store <32 x i64> %res, ptr %a
746 ; Vector vXi8 udiv are not legal for NEON so use SVE when available.
747 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
748 define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
749 ; VBITS_GE_128-LABEL: udiv_v8i8:
750 ; VBITS_GE_128: // %bb.0:
751 ; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0
752 ; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0
753 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
754 ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
755 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
756 ; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
757 ; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
758 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
759 ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
760 ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
761 ; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h
762 ; VBITS_GE_128-NEXT: ret
764 ; VBITS_GE_256-LABEL: udiv_v8i8:
765 ; VBITS_GE_256: // %bb.0:
766 ; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
767 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
768 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
769 ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
770 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
771 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
772 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
773 ; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s
774 ; VBITS_GE_256-NEXT: uzp1 z1.h, z0.h, z0.h
775 ; VBITS_GE_256-NEXT: umov w8, v1.h[0]
776 ; VBITS_GE_256-NEXT: umov w9, v1.h[1]
777 ; VBITS_GE_256-NEXT: fmov s0, w8
778 ; VBITS_GE_256-NEXT: umov w8, v1.h[2]
779 ; VBITS_GE_256-NEXT: mov v0.b[1], w9
780 ; VBITS_GE_256-NEXT: mov v0.b[2], w8
781 ; VBITS_GE_256-NEXT: umov w8, v1.h[3]
782 ; VBITS_GE_256-NEXT: mov v0.b[3], w8
783 ; VBITS_GE_256-NEXT: umov w8, v1.h[4]
784 ; VBITS_GE_256-NEXT: mov v0.b[4], w8
785 ; VBITS_GE_256-NEXT: umov w8, v1.h[5]
786 ; VBITS_GE_256-NEXT: mov v0.b[5], w8
787 ; VBITS_GE_256-NEXT: umov w8, v1.h[6]
788 ; VBITS_GE_256-NEXT: mov v0.b[6], w8
789 ; VBITS_GE_256-NEXT: umov w8, v1.h[7]
790 ; VBITS_GE_256-NEXT: mov v0.b[7], w8
791 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
792 ; VBITS_GE_256-NEXT: ret
794 ; VBITS_GE_512-LABEL: udiv_v8i8:
795 ; VBITS_GE_512: // %bb.0:
796 ; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1
797 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
798 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
799 ; VBITS_GE_512-NEXT: uunpklo z1.h, z1.b
800 ; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
801 ; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h
802 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
803 ; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s
804 ; VBITS_GE_512-NEXT: uzp1 z1.h, z0.h, z0.h
805 ; VBITS_GE_512-NEXT: umov w8, v1.h[0]
806 ; VBITS_GE_512-NEXT: umov w9, v1.h[1]
807 ; VBITS_GE_512-NEXT: fmov s0, w8
808 ; VBITS_GE_512-NEXT: umov w8, v1.h[2]
809 ; VBITS_GE_512-NEXT: mov v0.b[1], w9
810 ; VBITS_GE_512-NEXT: mov v0.b[2], w8
811 ; VBITS_GE_512-NEXT: umov w8, v1.h[3]
812 ; VBITS_GE_512-NEXT: mov v0.b[3], w8
813 ; VBITS_GE_512-NEXT: umov w8, v1.h[4]
814 ; VBITS_GE_512-NEXT: mov v0.b[4], w8
815 ; VBITS_GE_512-NEXT: umov w8, v1.h[5]
816 ; VBITS_GE_512-NEXT: mov v0.b[5], w8
817 ; VBITS_GE_512-NEXT: umov w8, v1.h[6]
818 ; VBITS_GE_512-NEXT: mov v0.b[6], w8
819 ; VBITS_GE_512-NEXT: umov w8, v1.h[7]
820 ; VBITS_GE_512-NEXT: mov v0.b[7], w8
821 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0
822 ; VBITS_GE_512-NEXT: ret
823 %res = udiv <8 x i8> %op1, %op2
827 define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
828 ; VBITS_GE_128-LABEL: udiv_v16i8:
829 ; VBITS_GE_128: // %bb.0:
830 ; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0
831 ; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0
832 ; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0
833 ; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0
834 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
835 ; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0
836 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0
837 ; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0
838 ; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
839 ; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
840 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v0.8h, #0
841 ; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
842 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
843 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v1.8h, #0
844 ; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
845 ; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s
846 ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
847 ; VBITS_GE_128-NEXT: uzp1 v1.8h, v2.8h, v4.8h
848 ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v3.8h
849 ; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v1.16b
850 ; VBITS_GE_128-NEXT: ret
852 ; VBITS_GE_256-LABEL: udiv_v16i8:
853 ; VBITS_GE_256: // %bb.0:
854 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
855 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
856 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
857 ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
858 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
859 ; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
860 ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
861 ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
862 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
863 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
864 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
865 ; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
866 ; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s
867 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
868 ; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h
869 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
870 ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
871 ; VBITS_GE_256-NEXT: uzp1 z0.b, z1.b, z1.b
872 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
873 ; VBITS_GE_256-NEXT: ret
875 ; VBITS_GE_512-LABEL: udiv_v16i8:
876 ; VBITS_GE_512: // %bb.0:
877 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
878 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
879 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
880 ; VBITS_GE_512-NEXT: uunpklo z1.h, z1.b
881 ; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
882 ; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h
883 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
884 ; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s
885 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
886 ; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
887 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
888 ; VBITS_GE_512-NEXT: ret
889 %res = udiv <16 x i8> %op1, %op2
893 define void @udiv_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
894 ; CHECK-LABEL: udiv_v32i8:
896 ; CHECK-NEXT: ptrue p0.s, vl32
897 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1]
898 ; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0]
899 ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s
900 ; CHECK-NEXT: st1b { z0.s }, p0, [x0]
902 %op1 = load <32 x i8>, ptr %a
903 %op2 = load <32 x i8>, ptr %b
904 %res = udiv <32 x i8> %op1, %op2
905 store <32 x i8> %res, ptr %a
909 define void @udiv_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
910 ; CHECK-LABEL: udiv_v64i8:
912 ; CHECK-NEXT: ptrue p0.s, vl64
913 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1]
914 ; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0]
915 ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s
916 ; CHECK-NEXT: st1b { z0.s }, p0, [x0]
918 %op1 = load <64 x i8>, ptr %a
919 %op2 = load <64 x i8>, ptr %b
920 %res = udiv <64 x i8> %op1, %op2
921 store <64 x i8> %res, ptr %a
925 define void @udiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
926 ; CHECK-LABEL: udiv_v128i8:
928 ; CHECK-NEXT: ptrue p0.h, vl128
929 ; CHECK-NEXT: ptrue p1.s, vl64
930 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1]
931 ; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0]
932 ; CHECK-NEXT: uunpklo z2.s, z0.h
933 ; CHECK-NEXT: uunpklo z3.s, z1.h
934 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
935 ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
936 ; CHECK-NEXT: uunpklo z0.s, z0.h
937 ; CHECK-NEXT: uunpklo z1.s, z1.h
938 ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
939 ; CHECK-NEXT: udivr z0.s, p1/m, z0.s, z1.s
940 ; CHECK-NEXT: ptrue p1.h, vl64
941 ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
942 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
943 ; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h
944 ; CHECK-NEXT: st1b { z1.h }, p0, [x0]
946 %op1 = load <128 x i8>, ptr %a
947 %op2 = load <128 x i8>, ptr %b
948 %res = udiv <128 x i8> %op1, %op2
949 store <128 x i8> %res, ptr %a
953 define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
954 ; CHECK-LABEL: udiv_v256i8:
956 ; CHECK-NEXT: ptrue p0.b, vl256
957 ; CHECK-NEXT: ptrue p1.s, vl64
958 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
959 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
960 ; CHECK-NEXT: uunpklo z2.h, z1.b
961 ; CHECK-NEXT: uunpklo z3.h, z0.b
962 ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
963 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
964 ; CHECK-NEXT: uunpklo z1.h, z1.b
965 ; CHECK-NEXT: uunpklo z4.s, z2.h
966 ; CHECK-NEXT: uunpklo z5.s, z3.h
967 ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
968 ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
969 ; CHECK-NEXT: uunpklo z0.h, z0.b
970 ; CHECK-NEXT: uunpklo z2.s, z2.h
971 ; CHECK-NEXT: uunpklo z3.s, z3.h
972 ; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
973 ; CHECK-NEXT: uunpklo z5.s, z0.h
974 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
975 ; CHECK-NEXT: uunpklo z0.s, z0.h
976 ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
977 ; CHECK-NEXT: uunpklo z3.s, z1.h
978 ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
979 ; CHECK-NEXT: uunpklo z1.s, z1.h
980 ; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z5.s
981 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
982 ; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
983 ; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h
984 ; CHECK-NEXT: ptrue p1.h, vl64
985 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
986 ; CHECK-NEXT: splice z1.h, p1, z1.h, z2.h
987 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
988 ; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h
989 ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b
990 ; CHECK-NEXT: ptrue p1.b, vl128
991 ; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b
992 ; CHECK-NEXT: splice z0.b, p1, z0.b, z1.b
993 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
995 %op1 = load <256 x i8>, ptr %a
996 %op2 = load <256 x i8>, ptr %b
997 %res = udiv <256 x i8> %op1, %op2
998 store <256 x i8> %res, ptr %a
1002 ; Vector vXi16 udiv are not legal for NEON so use SVE when available.
1003 ; FIXME: We should be able to improve the codegen for >= 256 bits here.
1004 define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
1005 ; VBITS_GE_128-LABEL: udiv_v4i16:
1006 ; VBITS_GE_128: // %bb.0:
1007 ; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
1008 ; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
1009 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
1010 ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1011 ; VBITS_GE_128-NEXT: xtn v0.4h, v0.4s
1012 ; VBITS_GE_128-NEXT: ret
1014 ; VBITS_GE_256-LABEL: udiv_v4i16:
1015 ; VBITS_GE_256: // %bb.0:
1016 ; VBITS_GE_256-NEXT: ushll v1.4s, v1.4h, #0
1017 ; VBITS_GE_256-NEXT: ushll v0.4s, v0.4h, #0
1018 ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
1019 ; VBITS_GE_256-NEXT: udivr z1.s, p0/m, z1.s, z0.s
1020 ; VBITS_GE_256-NEXT: mov w8, v1.s[1]
1021 ; VBITS_GE_256-NEXT: mov v0.16b, v1.16b
1022 ; VBITS_GE_256-NEXT: mov w9, v1.s[2]
1023 ; VBITS_GE_256-NEXT: mov v0.h[1], w8
1024 ; VBITS_GE_256-NEXT: mov w8, v1.s[3]
1025 ; VBITS_GE_256-NEXT: mov v0.h[2], w9
1026 ; VBITS_GE_256-NEXT: mov v0.h[3], w8
1027 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
1028 ; VBITS_GE_256-NEXT: ret
1030 ; VBITS_GE_512-LABEL: udiv_v4i16:
1031 ; VBITS_GE_512: // %bb.0:
1032 ; VBITS_GE_512-NEXT: ushll v1.4s, v1.4h, #0
1033 ; VBITS_GE_512-NEXT: ushll v0.4s, v0.4h, #0
1034 ; VBITS_GE_512-NEXT: ptrue p0.s, vl4
1035 ; VBITS_GE_512-NEXT: udivr z1.s, p0/m, z1.s, z0.s
1036 ; VBITS_GE_512-NEXT: mov w8, v1.s[1]
1037 ; VBITS_GE_512-NEXT: mov v0.16b, v1.16b
1038 ; VBITS_GE_512-NEXT: mov w9, v1.s[2]
1039 ; VBITS_GE_512-NEXT: mov v0.h[1], w8
1040 ; VBITS_GE_512-NEXT: mov w8, v1.s[3]
1041 ; VBITS_GE_512-NEXT: mov v0.h[2], w9
1042 ; VBITS_GE_512-NEXT: mov v0.h[3], w8
1043 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0
1044 ; VBITS_GE_512-NEXT: ret
1045 %res = udiv <4 x i16> %op1, %op2
1049 define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
1050 ; VBITS_GE_128-LABEL: udiv_v8i16:
1051 ; VBITS_GE_128: // %bb.0:
1052 ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
1053 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
1054 ; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
1055 ; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
1056 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
1057 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
1058 ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1059 ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
1060 ; VBITS_GE_128-NEXT: ret
1062 ; VBITS_GE_256-LABEL: udiv_v8i16:
1063 ; VBITS_GE_256: // %bb.0:
1064 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
1065 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
1066 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1067 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
1068 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
1069 ; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1070 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
1071 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
1072 ; VBITS_GE_256-NEXT: ret
1074 ; VBITS_GE_512-LABEL: udiv_v8i16:
1075 ; VBITS_GE_512: // %bb.0:
1076 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
1077 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
1078 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
1079 ; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h
1080 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
1081 ; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1082 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
1083 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
1084 ; VBITS_GE_512-NEXT: ret
1085 %res = udiv <8 x i16> %op1, %op2
1089 define void @udiv_v16i16(ptr %a, ptr %b) #0 {
1090 ; VBITS_GE_128-LABEL: udiv_v16i16:
1091 ; VBITS_GE_128: // %bb.0:
1092 ; VBITS_GE_128-NEXT: ldp q4, q1, [x1]
1093 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
1094 ; VBITS_GE_128-NEXT: ldr q0, [x0, #16]
1095 ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
1096 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
1097 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v4.8h, #0
1098 ; VBITS_GE_128-NEXT: ushll v4.4s, v4.4h, #0
1099 ; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
1100 ; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
1101 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
1102 ; VBITS_GE_128-NEXT: ldr q3, [x0]
1103 ; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0
1104 ; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
1105 ; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s
1106 ; VBITS_GE_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s
1107 ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1108 ; VBITS_GE_128-NEXT: uzp1 v1.8h, v3.8h, v5.8h
1109 ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
1110 ; VBITS_GE_128-NEXT: stp q1, q0, [x0]
1111 ; VBITS_GE_128-NEXT: ret
1113 ; VBITS_GE_256-LABEL: udiv_v16i16:
1114 ; VBITS_GE_256: // %bb.0:
1115 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1116 ; VBITS_GE_256-NEXT: ptrue p1.s, vl8
1117 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
1118 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
1119 ; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
1120 ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
1121 ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
1122 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1123 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
1124 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
1125 ; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s
1126 ; VBITS_GE_256-NEXT: udiv z0.s, p1/m, z0.s, z1.s
1127 ; VBITS_GE_256-NEXT: ptrue p1.h, vl8
1128 ; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h
1129 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
1130 ; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z0.h
1131 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
1132 ; VBITS_GE_256-NEXT: ret
1134 ; VBITS_GE_512-LABEL: udiv_v16i16:
1135 ; VBITS_GE_512: // %bb.0:
1136 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1137 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1]
1138 ; VBITS_GE_512-NEXT: ld1h { z1.s }, p0/z, [x0]
1139 ; VBITS_GE_512-NEXT: udivr z0.s, p0/m, z0.s, z1.s
1140 ; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0]
1141 ; VBITS_GE_512-NEXT: ret
1142 %op1 = load <16 x i16>, ptr %a
1143 %op2 = load <16 x i16>, ptr %b
1144 %res = udiv <16 x i16> %op1, %op2
1145 store <16 x i16> %res, ptr %a
1149 define void @udiv_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1150 ; CHECK-LABEL: udiv_v32i16:
1152 ; CHECK-NEXT: ptrue p0.s, vl32
1153 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1]
1154 ; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0]
1155 ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s
1156 ; CHECK-NEXT: st1h { z0.s }, p0, [x0]
1158 %op1 = load <32 x i16>, ptr %a
1159 %op2 = load <32 x i16>, ptr %b
1160 %res = udiv <32 x i16> %op1, %op2
1161 store <32 x i16> %res, ptr %a
1165 define void @udiv_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1166 ; CHECK-LABEL: udiv_v64i16:
1168 ; CHECK-NEXT: ptrue p0.s, vl64
1169 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1]
1170 ; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0]
1171 ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s
1172 ; CHECK-NEXT: st1h { z0.s }, p0, [x0]
1174 %op1 = load <64 x i16>, ptr %a
1175 %op2 = load <64 x i16>, ptr %b
1176 %res = udiv <64 x i16> %op1, %op2
1177 store <64 x i16> %res, ptr %a
1181 define void @udiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1182 ; CHECK-LABEL: udiv_v128i16:
1184 ; CHECK-NEXT: ptrue p0.h, vl128
1185 ; CHECK-NEXT: ptrue p1.s, vl64
1186 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1187 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1188 ; CHECK-NEXT: uunpklo z2.s, z1.h
1189 ; CHECK-NEXT: uunpklo z3.s, z0.h
1190 ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
1191 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
1192 ; CHECK-NEXT: uunpklo z1.s, z1.h
1193 ; CHECK-NEXT: uunpklo z0.s, z0.h
1194 ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
1195 ; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
1196 ; CHECK-NEXT: ptrue p1.h, vl64
1197 ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
1198 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1199 ; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h
1200 ; CHECK-NEXT: st1h { z1.h }, p0, [x0]
1202 %op1 = load <128 x i16>, ptr %a
1203 %op2 = load <128 x i16>, ptr %b
1204 %res = udiv <128 x i16> %op1, %op2
1205 store <128 x i16> %res, ptr %a
1209 ; Vector v2i32 udiv are not legal for NEON so use SVE when available.
1210 define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
1211 ; CHECK-LABEL: udiv_v2i32:
1213 ; CHECK-NEXT: ptrue p0.s, vl2
1214 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
1215 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
1216 ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1217 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1219 %res = udiv <2 x i32> %op1, %op2
1223 ; Vector v4i32 udiv are not legal for NEON so use SVE when available.
1224 define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
1225 ; CHECK-LABEL: udiv_v4i32:
1227 ; CHECK-NEXT: ptrue p0.s, vl4
1228 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1229 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
1230 ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1231 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
1233 %res = udiv <4 x i32> %op1, %op2
1237 define void @udiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1238 ; CHECK-LABEL: udiv_v8i32:
1240 ; CHECK-NEXT: ptrue p0.s, vl8
1241 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1242 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1243 ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1244 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1246 %op1 = load <8 x i32>, ptr %a
1247 %op2 = load <8 x i32>, ptr %b
1248 %res = udiv <8 x i32> %op1, %op2
1249 store <8 x i32> %res, ptr %a
1253 define void @udiv_v16i32(ptr %a, ptr %b) #0 {
1254 ; VBITS_GE_128-LABEL: udiv_v16i32:
1255 ; VBITS_GE_128: // %bb.0:
1256 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
1257 ; VBITS_GE_128-NEXT: ptrue p0.s, vl4
1258 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
1259 ; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32]
1260 ; VBITS_GE_128-NEXT: udivr z0.s, p0/m, z0.s, z1.s
1261 ; VBITS_GE_128-NEXT: ldr q1, [x0, #48]
1262 ; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z4.s
1263 ; VBITS_GE_128-NEXT: ldr q4, [x0, #32]
1264 ; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z5.s
1265 ; VBITS_GE_128-NEXT: udiv z2.s, p0/m, z2.s, z3.s
1266 ; VBITS_GE_128-NEXT: stp q4, q1, [x0, #32]
1267 ; VBITS_GE_128-NEXT: stp q0, q2, [x0]
1268 ; VBITS_GE_128-NEXT: ret
1270 ; VBITS_GE_256-LABEL: udiv_v16i32:
1271 ; VBITS_GE_256: // %bb.0:
1272 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1273 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1274 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1275 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1276 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1]
1277 ; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1278 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1279 ; VBITS_GE_256-NEXT: udiv z1.s, p0/m, z1.s, z2.s
1280 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1281 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1282 ; VBITS_GE_256-NEXT: ret
1284 ; VBITS_GE_512-LABEL: udiv_v16i32:
1285 ; VBITS_GE_512: // %bb.0:
1286 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1287 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1288 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
1289 ; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1290 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1291 ; VBITS_GE_512-NEXT: ret
1292 %op1 = load <16 x i32>, ptr %a
1293 %op2 = load <16 x i32>, ptr %b
1294 %res = udiv <16 x i32> %op1, %op2
1295 store <16 x i32> %res, ptr %a
1299 define void @udiv_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1300 ; CHECK-LABEL: udiv_v32i32:
1302 ; CHECK-NEXT: ptrue p0.s, vl32
1303 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1304 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1305 ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1306 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1308 %op1 = load <32 x i32>, ptr %a
1309 %op2 = load <32 x i32>, ptr %b
1310 %res = udiv <32 x i32> %op1, %op2
1311 store <32 x i32> %res, ptr %a
1315 define void @udiv_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1316 ; CHECK-LABEL: udiv_v64i32:
1318 ; CHECK-NEXT: ptrue p0.s, vl64
1319 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1320 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1321 ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1322 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1324 %op1 = load <64 x i32>, ptr %a
1325 %op2 = load <64 x i32>, ptr %b
1326 %res = udiv <64 x i32> %op1, %op2
1327 store <64 x i32> %res, ptr %a
1331 ; Vector i64 udiv are not legal for NEON so use SVE when available.
1332 define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
1333 ; CHECK-LABEL: udiv_v1i64:
1335 ; CHECK-NEXT: ptrue p0.d, vl1
1336 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
1337 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
1338 ; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
1339 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1341 %res = udiv <1 x i64> %op1, %op2
1345 ; Vector i64 udiv are not legal for NEON so use SVE when available.
1346 define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
1347 ; CHECK-LABEL: udiv_v2i64:
1349 ; CHECK-NEXT: ptrue p0.d, vl2
1350 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1351 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
1352 ; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
1353 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
1355 %res = udiv <2 x i64> %op1, %op2
1359 define void @udiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1360 ; CHECK-LABEL: udiv_v4i64:
1362 ; CHECK-NEXT: ptrue p0.d, vl4
1363 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1364 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1365 ; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
1366 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1368 %op1 = load <4 x i64>, ptr %a
1369 %op2 = load <4 x i64>, ptr %b
1370 %res = udiv <4 x i64> %op1, %op2
1371 store <4 x i64> %res, ptr %a
1375 define void @udiv_v8i64(ptr %a, ptr %b) #0 {
1376 ; VBITS_GE_128-LABEL: udiv_v8i64:
1377 ; VBITS_GE_128: // %bb.0:
1378 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
1379 ; VBITS_GE_128-NEXT: ptrue p0.d, vl2
1380 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
1381 ; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32]
1382 ; VBITS_GE_128-NEXT: udivr z0.d, p0/m, z0.d, z1.d
1383 ; VBITS_GE_128-NEXT: ldr q1, [x0, #48]
1384 ; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z4.d
1385 ; VBITS_GE_128-NEXT: ldr q4, [x0, #32]
1386 ; VBITS_GE_128-NEXT: udiv z4.d, p0/m, z4.d, z5.d
1387 ; VBITS_GE_128-NEXT: udiv z2.d, p0/m, z2.d, z3.d
1388 ; VBITS_GE_128-NEXT: stp q4, q1, [x0, #32]
1389 ; VBITS_GE_128-NEXT: stp q0, q2, [x0]
1390 ; VBITS_GE_128-NEXT: ret
1392 ; VBITS_GE_256-LABEL: udiv_v8i64:
1393 ; VBITS_GE_256: // %bb.0:
1394 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1395 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1396 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1397 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1398 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1]
1399 ; VBITS_GE_256-NEXT: udiv z0.d, p0/m, z0.d, z1.d
1400 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1401 ; VBITS_GE_256-NEXT: udiv z1.d, p0/m, z1.d, z2.d
1402 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1403 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1404 ; VBITS_GE_256-NEXT: ret
1406 ; VBITS_GE_512-LABEL: udiv_v8i64:
1407 ; VBITS_GE_512: // %bb.0:
1408 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1409 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1410 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
1411 ; VBITS_GE_512-NEXT: udiv z0.d, p0/m, z0.d, z1.d
1412 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1413 ; VBITS_GE_512-NEXT: ret
1414 %op1 = load <8 x i64>, ptr %a
1415 %op2 = load <8 x i64>, ptr %b
1416 %res = udiv <8 x i64> %op1, %op2
1417 store <8 x i64> %res, ptr %a
1421 define void @udiv_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1422 ; CHECK-LABEL: udiv_v16i64:
1424 ; CHECK-NEXT: ptrue p0.d, vl16
1425 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1426 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1427 ; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
1428 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1430 %op1 = load <16 x i64>, ptr %a
1431 %op2 = load <16 x i64>, ptr %b
1432 %res = udiv <16 x i64> %op1, %op2
1433 store <16 x i64> %res, ptr %a
1437 define void @udiv_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1438 ; CHECK-LABEL: udiv_v32i64:
1440 ; CHECK-NEXT: ptrue p0.d, vl32
1441 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1442 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1443 ; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
1444 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1446 %op1 = load <32 x i64>, ptr %a
1447 %op2 = load <32 x i64>, ptr %b
1448 %res = udiv <32 x i64> %op1, %op2
1449 store <32 x i64> %res, ptr %a
1453 ; This used to crash because isUnaryPredicate and BuildUDIV don't know how
1454 ; a SPLAT_VECTOR of fixed vector type should be handled.
1455 define void @udiv_constantsplat_v8i32(ptr %a) vscale_range(2,0) #1 {
1456 ; CHECK-LABEL: udiv_constantsplat_v8i32:
1458 ; CHECK-NEXT: ptrue p0.s, vl8
1459 ; CHECK-NEXT: mov z1.s, #95 // =0x5f
1460 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1461 ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
1462 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1464 %op1 = load <8 x i32>, ptr %a
1465 %res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
1466 store <8 x i32> %res, ptr %a
1470 attributes #0 = { "target-features"="+sve" }
1471 attributes #1 = { "target-features"="+sve" minsize }