1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 define void @masked_scatter_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
13 ; CHECK-LABEL: masked_scatter_v2i8:
15 ; CHECK-NEXT: ldrb w8, [x0]
16 ; CHECK-NEXT: ldrb w9, [x0, #1]
17 ; CHECK-NEXT: ptrue p0.d, vl2
18 ; CHECK-NEXT: fmov s0, w8
19 ; CHECK-NEXT: mov v0.s[1], w9
20 ; CHECK-NEXT: cmeq v1.2s, v0.2s, #0
21 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
22 ; CHECK-NEXT: sshll v1.2d, v1.2s, #0
23 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
24 ; CHECK-NEXT: ldr q1, [x1]
25 ; CHECK-NEXT: st1b { z0.d }, p0, [z1.d]
27 %vals = load <2 x i8>, ptr %a
28 %ptrs = load <2 x ptr>, ptr %b
29 %mask = icmp eq <2 x i8> %vals, zeroinitializer
30 call void @llvm.masked.scatter.v2i8(<2 x i8> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
34 define void @masked_scatter_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
35 ; CHECK-LABEL: masked_scatter_v4i8:
37 ; CHECK-NEXT: ldr s0, [x0]
38 ; CHECK-NEXT: ptrue p0.d, vl4
39 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
40 ; CHECK-NEXT: cmeq v1.4h, v0.4h, #0
41 ; CHECK-NEXT: uunpklo z0.s, z0.h
42 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1]
43 ; CHECK-NEXT: sunpklo z1.s, z1.h
44 ; CHECK-NEXT: uunpklo z0.d, z0.s
45 ; CHECK-NEXT: sunpklo z1.d, z1.s
46 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
47 ; CHECK-NEXT: st1b { z0.d }, p0, [z2.d]
49 %vals = load <4 x i8>, ptr %a
50 %ptrs = load <4 x ptr>, ptr %b
51 %mask = icmp eq <4 x i8> %vals, zeroinitializer
52 call void @llvm.masked.scatter.v4i8(<4 x i8> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
56 define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 {
57 ; VBITS_GE_256-LABEL: masked_scatter_v8i8:
58 ; VBITS_GE_256: // %bb.0:
59 ; VBITS_GE_256-NEXT: ldr d0, [x0]
60 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
61 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
62 ; VBITS_GE_256-NEXT: cmeq v1.8b, v0.8b, #0
63 ; VBITS_GE_256-NEXT: zip1 v3.8b, v0.8b, v0.8b
64 ; VBITS_GE_256-NEXT: zip1 v2.8b, v1.8b, v0.8b
65 ; VBITS_GE_256-NEXT: zip2 v1.8b, v1.8b, v0.8b
66 ; VBITS_GE_256-NEXT: zip2 v0.8b, v0.8b, v0.8b
67 ; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
68 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
69 ; VBITS_GE_256-NEXT: shl v2.4h, v2.4h, #8
70 ; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8
71 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
72 ; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
73 ; VBITS_GE_256-NEXT: sshr v2.4h, v2.4h, #8
74 ; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8
75 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
76 ; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
77 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
78 ; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
79 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
80 ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0
81 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1]
82 ; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0
83 ; VBITS_GE_256-NEXT: st1b { z3.d }, p1, [z2.d]
84 ; VBITS_GE_256-NEXT: st1b { z0.d }, p0, [z4.d]
85 ; VBITS_GE_256-NEXT: ret
87 ; VBITS_GE_512-LABEL: masked_scatter_v8i8:
88 ; VBITS_GE_512: // %bb.0:
89 ; VBITS_GE_512-NEXT: ldr d0, [x0]
90 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
91 ; VBITS_GE_512-NEXT: cmeq v1.8b, v0.8b, #0
92 ; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
93 ; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b
94 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
95 ; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1]
96 ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
97 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
98 ; VBITS_GE_512-NEXT: sunpklo z1.d, z1.s
99 ; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z1.d, #0
100 ; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [z2.d]
101 ; VBITS_GE_512-NEXT: ret
102 %vals = load <8 x i8>, ptr %a
103 %ptrs = load <8 x ptr>, ptr %b
104 %mask = icmp eq <8 x i8> %vals, zeroinitializer
105 call void @llvm.masked.scatter.v8i8(<8 x i8> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
109 define void @masked_scatter_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
110 ; CHECK-LABEL: masked_scatter_v16i8:
112 ; CHECK-NEXT: ldr q0, [x0]
113 ; CHECK-NEXT: ptrue p0.d, vl16
114 ; CHECK-NEXT: cmeq v1.16b, v0.16b, #0
115 ; CHECK-NEXT: uunpklo z0.h, z0.b
116 ; CHECK-NEXT: sunpklo z1.h, z1.b
117 ; CHECK-NEXT: uunpklo z0.s, z0.h
118 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1]
119 ; CHECK-NEXT: sunpklo z1.s, z1.h
120 ; CHECK-NEXT: uunpklo z0.d, z0.s
121 ; CHECK-NEXT: sunpklo z1.d, z1.s
122 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
123 ; CHECK-NEXT: st1b { z0.d }, p0, [z2.d]
125 %vals = load <16 x i8>, ptr %a
126 %ptrs = load <16 x ptr>, ptr %b
127 %mask = icmp eq <16 x i8> %vals, zeroinitializer
128 call void @llvm.masked.scatter.v16i8(<16 x i8> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
132 define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
133 ; CHECK-LABEL: masked_scatter_v32i8:
135 ; CHECK-NEXT: ptrue p0.b, vl32
136 ; CHECK-NEXT: ptrue p1.d, vl32
137 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
138 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
139 ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0
140 ; CHECK-NEXT: uunpklo z0.h, z0.b
141 ; CHECK-NEXT: punpklo p0.h, p0.b
142 ; CHECK-NEXT: uunpklo z0.s, z0.h
143 ; CHECK-NEXT: uunpklo z0.d, z0.s
144 ; CHECK-NEXT: punpklo p0.h, p0.b
145 ; CHECK-NEXT: punpklo p0.h, p0.b
146 ; CHECK-NEXT: st1b { z0.d }, p0, [z1.d]
148 %vals = load <32 x i8>, ptr %a
149 %ptrs = load <32 x ptr>, ptr %b
150 %mask = icmp eq <32 x i8> %vals, zeroinitializer
151 call void @llvm.masked.scatter.v32i8(<32 x i8> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
159 define void @masked_scatter_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
160 ; CHECK-LABEL: masked_scatter_v2i16:
162 ; CHECK-NEXT: ldrh w8, [x0]
163 ; CHECK-NEXT: ldrh w9, [x0, #2]
164 ; CHECK-NEXT: ptrue p0.d, vl2
165 ; CHECK-NEXT: fmov s0, w8
166 ; CHECK-NEXT: mov v0.s[1], w9
167 ; CHECK-NEXT: cmeq v1.2s, v0.2s, #0
168 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
169 ; CHECK-NEXT: sshll v1.2d, v1.2s, #0
170 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
171 ; CHECK-NEXT: ldr q1, [x1]
172 ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d]
174 %vals = load <2 x i16>, ptr %a
175 %ptrs = load <2 x ptr>, ptr %b
176 %mask = icmp eq <2 x i16> %vals, zeroinitializer
177 call void @llvm.masked.scatter.v2i16(<2 x i16> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
181 define void @masked_scatter_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
182 ; CHECK-LABEL: masked_scatter_v4i16:
184 ; CHECK-NEXT: ldr d0, [x0]
185 ; CHECK-NEXT: ptrue p0.d, vl4
186 ; CHECK-NEXT: cmeq v1.4h, v0.4h, #0
187 ; CHECK-NEXT: uunpklo z0.s, z0.h
188 ; CHECK-NEXT: sunpklo z1.s, z1.h
189 ; CHECK-NEXT: uunpklo z0.d, z0.s
190 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1]
191 ; CHECK-NEXT: sunpklo z1.d, z1.s
192 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
193 ; CHECK-NEXT: st1h { z0.d }, p0, [z2.d]
195 %vals = load <4 x i16>, ptr %a
196 %ptrs = load <4 x ptr>, ptr %b
197 %mask = icmp eq <4 x i16> %vals, zeroinitializer
198 call void @llvm.masked.scatter.v4i16(<4 x i16> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
202 define void @masked_scatter_v8i16(ptr %a, ptr %b) #0 {
203 ; VBITS_GE_256-LABEL: masked_scatter_v8i16:
204 ; VBITS_GE_256: // %bb.0:
205 ; VBITS_GE_256-NEXT: ldr q0, [x0]
206 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
207 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
208 ; VBITS_GE_256-NEXT: cmeq v1.8h, v0.8h, #0
209 ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
210 ; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8
211 ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
212 ; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8
213 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
214 ; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
215 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
216 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
217 ; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
218 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
219 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
220 ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0
221 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1]
222 ; VBITS_GE_256-NEXT: st1h { z3.d }, p1, [z2.d]
223 ; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0
224 ; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [z4.d]
225 ; VBITS_GE_256-NEXT: ret
227 ; VBITS_GE_512-LABEL: masked_scatter_v8i16:
228 ; VBITS_GE_512: // %bb.0:
229 ; VBITS_GE_512-NEXT: ldr q0, [x0]
230 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
231 ; VBITS_GE_512-NEXT: cmeq v1.8h, v0.8h, #0
232 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
233 ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
234 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
235 ; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1]
236 ; VBITS_GE_512-NEXT: sunpklo z1.d, z1.s
237 ; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z1.d, #0
238 ; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [z2.d]
239 ; VBITS_GE_512-NEXT: ret
240 %vals = load <8 x i16>, ptr %a
241 %ptrs = load <8 x ptr>, ptr %b
242 %mask = icmp eq <8 x i16> %vals, zeroinitializer
243 call void @llvm.masked.scatter.v8i16(<8 x i16> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
247 define void @masked_scatter_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
248 ; CHECK-LABEL: masked_scatter_v16i16:
250 ; CHECK-NEXT: ptrue p0.h, vl16
251 ; CHECK-NEXT: ptrue p1.d, vl16
252 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
253 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
254 ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0
255 ; CHECK-NEXT: uunpklo z0.s, z0.h
256 ; CHECK-NEXT: punpklo p0.h, p0.b
257 ; CHECK-NEXT: uunpklo z0.d, z0.s
258 ; CHECK-NEXT: punpklo p0.h, p0.b
259 ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d]
261 %vals = load <16 x i16>, ptr %a
262 %ptrs = load <16 x ptr>, ptr %b
263 %mask = icmp eq <16 x i16> %vals, zeroinitializer
264 call void @llvm.masked.scatter.v16i16(<16 x i16> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
268 define void @masked_scatter_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
269 ; CHECK-LABEL: masked_scatter_v32i16:
271 ; CHECK-NEXT: ptrue p0.h, vl32
272 ; CHECK-NEXT: ptrue p1.d, vl32
273 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
274 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
275 ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0
276 ; CHECK-NEXT: uunpklo z0.s, z0.h
277 ; CHECK-NEXT: punpklo p0.h, p0.b
278 ; CHECK-NEXT: uunpklo z0.d, z0.s
279 ; CHECK-NEXT: punpklo p0.h, p0.b
280 ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d]
282 %vals = load <32 x i16>, ptr %a
283 %ptrs = load <32 x ptr>, ptr %b
284 %mask = icmp eq <32 x i16> %vals, zeroinitializer
285 call void @llvm.masked.scatter.v32i16(<32 x i16> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
293 define void @masked_scatter_v2i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
294 ; CHECK-LABEL: masked_scatter_v2i32:
296 ; CHECK-NEXT: ldr d0, [x0]
297 ; CHECK-NEXT: ptrue p0.d, vl2
298 ; CHECK-NEXT: cmeq v1.2s, v0.2s, #0
299 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
300 ; CHECK-NEXT: sshll v1.2d, v1.2s, #0
301 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
302 ; CHECK-NEXT: ldr q1, [x1]
303 ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d]
305 %vals = load <2 x i32>, ptr %a
306 %ptrs = load <2 x ptr>, ptr %b
307 %mask = icmp eq <2 x i32> %vals, zeroinitializer
308 call void @llvm.masked.scatter.v2i32(<2 x i32> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
312 define void @masked_scatter_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
313 ; CHECK-LABEL: masked_scatter_v4i32:
315 ; CHECK-NEXT: ldr q0, [x0]
316 ; CHECK-NEXT: ptrue p0.d, vl4
317 ; CHECK-NEXT: cmeq v1.4s, v0.4s, #0
318 ; CHECK-NEXT: uunpklo z0.d, z0.s
319 ; CHECK-NEXT: sunpklo z1.d, z1.s
320 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1]
321 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
322 ; CHECK-NEXT: st1w { z0.d }, p0, [z2.d]
324 %vals = load <4 x i32>, ptr %a
325 %ptrs = load <4 x ptr>, ptr %b
326 %mask = icmp eq <4 x i32> %vals, zeroinitializer
327 call void @llvm.masked.scatter.v4i32(<4 x i32> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
331 define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 {
332 ; VBITS_GE_256-LABEL: masked_scatter_v8i32:
333 ; VBITS_GE_256: // %bb.0:
334 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
335 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
336 ; VBITS_GE_256-NEXT: ptrue p1.d, vl4
337 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
338 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x8, lsl #3]
339 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1]
340 ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0
341 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s
342 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
343 ; VBITS_GE_256-NEXT: punpklo p2.h, p0.b
344 ; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff
345 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
346 ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
347 ; VBITS_GE_256-NEXT: and p0.b, p2/z, p2.b, p1.b
348 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
349 ; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z4.d]
350 ; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0
351 ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z3.d]
352 ; VBITS_GE_256-NEXT: ret
354 ; VBITS_GE_512-LABEL: masked_scatter_v8i32:
355 ; VBITS_GE_512: // %bb.0:
356 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
357 ; VBITS_GE_512-NEXT: ptrue p1.d, vl8
358 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
359 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1]
360 ; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0
361 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
362 ; VBITS_GE_512-NEXT: punpklo p0.h, p0.b
363 ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [z1.d]
364 ; VBITS_GE_512-NEXT: ret
365 %vals = load <8 x i32>, ptr %a
366 %ptrs = load <8 x ptr>, ptr %b
367 %mask = icmp eq <8 x i32> %vals, zeroinitializer
368 call void @llvm.masked.scatter.v8i32(<8 x i32> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
372 define void @masked_scatter_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
373 ; CHECK-LABEL: masked_scatter_v16i32:
375 ; CHECK-NEXT: ptrue p0.s, vl16
376 ; CHECK-NEXT: ptrue p1.d, vl16
377 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
378 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
379 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0
380 ; CHECK-NEXT: uunpklo z0.d, z0.s
381 ; CHECK-NEXT: punpklo p0.h, p0.b
382 ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d]
384 %vals = load <16 x i32>, ptr %a
385 %ptrs = load <16 x ptr>, ptr %b
386 %mask = icmp eq <16 x i32> %vals, zeroinitializer
387 call void @llvm.masked.scatter.v16i32(<16 x i32> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
391 define void @masked_scatter_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
392 ; CHECK-LABEL: masked_scatter_v32i32:
394 ; CHECK-NEXT: ptrue p0.s, vl32
395 ; CHECK-NEXT: ptrue p1.d, vl32
396 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
397 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
398 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0
399 ; CHECK-NEXT: uunpklo z0.d, z0.s
400 ; CHECK-NEXT: punpklo p0.h, p0.b
401 ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d]
403 %vals = load <32 x i32>, ptr %a
404 %ptrs = load <32 x ptr>, ptr %b
405 %mask = icmp eq <32 x i32> %vals, zeroinitializer
406 call void @llvm.masked.scatter.v32i32(<32 x i32> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
414 ; Scalarize 1 x i64 scatters
415 define void @masked_scatter_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
416 ; CHECK-LABEL: masked_scatter_v1i64:
418 ; CHECK-NEXT: ldr d0, [x0]
419 ; CHECK-NEXT: fmov x8, d0
420 ; CHECK-NEXT: cbnz x8, .LBB15_2
421 ; CHECK-NEXT: // %bb.1: // %cond.store
422 ; CHECK-NEXT: ldr d1, [x1]
423 ; CHECK-NEXT: fmov x8, d1
424 ; CHECK-NEXT: str d0, [x8]
425 ; CHECK-NEXT: .LBB15_2: // %else
427 %vals = load <1 x i64>, ptr %a
428 %ptrs = load <1 x ptr>, ptr %b
429 %mask = icmp eq <1 x i64> %vals, zeroinitializer
430 call void @llvm.masked.scatter.v1i64(<1 x i64> %vals, <1 x ptr> %ptrs, i32 8, <1 x i1> %mask)
434 define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
435 ; CHECK-LABEL: masked_scatter_v2i64:
437 ; CHECK-NEXT: ptrue p0.d, vl2
438 ; CHECK-NEXT: ldr q0, [x0]
439 ; CHECK-NEXT: cmeq v1.2d, v0.2d, #0
440 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
441 ; CHECK-NEXT: ldr q1, [x1]
442 ; CHECK-NEXT: st1d { z0.d }, p0, [z1.d]
444 %vals = load <2 x i64>, ptr %a
445 %ptrs = load <2 x ptr>, ptr %b
446 %mask = icmp eq <2 x i64> %vals, zeroinitializer
447 call void @llvm.masked.scatter.v2i64(<2 x i64> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
451 define void @masked_scatter_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
452 ; CHECK-LABEL: masked_scatter_v4i64:
454 ; CHECK-NEXT: ptrue p0.d, vl4
455 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
456 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
457 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0
458 ; CHECK-NEXT: st1d { z0.d }, p0, [z1.d]
460 %vals = load <4 x i64>, ptr %a
461 %ptrs = load <4 x ptr>, ptr %b
462 %mask = icmp eq <4 x i64> %vals, zeroinitializer
463 call void @llvm.masked.scatter.v4i64(<4 x i64> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
467 define void @masked_scatter_v8i64(ptr %a, ptr %b) #0 {
468 ; VBITS_GE_256-LABEL: masked_scatter_v8i64:
469 ; VBITS_GE_256: // %bb.0:
470 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
471 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
472 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
473 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
474 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
475 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
476 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z1.d, #0
477 ; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z0.d, #0
478 ; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [z3.d]
479 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [z2.d]
480 ; VBITS_GE_256-NEXT: ret
482 ; VBITS_GE_512-LABEL: masked_scatter_v8i64:
483 ; VBITS_GE_512: // %bb.0:
484 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
485 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
486 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
487 ; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, #0
488 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [z1.d]
489 ; VBITS_GE_512-NEXT: ret
490 %vals = load <8 x i64>, ptr %a
491 %ptrs = load <8 x ptr>, ptr %b
492 %mask = icmp eq <8 x i64> %vals, zeroinitializer
493 call void @llvm.masked.scatter.v8i64(<8 x i64> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
497 define void @masked_scatter_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
498 ; CHECK-LABEL: masked_scatter_v16i64:
500 ; CHECK-NEXT: ptrue p0.d, vl16
501 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
502 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
503 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0
504 ; CHECK-NEXT: st1d { z0.d }, p0, [z1.d]
506 %vals = load <16 x i64>, ptr %a
507 %ptrs = load <16 x ptr>, ptr %b
508 %mask = icmp eq <16 x i64> %vals, zeroinitializer
509 call void @llvm.masked.scatter.v16i64(<16 x i64> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
513 define void @masked_scatter_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
514 ; CHECK-LABEL: masked_scatter_v32i64:
516 ; CHECK-NEXT: ptrue p0.d, vl32
517 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
518 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
519 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0
520 ; CHECK-NEXT: st1d { z0.d }, p0, [z1.d]
522 %vals = load <32 x i64>, ptr %a
523 %ptrs = load <32 x ptr>, ptr %b
524 %mask = icmp eq <32 x i64> %vals, zeroinitializer
525 call void @llvm.masked.scatter.v32i64(<32 x i64> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
533 define void @masked_scatter_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
534 ; CHECK-LABEL: masked_scatter_v2f16:
536 ; CHECK-NEXT: ldr s1, [x0]
537 ; CHECK-NEXT: movi v0.2d, #0000000000000000
538 ; CHECK-NEXT: ptrue p0.d, vl4
539 ; CHECK-NEXT: fcmeq v2.4h, v1.4h, #0.0
540 ; CHECK-NEXT: uunpklo z1.s, z1.h
541 ; CHECK-NEXT: sshll v2.4s, v2.4h, #0
542 ; CHECK-NEXT: uunpklo z1.d, z1.s
543 ; CHECK-NEXT: mov v0.h[0], v2.h[0]
544 ; CHECK-NEXT: mov w8, v2.s[1]
545 ; CHECK-NEXT: mov v0.h[1], w8
546 ; CHECK-NEXT: sunpklo z0.s, z0.h
547 ; CHECK-NEXT: sunpklo z0.d, z0.s
548 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
549 ; CHECK-NEXT: ldr q0, [x1]
550 ; CHECK-NEXT: st1h { z1.d }, p0, [z0.d]
552 %vals = load <2 x half>, ptr %a
553 %ptrs = load <2 x ptr>, ptr %b
554 %mask = fcmp oeq <2 x half> %vals, zeroinitializer
555 call void @llvm.masked.scatter.v2f16(<2 x half> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
559 define void @masked_scatter_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
560 ; CHECK-LABEL: masked_scatter_v4f16:
562 ; CHECK-NEXT: ldr d0, [x0]
563 ; CHECK-NEXT: ptrue p0.d, vl4
564 ; CHECK-NEXT: fcmeq v1.4h, v0.4h, #0.0
565 ; CHECK-NEXT: uunpklo z0.s, z0.h
566 ; CHECK-NEXT: sunpklo z1.s, z1.h
567 ; CHECK-NEXT: uunpklo z0.d, z0.s
568 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1]
569 ; CHECK-NEXT: sunpklo z1.d, z1.s
570 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
571 ; CHECK-NEXT: st1h { z0.d }, p0, [z2.d]
573 %vals = load <4 x half>, ptr %a
574 %ptrs = load <4 x ptr>, ptr %b
575 %mask = fcmp oeq <4 x half> %vals, zeroinitializer
576 call void @llvm.masked.scatter.v4f16(<4 x half> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
580 define void @masked_scatter_v8f16(ptr %a, ptr %b) #0 {
581 ; VBITS_GE_256-LABEL: masked_scatter_v8f16:
582 ; VBITS_GE_256: // %bb.0:
583 ; VBITS_GE_256-NEXT: ldr q0, [x0]
584 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
585 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
586 ; VBITS_GE_256-NEXT: fcmeq v1.8h, v0.8h, #0.0
587 ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
588 ; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8
589 ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
590 ; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8
591 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
592 ; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s
593 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
594 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
595 ; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s
596 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
597 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
598 ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0
599 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1]
600 ; VBITS_GE_256-NEXT: st1h { z3.d }, p1, [z2.d]
601 ; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0
602 ; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [z4.d]
603 ; VBITS_GE_256-NEXT: ret
605 ; VBITS_GE_512-LABEL: masked_scatter_v8f16:
606 ; VBITS_GE_512: // %bb.0:
607 ; VBITS_GE_512-NEXT: ldr q0, [x0]
608 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
609 ; VBITS_GE_512-NEXT: fcmeq v1.8h, v0.8h, #0.0
610 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
611 ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
612 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
613 ; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1]
614 ; VBITS_GE_512-NEXT: sunpklo z1.d, z1.s
615 ; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z1.d, #0
616 ; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [z2.d]
617 ; VBITS_GE_512-NEXT: ret
618 %vals = load <8 x half>, ptr %a
619 %ptrs = load <8 x ptr>, ptr %b
620 %mask = fcmp oeq <8 x half> %vals, zeroinitializer
621 call void @llvm.masked.scatter.v8f16(<8 x half> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
625 define void @masked_scatter_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
626 ; CHECK-LABEL: masked_scatter_v16f16:
628 ; CHECK-NEXT: ptrue p0.h, vl16
629 ; CHECK-NEXT: ptrue p1.d, vl16
630 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
631 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
632 ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
633 ; CHECK-NEXT: uunpklo z0.s, z0.h
634 ; CHECK-NEXT: punpklo p0.h, p0.b
635 ; CHECK-NEXT: uunpklo z0.d, z0.s
636 ; CHECK-NEXT: punpklo p0.h, p0.b
637 ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d]
639 %vals = load <16 x half>, ptr %a
640 %ptrs = load <16 x ptr>, ptr %b
641 %mask = fcmp oeq <16 x half> %vals, zeroinitializer
642 call void @llvm.masked.scatter.v16f16(<16 x half> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
646 define void @masked_scatter_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
647 ; CHECK-LABEL: masked_scatter_v32f16:
649 ; CHECK-NEXT: ptrue p0.h, vl32
650 ; CHECK-NEXT: ptrue p1.d, vl32
651 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
652 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
653 ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
654 ; CHECK-NEXT: uunpklo z0.s, z0.h
655 ; CHECK-NEXT: punpklo p0.h, p0.b
656 ; CHECK-NEXT: uunpklo z0.d, z0.s
657 ; CHECK-NEXT: punpklo p0.h, p0.b
658 ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d]
660 %vals = load <32 x half>, ptr %a
661 %ptrs = load <32 x ptr>, ptr %b
662 %mask = fcmp oeq <32 x half> %vals, zeroinitializer
663 call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
671 define void @masked_scatter_v2f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
672 ; CHECK-LABEL: masked_scatter_v2f32:
674 ; CHECK-NEXT: ldr d0, [x0]
675 ; CHECK-NEXT: ptrue p0.d, vl2
676 ; CHECK-NEXT: fcmeq v1.2s, v0.2s, #0.0
677 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
678 ; CHECK-NEXT: sshll v1.2d, v1.2s, #0
679 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
680 ; CHECK-NEXT: ldr q1, [x1]
681 ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d]
683 %vals = load <2 x float>, ptr %a
684 %ptrs = load <2 x ptr>, ptr %b
685 %mask = fcmp oeq <2 x float> %vals, zeroinitializer
686 call void @llvm.masked.scatter.v2f32(<2 x float> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
690 define void @masked_scatter_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
691 ; CHECK-LABEL: masked_scatter_v4f32:
693 ; CHECK-NEXT: ldr q0, [x0]
694 ; CHECK-NEXT: ptrue p0.d, vl4
695 ; CHECK-NEXT: fcmeq v1.4s, v0.4s, #0.0
696 ; CHECK-NEXT: uunpklo z0.d, z0.s
697 ; CHECK-NEXT: sunpklo z1.d, z1.s
698 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1]
699 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
700 ; CHECK-NEXT: st1w { z0.d }, p0, [z2.d]
702 %vals = load <4 x float>, ptr %a
703 %ptrs = load <4 x ptr>, ptr %b
704 %mask = fcmp oeq <4 x float> %vals, zeroinitializer
705 call void @llvm.masked.scatter.v4f32(<4 x float> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
709 define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 {
710 ; VBITS_GE_256-LABEL: masked_scatter_v8f32:
711 ; VBITS_GE_256: // %bb.0:
712 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
713 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
714 ; VBITS_GE_256-NEXT: ptrue p1.d, vl4
715 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
716 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x8, lsl #3]
717 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1]
718 ; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
719 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s
720 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
721 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
722 ; VBITS_GE_256-NEXT: punpklo p2.h, p0.b
723 ; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff
724 ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
725 ; VBITS_GE_256-NEXT: and p0.b, p2/z, p2.b, p1.b
726 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
727 ; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z4.d]
728 ; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0
729 ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z3.d]
730 ; VBITS_GE_256-NEXT: ret
732 ; VBITS_GE_512-LABEL: masked_scatter_v8f32:
733 ; VBITS_GE_512: // %bb.0:
734 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
735 ; VBITS_GE_512-NEXT: ptrue p1.d, vl8
736 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
737 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1]
738 ; VBITS_GE_512-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
739 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
740 ; VBITS_GE_512-NEXT: punpklo p0.h, p0.b
741 ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [z1.d]
742 ; VBITS_GE_512-NEXT: ret
743 %vals = load <8 x float>, ptr %a
744 %ptrs = load <8 x ptr>, ptr %b
745 %mask = fcmp oeq <8 x float> %vals, zeroinitializer
746 call void @llvm.masked.scatter.v8f32(<8 x float> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
750 define void @masked_scatter_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
751 ; CHECK-LABEL: masked_scatter_v16f32:
753 ; CHECK-NEXT: ptrue p0.s, vl16
754 ; CHECK-NEXT: ptrue p1.d, vl16
755 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
756 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
757 ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
758 ; CHECK-NEXT: uunpklo z0.d, z0.s
759 ; CHECK-NEXT: punpklo p0.h, p0.b
760 ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d]
762 %vals = load <16 x float>, ptr %a
763 %ptrs = load <16 x ptr>, ptr %b
764 %mask = fcmp oeq <16 x float> %vals, zeroinitializer
765 call void @llvm.masked.scatter.v16f32(<16 x float> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
769 define void @masked_scatter_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
770 ; CHECK-LABEL: masked_scatter_v32f32:
772 ; CHECK-NEXT: ptrue p0.s, vl32
773 ; CHECK-NEXT: ptrue p1.d, vl32
774 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
775 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
776 ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
777 ; CHECK-NEXT: uunpklo z0.d, z0.s
778 ; CHECK-NEXT: punpklo p0.h, p0.b
779 ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d]
781 %vals = load <32 x float>, ptr %a
782 %ptrs = load <32 x ptr>, ptr %b
783 %mask = fcmp oeq <32 x float> %vals, zeroinitializer
784 call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
792 ; Scalarize 1 x double scatters
793 define void @masked_scatter_v1f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
794 ; CHECK-LABEL: masked_scatter_v1f64:
796 ; CHECK-NEXT: ldr d0, [x0]
797 ; CHECK-NEXT: fcmp d0, #0.0
798 ; CHECK-NEXT: b.ne .LBB31_2
799 ; CHECK-NEXT: // %bb.1: // %cond.store
800 ; CHECK-NEXT: ldr d1, [x1]
801 ; CHECK-NEXT: fmov x8, d1
802 ; CHECK-NEXT: str d0, [x8]
803 ; CHECK-NEXT: .LBB31_2: // %else
805 %vals = load <1 x double>, ptr %a
806 %ptrs = load <1 x ptr>, ptr %b
807 %mask = fcmp oeq <1 x double> %vals, zeroinitializer
808 call void @llvm.masked.scatter.v1f64(<1 x double> %vals, <1 x ptr> %ptrs, i32 8, <1 x i1> %mask)
812 define void @masked_scatter_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
813 ; CHECK-LABEL: masked_scatter_v2f64:
815 ; CHECK-NEXT: ptrue p0.d, vl2
816 ; CHECK-NEXT: ldr q0, [x0]
817 ; CHECK-NEXT: fcmeq v1.2d, v0.2d, #0.0
818 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
819 ; CHECK-NEXT: ldr q1, [x1]
820 ; CHECK-NEXT: st1d { z0.d }, p0, [z1.d]
822 %vals = load <2 x double>, ptr %a
823 %ptrs = load <2 x ptr>, ptr %b
824 %mask = fcmp oeq <2 x double> %vals, zeroinitializer
825 call void @llvm.masked.scatter.v2f64(<2 x double> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask)
829 define void @masked_scatter_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
830 ; CHECK-LABEL: masked_scatter_v4f64:
832 ; CHECK-NEXT: ptrue p0.d, vl4
833 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
834 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
835 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
836 ; CHECK-NEXT: st1d { z0.d }, p1, [z1.d]
838 %vals = load <4 x double>, ptr %a
839 %ptrs = load <4 x ptr>, ptr %b
840 %mask = fcmp oeq <4 x double> %vals, zeroinitializer
841 call void @llvm.masked.scatter.v4f64(<4 x double> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask)
845 define void @masked_scatter_v8f64(ptr %a, ptr %b) #0 {
846 ; VBITS_GE_256-LABEL: masked_scatter_v8f64:
847 ; VBITS_GE_256: // %bb.0:
848 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
849 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
850 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
851 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
852 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
853 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
854 ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z1.d, #0.0
855 ; VBITS_GE_256-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0
856 ; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [z3.d]
857 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [z2.d]
858 ; VBITS_GE_256-NEXT: ret
860 ; VBITS_GE_512-LABEL: masked_scatter_v8f64:
861 ; VBITS_GE_512: // %bb.0:
862 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
863 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
864 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
865 ; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
866 ; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [z1.d]
867 ; VBITS_GE_512-NEXT: ret
868 %vals = load <8 x double>, ptr %a
869 %ptrs = load <8 x ptr>, ptr %b
870 %mask = fcmp oeq <8 x double> %vals, zeroinitializer
871 call void @llvm.masked.scatter.v8f64(<8 x double> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
875 define void @masked_scatter_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
876 ; CHECK-LABEL: masked_scatter_v16f64:
878 ; CHECK-NEXT: ptrue p0.d, vl16
879 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
880 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
881 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
882 ; CHECK-NEXT: st1d { z0.d }, p1, [z1.d]
884 %vals = load <16 x double>, ptr %a
885 %ptrs = load <16 x ptr>, ptr %b
886 %mask = fcmp oeq <16 x double> %vals, zeroinitializer
887 call void @llvm.masked.scatter.v16f64(<16 x double> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask)
891 define void @masked_scatter_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
892 ; CHECK-LABEL: masked_scatter_v32f64:
894 ; CHECK-NEXT: ptrue p0.d, vl32
895 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
896 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
897 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
898 ; CHECK-NEXT: st1d { z0.d }, p1, [z1.d]
900 %vals = load <32 x double>, ptr %a
901 %ptrs = load <32 x ptr>, ptr %b
902 %mask = fcmp oeq <32 x double> %vals, zeroinitializer
903 call void @llvm.masked.scatter.v32f64(<32 x double> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
907 ; The above tests test the types, the below tests check that the addressing
908 ; modes still function
910 define void @masked_scatter_32b_scaled_sext_f16(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
911 ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f16:
913 ; CHECK-NEXT: ptrue p0.h, vl32
914 ; CHECK-NEXT: ptrue p1.s, vl32
915 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
916 ; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
917 ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
918 ; CHECK-NEXT: uunpklo z0.s, z0.h
919 ; CHECK-NEXT: punpklo p0.h, p0.b
920 ; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw #1]
922 %vals = load <32 x half>, ptr %a
923 %idxs = load <32 x i32>, ptr %b
924 %ext = sext <32 x i32> %idxs to <32 x i64>
925 %ptrs = getelementptr half, ptr %base, <32 x i64> %ext
926 %mask = fcmp oeq <32 x half> %vals, zeroinitializer
927 call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
931 define void @masked_scatter_32b_scaled_sext_f32(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
932 ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f32:
934 ; CHECK-NEXT: ptrue p0.s, vl32
935 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
936 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
937 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0
938 ; CHECK-NEXT: st1w { z0.s }, p1, [x2, z1.s, sxtw #2]
940 %vals = load <32 x float>, ptr %a
941 %idxs = load <32 x i32>, ptr %b
942 %ext = sext <32 x i32> %idxs to <32 x i64>
943 %ptrs = getelementptr float, ptr %base, <32 x i64> %ext
944 %mask = fcmp oeq <32 x float> %vals, zeroinitializer
945 call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
949 define void @masked_scatter_32b_scaled_sext_f64(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
950 ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f64:
952 ; CHECK-NEXT: ptrue p0.d, vl32
953 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
954 ; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x1]
955 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
956 ; CHECK-NEXT: st1d { z0.d }, p1, [x2, z1.d, lsl #3]
958 %vals = load <32 x double>, ptr %a
959 %idxs = load <32 x i32>, ptr %b
960 %ext = sext <32 x i32> %idxs to <32 x i64>
961 %ptrs = getelementptr double, ptr %base, <32 x i64> %ext
962 %mask = fcmp oeq <32 x double> %vals, zeroinitializer
963 call void @llvm.masked.scatter.v32f64(<32 x double> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
967 define void @masked_scatter_32b_scaled_zext(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
968 ; CHECK-LABEL: masked_scatter_32b_scaled_zext:
970 ; CHECK-NEXT: ptrue p0.h, vl32
971 ; CHECK-NEXT: ptrue p1.s, vl32
972 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
973 ; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
974 ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
975 ; CHECK-NEXT: uunpklo z0.s, z0.h
976 ; CHECK-NEXT: punpklo p0.h, p0.b
977 ; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw #1]
979 %vals = load <32 x half>, ptr %a
980 %idxs = load <32 x i32>, ptr %b
981 %ext = zext <32 x i32> %idxs to <32 x i64>
982 %ptrs = getelementptr half, ptr %base, <32 x i64> %ext
983 %mask = fcmp oeq <32 x half> %vals, zeroinitializer
984 call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
988 define void @masked_scatter_32b_unscaled_sext(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
989 ; CHECK-LABEL: masked_scatter_32b_unscaled_sext:
991 ; CHECK-NEXT: ptrue p0.h, vl32
992 ; CHECK-NEXT: ptrue p1.s, vl32
993 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
994 ; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
995 ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
996 ; CHECK-NEXT: uunpklo z0.s, z0.h
997 ; CHECK-NEXT: punpklo p0.h, p0.b
998 ; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw]
1000 %vals = load <32 x half>, ptr %a
1001 %idxs = load <32 x i32>, ptr %b
1002 %ext = sext <32 x i32> %idxs to <32 x i64>
1003 %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %ext
1004 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1005 %mask = fcmp oeq <32 x half> %vals, zeroinitializer
1006 call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
1010 define void @masked_scatter_32b_unscaled_zext(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
1011 ; CHECK-LABEL: masked_scatter_32b_unscaled_zext:
1013 ; CHECK-NEXT: ptrue p0.h, vl32
1014 ; CHECK-NEXT: ptrue p1.s, vl32
1015 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1016 ; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
1017 ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
1018 ; CHECK-NEXT: uunpklo z0.s, z0.h
1019 ; CHECK-NEXT: punpklo p0.h, p0.b
1020 ; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw]
1022 %vals = load <32 x half>, ptr %a
1023 %idxs = load <32 x i32>, ptr %b
1024 %ext = zext <32 x i32> %idxs to <32 x i64>
1025 %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %ext
1026 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1027 %mask = fcmp oeq <32 x half> %vals, zeroinitializer
1028 call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
1032 define void @masked_scatter_64b_scaled(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
1033 ; CHECK-LABEL: masked_scatter_64b_scaled:
1035 ; CHECK-NEXT: ptrue p0.s, vl32
1036 ; CHECK-NEXT: ptrue p1.d, vl32
1037 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1038 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
1039 ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
1040 ; CHECK-NEXT: uunpklo z0.d, z0.s
1041 ; CHECK-NEXT: punpklo p0.h, p0.b
1042 ; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2]
1044 %vals = load <32 x float>, ptr %a
1045 %idxs = load <32 x i64>, ptr %b
1046 %ptrs = getelementptr float, ptr %base, <32 x i64> %idxs
1047 %mask = fcmp oeq <32 x float> %vals, zeroinitializer
1048 call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
1052 define void @masked_scatter_64b_unscaled(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 {
1053 ; CHECK-LABEL: masked_scatter_64b_unscaled:
1055 ; CHECK-NEXT: ptrue p0.s, vl32
1056 ; CHECK-NEXT: ptrue p1.d, vl32
1057 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1058 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
1059 ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
1060 ; CHECK-NEXT: uunpklo z0.d, z0.s
1061 ; CHECK-NEXT: punpklo p0.h, p0.b
1062 ; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d]
1064 %vals = load <32 x float>, ptr %a
1065 %idxs = load <32 x i64>, ptr %b
1066 %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %idxs
1067 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1068 %mask = fcmp oeq <32 x float> %vals, zeroinitializer
1069 call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
1073 define void @masked_scatter_vec_plus_reg(ptr %a, ptr %b, i64 %off) vscale_range(16,0) #0 {
1074 ; CHECK-LABEL: masked_scatter_vec_plus_reg:
1076 ; CHECK-NEXT: ptrue p0.s, vl32
1077 ; CHECK-NEXT: ptrue p1.d, vl32
1078 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1079 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
1080 ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
1081 ; CHECK-NEXT: uunpklo z0.d, z0.s
1082 ; CHECK-NEXT: punpklo p0.h, p0.b
1083 ; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d]
1085 %vals = load <32 x float>, ptr %a
1086 %bases = load <32 x ptr>, ptr %b
1087 %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 %off
1088 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1089 %mask = fcmp oeq <32 x float> %vals, zeroinitializer
1090 call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
1094 define void @masked_scatter_vec_plus_imm(ptr %a, ptr %b) vscale_range(16,0) #0 {
1095 ; CHECK-LABEL: masked_scatter_vec_plus_imm:
1097 ; CHECK-NEXT: ptrue p0.s, vl32
1098 ; CHECK-NEXT: ptrue p1.d, vl32
1099 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1100 ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1]
1101 ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
1102 ; CHECK-NEXT: uunpklo z0.d, z0.s
1103 ; CHECK-NEXT: punpklo p0.h, p0.b
1104 ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d, #4]
1106 %vals = load <32 x float>, ptr %a
1107 %bases = load <32 x ptr>, ptr %b
1108 %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 4
1109 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr>
1110 %mask = fcmp oeq <32 x float> %vals, zeroinitializer
1111 call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask)
1115 ; extract_subvec(...(insert_subvec(a,b,c))) -> extract_subvec(bitcast(b),d) like
1116 ; combines can effectively unlegalise bitcast operations. This test ensures such
1117 ; combines do not happen after operation legalisation. When not prevented the
1118 ; test triggers infinite combine->legalise->combine->...
1120 ; NOTE: For this test to function correctly it's critical for %vals to be in a
1121 ; different block to the scatter store. If not, the problematic bitcast will be
1122 ; removed before operation legalisation and thus not exercise the combine.
1123 define void @masked_scatter_bitcast_infinite_loop(ptr %a, ptr %b, i1 %cond) vscale_range(4,0) #0 {
1124 ; CHECK-LABEL: masked_scatter_bitcast_infinite_loop:
1126 ; CHECK-NEXT: ptrue p0.d, vl8
1127 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1128 ; CHECK-NEXT: tbz w2, #0, .LBB47_2
1129 ; CHECK-NEXT: // %bb.1: // %bb.1
1130 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0
1131 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1132 ; CHECK-NEXT: st1d { z0.d }, p1, [z1.d]
1133 ; CHECK-NEXT: .LBB47_2: // %bb.2
1135 %vals = load volatile <8 x double>, ptr %a
1136 br i1 %cond, label %bb.1, label %bb.2
1139 %ptrs = load <8 x ptr>, ptr %b
1140 %mask = fcmp oeq <8 x double> %vals, zeroinitializer
1141 call void @llvm.masked.scatter.v8f64(<8 x double> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
1148 declare void @llvm.masked.scatter.v2i8(<2 x i8>, <2 x ptr>, i32, <2 x i1>)
1149 declare void @llvm.masked.scatter.v4i8(<4 x i8>, <4 x ptr>, i32, <4 x i1>)
1150 declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
1151 declare void @llvm.masked.scatter.v16i8(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
1152 declare void @llvm.masked.scatter.v32i8(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
1154 declare void @llvm.masked.scatter.v2i16(<2 x i16>, <2 x ptr>, i32, <2 x i1>)
1155 declare void @llvm.masked.scatter.v4i16(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
1156 declare void @llvm.masked.scatter.v8i16(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
1157 declare void @llvm.masked.scatter.v16i16(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
1158 declare void @llvm.masked.scatter.v32i16(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
1160 declare void @llvm.masked.scatter.v2i32(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
1161 declare void @llvm.masked.scatter.v4i32(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
1162 declare void @llvm.masked.scatter.v8i32(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
1163 declare void @llvm.masked.scatter.v16i32(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
1164 declare void @llvm.masked.scatter.v32i32(<32 x i32>, <32 x ptr>, i32, <32 x i1>)
1166 declare void @llvm.masked.scatter.v1i64(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
1167 declare void @llvm.masked.scatter.v2i64(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
1168 declare void @llvm.masked.scatter.v4i64(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
1169 declare void @llvm.masked.scatter.v8i64(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
1170 declare void @llvm.masked.scatter.v16i64(<16 x i64>, <16 x ptr>, i32, <16 x i1>)
1171 declare void @llvm.masked.scatter.v32i64(<32 x i64>, <32 x ptr>, i32, <32 x i1>)
1173 declare void @llvm.masked.scatter.v2f16(<2 x half>, <2 x ptr>, i32, <2 x i1>)
1174 declare void @llvm.masked.scatter.v4f16(<4 x half>, <4 x ptr>, i32, <4 x i1>)
1175 declare void @llvm.masked.scatter.v8f16(<8 x half>, <8 x ptr>, i32, <8 x i1>)
1176 declare void @llvm.masked.scatter.v16f16(<16 x half>, <16 x ptr>, i32, <16 x i1>)
1177 declare void @llvm.masked.scatter.v32f16(<32 x half>, <32 x ptr>, i32, <32 x i1>)
1179 declare void @llvm.masked.scatter.v2f32(<2 x float>, <2 x ptr>, i32, <2 x i1>)
1180 declare void @llvm.masked.scatter.v4f32(<4 x float>, <4 x ptr>, i32, <4 x i1>)
1181 declare void @llvm.masked.scatter.v8f32(<8 x float>, <8 x ptr>, i32, <8 x i1>)
1182 declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x ptr>, i32, <16 x i1>)
1183 declare void @llvm.masked.scatter.v32f32(<32 x float>, <32 x ptr>, i32, <32 x i1>)
1185 declare void @llvm.masked.scatter.v1f64(<1 x double>, <1 x ptr>, i32, <1 x i1>)
1186 declare void @llvm.masked.scatter.v2f64(<2 x double>, <2 x ptr>, i32, <2 x i1>)
1187 declare void @llvm.masked.scatter.v4f64(<4 x double>, <4 x ptr>, i32, <4 x i1>)
1188 declare void @llvm.masked.scatter.v8f64(<8 x double>, <8 x ptr>, i32, <8 x i1>)
1189 declare void @llvm.masked.scatter.v16f64(<16 x double>, <16 x ptr>, i32, <16 x i1>)
1190 declare void @llvm.masked.scatter.v32f64(<32 x double>, <32 x ptr>, i32, <32 x i1>)
1192 attributes #0 = { "target-features"="+sve" }