1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 define <2 x half> @masked_load_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
13 ; CHECK-LABEL: masked_load_v2f16:
15 ; CHECK-NEXT: ldr s1, [x0]
16 ; CHECK-NEXT: ldr s2, [x1]
17 ; CHECK-NEXT: movi v0.2d, #0000000000000000
18 ; CHECK-NEXT: ptrue p0.h, vl4
19 ; CHECK-NEXT: fcmeq v1.4h, v1.4h, v2.4h
20 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0
21 ; CHECK-NEXT: mov v0.h[0], v1.h[0]
22 ; CHECK-NEXT: mov w8, v1.s[1]
23 ; CHECK-NEXT: mov v0.h[1], w8
24 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
25 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
26 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
28 %a = load <2 x half>, ptr %ap
29 %b = load <2 x half>, ptr %bp
30 %mask = fcmp oeq <2 x half> %a, %b
31 %load = call <2 x half> @llvm.masked.load.v2f16(ptr %ap, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer)
35 define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
36 ; CHECK-LABEL: masked_load_v2f32:
38 ; CHECK-NEXT: ldr d0, [x0]
39 ; CHECK-NEXT: ldr d1, [x1]
40 ; CHECK-NEXT: ptrue p0.s, vl2
41 ; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s
42 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
43 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
44 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
46 %a = load <2 x float>, ptr %ap
47 %b = load <2 x float>, ptr %bp
48 %mask = fcmp oeq <2 x float> %a, %b
49 %load = call <2 x float> @llvm.masked.load.v2f32(ptr %ap, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer)
53 define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
54 ; CHECK-LABEL: masked_load_v4f32:
56 ; CHECK-NEXT: ldr q0, [x0]
57 ; CHECK-NEXT: ldr q1, [x1]
58 ; CHECK-NEXT: ptrue p0.s, vl4
59 ; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s
60 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
61 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
62 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
64 %a = load <4 x float>, ptr %ap
65 %b = load <4 x float>, ptr %bp
66 %mask = fcmp oeq <4 x float> %a, %b
67 %load = call <4 x float> @llvm.masked.load.v4f32(ptr %ap, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
71 define void @masked_load_v8f32(ptr %ap, ptr %bp, ptr %c) vscale_range(2,0) #0 {
72 ; CHECK-LABEL: masked_load_v8f32:
74 ; CHECK-NEXT: ptrue p0.s, vl8
75 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
76 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
77 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
78 ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0]
79 ; CHECK-NEXT: st1w { z0.s }, p0, [x2]
81 %a = load <8 x float>, ptr %ap
82 %b = load <8 x float>, ptr %bp
83 %mask = fcmp oeq <8 x float> %a, %b
84 %load = call <8 x float> @llvm.masked.load.v8f32(ptr %ap, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer)
85 store <8 x float> %load, ptr %c
89 define void @masked_load_v16f32(ptr %ap, ptr %bp, ptr %c) #0 {
90 ; VBITS_GE_256-LABEL: masked_load_v16f32:
91 ; VBITS_GE_256: // %bb.0:
92 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
93 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
94 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
95 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
96 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
97 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
98 ; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
99 ; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s
100 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
101 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0]
102 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
103 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2]
104 ; VBITS_GE_256-NEXT: ret
106 ; VBITS_GE_512-LABEL: masked_load_v16f32:
107 ; VBITS_GE_512: // %bb.0:
108 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
109 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
110 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
111 ; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
112 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0]
113 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
114 ; VBITS_GE_512-NEXT: ret
115 %a = load <16 x float>, ptr %ap
116 %b = load <16 x float>, ptr %bp
117 %mask = fcmp oeq <16 x float> %a, %b
118 %load = call <16 x float> @llvm.masked.load.v16f32(ptr %ap, i32 8, <16 x i1> %mask, <16 x float> zeroinitializer)
119 store <16 x float> %load, ptr %c
123 define void @masked_load_v32f32(ptr %ap, ptr %bp, ptr %c) vscale_range(8,0) #0 {
124 ; CHECK-LABEL: masked_load_v32f32:
126 ; CHECK-NEXT: ptrue p0.s, vl32
127 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
128 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
129 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
130 ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0]
131 ; CHECK-NEXT: st1w { z0.s }, p0, [x2]
133 %a = load <32 x float>, ptr %ap
134 %b = load <32 x float>, ptr %bp
135 %mask = fcmp oeq <32 x float> %a, %b
136 %load = call <32 x float> @llvm.masked.load.v32f32(ptr %ap, i32 8, <32 x i1> %mask, <32 x float> zeroinitializer)
137 store <32 x float> %load, ptr %c
141 define void @masked_load_v64f32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
142 ; CHECK-LABEL: masked_load_v64f32:
144 ; CHECK-NEXT: ptrue p0.s, vl64
145 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
146 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
147 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
148 ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0]
149 ; CHECK-NEXT: st1w { z0.s }, p0, [x2]
151 %a = load <64 x float>, ptr %ap
152 %b = load <64 x float>, ptr %bp
153 %mask = fcmp oeq <64 x float> %a, %b
154 %load = call <64 x float> @llvm.masked.load.v64f32(ptr %ap, i32 8, <64 x i1> %mask, <64 x float> zeroinitializer)
155 store <64 x float> %load, ptr %c
159 define void @masked_load_v64i8(ptr %ap, ptr %bp, ptr %c) #0 {
160 ; VBITS_GE_256-LABEL: masked_load_v64i8:
161 ; VBITS_GE_256: // %bb.0:
162 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
163 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
164 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
165 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8]
166 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0]
167 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
168 ; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
169 ; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z2.b, z3.b
170 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
171 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p2/z, [x0]
172 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x2, x8]
173 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2]
174 ; VBITS_GE_256-NEXT: ret
176 ; VBITS_GE_512-LABEL: masked_load_v64i8:
177 ; VBITS_GE_512: // %bb.0:
178 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
179 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
180 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
181 ; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
182 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p1/z, [x0]
183 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2]
184 ; VBITS_GE_512-NEXT: ret
185 %a = load <64 x i8>, ptr %ap
186 %b = load <64 x i8>, ptr %bp
187 %mask = icmp eq <64 x i8> %a, %b
188 %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
189 store <64 x i8> %load, ptr %c
193 define void @masked_load_v32i16(ptr %ap, ptr %bp, ptr %c) #0 {
194 ; VBITS_GE_256-LABEL: masked_load_v32i16:
195 ; VBITS_GE_256: // %bb.0:
196 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
197 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
198 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
199 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
200 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
201 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
202 ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
203 ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z2.h, z3.h
204 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
205 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p2/z, [x0]
206 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
207 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2]
208 ; VBITS_GE_256-NEXT: ret
210 ; VBITS_GE_512-LABEL: masked_load_v32i16:
211 ; VBITS_GE_512: // %bb.0:
212 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
213 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
214 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
215 ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
216 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0]
217 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2]
218 ; VBITS_GE_512-NEXT: ret
219 %a = load <32 x i16>, ptr %ap
220 %b = load <32 x i16>, ptr %bp
221 %mask = icmp eq <32 x i16> %a, %b
222 %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
223 store <32 x i16> %load, ptr %c
227 define void @masked_load_v16i32(ptr %ap, ptr %bp, ptr %c) #0 {
228 ; VBITS_GE_256-LABEL: masked_load_v16i32:
229 ; VBITS_GE_256: // %bb.0:
230 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
231 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
232 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
233 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
234 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
235 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
236 ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
237 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z2.s, z3.s
238 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
239 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0]
240 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
241 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2]
242 ; VBITS_GE_256-NEXT: ret
244 ; VBITS_GE_512-LABEL: masked_load_v16i32:
245 ; VBITS_GE_512: // %bb.0:
246 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
247 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
248 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
249 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
250 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0]
251 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
252 ; VBITS_GE_512-NEXT: ret
253 %a = load <16 x i32>, ptr %ap
254 %b = load <16 x i32>, ptr %bp
255 %mask = icmp eq <16 x i32> %a, %b
256 %load = call <16 x i32> @llvm.masked.load.v16i32(ptr %ap, i32 8, <16 x i1> %mask, <16 x i32> undef)
257 store <16 x i32> %load, ptr %c
261 define void @masked_load_v8i64(ptr %ap, ptr %bp, ptr %c) #0 {
262 ; VBITS_GE_256-LABEL: masked_load_v8i64:
263 ; VBITS_GE_256: // %bb.0:
264 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
265 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
266 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
267 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
268 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
269 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
270 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
271 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d
272 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
273 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0]
274 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
275 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
276 ; VBITS_GE_256-NEXT: ret
278 ; VBITS_GE_512-LABEL: masked_load_v8i64:
279 ; VBITS_GE_512: // %bb.0:
280 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
281 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
282 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
283 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
284 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0]
285 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
286 ; VBITS_GE_512-NEXT: ret
287 %a = load <8 x i64>, ptr %ap
288 %b = load <8 x i64>, ptr %bp
289 %mask = icmp eq <8 x i64> %a, %b
290 %load = call <8 x i64> @llvm.masked.load.v8i64(ptr %ap, i32 8, <8 x i1> %mask, <8 x i64> undef)
291 store <8 x i64> %load, ptr %c
295 define void @masked_load_passthru_v8i64(ptr %ap, ptr %bp, ptr %c) #0 {
296 ; VBITS_GE_256-LABEL: masked_load_passthru_v8i64:
297 ; VBITS_GE_256: // %bb.0:
298 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
299 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
300 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
301 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
302 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
303 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
304 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
305 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d
306 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
307 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x0]
308 ; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d
309 ; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d
310 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
311 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
312 ; VBITS_GE_256-NEXT: ret
314 ; VBITS_GE_512-LABEL: masked_load_passthru_v8i64:
315 ; VBITS_GE_512: // %bb.0:
316 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
317 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
318 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
319 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
320 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0]
321 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
322 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
323 ; VBITS_GE_512-NEXT: ret
324 %a = load <8 x i64>, ptr %ap
325 %b = load <8 x i64>, ptr %bp
326 %mask = icmp eq <8 x i64> %a, %b
327 %load = call <8 x i64> @llvm.masked.load.v8i64(ptr %ap, i32 8, <8 x i1> %mask, <8 x i64> %b)
328 store <8 x i64> %load, ptr %c
332 define void @masked_load_passthru_v8f64(ptr %ap, ptr %bp, ptr %c) #0 {
333 ; VBITS_GE_256-LABEL: masked_load_passthru_v8f64:
334 ; VBITS_GE_256: // %bb.0:
335 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
336 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
337 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
338 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
339 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
340 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
341 ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
342 ; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d
343 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
344 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x0]
345 ; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d
346 ; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d
347 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
348 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
349 ; VBITS_GE_256-NEXT: ret
351 ; VBITS_GE_512-LABEL: masked_load_passthru_v8f64:
352 ; VBITS_GE_512: // %bb.0:
353 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
354 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
355 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
356 ; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
357 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0]
358 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
359 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
360 ; VBITS_GE_512-NEXT: ret
361 %a = load <8 x double>, ptr %ap
362 %b = load <8 x double>, ptr %bp
363 %mask = fcmp oeq <8 x double> %a, %b
364 %load = call <8 x double> @llvm.masked.load.v8f64(ptr %ap, i32 8, <8 x i1> %mask, <8 x double> %b)
365 store <8 x double> %load, ptr %c
369 define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 {
370 ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16:
371 ; VBITS_GE_256: // %bb.0:
372 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
373 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
374 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1]
375 ; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0
376 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
377 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
378 ; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b
379 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
380 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
381 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2]
382 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
383 ; VBITS_GE_256-NEXT: ret
385 ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16:
386 ; VBITS_GE_512: // %bb.0:
387 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
388 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x1]
389 ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
390 ; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0]
391 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2]
392 ; VBITS_GE_512-NEXT: ret
393 %b = load <32 x i8>, ptr %bp
394 %mask = icmp eq <32 x i8> %b, zeroinitializer
395 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
396 %ext = sext <32 x i8> %load to <32 x i16>
397 store <32 x i16> %ext, ptr %c
401 define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 {
402 ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32:
403 ; VBITS_GE_256: // %bb.0:
404 ; VBITS_GE_256-NEXT: ldr q0, [x1]
405 ; VBITS_GE_256-NEXT: ptrue p0.b, vl16
406 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
407 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0
408 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
409 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
410 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
411 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
412 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
413 ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
414 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
415 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
416 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2]
417 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2]
418 ; VBITS_GE_256-NEXT: ret
420 ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32:
421 ; VBITS_GE_512: // %bb.0:
422 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
423 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p0/z, [x1]
424 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
425 ; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0]
426 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
427 ; VBITS_GE_512-NEXT: ret
428 %b = load <16 x i8>, ptr %bp
429 %mask = icmp eq <16 x i8> %b, zeroinitializer
430 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
431 %ext = sext <16 x i8> %load to <16 x i32>
432 store <16 x i32> %ext, ptr %c
436 define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 {
437 ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64:
438 ; VBITS_GE_256: // %bb.0:
439 ; VBITS_GE_256-NEXT: ldr d0, [x1]
440 ; VBITS_GE_256-NEXT: ptrue p0.b, vl8
441 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
442 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0
443 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
444 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
445 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
446 ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0
447 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
448 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
449 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
450 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
451 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
452 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
453 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
454 ; VBITS_GE_256-NEXT: ret
456 ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64:
457 ; VBITS_GE_512: // %bb.0:
458 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
459 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [x1]
460 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
461 ; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0]
462 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
463 ; VBITS_GE_512-NEXT: ret
464 %b = load <8 x i8>, ptr %bp
465 %mask = icmp eq <8 x i8> %b, zeroinitializer
466 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
467 %ext = sext <8 x i8> %load to <8 x i64>
468 store <8 x i64> %ext, ptr %c
472 define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 {
473 ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32:
474 ; VBITS_GE_256: // %bb.0:
475 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
476 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
477 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1]
478 ; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0
479 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
480 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
481 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
482 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
483 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
484 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2]
485 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
486 ; VBITS_GE_256-NEXT: ret
488 ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32:
489 ; VBITS_GE_512: // %bb.0:
490 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
491 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1]
492 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
493 ; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0]
494 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
495 ; VBITS_GE_512-NEXT: ret
496 %b = load <16 x i16>, ptr %bp
497 %mask = icmp eq <16 x i16> %b, zeroinitializer
498 %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
499 %ext = sext <16 x i16> %load to <16 x i32>
500 store <16 x i32> %ext, ptr %c
504 define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 {
505 ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64:
506 ; VBITS_GE_256: // %bb.0:
507 ; VBITS_GE_256-NEXT: ldr q0, [x1]
508 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
509 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
510 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0
511 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0
512 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
513 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
514 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
515 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
516 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
517 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
518 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
519 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
520 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
521 ; VBITS_GE_256-NEXT: ret
523 ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64:
524 ; VBITS_GE_512: // %bb.0:
525 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
526 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x1]
527 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
528 ; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0]
529 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
530 ; VBITS_GE_512-NEXT: ret
531 %b = load <8 x i16>, ptr %bp
532 %mask = icmp eq <8 x i16> %b, zeroinitializer
533 %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
534 %ext = sext <8 x i16> %load to <8 x i64>
535 store <8 x i64> %ext, ptr %c
539 define void @masked_load_sext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
540 ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64:
541 ; VBITS_GE_256: // %bb.0:
542 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
543 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
544 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
545 ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0
546 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
547 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
548 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
549 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
550 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
551 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
552 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
553 ; VBITS_GE_256-NEXT: ret
555 ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64:
556 ; VBITS_GE_512: // %bb.0:
557 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
558 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1]
559 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
560 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0]
561 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
562 ; VBITS_GE_512-NEXT: ret
563 %b = load <8 x i32>, ptr %bp
564 %mask = icmp eq <8 x i32> %b, zeroinitializer
565 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
566 %ext = sext <8 x i32> %load to <8 x i64>
567 store <8 x i64> %ext, ptr %c
571 define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 {
572 ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16:
573 ; VBITS_GE_256: // %bb.0:
574 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
575 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
576 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1]
577 ; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0
578 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
579 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
580 ; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b
581 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
582 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
583 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2]
584 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
585 ; VBITS_GE_256-NEXT: ret
587 ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16:
588 ; VBITS_GE_512: // %bb.0:
589 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
590 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x1]
591 ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
592 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0]
593 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2]
594 ; VBITS_GE_512-NEXT: ret
595 %b = load <32 x i8>, ptr %bp
596 %mask = icmp eq <32 x i8> %b, zeroinitializer
597 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
598 %ext = zext <32 x i8> %load to <32 x i16>
599 store <32 x i16> %ext, ptr %c
603 define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 {
604 ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32:
605 ; VBITS_GE_256: // %bb.0:
606 ; VBITS_GE_256-NEXT: ldr q0, [x1]
607 ; VBITS_GE_256-NEXT: ptrue p0.b, vl16
608 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
609 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0
610 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
611 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
612 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
613 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
614 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
615 ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
616 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
617 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
618 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2]
619 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2]
620 ; VBITS_GE_256-NEXT: ret
622 ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32:
623 ; VBITS_GE_512: // %bb.0:
624 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
625 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p0/z, [x1]
626 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
627 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0]
628 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
629 ; VBITS_GE_512-NEXT: ret
630 %b = load <16 x i8>, ptr %bp
631 %mask = icmp eq <16 x i8> %b, zeroinitializer
632 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
633 %ext = zext <16 x i8> %load to <16 x i32>
634 store <16 x i32> %ext, ptr %c
638 define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 {
639 ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64:
640 ; VBITS_GE_256: // %bb.0:
641 ; VBITS_GE_256-NEXT: ldr d0, [x1]
642 ; VBITS_GE_256-NEXT: ptrue p0.b, vl8
643 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
644 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0
645 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
646 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
647 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
648 ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
649 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
650 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
651 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
652 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
653 ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
654 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
655 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
656 ; VBITS_GE_256-NEXT: ret
658 ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64:
659 ; VBITS_GE_512: // %bb.0:
660 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
661 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [x1]
662 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
663 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0]
664 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
665 ; VBITS_GE_512-NEXT: ret
666 %b = load <8 x i8>, ptr %bp
667 %mask = icmp eq <8 x i8> %b, zeroinitializer
668 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
669 %ext = zext <8 x i8> %load to <8 x i64>
670 store <8 x i64> %ext, ptr %c
674 define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 {
675 ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32:
676 ; VBITS_GE_256: // %bb.0:
677 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
678 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
679 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1]
680 ; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0
681 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
682 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
683 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
684 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
685 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
686 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2]
687 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
688 ; VBITS_GE_256-NEXT: ret
690 ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32:
691 ; VBITS_GE_512: // %bb.0:
692 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
693 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1]
694 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
695 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0]
696 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
697 ; VBITS_GE_512-NEXT: ret
698 %b = load <16 x i16>, ptr %bp
699 %mask = icmp eq <16 x i16> %b, zeroinitializer
700 %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
701 %ext = zext <16 x i16> %load to <16 x i32>
702 store <16 x i32> %ext, ptr %c
706 define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 {
707 ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64:
708 ; VBITS_GE_256: // %bb.0:
709 ; VBITS_GE_256-NEXT: ldr q0, [x1]
710 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
711 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
712 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0
713 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0
714 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
715 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
716 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
717 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
718 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
719 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
720 ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
721 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
722 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
723 ; VBITS_GE_256-NEXT: ret
725 ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64:
726 ; VBITS_GE_512: // %bb.0:
727 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
728 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x1]
729 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
730 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0]
731 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
732 ; VBITS_GE_512-NEXT: ret
733 %b = load <8 x i16>, ptr %bp
734 %mask = icmp eq <8 x i16> %b, zeroinitializer
735 %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
736 %ext = zext <8 x i16> %load to <8 x i64>
737 store <8 x i64> %ext, ptr %c
741 define void @masked_load_zext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
742 ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64:
743 ; VBITS_GE_256: // %bb.0:
744 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
745 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
746 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
747 ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0
748 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
749 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
750 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
751 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
752 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
753 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
754 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
755 ; VBITS_GE_256-NEXT: ret
757 ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64:
758 ; VBITS_GE_512: // %bb.0:
759 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
760 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1]
761 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
762 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0]
763 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
764 ; VBITS_GE_512-NEXT: ret
765 %b = load <8 x i32>, ptr %bp
766 %mask = icmp eq <8 x i32> %b, zeroinitializer
767 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
768 %ext = zext <8 x i32> %load to <8 x i64>
769 store <8 x i64> %ext, ptr %c
773 define void @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 {
774 ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16_m16:
775 ; VBITS_GE_256: // %bb.0:
776 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
777 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
778 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1]
779 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
780 ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0
781 ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, #0
782 ; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
783 ; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff
784 ; VBITS_GE_256-NEXT: ptrue p1.b, vl16
785 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
786 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
787 ; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b
788 ; VBITS_GE_256-NEXT: ptrue p1.b, vl32
789 ; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0
790 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
791 ; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b
792 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
793 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
794 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2]
795 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
796 ; VBITS_GE_256-NEXT: ret
798 ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16_m16:
799 ; VBITS_GE_512: // %bb.0:
800 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
801 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1]
802 ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
803 ; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0]
804 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2]
805 ; VBITS_GE_512-NEXT: ret
806 %b = load <32 x i16>, ptr %bp
807 %mask = icmp eq <32 x i16> %b, zeroinitializer
808 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
809 %ext = sext <32 x i8> %load to <32 x i16>
810 store <32 x i16> %ext, ptr %c
814 define void @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
815 ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32_m32:
816 ; VBITS_GE_256: // %bb.0:
817 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
818 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
819 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
820 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
821 ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
822 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
823 ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
824 ; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
825 ; VBITS_GE_256-NEXT: ptrue p1.b, vl16
826 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
827 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
828 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
829 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
830 ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
831 ; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0
832 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
833 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
834 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
835 ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
836 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
837 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
838 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2]
839 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2]
840 ; VBITS_GE_256-NEXT: ret
842 ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32_m32:
843 ; VBITS_GE_512: // %bb.0:
844 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
845 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
846 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
847 ; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0]
848 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
849 ; VBITS_GE_512-NEXT: ret
850 %b = load <16 x i32>, ptr %bp
851 %mask = icmp eq <16 x i32> %b, zeroinitializer
852 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
853 %ext = sext <16 x i8> %load to <16 x i32>
854 store <16 x i32> %ext, ptr %c
858 define void @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
859 ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64_m64:
860 ; VBITS_GE_256: // %bb.0:
861 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
862 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
863 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
864 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
865 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
866 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
867 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
868 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
869 ; VBITS_GE_256-NEXT: ptrue p1.s, vl4
870 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
871 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
872 ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
873 ; VBITS_GE_256-NEXT: ptrue p1.b, vl8
874 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
875 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
876 ; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z0.b, #0
877 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
878 ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0
879 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
880 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
881 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
882 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
883 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
884 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
885 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
886 ; VBITS_GE_256-NEXT: ret
888 ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64_m64:
889 ; VBITS_GE_512: // %bb.0:
890 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
891 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
892 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
893 ; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0]
894 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
895 ; VBITS_GE_512-NEXT: ret
896 %b = load <8 x i64>, ptr %bp
897 %mask = icmp eq <8 x i64> %b, zeroinitializer
898 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
899 %ext = sext <8 x i8> %load to <8 x i64>
900 store <8 x i64> %ext, ptr %c
904 define void @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
905 ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32_m32:
906 ; VBITS_GE_256: // %bb.0:
907 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
908 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
909 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
910 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
911 ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
912 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
913 ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
914 ; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
915 ; VBITS_GE_256-NEXT: ptrue p1.h, vl16
916 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
917 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
918 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
919 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
920 ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
921 ; VBITS_GE_256-NEXT: sunpklo z0.h, z1.b
922 ; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0
923 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0]
924 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
925 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
926 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
927 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2]
928 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
929 ; VBITS_GE_256-NEXT: ret
931 ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32_m32:
932 ; VBITS_GE_512: // %bb.0:
933 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
934 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
935 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
936 ; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0]
937 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
938 ; VBITS_GE_512-NEXT: ret
939 %b = load <16 x i32>, ptr %bp
940 %mask = icmp eq <16 x i32> %b, zeroinitializer
941 %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
942 %ext = sext <16 x i16> %load to <16 x i32>
943 store <16 x i32> %ext, ptr %c
947 define void @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
948 ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64_m64:
949 ; VBITS_GE_256: // %bb.0:
950 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
951 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
952 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
953 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
954 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
955 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
956 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
957 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
958 ; VBITS_GE_256-NEXT: ptrue p1.s, vl4
959 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
960 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
961 ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
962 ; VBITS_GE_256-NEXT: ptrue p1.h, vl8
963 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
964 ; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0
965 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0]
966 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
967 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
968 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
969 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
970 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
971 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
972 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
973 ; VBITS_GE_256-NEXT: ret
975 ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64_m64:
976 ; VBITS_GE_512: // %bb.0:
977 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
978 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
979 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
980 ; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0]
981 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
982 ; VBITS_GE_512-NEXT: ret
983 %b = load <8 x i64>, ptr %bp
984 %mask = icmp eq <8 x i64> %b, zeroinitializer
985 %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
986 %ext = sext <8 x i16> %load to <8 x i64>
987 store <8 x i64> %ext, ptr %c
991 define void @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
992 ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64_m64:
993 ; VBITS_GE_256: // %bb.0:
994 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
995 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
996 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
997 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
998 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
999 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
1000 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
1001 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
1002 ; VBITS_GE_256-NEXT: ptrue p1.s, vl4
1003 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
1004 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
1005 ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
1006 ; VBITS_GE_256-NEXT: ptrue p1.s, vl8
1007 ; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0
1008 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0]
1009 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
1010 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1011 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
1012 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
1013 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
1014 ; VBITS_GE_256-NEXT: ret
1016 ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64_m64:
1017 ; VBITS_GE_512: // %bb.0:
1018 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1019 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
1020 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1021 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0]
1022 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
1023 ; VBITS_GE_512-NEXT: ret
1024 %b = load <8 x i64>, ptr %bp
1025 %mask = icmp eq <8 x i64> %b, zeroinitializer
1026 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
1027 %ext = sext <8 x i32> %load to <8 x i64>
1028 store <8 x i64> %ext, ptr %c
1032 define void @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 {
1033 ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16_m16:
1034 ; VBITS_GE_256: // %bb.0:
1035 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1036 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
1037 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1]
1038 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
1039 ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0
1040 ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, #0
1041 ; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
1042 ; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff
1043 ; VBITS_GE_256-NEXT: ptrue p1.b, vl16
1044 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
1045 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
1046 ; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b
1047 ; VBITS_GE_256-NEXT: ptrue p1.b, vl32
1048 ; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0
1049 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
1050 ; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b
1051 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1052 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
1053 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2]
1054 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
1055 ; VBITS_GE_256-NEXT: ret
1057 ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16_m16:
1058 ; VBITS_GE_512: // %bb.0:
1059 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1060 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1]
1061 ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
1062 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0]
1063 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2]
1064 ; VBITS_GE_512-NEXT: ret
1065 %b = load <32 x i16>, ptr %bp
1066 %mask = icmp eq <32 x i16> %b, zeroinitializer
1067 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
1068 %ext = zext <32 x i8> %load to <32 x i16>
1069 store <32 x i16> %ext, ptr %c
1073 define void @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
1074 ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32_m32:
1075 ; VBITS_GE_256: // %bb.0:
1076 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1077 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1078 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
1079 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
1080 ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1081 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
1082 ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
1083 ; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
1084 ; VBITS_GE_256-NEXT: ptrue p1.b, vl16
1085 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
1086 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
1087 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
1088 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
1089 ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
1090 ; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0
1091 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
1092 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
1093 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
1094 ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
1095 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
1096 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
1097 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2]
1098 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2]
1099 ; VBITS_GE_256-NEXT: ret
1101 ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32_m32:
1102 ; VBITS_GE_512: // %bb.0:
1103 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1104 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
1105 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1106 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0]
1107 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
1108 ; VBITS_GE_512-NEXT: ret
1109 %b = load <16 x i32>, ptr %bp
1110 %mask = icmp eq <16 x i32> %b, zeroinitializer
1111 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
1112 %ext = zext <16 x i8> %load to <16 x i32>
1113 store <16 x i32> %ext, ptr %c
1117 define void @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
1118 ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64_m64:
1119 ; VBITS_GE_256: // %bb.0:
1120 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1121 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1122 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
1123 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
1124 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1125 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
1126 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
1127 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
1128 ; VBITS_GE_256-NEXT: ptrue p1.s, vl4
1129 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
1130 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
1131 ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
1132 ; VBITS_GE_256-NEXT: ptrue p1.b, vl8
1133 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
1134 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
1135 ; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z0.b, #0
1136 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
1137 ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
1138 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
1139 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
1140 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
1141 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
1142 ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
1143 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
1144 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
1145 ; VBITS_GE_256-NEXT: ret
1147 ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64_m64:
1148 ; VBITS_GE_512: // %bb.0:
1149 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1150 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
1151 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1152 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0]
1153 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
1154 ; VBITS_GE_512-NEXT: ret
1155 %b = load <8 x i64>, ptr %bp
1156 %mask = icmp eq <8 x i64> %b, zeroinitializer
1157 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
1158 %ext = zext <8 x i8> %load to <8 x i64>
1159 store <8 x i64> %ext, ptr %c
1163 define void @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
1164 ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32_m32:
1165 ; VBITS_GE_256: // %bb.0:
1166 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1167 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1168 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2]
1169 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
1170 ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1171 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
1172 ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
1173 ; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
1174 ; VBITS_GE_256-NEXT: ptrue p1.h, vl16
1175 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
1176 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
1177 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
1178 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
1179 ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
1180 ; VBITS_GE_256-NEXT: sunpklo z0.h, z1.b
1181 ; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0
1182 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0]
1183 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
1184 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1185 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
1186 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2]
1187 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
1188 ; VBITS_GE_256-NEXT: ret
1190 ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32_m32:
1191 ; VBITS_GE_512: // %bb.0:
1192 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1193 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
1194 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1195 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0]
1196 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2]
1197 ; VBITS_GE_512-NEXT: ret
1198 %b = load <16 x i32>, ptr %bp
1199 %mask = icmp eq <16 x i32> %b, zeroinitializer
1200 %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
1201 %ext = zext <16 x i16> %load to <16 x i32>
1202 store <16 x i32> %ext, ptr %c
1206 define void @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
1207 ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64_m64:
1208 ; VBITS_GE_256: // %bb.0:
1209 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1210 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1211 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
1212 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
1213 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1214 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
1215 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
1216 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
1217 ; VBITS_GE_256-NEXT: ptrue p1.s, vl4
1218 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
1219 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
1220 ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
1221 ; VBITS_GE_256-NEXT: ptrue p1.h, vl8
1222 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
1223 ; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0
1224 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0]
1225 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
1226 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
1227 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
1228 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
1229 ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
1230 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2]
1231 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3]
1232 ; VBITS_GE_256-NEXT: ret
1234 ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64_m64:
1235 ; VBITS_GE_512: // %bb.0:
1236 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1237 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
1238 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1239 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0]
1240 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
1241 ; VBITS_GE_512-NEXT: ret
1242 %b = load <8 x i64>, ptr %bp
1243 %mask = icmp eq <8 x i64> %b, zeroinitializer
1244 %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
1245 %ext = zext <8 x i16> %load to <8 x i64>
1246 store <8 x i64> %ext, ptr %c
1250 define void @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
1251 ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64_m64:
1252 ; VBITS_GE_256: // %bb.0:
1253 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1254 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1255 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
1256 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
1257 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1258 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
1259 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
1260 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
1261 ; VBITS_GE_256-NEXT: ptrue p1.s, vl4
1262 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
1263 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
1264 ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
1265 ; VBITS_GE_256-NEXT: ptrue p1.s, vl8
1266 ; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0
1267 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0]
1268 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
1269 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1270 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
1271 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
1272 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
1273 ; VBITS_GE_256-NEXT: ret
1275 ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64_m64:
1276 ; VBITS_GE_512: // %bb.0:
1277 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1278 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
1279 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1280 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0]
1281 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
1282 ; VBITS_GE_512-NEXT: ret
1283 %b = load <8 x i64>, ptr %bp
1284 %mask = icmp eq <8 x i64> %b, zeroinitializer
1285 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
1286 %ext = zext <8 x i32> %load to <8 x i64>
1287 store <8 x i64> %ext, ptr %c
1291 define void @masked_load_sext_v128i8i16(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1292 ; CHECK-LABEL: masked_load_sext_v128i8i16:
1294 ; CHECK-NEXT: ptrue p0.h, vl128
1295 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1]
1296 ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0
1297 ; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0]
1298 ; CHECK-NEXT: st1h { z0.h }, p0, [x2]
1300 %b = load <128 x i8>, ptr %bp
1301 %mask = icmp eq <128 x i8> %b, zeroinitializer
1302 %load = call <128 x i8> @llvm.masked.load.v128i8(ptr %ap, i32 8, <128 x i1> %mask, <128 x i8> undef)
1303 %ext = sext <128 x i8> %load to <128 x i16>
1304 store <128 x i16> %ext, ptr %c
1308 define void @masked_load_sext_v64i8i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1309 ; CHECK-LABEL: masked_load_sext_v64i8i32:
1311 ; CHECK-NEXT: ptrue p0.s, vl64
1312 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1]
1313 ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1314 ; CHECK-NEXT: ld1sb { z0.s }, p1/z, [x0]
1315 ; CHECK-NEXT: st1w { z0.s }, p0, [x2]
1317 %b = load <64 x i8>, ptr %bp
1318 %mask = icmp eq <64 x i8> %b, zeroinitializer
1319 %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
1320 %ext = sext <64 x i8> %load to <64 x i32>
1321 store <64 x i32> %ext, ptr %c
1325 define void @masked_load_sext_v32i8i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1326 ; CHECK-LABEL: masked_load_sext_v32i8i64:
1328 ; CHECK-NEXT: ptrue p0.d, vl32
1329 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1]
1330 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1331 ; CHECK-NEXT: ld1sb { z0.d }, p1/z, [x0]
1332 ; CHECK-NEXT: st1d { z0.d }, p0, [x2]
1334 %b = load <32 x i8>, ptr %bp
1335 %mask = icmp eq <32 x i8> %b, zeroinitializer
1336 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
1337 %ext = sext <32 x i8> %load to <32 x i64>
1338 store <32 x i64> %ext, ptr %c
1342 define void @masked_load_sext_v64i16i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1343 ; CHECK-LABEL: masked_load_sext_v64i16i32:
1345 ; CHECK-NEXT: ptrue p0.s, vl64
1346 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1]
1347 ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1348 ; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0]
1349 ; CHECK-NEXT: st1w { z0.s }, p0, [x2]
1351 %b = load <64 x i16>, ptr %bp
1352 %mask = icmp eq <64 x i16> %b, zeroinitializer
1353 %load = call <64 x i16> @llvm.masked.load.v64i16(ptr %ap, i32 8, <64 x i1> %mask, <64 x i16> undef)
1354 %ext = sext <64 x i16> %load to <64 x i32>
1355 store <64 x i32> %ext, ptr %c
1359 define void @masked_load_sext_v32i16i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1360 ; CHECK-LABEL: masked_load_sext_v32i16i64:
1362 ; CHECK-NEXT: ptrue p0.d, vl32
1363 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1]
1364 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1365 ; CHECK-NEXT: ld1sh { z0.d }, p1/z, [x0]
1366 ; CHECK-NEXT: st1d { z0.d }, p0, [x2]
1368 %b = load <32 x i16>, ptr %bp
1369 %mask = icmp eq <32 x i16> %b, zeroinitializer
1370 %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
1371 %ext = sext <32 x i16> %load to <32 x i64>
1372 store <32 x i64> %ext, ptr %c
1376 define void @masked_load_sext_v32i32i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1377 ; CHECK-LABEL: masked_load_sext_v32i32i64:
1379 ; CHECK-NEXT: ptrue p0.d, vl32
1380 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1]
1381 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1382 ; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0]
1383 ; CHECK-NEXT: st1d { z0.d }, p0, [x2]
1385 %b = load <32 x i32>, ptr %bp
1386 %mask = icmp eq <32 x i32> %b, zeroinitializer
1387 %load = call <32 x i32> @llvm.masked.load.v32i32(ptr %ap, i32 8, <32 x i1> %mask, <32 x i32> undef)
1388 %ext = sext <32 x i32> %load to <32 x i64>
1389 store <32 x i64> %ext, ptr %c
1393 define void @masked_load_zext_v128i8i16(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1394 ; CHECK-LABEL: masked_load_zext_v128i8i16:
1396 ; CHECK-NEXT: ptrue p0.h, vl128
1397 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1]
1398 ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0
1399 ; CHECK-NEXT: ld1b { z0.h }, p1/z, [x0]
1400 ; CHECK-NEXT: st1h { z0.h }, p0, [x2]
1402 %b = load <128 x i8>, ptr %bp
1403 %mask = icmp eq <128 x i8> %b, zeroinitializer
1404 %load = call <128 x i8> @llvm.masked.load.v128i8(ptr %ap, i32 8, <128 x i1> %mask, <128 x i8> undef)
1405 %ext = zext <128 x i8> %load to <128 x i16>
1406 store <128 x i16> %ext, ptr %c
1410 define void @masked_load_zext_v64i8i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1411 ; CHECK-LABEL: masked_load_zext_v64i8i32:
1413 ; CHECK-NEXT: ptrue p0.s, vl64
1414 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1]
1415 ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1416 ; CHECK-NEXT: ld1b { z0.s }, p1/z, [x0]
1417 ; CHECK-NEXT: st1w { z0.s }, p0, [x2]
1419 %b = load <64 x i8>, ptr %bp
1420 %mask = icmp eq <64 x i8> %b, zeroinitializer
1421 %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
1422 %ext = zext <64 x i8> %load to <64 x i32>
1423 store <64 x i32> %ext, ptr %c
1427 define void @masked_load_zext_v32i8i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1428 ; CHECK-LABEL: masked_load_zext_v32i8i64:
1430 ; CHECK-NEXT: ptrue p0.d, vl32
1431 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1]
1432 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1433 ; CHECK-NEXT: ld1b { z0.d }, p1/z, [x0]
1434 ; CHECK-NEXT: st1d { z0.d }, p0, [x2]
1436 %b = load <32 x i8>, ptr %bp
1437 %mask = icmp eq <32 x i8> %b, zeroinitializer
1438 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
1439 %ext = zext <32 x i8> %load to <32 x i64>
1440 store <32 x i64> %ext, ptr %c
1444 define void @masked_load_zext_v64i16i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1445 ; CHECK-LABEL: masked_load_zext_v64i16i32:
1447 ; CHECK-NEXT: ptrue p0.s, vl64
1448 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1]
1449 ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1450 ; CHECK-NEXT: ld1h { z0.s }, p1/z, [x0]
1451 ; CHECK-NEXT: st1w { z0.s }, p0, [x2]
1453 %b = load <64 x i16>, ptr %bp
1454 %mask = icmp eq <64 x i16> %b, zeroinitializer
1455 %load = call <64 x i16> @llvm.masked.load.v64i16(ptr %ap, i32 8, <64 x i1> %mask, <64 x i16> undef)
1456 %ext = zext <64 x i16> %load to <64 x i32>
1457 store <64 x i32> %ext, ptr %c
1461 define void @masked_load_zext_v32i16i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1462 ; CHECK-LABEL: masked_load_zext_v32i16i64:
1464 ; CHECK-NEXT: ptrue p0.d, vl32
1465 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1]
1466 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1467 ; CHECK-NEXT: ld1h { z0.d }, p1/z, [x0]
1468 ; CHECK-NEXT: st1d { z0.d }, p0, [x2]
1470 %b = load <32 x i16>, ptr %bp
1471 %mask = icmp eq <32 x i16> %b, zeroinitializer
1472 %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
1473 %ext = zext <32 x i16> %load to <32 x i64>
1474 store <32 x i64> %ext, ptr %c
1478 define void @masked_load_zext_v32i32i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 {
1479 ; CHECK-LABEL: masked_load_zext_v32i32i64:
1481 ; CHECK-NEXT: ptrue p0.d, vl32
1482 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1]
1483 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1484 ; CHECK-NEXT: ld1w { z0.d }, p1/z, [x0]
1485 ; CHECK-NEXT: st1d { z0.d }, p0, [x2]
1487 %b = load <32 x i32>, ptr %bp
1488 %mask = icmp eq <32 x i32> %b, zeroinitializer
1489 %load = call <32 x i32> @llvm.masked.load.v32i32(ptr %ap, i32 8, <32 x i1> %mask, <32 x i32> undef)
1490 %ext = zext <32 x i32> %load to <32 x i64>
1491 store <32 x i64> %ext, ptr %c
1495 define void @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
1496 ; VBITS_GE_256-LABEL: masked_load_sext_ugt_v8i32i64:
1497 ; VBITS_GE_256: // %bb.0:
1498 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1499 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1500 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
1501 ; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0
1502 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
1503 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1504 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
1505 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1506 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
1507 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
1508 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
1509 ; VBITS_GE_256-NEXT: ret
1511 ; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64:
1512 ; VBITS_GE_512: // %bb.0:
1513 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1514 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1]
1515 ; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0
1516 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0]
1517 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
1518 ; VBITS_GE_512-NEXT: ret
1519 %b = load <8 x i32>, ptr %bp
1520 %mask = icmp ugt <8 x i32> %b, zeroinitializer
1521 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
1522 %ext = sext <8 x i32> %load to <8 x i64>
1523 store <8 x i64> %ext, ptr %c
1527 define void @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
1528 ; VBITS_GE_256-LABEL: masked_load_zext_sgt_v8i32i64:
1529 ; VBITS_GE_256: // %bb.0:
1530 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1531 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1532 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
1533 ; VBITS_GE_256-NEXT: cmpgt p0.s, p0/z, z0.s, #0
1534 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
1535 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1536 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
1537 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1538 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
1539 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2]
1540 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
1541 ; VBITS_GE_256-NEXT: ret
1543 ; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64:
1544 ; VBITS_GE_512: // %bb.0:
1545 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1546 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1]
1547 ; VBITS_GE_512-NEXT: cmpgt p1.d, p0/z, z0.d, #0
1548 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0]
1549 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2]
1550 ; VBITS_GE_512-NEXT: ret
1551 %b = load <8 x i32>, ptr %bp
1552 %mask = icmp sgt <8 x i32> %b, zeroinitializer
1553 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
1554 %ext = zext <8 x i32> %load to <8 x i64>
1555 store <8 x i64> %ext, ptr %c
1559 declare <2 x half> @llvm.masked.load.v2f16(ptr, i32, <2 x i1>, <2 x half>)
1560 declare <2 x float> @llvm.masked.load.v2f32(ptr, i32, <2 x i1>, <2 x float>)
1561 declare <4 x float> @llvm.masked.load.v4f32(ptr, i32, <4 x i1>, <4 x float>)
1562 declare <8 x float> @llvm.masked.load.v8f32(ptr, i32, <8 x i1>, <8 x float>)
1563 declare <16 x float> @llvm.masked.load.v16f32(ptr, i32, <16 x i1>, <16 x float>)
1564 declare <32 x float> @llvm.masked.load.v32f32(ptr, i32, <32 x i1>, <32 x float>)
1565 declare <64 x float> @llvm.masked.load.v64f32(ptr, i32, <64 x i1>, <64 x float>)
1567 declare <128 x i8> @llvm.masked.load.v128i8(ptr, i32, <128 x i1>, <128 x i8>)
1568 declare <64 x i8> @llvm.masked.load.v64i8(ptr, i32, <64 x i1>, <64 x i8>)
1569 declare <32 x i8> @llvm.masked.load.v32i8(ptr, i32, <32 x i1>, <32 x i8>)
1570 declare <16 x i8> @llvm.masked.load.v16i8(ptr, i32, <16 x i1>, <16 x i8>)
1571 declare <16 x i16> @llvm.masked.load.v16i16(ptr, i32, <16 x i1>, <16 x i16>)
1572 declare <8 x i8> @llvm.masked.load.v8i8(ptr, i32, <8 x i1>, <8 x i8>)
1573 declare <8 x i16> @llvm.masked.load.v8i16(ptr, i32, <8 x i1>, <8 x i16>)
1574 declare <8 x i32> @llvm.masked.load.v8i32(ptr, i32, <8 x i1>, <8 x i32>)
1575 declare <32 x i32> @llvm.masked.load.v32i32(ptr, i32, <32 x i1>, <32 x i32>)
1576 declare <32 x i16> @llvm.masked.load.v32i16(ptr, i32, <32 x i1>, <32 x i16>)
1577 declare <64 x i16> @llvm.masked.load.v64i16(ptr, i32, <64 x i1>, <64 x i16>)
1578 declare <16 x i32> @llvm.masked.load.v16i32(ptr, i32, <16 x i1>, <16 x i32>)
1579 declare <8 x i64> @llvm.masked.load.v8i64(ptr, i32, <8 x i1>, <8 x i64>)
1580 declare <8 x double> @llvm.masked.load.v8f64(ptr, i32, <8 x i1>, <8 x double>)
1582 attributes #0 = { "target-features"="+sve" }