1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 define <2 x half> @masked_load_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
13 ; CHECK-LABEL: masked_load_v2f16:
15 ; CHECK-NEXT: ldr s1, [x0]
16 ; CHECK-NEXT: ldr s2, [x1]
17 ; CHECK-NEXT: movi v0.2d, #0000000000000000
18 ; CHECK-NEXT: ptrue p0.h, vl4
19 ; CHECK-NEXT: fcmeq v1.4h, v1.4h, v2.4h
20 ; CHECK-NEXT: sshll v1.4s, v1.4h, #0
21 ; CHECK-NEXT: mov v0.h[0], v1.h[0]
22 ; CHECK-NEXT: mov w8, v1.s[1]
23 ; CHECK-NEXT: mov v0.h[1], w8
24 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
25 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
26 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
28 %a = load <2 x half>, ptr %ap
29 %b = load <2 x half>, ptr %bp
30 %mask = fcmp oeq <2 x half> %a, %b
31 %load = call <2 x half> @llvm.masked.load.v2f16(ptr %ap, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer)
35 define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
36 ; CHECK-LABEL: masked_load_v2f32:
38 ; CHECK-NEXT: ptrue p0.s, vl2
39 ; CHECK-NEXT: ldr d0, [x0]
40 ; CHECK-NEXT: ldr d1, [x1]
41 ; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s
42 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
43 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
44 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
46 %a = load <2 x float>, ptr %ap
47 %b = load <2 x float>, ptr %bp
48 %mask = fcmp oeq <2 x float> %a, %b
49 %load = call <2 x float> @llvm.masked.load.v2f32(ptr %ap, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer)
53 define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
54 ; CHECK-LABEL: masked_load_v4f32:
56 ; CHECK-NEXT: ptrue p0.s, vl4
57 ; CHECK-NEXT: ldr q0, [x0]
58 ; CHECK-NEXT: ldr q1, [x1]
59 ; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s
60 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
61 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
62 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
64 %a = load <4 x float>, ptr %ap
65 %b = load <4 x float>, ptr %bp
66 %mask = fcmp oeq <4 x float> %a, %b
67 %load = call <4 x float> @llvm.masked.load.v4f32(ptr %ap, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
71 define <8 x float> @masked_load_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
72 ; CHECK-LABEL: masked_load_v8f32:
74 ; CHECK-NEXT: ptrue p0.s, vl8
75 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
76 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
77 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
78 ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0]
79 ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
81 %a = load <8 x float>, ptr %ap
82 %b = load <8 x float>, ptr %bp
83 %mask = fcmp oeq <8 x float> %a, %b
84 %load = call <8 x float> @llvm.masked.load.v8f32(ptr %ap, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer)
88 define <16 x float> @masked_load_v16f32(ptr %ap, ptr %bp) #0 {
89 ; VBITS_GE_256-LABEL: masked_load_v16f32:
90 ; VBITS_GE_256: // %bb.0:
91 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
92 ; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
93 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
94 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
95 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x9, lsl #2]
96 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
97 ; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z2.s
98 ; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z1.s, z3.s
99 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x9, lsl #2]
100 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0]
101 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
102 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
103 ; VBITS_GE_256-NEXT: ret
105 ; VBITS_GE_512-LABEL: masked_load_v16f32:
106 ; VBITS_GE_512: // %bb.0:
107 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
108 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
109 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
110 ; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
111 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0]
112 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
113 ; VBITS_GE_512-NEXT: ret
114 %a = load <16 x float>, ptr %ap
115 %b = load <16 x float>, ptr %bp
116 %mask = fcmp oeq <16 x float> %a, %b
117 %load = call <16 x float> @llvm.masked.load.v16f32(ptr %ap, i32 8, <16 x i1> %mask, <16 x float> zeroinitializer)
118 ret <16 x float> %load
121 define <32 x float> @masked_load_v32f32(ptr %ap, ptr %bp) vscale_range(8,0) #0 {
122 ; CHECK-LABEL: masked_load_v32f32:
124 ; CHECK-NEXT: ptrue p0.s, vl32
125 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
126 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
127 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
128 ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0]
129 ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
131 %a = load <32 x float>, ptr %ap
132 %b = load <32 x float>, ptr %bp
133 %mask = fcmp oeq <32 x float> %a, %b
134 %load = call <32 x float> @llvm.masked.load.v32f32(ptr %ap, i32 8, <32 x i1> %mask, <32 x float> zeroinitializer)
135 ret <32 x float> %load
138 define <64 x float> @masked_load_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
139 ; CHECK-LABEL: masked_load_v64f32:
141 ; CHECK-NEXT: ptrue p0.s, vl64
142 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
143 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
144 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
145 ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0]
146 ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
148 %a = load <64 x float>, ptr %ap
149 %b = load <64 x float>, ptr %bp
150 %mask = fcmp oeq <64 x float> %a, %b
151 %load = call <64 x float> @llvm.masked.load.v64f32(ptr %ap, i32 8, <64 x i1> %mask, <64 x float> zeroinitializer)
152 ret <64 x float> %load
155 define <64 x i8> @masked_load_v64i8(ptr %ap, ptr %bp) #0 {
156 ; VBITS_GE_256-LABEL: masked_load_v64i8:
157 ; VBITS_GE_256: // %bb.0:
158 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
159 ; VBITS_GE_256-NEXT: mov w9, #32 // =0x20
160 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x9]
161 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
162 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x9]
163 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
164 ; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z2.b
165 ; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z1.b, z3.b
166 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0, x9]
167 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p2/z, [x0]
168 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x8, x9]
169 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x8]
170 ; VBITS_GE_256-NEXT: ret
172 ; VBITS_GE_512-LABEL: masked_load_v64i8:
173 ; VBITS_GE_512: // %bb.0:
174 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
175 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
176 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
177 ; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b
178 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p1/z, [x0]
179 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x8]
180 ; VBITS_GE_512-NEXT: ret
181 %a = load <64 x i8>, ptr %ap
182 %b = load <64 x i8>, ptr %bp
183 %mask = icmp eq <64 x i8> %a, %b
184 %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
188 define <32 x i16> @masked_load_v32i16(ptr %ap, ptr %bp) #0 {
189 ; VBITS_GE_256-LABEL: masked_load_v32i16:
190 ; VBITS_GE_256: // %bb.0:
191 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
192 ; VBITS_GE_256-NEXT: mov x9, #16 // =0x10
193 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
194 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
195 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x9, lsl #1]
196 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
197 ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h
198 ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, z3.h
199 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x9, lsl #1]
200 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p2/z, [x0]
201 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
202 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
203 ; VBITS_GE_256-NEXT: ret
205 ; VBITS_GE_512-LABEL: masked_load_v32i16:
206 ; VBITS_GE_512: // %bb.0:
207 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
208 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
209 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
210 ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h
211 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0]
212 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8]
213 ; VBITS_GE_512-NEXT: ret
214 %a = load <32 x i16>, ptr %ap
215 %b = load <32 x i16>, ptr %bp
216 %mask = icmp eq <32 x i16> %a, %b
217 %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
221 define <16 x i32> @masked_load_v16i32(ptr %ap, ptr %bp) #0 {
222 ; VBITS_GE_256-LABEL: masked_load_v16i32:
223 ; VBITS_GE_256: // %bb.0:
224 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
225 ; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
226 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
227 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
228 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x9, lsl #2]
229 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
230 ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s
231 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, z3.s
232 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x9, lsl #2]
233 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0]
234 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
235 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
236 ; VBITS_GE_256-NEXT: ret
238 ; VBITS_GE_512-LABEL: masked_load_v16i32:
239 ; VBITS_GE_512: // %bb.0:
240 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
241 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
242 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
243 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s
244 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0]
245 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
246 ; VBITS_GE_512-NEXT: ret
247 %a = load <16 x i32>, ptr %ap
248 %b = load <16 x i32>, ptr %bp
249 %mask = icmp eq <16 x i32> %a, %b
250 %load = call <16 x i32> @llvm.masked.load.v16i32(ptr %ap, i32 8, <16 x i1> %mask, <16 x i32> undef)
254 define <8 x i64> @masked_load_v8i64(ptr %ap, ptr %bp) #0 {
255 ; VBITS_GE_256-LABEL: masked_load_v8i64:
256 ; VBITS_GE_256: // %bb.0:
257 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
258 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
259 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
260 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
261 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
262 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
263 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d
264 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z3.d
265 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x9, lsl #3]
266 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0]
267 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
268 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
269 ; VBITS_GE_256-NEXT: ret
271 ; VBITS_GE_512-LABEL: masked_load_v8i64:
272 ; VBITS_GE_512: // %bb.0:
273 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
274 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
275 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
276 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
277 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0]
278 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
279 ; VBITS_GE_512-NEXT: ret
280 %a = load <8 x i64>, ptr %ap
281 %b = load <8 x i64>, ptr %bp
282 %mask = icmp eq <8 x i64> %a, %b
283 %load = call <8 x i64> @llvm.masked.load.v8i64(ptr %ap, i32 8, <8 x i1> %mask, <8 x i64> undef)
287 define <8 x i64> @masked_load_passthru_v8i64(ptr %ap, ptr %bp) #0 {
288 ; VBITS_GE_256-LABEL: masked_load_passthru_v8i64:
289 ; VBITS_GE_256: // %bb.0:
290 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
291 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
292 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
293 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
294 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
295 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
296 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d
297 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, z3.d
298 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x9, lsl #3]
299 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0]
300 ; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
301 ; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z3.d
302 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
303 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
304 ; VBITS_GE_256-NEXT: ret
306 ; VBITS_GE_512-LABEL: masked_load_passthru_v8i64:
307 ; VBITS_GE_512: // %bb.0:
308 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
309 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
310 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
311 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d
312 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0]
313 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
314 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
315 ; VBITS_GE_512-NEXT: ret
316 %a = load <8 x i64>, ptr %ap
317 %b = load <8 x i64>, ptr %bp
318 %mask = icmp eq <8 x i64> %a, %b
319 %load = call <8 x i64> @llvm.masked.load.v8i64(ptr %ap, i32 8, <8 x i1> %mask, <8 x i64> %b)
323 define <8 x double> @masked_load_passthru_v8f64(ptr %ap, ptr %bp) #0 {
324 ; VBITS_GE_256-LABEL: masked_load_passthru_v8f64:
325 ; VBITS_GE_256: // %bb.0:
326 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
327 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
328 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
329 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
330 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
331 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
332 ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z2.d
333 ; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z1.d, z3.d
334 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x9, lsl #3]
335 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0]
336 ; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z2.d
337 ; VBITS_GE_256-NEXT: sel z1.d, p2, z1.d, z3.d
338 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
339 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
340 ; VBITS_GE_256-NEXT: ret
342 ; VBITS_GE_512-LABEL: masked_load_passthru_v8f64:
343 ; VBITS_GE_512: // %bb.0:
344 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
345 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
346 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
347 ; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
348 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0]
349 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
350 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
351 ; VBITS_GE_512-NEXT: ret
352 %a = load <8 x double>, ptr %ap
353 %b = load <8 x double>, ptr %bp
354 %mask = fcmp oeq <8 x double> %a, %b
355 %load = call <8 x double> @llvm.masked.load.v8f64(ptr %ap, i32 8, <8 x i1> %mask, <8 x double> %b)
356 ret <8 x double> %load
359 define <32 x i16> @masked_load_sext_v32i8i16(ptr %ap, ptr %bp) #0 {
360 ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16:
361 ; VBITS_GE_256: // %bb.0:
362 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
363 ; VBITS_GE_256-NEXT: mov x9, #16 // =0x10
364 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1]
365 ; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0
366 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
367 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
368 ; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b
369 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
370 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
371 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
372 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
373 ; VBITS_GE_256-NEXT: ret
375 ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16:
376 ; VBITS_GE_512: // %bb.0:
377 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
378 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x1]
379 ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
380 ; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0]
381 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8]
382 ; VBITS_GE_512-NEXT: ret
383 %b = load <32 x i8>, ptr %bp
384 %mask = icmp eq <32 x i8> %b, zeroinitializer
385 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
386 %ext = sext <32 x i8> %load to <32 x i16>
390 define <16 x i32> @masked_load_sext_v16i8i32(ptr %ap, ptr %bp) #0 {
391 ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32:
392 ; VBITS_GE_256: // %bb.0:
393 ; VBITS_GE_256-NEXT: ptrue p0.b, vl16
394 ; VBITS_GE_256-NEXT: ldr q0, [x1]
395 ; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
396 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0
397 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
398 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
399 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
400 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
401 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
402 ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
403 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
404 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
405 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
406 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
407 ; VBITS_GE_256-NEXT: ret
409 ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32:
410 ; VBITS_GE_512: // %bb.0:
411 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
412 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p0/z, [x1]
413 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
414 ; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0]
415 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
416 ; VBITS_GE_512-NEXT: ret
417 %b = load <16 x i8>, ptr %bp
418 %mask = icmp eq <16 x i8> %b, zeroinitializer
419 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
420 %ext = sext <16 x i8> %load to <16 x i32>
424 define <8 x i64> @masked_load_sext_v8i8i64(ptr %ap, ptr %bp) #0 {
425 ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64:
426 ; VBITS_GE_256: // %bb.0:
427 ; VBITS_GE_256-NEXT: ptrue p0.b, vl8
428 ; VBITS_GE_256-NEXT: ldr d0, [x1]
429 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
430 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0
431 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
432 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
433 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
434 ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0
435 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
436 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
437 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
438 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
439 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
440 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
441 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
442 ; VBITS_GE_256-NEXT: ret
444 ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64:
445 ; VBITS_GE_512: // %bb.0:
446 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
447 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [x1]
448 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
449 ; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0]
450 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
451 ; VBITS_GE_512-NEXT: ret
452 %b = load <8 x i8>, ptr %bp
453 %mask = icmp eq <8 x i8> %b, zeroinitializer
454 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
455 %ext = sext <8 x i8> %load to <8 x i64>
459 define <16 x i32> @masked_load_sext_v16i16i32(ptr %ap, ptr %bp) #0 {
460 ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32:
461 ; VBITS_GE_256: // %bb.0:
462 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
463 ; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
464 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1]
465 ; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0
466 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
467 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
468 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
469 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
470 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
471 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
472 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
473 ; VBITS_GE_256-NEXT: ret
475 ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32:
476 ; VBITS_GE_512: // %bb.0:
477 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
478 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1]
479 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
480 ; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0]
481 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
482 ; VBITS_GE_512-NEXT: ret
483 %b = load <16 x i16>, ptr %bp
484 %mask = icmp eq <16 x i16> %b, zeroinitializer
485 %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
486 %ext = sext <16 x i16> %load to <16 x i32>
490 define <8 x i64> @masked_load_sext_v8i16i64(ptr %ap, ptr %bp) #0 {
491 ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64:
492 ; VBITS_GE_256: // %bb.0:
493 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
494 ; VBITS_GE_256-NEXT: ldr q0, [x1]
495 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
496 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0
497 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0
498 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
499 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
500 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
501 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
502 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
503 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
504 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
505 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
506 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
507 ; VBITS_GE_256-NEXT: ret
509 ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64:
510 ; VBITS_GE_512: // %bb.0:
511 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
512 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x1]
513 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
514 ; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0]
515 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
516 ; VBITS_GE_512-NEXT: ret
517 %b = load <8 x i16>, ptr %bp
518 %mask = icmp eq <8 x i16> %b, zeroinitializer
519 %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
520 %ext = sext <8 x i16> %load to <8 x i64>
524 define <8 x i64> @masked_load_sext_v8i32i64(ptr %ap, ptr %bp) #0 {
525 ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64:
526 ; VBITS_GE_256: // %bb.0:
527 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
528 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
529 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
530 ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0
531 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
532 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
533 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
534 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
535 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
536 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
537 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
538 ; VBITS_GE_256-NEXT: ret
540 ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64:
541 ; VBITS_GE_512: // %bb.0:
542 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
543 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1]
544 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
545 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0]
546 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
547 ; VBITS_GE_512-NEXT: ret
548 %b = load <8 x i32>, ptr %bp
549 %mask = icmp eq <8 x i32> %b, zeroinitializer
550 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
551 %ext = sext <8 x i32> %load to <8 x i64>
555 define <32 x i16> @masked_load_zext_v32i8i16(ptr %ap, ptr %bp) #0 {
556 ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16:
557 ; VBITS_GE_256: // %bb.0:
558 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
559 ; VBITS_GE_256-NEXT: mov x9, #16 // =0x10
560 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1]
561 ; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0
562 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
563 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
564 ; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b
565 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
566 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
567 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
568 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
569 ; VBITS_GE_256-NEXT: ret
571 ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16:
572 ; VBITS_GE_512: // %bb.0:
573 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
574 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x1]
575 ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
576 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0]
577 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8]
578 ; VBITS_GE_512-NEXT: ret
579 %b = load <32 x i8>, ptr %bp
580 %mask = icmp eq <32 x i8> %b, zeroinitializer
581 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
582 %ext = zext <32 x i8> %load to <32 x i16>
586 define <16 x i32> @masked_load_zext_v16i8i32(ptr %ap, ptr %bp) #0 {
587 ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32:
588 ; VBITS_GE_256: // %bb.0:
589 ; VBITS_GE_256-NEXT: ptrue p0.b, vl16
590 ; VBITS_GE_256-NEXT: ldr q0, [x1]
591 ; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
592 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0
593 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
594 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
595 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
596 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
597 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
598 ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
599 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
600 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
601 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
602 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
603 ; VBITS_GE_256-NEXT: ret
605 ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32:
606 ; VBITS_GE_512: // %bb.0:
607 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
608 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p0/z, [x1]
609 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
610 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0]
611 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
612 ; VBITS_GE_512-NEXT: ret
613 %b = load <16 x i8>, ptr %bp
614 %mask = icmp eq <16 x i8> %b, zeroinitializer
615 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
616 %ext = zext <16 x i8> %load to <16 x i32>
620 define <8 x i64> @masked_load_zext_v8i8i64(ptr %ap, ptr %bp) #0 {
621 ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64:
622 ; VBITS_GE_256: // %bb.0:
623 ; VBITS_GE_256-NEXT: ptrue p0.b, vl8
624 ; VBITS_GE_256-NEXT: ldr d0, [x1]
625 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
626 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0
627 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0
628 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
629 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
630 ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
631 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
632 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
633 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
634 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
635 ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
636 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
637 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
638 ; VBITS_GE_256-NEXT: ret
640 ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64:
641 ; VBITS_GE_512: // %bb.0:
642 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
643 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [x1]
644 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
645 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0]
646 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
647 ; VBITS_GE_512-NEXT: ret
648 %b = load <8 x i8>, ptr %bp
649 %mask = icmp eq <8 x i8> %b, zeroinitializer
650 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
651 %ext = zext <8 x i8> %load to <8 x i64>
655 define <16 x i32> @masked_load_zext_v16i16i32(ptr %ap, ptr %bp) #0 {
656 ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32:
657 ; VBITS_GE_256: // %bb.0:
658 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
659 ; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
660 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1]
661 ; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0
662 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
663 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
664 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
665 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
666 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
667 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
668 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
669 ; VBITS_GE_256-NEXT: ret
671 ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32:
672 ; VBITS_GE_512: // %bb.0:
673 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
674 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1]
675 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
676 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0]
677 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
678 ; VBITS_GE_512-NEXT: ret
679 %b = load <16 x i16>, ptr %bp
680 %mask = icmp eq <16 x i16> %b, zeroinitializer
681 %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
682 %ext = zext <16 x i16> %load to <16 x i32>
686 define <8 x i64> @masked_load_zext_v8i16i64(ptr %ap, ptr %bp) #0 {
687 ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64:
688 ; VBITS_GE_256: // %bb.0:
689 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
690 ; VBITS_GE_256-NEXT: ldr q0, [x1]
691 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
692 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0
693 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0
694 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
695 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
696 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
697 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
698 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
699 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
700 ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
701 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
702 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
703 ; VBITS_GE_256-NEXT: ret
705 ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64:
706 ; VBITS_GE_512: // %bb.0:
707 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
708 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x1]
709 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
710 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0]
711 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
712 ; VBITS_GE_512-NEXT: ret
713 %b = load <8 x i16>, ptr %bp
714 %mask = icmp eq <8 x i16> %b, zeroinitializer
715 %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
716 %ext = zext <8 x i16> %load to <8 x i64>
720 define <8 x i64> @masked_load_zext_v8i32i64(ptr %ap, ptr %bp) #0 {
721 ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64:
722 ; VBITS_GE_256: // %bb.0:
723 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
724 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
725 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
726 ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0
727 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
728 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
729 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
730 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
731 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
732 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
733 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
734 ; VBITS_GE_256-NEXT: ret
736 ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64:
737 ; VBITS_GE_512: // %bb.0:
738 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
739 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1]
740 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
741 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0]
742 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
743 ; VBITS_GE_512-NEXT: ret
744 %b = load <8 x i32>, ptr %bp
745 %mask = icmp eq <8 x i32> %b, zeroinitializer
746 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
747 %ext = zext <8 x i32> %load to <8 x i64>
751 define <32 x i16> @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp) #0 {
752 ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16_m16:
753 ; VBITS_GE_256: // %bb.0:
754 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
755 ; VBITS_GE_256-NEXT: mov x9, #16 // =0x10
756 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x9, lsl #1]
757 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
758 ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0
759 ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, #0
760 ; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
761 ; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff
762 ; VBITS_GE_256-NEXT: ptrue p1.b, vl16
763 ; VBITS_GE_256-NEXT: ptrue p2.b, vl32
764 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
765 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
766 ; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b
767 ; VBITS_GE_256-NEXT: cmpne p1.b, p2/z, z1.b, #0
768 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
769 ; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b
770 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
771 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
772 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
773 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
774 ; VBITS_GE_256-NEXT: ret
776 ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16_m16:
777 ; VBITS_GE_512: // %bb.0:
778 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
779 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1]
780 ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
781 ; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0]
782 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8]
783 ; VBITS_GE_512-NEXT: ret
784 %b = load <32 x i16>, ptr %bp
785 %mask = icmp eq <32 x i16> %b, zeroinitializer
786 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
787 %ext = sext <32 x i8> %load to <32 x i16>
791 define <16 x i32> @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp) #0 {
792 ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32_m32:
793 ; VBITS_GE_256: // %bb.0:
794 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
795 ; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
796 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
797 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
798 ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
799 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
800 ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
801 ; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
802 ; VBITS_GE_256-NEXT: ptrue p1.b, vl16
803 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
804 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
805 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
806 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
807 ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
808 ; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0
809 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
810 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
811 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
812 ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
813 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
814 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
815 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
816 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
817 ; VBITS_GE_256-NEXT: ret
819 ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32_m32:
820 ; VBITS_GE_512: // %bb.0:
821 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
822 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
823 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
824 ; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0]
825 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
826 ; VBITS_GE_512-NEXT: ret
827 %b = load <16 x i32>, ptr %bp
828 %mask = icmp eq <16 x i32> %b, zeroinitializer
829 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
830 %ext = sext <16 x i8> %load to <16 x i32>
834 define <8 x i64> @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
835 ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64_m64:
836 ; VBITS_GE_256: // %bb.0:
837 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
838 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
839 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
840 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
841 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
842 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
843 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
844 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
845 ; VBITS_GE_256-NEXT: ptrue p1.s, vl4
846 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
847 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
848 ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
849 ; VBITS_GE_256-NEXT: ptrue p1.b, vl8
850 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
851 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
852 ; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z0.b, #0
853 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
854 ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0
855 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
856 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
857 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
858 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
859 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
860 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
861 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
862 ; VBITS_GE_256-NEXT: ret
864 ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64_m64:
865 ; VBITS_GE_512: // %bb.0:
866 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
867 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
868 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
869 ; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0]
870 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
871 ; VBITS_GE_512-NEXT: ret
872 %b = load <8 x i64>, ptr %bp
873 %mask = icmp eq <8 x i64> %b, zeroinitializer
874 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
875 %ext = sext <8 x i8> %load to <8 x i64>
879 define <16 x i32> @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp) #0 {
880 ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32_m32:
881 ; VBITS_GE_256: // %bb.0:
882 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
883 ; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
884 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
885 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
886 ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
887 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
888 ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
889 ; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
890 ; VBITS_GE_256-NEXT: ptrue p1.h, vl16
891 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
892 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
893 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
894 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
895 ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
896 ; VBITS_GE_256-NEXT: sunpklo z0.h, z1.b
897 ; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0
898 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0]
899 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
900 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
901 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
902 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
903 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
904 ; VBITS_GE_256-NEXT: ret
906 ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32_m32:
907 ; VBITS_GE_512: // %bb.0:
908 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
909 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
910 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
911 ; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0]
912 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
913 ; VBITS_GE_512-NEXT: ret
914 %b = load <16 x i32>, ptr %bp
915 %mask = icmp eq <16 x i32> %b, zeroinitializer
916 %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
917 %ext = sext <16 x i16> %load to <16 x i32>
921 define <8 x i64> @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
922 ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64_m64:
923 ; VBITS_GE_256: // %bb.0:
924 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
925 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
926 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
927 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
928 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
929 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
930 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
931 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
932 ; VBITS_GE_256-NEXT: ptrue p1.s, vl4
933 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
934 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
935 ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
936 ; VBITS_GE_256-NEXT: ptrue p1.h, vl8
937 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
938 ; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0
939 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0]
940 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
941 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
942 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
943 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
944 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
945 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
946 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
947 ; VBITS_GE_256-NEXT: ret
949 ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64_m64:
950 ; VBITS_GE_512: // %bb.0:
951 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
952 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
953 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
954 ; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0]
955 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
956 ; VBITS_GE_512-NEXT: ret
957 %b = load <8 x i64>, ptr %bp
958 %mask = icmp eq <8 x i64> %b, zeroinitializer
959 %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
960 %ext = sext <8 x i16> %load to <8 x i64>
964 define <8 x i64> @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp) #0 {
965 ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64_m64:
966 ; VBITS_GE_256: // %bb.0:
967 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
968 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
969 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
970 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
971 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
972 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
973 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
974 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
975 ; VBITS_GE_256-NEXT: ptrue p1.s, vl4
976 ; VBITS_GE_256-NEXT: ptrue p2.s, vl8
977 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
978 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
979 ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
980 ; VBITS_GE_256-NEXT: cmpne p1.s, p2/z, z1.s, #0
981 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0]
982 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
983 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
984 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
985 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
986 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
987 ; VBITS_GE_256-NEXT: ret
989 ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64_m64:
990 ; VBITS_GE_512: // %bb.0:
991 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
992 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
993 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
994 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0]
995 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
996 ; VBITS_GE_512-NEXT: ret
997 %b = load <8 x i64>, ptr %bp
998 %mask = icmp eq <8 x i64> %b, zeroinitializer
999 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
1000 %ext = sext <8 x i32> %load to <8 x i64>
1004 define <32 x i16> @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp) #0 {
1005 ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16_m16:
1006 ; VBITS_GE_256: // %bb.0:
1007 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1008 ; VBITS_GE_256-NEXT: mov x9, #16 // =0x10
1009 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x9, lsl #1]
1010 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
1011 ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0
1012 ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, #0
1013 ; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
1014 ; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff
1015 ; VBITS_GE_256-NEXT: ptrue p1.b, vl16
1016 ; VBITS_GE_256-NEXT: ptrue p2.b, vl32
1017 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
1018 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
1019 ; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b
1020 ; VBITS_GE_256-NEXT: cmpne p1.b, p2/z, z1.b, #0
1021 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
1022 ; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b
1023 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1024 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
1025 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8, x9, lsl #1]
1026 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x8]
1027 ; VBITS_GE_256-NEXT: ret
1029 ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16_m16:
1030 ; VBITS_GE_512: // %bb.0:
1031 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1032 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1]
1033 ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
1034 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0]
1035 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8]
1036 ; VBITS_GE_512-NEXT: ret
1037 %b = load <32 x i16>, ptr %bp
1038 %mask = icmp eq <32 x i16> %b, zeroinitializer
1039 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
1040 %ext = zext <32 x i8> %load to <32 x i16>
1044 define <16 x i32> @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp) #0 {
1045 ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32_m32:
1046 ; VBITS_GE_256: // %bb.0:
1047 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1048 ; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
1049 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
1050 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
1051 ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1052 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
1053 ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
1054 ; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
1055 ; VBITS_GE_256-NEXT: ptrue p1.b, vl16
1056 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
1057 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
1058 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
1059 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
1060 ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
1061 ; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0
1062 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
1063 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
1064 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
1065 ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
1066 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
1067 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
1068 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8, x9, lsl #2]
1069 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8]
1070 ; VBITS_GE_256-NEXT: ret
1072 ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32_m32:
1073 ; VBITS_GE_512: // %bb.0:
1074 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1075 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
1076 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1077 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0]
1078 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
1079 ; VBITS_GE_512-NEXT: ret
1080 %b = load <16 x i32>, ptr %bp
1081 %mask = icmp eq <16 x i32> %b, zeroinitializer
1082 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
1083 %ext = zext <16 x i8> %load to <16 x i32>
1087 define <8 x i64> @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
1088 ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64_m64:
1089 ; VBITS_GE_256: // %bb.0:
1090 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1091 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
1092 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
1093 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
1094 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1095 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
1096 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
1097 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
1098 ; VBITS_GE_256-NEXT: ptrue p1.s, vl4
1099 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
1100 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
1101 ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
1102 ; VBITS_GE_256-NEXT: ptrue p1.b, vl8
1103 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
1104 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
1105 ; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z0.b, #0
1106 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0]
1107 ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
1108 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
1109 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
1110 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
1111 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
1112 ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
1113 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
1114 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
1115 ; VBITS_GE_256-NEXT: ret
1117 ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64_m64:
1118 ; VBITS_GE_512: // %bb.0:
1119 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1120 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
1121 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1122 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0]
1123 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
1124 ; VBITS_GE_512-NEXT: ret
1125 %b = load <8 x i64>, ptr %bp
1126 %mask = icmp eq <8 x i64> %b, zeroinitializer
1127 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
1128 %ext = zext <8 x i8> %load to <8 x i64>
1132 define <16 x i32> @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp) #0 {
1133 ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32_m32:
1134 ; VBITS_GE_256: // %bb.0:
1135 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1136 ; VBITS_GE_256-NEXT: mov x9, #8 // =0x8
1137 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
1138 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1]
1139 ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1140 ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0
1141 ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
1142 ; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff
1143 ; VBITS_GE_256-NEXT: ptrue p1.h, vl16
1144 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
1145 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
1146 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
1147 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
1148 ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
1149 ; VBITS_GE_256-NEXT: sunpklo z0.h, z1.b
1150 ; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0
1151 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0]
1152 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
1153 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1154 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
1155 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
1156 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x8]
1157 ; VBITS_GE_256-NEXT: ret
1159 ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32_m32:
1160 ; VBITS_GE_512: // %bb.0:
1161 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1162 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
1163 ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1164 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0]
1165 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
1166 ; VBITS_GE_512-NEXT: ret
1167 %b = load <16 x i32>, ptr %bp
1168 %mask = icmp eq <16 x i32> %b, zeroinitializer
1169 %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
1170 %ext = zext <16 x i16> %load to <16 x i32>
1174 define <8 x i64> @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
1175 ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64_m64:
1176 ; VBITS_GE_256: // %bb.0:
1177 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1178 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
1179 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
1180 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
1181 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1182 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
1183 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
1184 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
1185 ; VBITS_GE_256-NEXT: ptrue p1.s, vl4
1186 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
1187 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
1188 ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
1189 ; VBITS_GE_256-NEXT: ptrue p1.h, vl8
1190 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
1191 ; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0
1192 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0]
1193 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
1194 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
1195 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
1196 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
1197 ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
1198 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
1199 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8]
1200 ; VBITS_GE_256-NEXT: ret
1202 ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64_m64:
1203 ; VBITS_GE_512: // %bb.0:
1204 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1205 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
1206 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1207 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0]
1208 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
1209 ; VBITS_GE_512-NEXT: ret
1210 %b = load <8 x i64>, ptr %bp
1211 %mask = icmp eq <8 x i64> %b, zeroinitializer
1212 %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
1213 %ext = zext <8 x i16> %load to <8 x i64>
1217 define <8 x i64> @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp) #0 {
1218 ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64_m64:
1219 ; VBITS_GE_256: // %bb.0:
1220 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1221 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
1222 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
1223 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1]
1224 ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1225 ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0
1226 ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff
1227 ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff
1228 ; VBITS_GE_256-NEXT: ptrue p1.s, vl4
1229 ; VBITS_GE_256-NEXT: ptrue p2.s, vl8
1230 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
1231 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
1232 ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s
1233 ; VBITS_GE_256-NEXT: cmpne p1.s, p2/z, z1.s, #0
1234 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0]
1235 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
1236 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1237 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
1238 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
1239 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
1240 ; VBITS_GE_256-NEXT: ret
1242 ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64_m64:
1243 ; VBITS_GE_512: // %bb.0:
1244 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1245 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1]
1246 ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1247 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0]
1248 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
1249 ; VBITS_GE_512-NEXT: ret
1250 %b = load <8 x i64>, ptr %bp
1251 %mask = icmp eq <8 x i64> %b, zeroinitializer
1252 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
1253 %ext = zext <8 x i32> %load to <8 x i64>
1257 define <128 x i16> @masked_load_sext_v128i8i16(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
1258 ; CHECK-LABEL: masked_load_sext_v128i8i16:
1260 ; CHECK-NEXT: ptrue p0.h, vl128
1261 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1]
1262 ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0
1263 ; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0]
1264 ; CHECK-NEXT: st1h { z0.h }, p0, [x8]
1266 %b = load <128 x i8>, ptr %bp
1267 %mask = icmp eq <128 x i8> %b, zeroinitializer
1268 %load = call <128 x i8> @llvm.masked.load.v128i8(ptr %ap, i32 8, <128 x i1> %mask, <128 x i8> undef)
1269 %ext = sext <128 x i8> %load to <128 x i16>
1270 ret <128 x i16> %ext
1273 define <64 x i32> @masked_load_sext_v64i8i32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
1274 ; CHECK-LABEL: masked_load_sext_v64i8i32:
1276 ; CHECK-NEXT: ptrue p0.s, vl64
1277 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1]
1278 ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1279 ; CHECK-NEXT: ld1sb { z0.s }, p1/z, [x0]
1280 ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
1282 %b = load <64 x i8>, ptr %bp
1283 %mask = icmp eq <64 x i8> %b, zeroinitializer
1284 %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
1285 %ext = sext <64 x i8> %load to <64 x i32>
1289 define <32 x i64> @masked_load_sext_v32i8i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
1290 ; CHECK-LABEL: masked_load_sext_v32i8i64:
1292 ; CHECK-NEXT: ptrue p0.d, vl32
1293 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1]
1294 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1295 ; CHECK-NEXT: ld1sb { z0.d }, p1/z, [x0]
1296 ; CHECK-NEXT: st1d { z0.d }, p0, [x8]
1298 %b = load <32 x i8>, ptr %bp
1299 %mask = icmp eq <32 x i8> %b, zeroinitializer
1300 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
1301 %ext = sext <32 x i8> %load to <32 x i64>
1305 define <64 x i32> @masked_load_sext_v64i16i32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
1306 ; CHECK-LABEL: masked_load_sext_v64i16i32:
1308 ; CHECK-NEXT: ptrue p0.s, vl64
1309 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1]
1310 ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1311 ; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0]
1312 ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
1314 %b = load <64 x i16>, ptr %bp
1315 %mask = icmp eq <64 x i16> %b, zeroinitializer
1316 %load = call <64 x i16> @llvm.masked.load.v64i16(ptr %ap, i32 8, <64 x i1> %mask, <64 x i16> undef)
1317 %ext = sext <64 x i16> %load to <64 x i32>
1321 define <32 x i64> @masked_load_sext_v32i16i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
1322 ; CHECK-LABEL: masked_load_sext_v32i16i64:
1324 ; CHECK-NEXT: ptrue p0.d, vl32
1325 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1]
1326 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1327 ; CHECK-NEXT: ld1sh { z0.d }, p1/z, [x0]
1328 ; CHECK-NEXT: st1d { z0.d }, p0, [x8]
1330 %b = load <32 x i16>, ptr %bp
1331 %mask = icmp eq <32 x i16> %b, zeroinitializer
1332 %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
1333 %ext = sext <32 x i16> %load to <32 x i64>
1337 define <32 x i64> @masked_load_sext_v32i32i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
1338 ; CHECK-LABEL: masked_load_sext_v32i32i64:
1340 ; CHECK-NEXT: ptrue p0.d, vl32
1341 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1]
1342 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1343 ; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0]
1344 ; CHECK-NEXT: st1d { z0.d }, p0, [x8]
1346 %b = load <32 x i32>, ptr %bp
1347 %mask = icmp eq <32 x i32> %b, zeroinitializer
1348 %load = call <32 x i32> @llvm.masked.load.v32i32(ptr %ap, i32 8, <32 x i1> %mask, <32 x i32> undef)
1349 %ext = sext <32 x i32> %load to <32 x i64>
1353 define <128 x i16> @masked_load_zext_v128i8i16(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
1354 ; CHECK-LABEL: masked_load_zext_v128i8i16:
1356 ; CHECK-NEXT: ptrue p0.h, vl128
1357 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1]
1358 ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0
1359 ; CHECK-NEXT: ld1b { z0.h }, p1/z, [x0]
1360 ; CHECK-NEXT: st1h { z0.h }, p0, [x8]
1362 %b = load <128 x i8>, ptr %bp
1363 %mask = icmp eq <128 x i8> %b, zeroinitializer
1364 %load = call <128 x i8> @llvm.masked.load.v128i8(ptr %ap, i32 8, <128 x i1> %mask, <128 x i8> undef)
1365 %ext = zext <128 x i8> %load to <128 x i16>
1366 ret <128 x i16> %ext
1369 define <64 x i32> @masked_load_zext_v64i8i32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
1370 ; CHECK-LABEL: masked_load_zext_v64i8i32:
1372 ; CHECK-NEXT: ptrue p0.s, vl64
1373 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1]
1374 ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1375 ; CHECK-NEXT: ld1b { z0.s }, p1/z, [x0]
1376 ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
1378 %b = load <64 x i8>, ptr %bp
1379 %mask = icmp eq <64 x i8> %b, zeroinitializer
1380 %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
1381 %ext = zext <64 x i8> %load to <64 x i32>
1385 define <32 x i64> @masked_load_zext_v32i8i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
1386 ; CHECK-LABEL: masked_load_zext_v32i8i64:
1388 ; CHECK-NEXT: ptrue p0.d, vl32
1389 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1]
1390 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1391 ; CHECK-NEXT: ld1b { z0.d }, p1/z, [x0]
1392 ; CHECK-NEXT: st1d { z0.d }, p0, [x8]
1394 %b = load <32 x i8>, ptr %bp
1395 %mask = icmp eq <32 x i8> %b, zeroinitializer
1396 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
1397 %ext = zext <32 x i8> %load to <32 x i64>
1401 define <64 x i32> @masked_load_zext_v64i16i32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
1402 ; CHECK-LABEL: masked_load_zext_v64i16i32:
1404 ; CHECK-NEXT: ptrue p0.s, vl64
1405 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1]
1406 ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0
1407 ; CHECK-NEXT: ld1h { z0.s }, p1/z, [x0]
1408 ; CHECK-NEXT: st1w { z0.s }, p0, [x8]
1410 %b = load <64 x i16>, ptr %bp
1411 %mask = icmp eq <64 x i16> %b, zeroinitializer
1412 %load = call <64 x i16> @llvm.masked.load.v64i16(ptr %ap, i32 8, <64 x i1> %mask, <64 x i16> undef)
1413 %ext = zext <64 x i16> %load to <64 x i32>
1417 define <32 x i64> @masked_load_zext_v32i16i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
1418 ; CHECK-LABEL: masked_load_zext_v32i16i64:
1420 ; CHECK-NEXT: ptrue p0.d, vl32
1421 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1]
1422 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1423 ; CHECK-NEXT: ld1h { z0.d }, p1/z, [x0]
1424 ; CHECK-NEXT: st1d { z0.d }, p0, [x8]
1426 %b = load <32 x i16>, ptr %bp
1427 %mask = icmp eq <32 x i16> %b, zeroinitializer
1428 %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
1429 %ext = zext <32 x i16> %load to <32 x i64>
1433 define <32 x i64> @masked_load_zext_v32i32i64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
1434 ; CHECK-LABEL: masked_load_zext_v32i32i64:
1436 ; CHECK-NEXT: ptrue p0.d, vl32
1437 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1]
1438 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
1439 ; CHECK-NEXT: ld1w { z0.d }, p1/z, [x0]
1440 ; CHECK-NEXT: st1d { z0.d }, p0, [x8]
1442 %b = load <32 x i32>, ptr %bp
1443 %mask = icmp eq <32 x i32> %b, zeroinitializer
1444 %load = call <32 x i32> @llvm.masked.load.v32i32(ptr %ap, i32 8, <32 x i1> %mask, <32 x i32> undef)
1445 %ext = zext <32 x i32> %load to <32 x i64>
1449 define <8 x i64> @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp) #0 {
1450 ; VBITS_GE_256-LABEL: masked_load_sext_ugt_v8i32i64:
1451 ; VBITS_GE_256: // %bb.0:
1452 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1453 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
1454 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
1455 ; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0
1456 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
1457 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1458 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
1459 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1460 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
1461 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
1462 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
1463 ; VBITS_GE_256-NEXT: ret
1465 ; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64:
1466 ; VBITS_GE_512: // %bb.0:
1467 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1468 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1]
1469 ; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0
1470 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0]
1471 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
1472 ; VBITS_GE_512-NEXT: ret
1473 %b = load <8 x i32>, ptr %bp
1474 %mask = icmp ugt <8 x i32> %b, zeroinitializer
1475 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
1476 %ext = sext <8 x i32> %load to <8 x i64>
1480 define <8 x i64> @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp) #0 {
1481 ; VBITS_GE_256-LABEL: masked_load_zext_sgt_v8i32i64:
1482 ; VBITS_GE_256: // %bb.0:
1483 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1484 ; VBITS_GE_256-NEXT: mov x9, #4 // =0x4
1485 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1]
1486 ; VBITS_GE_256-NEXT: cmpgt p0.s, p0/z, z0.s, #0
1487 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
1488 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1489 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
1490 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1491 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
1492 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
1493 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x8]
1494 ; VBITS_GE_256-NEXT: ret
1496 ; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64:
1497 ; VBITS_GE_512: // %bb.0:
1498 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1499 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1]
1500 ; VBITS_GE_512-NEXT: cmpgt p1.d, p0/z, z0.d, #0
1501 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0]
1502 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
1503 ; VBITS_GE_512-NEXT: ret
1504 %b = load <8 x i32>, ptr %bp
1505 %mask = icmp sgt <8 x i32> %b, zeroinitializer
1506 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
1507 %ext = zext <8 x i32> %load to <8 x i64>
1511 declare <2 x half> @llvm.masked.load.v2f16(ptr, i32, <2 x i1>, <2 x half>)
1512 declare <2 x float> @llvm.masked.load.v2f32(ptr, i32, <2 x i1>, <2 x float>)
1513 declare <4 x float> @llvm.masked.load.v4f32(ptr, i32, <4 x i1>, <4 x float>)
1514 declare <8 x float> @llvm.masked.load.v8f32(ptr, i32, <8 x i1>, <8 x float>)
1515 declare <16 x float> @llvm.masked.load.v16f32(ptr, i32, <16 x i1>, <16 x float>)
1516 declare <32 x float> @llvm.masked.load.v32f32(ptr, i32, <32 x i1>, <32 x float>)
1517 declare <64 x float> @llvm.masked.load.v64f32(ptr, i32, <64 x i1>, <64 x float>)
1519 declare <128 x i8> @llvm.masked.load.v128i8(ptr, i32, <128 x i1>, <128 x i8>)
1520 declare <64 x i8> @llvm.masked.load.v64i8(ptr, i32, <64 x i1>, <64 x i8>)
1521 declare <32 x i8> @llvm.masked.load.v32i8(ptr, i32, <32 x i1>, <32 x i8>)
1522 declare <16 x i8> @llvm.masked.load.v16i8(ptr, i32, <16 x i1>, <16 x i8>)
1523 declare <16 x i16> @llvm.masked.load.v16i16(ptr, i32, <16 x i1>, <16 x i16>)
1524 declare <8 x i8> @llvm.masked.load.v8i8(ptr, i32, <8 x i1>, <8 x i8>)
1525 declare <8 x i16> @llvm.masked.load.v8i16(ptr, i32, <8 x i1>, <8 x i16>)
1526 declare <8 x i32> @llvm.masked.load.v8i32(ptr, i32, <8 x i1>, <8 x i32>)
1527 declare <32 x i32> @llvm.masked.load.v32i32(ptr, i32, <32 x i1>, <32 x i32>)
1528 declare <32 x i16> @llvm.masked.load.v32i16(ptr, i32, <32 x i1>, <32 x i16>)
1529 declare <64 x i16> @llvm.masked.load.v64i16(ptr, i32, <64 x i1>, <64 x i16>)
1530 declare <16 x i32> @llvm.masked.load.v16i32(ptr, i32, <16 x i1>, <16 x i32>)
1531 declare <8 x i64> @llvm.masked.load.v8i64(ptr, i32, <8 x i1>, <8 x i64>)
1532 declare <8 x double> @llvm.masked.load.v8f64(ptr, i32, <8 x i1>, <8 x double>)
1534 attributes #0 = { "target-features"="+sve" }