1 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
3 ; 2-lane contiguous load/stores
5 define void @test_masked_ldst_sv2i8(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
6 ; CHECK-LABEL: test_masked_ldst_sv2i8:
7 ; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1]
8 ; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, x1]
10 %base_i8 = getelementptr i8, ptr %base, i64 %offset
11 %data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_i8,
13 <vscale x 2 x i1> %mask,
14 <vscale x 2 x i8> undef)
15 call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data,
16 <vscale x 2 x i8>* %base_i8,
18 <vscale x 2 x i1> %mask)
22 define void @test_masked_ldst_sv2i16(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
23 ; CHECK-LABEL: test_masked_ldst_sv2i16:
24 ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1]
25 ; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1]
27 %base_i16 = getelementptr i16, ptr %base, i64 %offset
28 %data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_i16,
30 <vscale x 2 x i1> %mask,
31 <vscale x 2 x i16> undef)
32 call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data,
33 <vscale x 2 x i16>* %base_i16,
35 <vscale x 2 x i1> %mask)
39 define void @test_masked_ldst_sv2i32(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
40 ; CHECK-LABEL: test_masked_ldst_sv2i32:
41 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2]
42 ; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2]
44 %base_i32 = getelementptr i32, ptr %base, i64 %offset
45 %data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_i32,
47 <vscale x 2 x i1> %mask,
48 <vscale x 2 x i32> undef)
49 call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data,
50 <vscale x 2 x i32>* %base_i32,
52 <vscale x 2 x i1> %mask)
56 define void @test_masked_ldst_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
57 ; CHECK-LABEL: test_masked_ldst_sv2i64:
58 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3]
59 ; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3]
61 %base_i64 = getelementptr i64, ptr %base, i64 %offset
62 %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_i64,
64 <vscale x 2 x i1> %mask,
65 <vscale x 2 x i64> undef)
66 call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
67 <vscale x 2 x i64>* %base_i64,
69 <vscale x 2 x i1> %mask)
73 define void @test_masked_ldst_sv2f16(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
74 ; CHECK-LABEL: test_masked_ldst_sv2f16:
75 ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1]
76 ; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1]
78 %base_half = getelementptr half, ptr %base, i64 %offset
79 %data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>* %base_half,
81 <vscale x 2 x i1> %mask,
82 <vscale x 2 x half> undef)
83 call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data,
84 <vscale x 2 x half>* %base_half,
86 <vscale x 2 x i1> %mask)
90 define void @test_masked_ldst_sv2f32(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
91 ; CHECK-LABEL: test_masked_ldst_sv2f32:
92 ; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2]
93 ; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2]
95 %base_float = getelementptr float, ptr %base, i64 %offset
96 %data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>* %base_float,
98 <vscale x 2 x i1> %mask,
99 <vscale x 2 x float> undef)
100 call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data,
101 <vscale x 2 x float>* %base_float,
103 <vscale x 2 x i1> %mask)
107 define void @test_masked_ldst_sv2f64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
108 ; CHECK-LABEL: test_masked_ldst_sv2f64:
109 ; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3]
110 ; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3]
112 %base_double = getelementptr double, ptr %base, i64 %offset
113 %data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>* %base_double,
115 <vscale x 2 x i1> %mask,
116 <vscale x 2 x double> undef)
117 call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data,
118 <vscale x 2 x double>* %base_double,
120 <vscale x 2 x i1> %mask)
124 ; 2-lane zero/sign extended contiguous loads.
126 define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
127 ; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64:
128 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1]
130 %base_i8 = getelementptr i8, ptr %base, i64 %offset
131 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_i8,
133 <vscale x 2 x i1> %mask,
134 <vscale x 2 x i8> undef)
135 %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
136 ret <vscale x 2 x i64> %ext
139 define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
140 ; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64:
141 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1]
143 %base_i8 = getelementptr i8, ptr %base, i64 %offset
144 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_i8,
146 <vscale x 2 x i1> %mask,
147 <vscale x 2 x i8> undef)
148 %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
149 ret <vscale x 2 x i64> %ext
152 define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
153 ; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64:
154 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1]
156 %base_i16 = getelementptr i16, ptr %base, i64 %offset
157 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_i16,
159 <vscale x 2 x i1> %mask,
160 <vscale x 2 x i16> undef)
161 %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
162 ret <vscale x 2 x i64> %ext
165 define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
166 ; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64:
167 ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1]
169 %base_i16 = getelementptr i16, ptr %base, i64 %offset
170 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_i16,
172 <vscale x 2 x i1> %mask,
173 <vscale x 2 x i16> undef)
174 %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
175 ret <vscale x 2 x i64> %ext
179 define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
180 ; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64:
181 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2]
183 %base_i32 = getelementptr i32, ptr %base, i64 %offset
184 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_i32,
186 <vscale x 2 x i1> %mask,
187 <vscale x 2 x i32> undef)
188 %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
189 ret <vscale x 2 x i64> %ext
192 define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
193 ; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64:
194 ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2]
196 %base_i32 = getelementptr i32, ptr %base, i64 %offset
197 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_i32,
199 <vscale x 2 x i1> %mask,
200 <vscale x 2 x i32> undef)
201 %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
202 ret <vscale x 2 x i64> %ext
205 ; 2-lane truncating contiguous stores.
207 define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
208 ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8:
209 ; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1]
211 %base_i8 = getelementptr i8, ptr %base, i64 %offset
212 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
213 call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc,
214 <vscale x 2 x i8> *%base_i8,
216 <vscale x 2 x i1> %mask)
220 define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
221 ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16:
222 ; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1]
224 %base_i16 = getelementptr i16, ptr %base, i64 %offset
225 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
226 call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc,
227 <vscale x 2 x i16> *%base_i16,
229 <vscale x 2 x i1> %mask)
233 define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
234 ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32:
235 ; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2]
237 %base_i32 = getelementptr i32, ptr %base, i64 %offset
238 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
239 call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc,
240 <vscale x 2 x i32> *%base_i32,
242 <vscale x 2 x i1> %mask)
246 ; 4-lane contiguous load/stores.
248 define void @test_masked_ldst_sv4i8(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
249 ; CHECK-LABEL: test_masked_ldst_sv4i8:
250 ; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1]
251 ; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, x1]
253 %base_i8 = getelementptr i8, ptr %base, i64 %offset
254 %data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_i8,
256 <vscale x 4 x i1> %mask,
257 <vscale x 4 x i8> undef)
258 call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data,
259 <vscale x 4 x i8>* %base_i8,
261 <vscale x 4 x i1> %mask)
265 define void @test_masked_ldst_sv4i16(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
266 ; CHECK-LABEL: test_masked_ldst_sv4i16:
267 ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1]
268 ; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1]
270 %base_i16 = getelementptr i16, ptr %base, i64 %offset
271 %data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_i16,
273 <vscale x 4 x i1> %mask,
274 <vscale x 4 x i16> undef)
275 call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data,
276 <vscale x 4 x i16>* %base_i16,
278 <vscale x 4 x i1> %mask)
282 define void @test_masked_ldst_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
283 ; CHECK-LABEL: test_masked_ldst_sv4i32:
284 ; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
285 ; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
287 %base_i32 = getelementptr i32, ptr %base, i64 %offset
288 %data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>* %base_i32,
290 <vscale x 4 x i1> %mask,
291 <vscale x 4 x i32> undef)
292 call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data,
293 <vscale x 4 x i32>* %base_i32,
295 <vscale x 4 x i1> %mask)
299 define void @test_masked_ldst_sv4f16(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
300 ; CHECK-LABEL: test_masked_ldst_sv4f16:
301 ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1]
302 ; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1]
304 %base_f16 = getelementptr half, ptr %base, i64 %offset
305 %data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>* %base_f16,
307 <vscale x 4 x i1> %mask,
308 <vscale x 4 x half> undef)
309 call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data,
310 <vscale x 4 x half>* %base_f16,
312 <vscale x 4 x i1> %mask)
316 define void @test_masked_ldst_sv4f32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
317 ; CHECK-LABEL: test_masked_ldst_sv4f32:
318 ; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
319 ; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
321 %base_f32 = getelementptr float, ptr %base, i64 %offset
322 %data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>* %base_f32,
324 <vscale x 4 x i1> %mask,
325 <vscale x 4 x float> undef)
326 call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data,
327 <vscale x 4 x float>* %base_f32,
329 <vscale x 4 x i1> %mask)
333 ; 4-lane zero/sign extended contiguous loads.
335 define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
336 ; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32:
337 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1]
339 %base_i8 = getelementptr i8, ptr %base, i64 %offset
340 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_i8,
342 <vscale x 4 x i1> %mask,
343 <vscale x 4 x i8> undef)
344 %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
345 ret <vscale x 4 x i32> %ext
348 define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
349 ; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32:
350 ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1]
352 %base_i8 = getelementptr i8, ptr %base, i64 %offset
353 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_i8,
355 <vscale x 4 x i1> %mask,
356 <vscale x 4 x i8> undef)
357 %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
358 ret <vscale x 4 x i32> %ext
361 define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
362 ; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32:
363 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1]
365 %base_i16 = getelementptr i16, ptr %base, i64 %offset
366 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_i16,
368 <vscale x 4 x i1> %mask,
369 <vscale x 4 x i16> undef)
370 %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
371 ret <vscale x 4 x i32> %ext
374 define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
375 ; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32:
376 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1]
378 %base_i16 = getelementptr i16, ptr %base, i64 %offset
379 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_i16,
381 <vscale x 4 x i1> %mask,
382 <vscale x 4 x i16> undef)
383 %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
384 ret <vscale x 4 x i32> %ext
387 ; 4-lane truncating contiguous stores.
389 define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
390 ; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8:
391 ; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1]
393 %base_i8 = getelementptr i8, ptr %base, i64 %offset
394 %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
395 call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc,
396 <vscale x 4 x i8> *%base_i8,
398 <vscale x 4 x i1> %mask)
402 define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
403 ; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16:
404 ; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1]
406 %base_i16 = getelementptr i16, ptr %base, i64 %offset
407 %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
408 call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc,
409 <vscale x 4 x i16> *%base_i16,
411 <vscale x 4 x i1> %mask)
415 ; 8-lane contiguous load/stores.
417 define void @test_masked_ldst_sv8i8(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
418 ; CHECK-LABEL: test_masked_ldst_sv8i8:
419 ; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1]
420 ; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, x1]
422 %base_i8 = getelementptr i8, ptr %base, i64 %offset
423 %data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_i8,
425 <vscale x 8 x i1> %mask,
426 <vscale x 8 x i8> undef)
427 call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data,
428 <vscale x 8 x i8>* %base_i8,
430 <vscale x 8 x i1> %mask)
434 define void @test_masked_ldst_sv8i16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
435 ; CHECK-LABEL: test_masked_ldst_sv8i16:
436 ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
437 ; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
439 %base_i16 = getelementptr i16, ptr %base, i64 %offset
440 %data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>* %base_i16,
442 <vscale x 8 x i1> %mask,
443 <vscale x 8 x i16> undef)
444 call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data,
445 <vscale x 8 x i16>* %base_i16,
447 <vscale x 8 x i1> %mask)
451 define void @test_masked_ldst_sv8f16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
452 ; CHECK-LABEL: test_masked_ldst_sv8f16:
453 ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
454 ; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
456 %base_f16 = getelementptr half, ptr %base, i64 %offset
457 %data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>* %base_f16,
459 <vscale x 8 x i1> %mask,
460 <vscale x 8 x half> undef)
461 call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data,
462 <vscale x 8 x half>* %base_f16,
464 <vscale x 8 x i1> %mask)
468 define void @test_masked_ldst_sv8bf16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind #0 {
469 ; CHECK-LABEL: test_masked_ldst_sv8bf16:
470 ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
471 ; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
473 %base_f16 = getelementptr bfloat, ptr %base, i64 %offset
474 %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>* %base_f16,
476 <vscale x 8 x i1> %mask,
477 <vscale x 8 x bfloat> undef)
478 call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data,
479 <vscale x 8 x bfloat>* %base_f16,
481 <vscale x 8 x i1> %mask)
485 ; 8-lane zero/sign extended contiguous loads.
487 define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
488 ; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16:
489 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1]
491 %base_i8 = getelementptr i8, ptr %base, i64 %offset
492 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_i8,
494 <vscale x 8 x i1> %mask,
495 <vscale x 8 x i8> undef)
496 %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
497 ret <vscale x 8 x i16> %ext
500 define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
501 ; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16:
502 ; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1]
504 %base_i8 = getelementptr i8, ptr %base, i64 %offset
505 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_i8,
507 <vscale x 8 x i1> %mask,
508 <vscale x 8 x i8> undef)
509 %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
510 ret <vscale x 8 x i16> %ext
513 ; 8-lane truncating contiguous stores.
515 define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
516 ; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8:
517 ; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1]
519 %base_i8 = getelementptr i8, ptr %base, i64 %offset
520 %trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
521 call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc,
522 <vscale x 8 x i8> *%base_i8,
524 <vscale x 8 x i1> %mask)
528 ; 16-lane contiguous load/stores.
530 define void @test_masked_ldst_sv16i8(ptr %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind {
531 ; CHECK-LABEL: test_masked_ldst_sv16i8:
532 ; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1]
533 ; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, x1]
535 %base_i8 = getelementptr i8, ptr %base, i64 %offset
536 %data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>* %base_i8,
538 <vscale x 16 x i1> %mask,
539 <vscale x 16 x i8> undef)
540 call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data,
541 <vscale x 16 x i8>* %base_i8,
543 <vscale x 16 x i1> %mask)
547 ; 2-element contiguous loads.
548 declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8 (<vscale x 2 x i8>* , i32, <vscale x 2 x i1>, <vscale x 2 x i8> )
549 declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
550 declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
551 declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
552 declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>*, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
553 declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>*, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
554 declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
556 ; 4-element contiguous loads.
557 declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8 (<vscale x 4 x i8>* , i32, <vscale x 4 x i1>, <vscale x 4 x i8> )
558 declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
559 declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
560 declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
561 declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
563 ; 8-element contiguous loads.
564 declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
565 declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
566 declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
567 declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
569 ; 16-element contiguous loads.
570 declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
572 ; 2-element contiguous stores.
573 declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , <vscale x 2 x i8>* , i32, <vscale x 2 x i1>)
574 declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>*, i32, <vscale x 2 x i1>)
575 declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>*, i32, <vscale x 2 x i1>)
576 declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>)
577 declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>*, i32, <vscale x 2 x i1>)
578 declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>*, i32, <vscale x 2 x i1>)
579 declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>*, i32, <vscale x 2 x i1>)
581 ; 4-element contiguous stores.
582 declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , <vscale x 4 x i8>* , i32, <vscale x 4 x i1>)
583 declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>*, i32, <vscale x 4 x i1>)
584 declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>*, i32, <vscale x 4 x i1>)
585 declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>)
586 declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>)
588 ; 8-element contiguous stores.
589 declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
590 declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
591 declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
592 declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>)
594 ; 16-element contiguous stores.
595 declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
597 ; +bf16 is required for the bfloat version.
598 attributes #0 = { "target-features"="+sve,+bf16" }