1 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
3 ; 2-lane contiguous load/stores
5 define void @test_masked_ldst_sv2i8(i8 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
6 ; CHECK-LABEL: test_masked_ldst_sv2i8:
7 ; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1]
8 ; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, x1]
10 %base_i8 = getelementptr i8, i8* %base, i64 %offset
11 %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
12 %data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr,
14 <vscale x 2 x i1> %mask,
15 <vscale x 2 x i8> undef)
16 call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data,
17 <vscale x 2 x i8>* %base_addr,
19 <vscale x 2 x i1> %mask)
23 define void @test_masked_ldst_sv2i16(i16 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
24 ; CHECK-LABEL: test_masked_ldst_sv2i16:
25 ; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1]
26 ; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1]
28 %base_i16 = getelementptr i16, i16* %base, i64 %offset
29 %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
30 %data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr,
32 <vscale x 2 x i1> %mask,
33 <vscale x 2 x i16> undef)
34 call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data,
35 <vscale x 2 x i16>* %base_addr,
37 <vscale x 2 x i1> %mask)
41 define void @test_masked_ldst_sv2i32(i32 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
42 ; CHECK-LABEL: test_masked_ldst_sv2i32:
43 ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2]
44 ; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2]
46 %base_i32 = getelementptr i32, i32* %base, i64 %offset
47 %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
48 %data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr,
50 <vscale x 2 x i1> %mask,
51 <vscale x 2 x i32> undef)
52 call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data,
53 <vscale x 2 x i32>* %base_addr,
55 <vscale x 2 x i1> %mask)
59 define void @test_masked_ldst_sv2i64(i64 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
60 ; CHECK-LABEL: test_masked_ldst_sv2i64:
61 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3]
62 ; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3]
64 %base_i64 = getelementptr i64, i64* %base, i64 %offset
65 %base_addr = bitcast i64* %base_i64 to <vscale x 2 x i64>*
66 %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_addr,
68 <vscale x 2 x i1> %mask,
69 <vscale x 2 x i64> undef)
70 call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
71 <vscale x 2 x i64>* %base_addr,
73 <vscale x 2 x i1> %mask)
77 define void @test_masked_ldst_sv2f16(half * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
78 ; CHECK-LABEL: test_masked_ldst_sv2f16:
79 ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1]
80 ; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1]
82 %base_half = getelementptr half, half* %base, i64 %offset
83 %base_addr = bitcast half* %base_half to <vscale x 2 x half>*
84 %data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>* %base_addr,
86 <vscale x 2 x i1> %mask,
87 <vscale x 2 x half> undef)
88 call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data,
89 <vscale x 2 x half>* %base_addr,
91 <vscale x 2 x i1> %mask)
95 define void @test_masked_ldst_sv2f32(float * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
96 ; CHECK-LABEL: test_masked_ldst_sv2f32:
97 ; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2]
98 ; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2]
100 %base_float = getelementptr float, float* %base, i64 %offset
101 %base_addr = bitcast float* %base_float to <vscale x 2 x float>*
102 %data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>* %base_addr,
104 <vscale x 2 x i1> %mask,
105 <vscale x 2 x float> undef)
106 call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data,
107 <vscale x 2 x float>* %base_addr,
109 <vscale x 2 x i1> %mask)
113 define void @test_masked_ldst_sv2f64(double * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
114 ; CHECK-LABEL: test_masked_ldst_sv2f64:
115 ; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3]
116 ; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3]
118 %base_double = getelementptr double, double* %base, i64 %offset
119 %base_addr = bitcast double* %base_double to <vscale x 2 x double>*
120 %data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>* %base_addr,
122 <vscale x 2 x i1> %mask,
123 <vscale x 2 x double> undef)
124 call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data,
125 <vscale x 2 x double>* %base_addr,
127 <vscale x 2 x i1> %mask)
131 ; 2-lane zero/sign extended contiguous loads.
133 define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(i8* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
134 ; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64:
135 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1]
137 %base_i8 = getelementptr i8, i8* %base, i64 %offset
138 %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
139 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr,
141 <vscale x 2 x i1> %mask,
142 <vscale x 2 x i8> undef)
143 %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
144 ret <vscale x 2 x i64> %ext
147 define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(i8* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
148 ; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64:
149 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1]
151 %base_i8 = getelementptr i8, i8* %base, i64 %offset
152 %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
153 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr,
155 <vscale x 2 x i1> %mask,
156 <vscale x 2 x i8> undef)
157 %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
158 ret <vscale x 2 x i64> %ext
161 define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(i16* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
162 ; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64:
163 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1]
165 %base_i16 = getelementptr i16, i16* %base, i64 %offset
166 %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
167 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr,
169 <vscale x 2 x i1> %mask,
170 <vscale x 2 x i16> undef)
171 %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
172 ret <vscale x 2 x i64> %ext
175 define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(i16* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
176 ; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64:
177 ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1]
179 %base_i16 = getelementptr i16, i16* %base, i64 %offset
180 %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
181 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr,
183 <vscale x 2 x i1> %mask,
184 <vscale x 2 x i16> undef)
185 %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
186 ret <vscale x 2 x i64> %ext
190 define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(i32* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
191 ; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64:
192 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2]
194 %base_i32 = getelementptr i32, i32* %base, i64 %offset
195 %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
196 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr,
198 <vscale x 2 x i1> %mask,
199 <vscale x 2 x i32> undef)
200 %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
201 ret <vscale x 2 x i64> %ext
204 define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(i32* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
205 ; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64:
206 ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2]
208 %base_i32 = getelementptr i32, i32* %base, i64 %offset
209 %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
210 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr,
212 <vscale x 2 x i1> %mask,
213 <vscale x 2 x i32> undef)
214 %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
215 ret <vscale x 2 x i64> %ext
218 ; 2-lane truncating contiguous stores.
220 define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, i8 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
221 ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8:
222 ; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1]
224 %base_i8 = getelementptr i8, i8* %base, i64 %offset
225 %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
226 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
227 call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc,
228 <vscale x 2 x i8> *%base_addr,
230 <vscale x 2 x i1> %mask)
234 define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, i16 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
235 ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16:
236 ; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1]
238 %base_i16 = getelementptr i16, i16* %base, i64 %offset
239 %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
240 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
241 call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc,
242 <vscale x 2 x i16> *%base_addr,
244 <vscale x 2 x i1> %mask)
248 define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, i32 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
249 ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32:
250 ; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2]
252 %base_i32 = getelementptr i32, i32* %base, i64 %offset
253 %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
254 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
255 call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc,
256 <vscale x 2 x i32> *%base_addr,
258 <vscale x 2 x i1> %mask)
262 ; 4-lane contiguous load/stores.
264 define void @test_masked_ldst_sv4i8(i8 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
265 ; CHECK-LABEL: test_masked_ldst_sv4i8:
266 ; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1]
267 ; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, x1]
269 %base_i8 = getelementptr i8, i8* %base, i64 %offset
270 %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
271 %data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr,
273 <vscale x 4 x i1> %mask,
274 <vscale x 4 x i8> undef)
275 call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data,
276 <vscale x 4 x i8>* %base_addr,
278 <vscale x 4 x i1> %mask)
282 define void @test_masked_ldst_sv4i16(i16 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
283 ; CHECK-LABEL: test_masked_ldst_sv4i16:
284 ; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1]
285 ; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1]
287 %base_i16 = getelementptr i16, i16* %base, i64 %offset
288 %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
289 %data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr,
291 <vscale x 4 x i1> %mask,
292 <vscale x 4 x i16> undef)
293 call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data,
294 <vscale x 4 x i16>* %base_addr,
296 <vscale x 4 x i1> %mask)
300 define void @test_masked_ldst_sv4i32(i32 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
301 ; CHECK-LABEL: test_masked_ldst_sv4i32:
302 ; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
303 ; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
305 %base_i32 = getelementptr i32, i32* %base, i64 %offset
306 %base_addr = bitcast i32* %base_i32 to <vscale x 4 x i32>*
307 %data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>* %base_addr,
309 <vscale x 4 x i1> %mask,
310 <vscale x 4 x i32> undef)
311 call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data,
312 <vscale x 4 x i32>* %base_addr,
314 <vscale x 4 x i1> %mask)
318 define void @test_masked_ldst_sv4f16(half * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
319 ; CHECK-LABEL: test_masked_ldst_sv4f16:
320 ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1]
321 ; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1]
323 %base_f16 = getelementptr half, half* %base, i64 %offset
324 %base_addr = bitcast half* %base_f16 to <vscale x 4 x half>*
325 %data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>* %base_addr,
327 <vscale x 4 x i1> %mask,
328 <vscale x 4 x half> undef)
329 call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data,
330 <vscale x 4 x half>* %base_addr,
332 <vscale x 4 x i1> %mask)
336 define void @test_masked_ldst_sv4f32(float * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
337 ; CHECK-LABEL: test_masked_ldst_sv4f32:
338 ; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
339 ; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
341 %base_f32 = getelementptr float, float* %base, i64 %offset
342 %base_addr = bitcast float* %base_f32 to <vscale x 4 x float>*
343 %data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>* %base_addr,
345 <vscale x 4 x i1> %mask,
346 <vscale x 4 x float> undef)
347 call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data,
348 <vscale x 4 x float>* %base_addr,
350 <vscale x 4 x i1> %mask)
354 ; 4-lane zero/sign extended contiguous loads.
356 define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(i8* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
357 ; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32:
358 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1]
360 %base_i8 = getelementptr i8, i8* %base, i64 %offset
361 %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
362 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr,
364 <vscale x 4 x i1> %mask,
365 <vscale x 4 x i8> undef)
366 %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
367 ret <vscale x 4 x i32> %ext
370 define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(i8* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
371 ; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32:
372 ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1]
374 %base_i8 = getelementptr i8, i8* %base, i64 %offset
375 %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
376 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr,
378 <vscale x 4 x i1> %mask,
379 <vscale x 4 x i8> undef)
380 %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
381 ret <vscale x 4 x i32> %ext
384 define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(i16* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
385 ; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32:
386 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1]
388 %base_i16 = getelementptr i16, i16* %base, i64 %offset
389 %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
390 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr,
392 <vscale x 4 x i1> %mask,
393 <vscale x 4 x i16> undef)
394 %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
395 ret <vscale x 4 x i32> %ext
398 define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(i16* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
399 ; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32:
400 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1]
402 %base_i16 = getelementptr i16, i16* %base, i64 %offset
403 %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
404 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr,
406 <vscale x 4 x i1> %mask,
407 <vscale x 4 x i16> undef)
408 %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
409 ret <vscale x 4 x i32> %ext
412 ; 4-lane truncating contiguous stores.
414 define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, i8 *%base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
415 ; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8:
416 ; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1]
418 %base_i8 = getelementptr i8, i8* %base, i64 %offset
419 %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
420 %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
421 call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc,
422 <vscale x 4 x i8> *%base_addr,
424 <vscale x 4 x i1> %mask)
428 define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, i16 *%base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
429 ; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16:
430 ; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1]
432 %base_i16 = getelementptr i16, i16* %base, i64 %offset
433 %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
434 %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
435 call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc,
436 <vscale x 4 x i16> *%base_addr,
438 <vscale x 4 x i1> %mask)
442 ; 8-lane contiguous load/stores.
444 define void @test_masked_ldst_sv8i8(i8 * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
445 ; CHECK-LABEL: test_masked_ldst_sv8i8:
446 ; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1]
447 ; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, x1]
449 %base_i8 = getelementptr i8, i8* %base, i64 %offset
450 %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
451 %data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr,
453 <vscale x 8 x i1> %mask,
454 <vscale x 8 x i8> undef)
455 call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data,
456 <vscale x 8 x i8>* %base_addr,
458 <vscale x 8 x i1> %mask)
462 define void @test_masked_ldst_sv8i16(i16 * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
463 ; CHECK-LABEL: test_masked_ldst_sv8i16:
464 ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
465 ; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
467 %base_i16 = getelementptr i16, i16* %base, i64 %offset
468 %base_addr = bitcast i16* %base_i16 to <vscale x 8 x i16>*
469 %data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>* %base_addr,
471 <vscale x 8 x i1> %mask,
472 <vscale x 8 x i16> undef)
473 call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data,
474 <vscale x 8 x i16>* %base_addr,
476 <vscale x 8 x i1> %mask)
480 define void @test_masked_ldst_sv8f16(half * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
481 ; CHECK-LABEL: test_masked_ldst_sv8f16:
482 ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
483 ; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
485 %base_f16 = getelementptr half, half* %base, i64 %offset
486 %base_addr = bitcast half* %base_f16 to <vscale x 8 x half>*
487 %data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>* %base_addr,
489 <vscale x 8 x i1> %mask,
490 <vscale x 8 x half> undef)
491 call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data,
492 <vscale x 8 x half>* %base_addr,
494 <vscale x 8 x i1> %mask)
498 define void @test_masked_ldst_sv8bf16(bfloat * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind #0 {
499 ; CHECK-LABEL: test_masked_ldst_sv8bf16:
500 ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
501 ; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
503 %base_f16 = getelementptr bfloat, bfloat* %base, i64 %offset
504 %base_addr = bitcast bfloat* %base_f16 to <vscale x 8 x bfloat>*
505 %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>* %base_addr,
507 <vscale x 8 x i1> %mask,
508 <vscale x 8 x bfloat> undef)
509 call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data,
510 <vscale x 8 x bfloat>* %base_addr,
512 <vscale x 8 x i1> %mask)
516 ; 8-lane zero/sign extended contiguous loads.
518 define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(i8* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
519 ; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16:
520 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1]
522 %base_i8 = getelementptr i8, i8* %base, i64 %offset
523 %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
524 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr,
526 <vscale x 8 x i1> %mask,
527 <vscale x 8 x i8> undef)
528 %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
529 ret <vscale x 8 x i16> %ext
532 define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(i8* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
533 ; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16:
534 ; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1]
536 %base_i8 = getelementptr i8, i8* %base, i64 %offset
537 %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
538 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr,
540 <vscale x 8 x i1> %mask,
541 <vscale x 8 x i8> undef)
542 %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
543 ret <vscale x 8 x i16> %ext
546 ; 8-lane truncating contiguous stores.
548 define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, i8 *%base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
549 ; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8:
550 ; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1]
552 %base_i8 = getelementptr i8, i8* %base, i64 %offset
553 %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
554 %trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
555 call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc,
556 <vscale x 8 x i8> *%base_addr,
558 <vscale x 8 x i1> %mask)
562 ; 16-lane contiguous load/stores.
564 define void @test_masked_ldst_sv16i8(i8 * %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind {
565 ; CHECK-LABEL: test_masked_ldst_sv16i8:
566 ; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1]
567 ; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, x1]
569 %base_i8 = getelementptr i8, i8* %base, i64 %offset
570 %base_addr = bitcast i8* %base_i8 to <vscale x 16 x i8>*
571 %data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>* %base_addr,
573 <vscale x 16 x i1> %mask,
574 <vscale x 16 x i8> undef)
575 call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data,
576 <vscale x 16 x i8>* %base_addr,
578 <vscale x 16 x i1> %mask)
582 ; 2-element contiguous loads.
583 declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8 (<vscale x 2 x i8>* , i32, <vscale x 2 x i1>, <vscale x 2 x i8> )
584 declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
585 declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
586 declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
587 declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>*, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
588 declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>*, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
589 declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
591 ; 4-element contiguous loads.
592 declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8 (<vscale x 4 x i8>* , i32, <vscale x 4 x i1>, <vscale x 4 x i8> )
593 declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
594 declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
595 declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
596 declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
598 ; 8-element contiguous loads.
599 declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
600 declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
601 declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
602 declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
604 ; 16-element contiguous loads.
605 declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
607 ; 2-element contiguous stores.
608 declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , <vscale x 2 x i8>* , i32, <vscale x 2 x i1>)
609 declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>*, i32, <vscale x 2 x i1>)
610 declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>*, i32, <vscale x 2 x i1>)
611 declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>)
612 declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>*, i32, <vscale x 2 x i1>)
613 declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>*, i32, <vscale x 2 x i1>)
614 declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>*, i32, <vscale x 2 x i1>)
616 ; 4-element contiguous stores.
617 declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , <vscale x 4 x i8>* , i32, <vscale x 4 x i1>)
618 declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>*, i32, <vscale x 4 x i1>)
619 declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>*, i32, <vscale x 4 x i1>)
620 declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>)
621 declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>)
623 ; 8-element contiguous stores.
624 declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
625 declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
626 declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
627 declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>)
629 ; 16-element contiguous stores.
630 declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
632 ; +bf16 is required for the bfloat version.
633 attributes #0 = { "target-features"="+sve,+bf16" }