1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
4 ; Range checks: for all the instruction tested in this file, the
5 ; immediate must be within the range [-8, 7] (4-bit immediate). Out of
6 ; range values are tested only in one case (following). Valid values
7 ; are tested all through the rest of the file.
9 define void @imm_out_of_range(ptr %base, <vscale x 2 x i1> %mask) nounwind {
10 ; CHECK-LABEL: imm_out_of_range:
12 ; CHECK-NEXT: rdvl x8, #8
13 ; CHECK-NEXT: add x8, x0, x8
14 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
15 ; CHECK-NEXT: rdvl x8, #-9
16 ; CHECK-NEXT: add x8, x0, x8
17 ; CHECK-NEXT: st1d { z0.d }, p0, [x8]
19 %base_load = getelementptr <vscale x 2 x i64>, ptr %base, i64 8
20 %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(ptr %base_load,
22 <vscale x 2 x i1> %mask,
23 <vscale x 2 x i64> undef)
24 %base_store = getelementptr <vscale x 2 x i64>, ptr %base, i64 -9
25 call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
28 <vscale x 2 x i1> %mask)
32 ; 2-lane contiguous load/stores
34 define void @test_masked_ldst_sv2i8(ptr %base, <vscale x 2 x i1> %mask) nounwind {
35 ; CHECK-LABEL: test_masked_ldst_sv2i8:
37 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #-8, mul vl]
38 ; CHECK-NEXT: st1b { z0.d }, p0, [x0, #-7, mul vl]
40 %base_load = getelementptr <vscale x 2 x i8>, ptr %base, i64 -8
41 %data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %base_load,
43 <vscale x 2 x i1> %mask,
44 <vscale x 2 x i8> undef)
45 %base_store = getelementptr <vscale x 2 x i8>, ptr %base, i64 -7
46 call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data,
49 <vscale x 2 x i1> %mask)
53 define void @test_masked_ldst_sv2i16(ptr %base, <vscale x 2 x i1> %mask) nounwind {
54 ; CHECK-LABEL: test_masked_ldst_sv2i16:
56 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #-8, mul vl]
57 ; CHECK-NEXT: st1h { z0.d }, p0, [x0, #-7, mul vl]
59 %base_load = getelementptr <vscale x 2 x i16>, ptr %base, i64 -8
60 %data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %base_load,
62 <vscale x 2 x i1> %mask,
63 <vscale x 2 x i16> undef)
64 %base_store = getelementptr <vscale x 2 x i16>, ptr %base, i64 -7
65 call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data,
68 <vscale x 2 x i1> %mask)
73 define void @test_masked_ldst_sv2i32(ptr %base, <vscale x 2 x i1> %mask) nounwind {
74 ; CHECK-LABEL: test_masked_ldst_sv2i32:
76 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #-8, mul vl]
77 ; CHECK-NEXT: st1w { z0.d }, p0, [x0, #-7, mul vl]
79 %base_load = getelementptr <vscale x 2 x i32>, ptr %base, i64 -8
80 %data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %base_load,
82 <vscale x 2 x i1> %mask,
83 <vscale x 2 x i32> undef)
84 %base_store = getelementptr <vscale x 2 x i32>, ptr %base, i64 -7
85 call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data,
88 <vscale x 2 x i1> %mask)
92 define void @test_masked_ldst_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
93 ; CHECK-LABEL: test_masked_ldst_sv2i64:
95 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #-8, mul vl]
96 ; CHECK-NEXT: st1d { z0.d }, p0, [x0, #-7, mul vl]
98 %base_load = getelementptr <vscale x 2 x i64>, ptr %base, i64 -8
99 %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(ptr %base_load,
101 <vscale x 2 x i1> %mask,
102 <vscale x 2 x i64> undef)
103 %base_store = getelementptr <vscale x 2 x i64>, ptr %base, i64 -7
104 call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
107 <vscale x 2 x i1> %mask)
111 define void @test_masked_ldst_sv2f16(ptr %base, <vscale x 2 x i1> %mask) nounwind {
112 ; CHECK-LABEL: test_masked_ldst_sv2f16:
114 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #-8, mul vl]
115 ; CHECK-NEXT: st1h { z0.d }, p0, [x0, #-7, mul vl]
117 %base_load = getelementptr <vscale x 2 x half>, ptr %base, i64 -8
118 %data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(ptr %base_load,
120 <vscale x 2 x i1> %mask,
121 <vscale x 2 x half> undef)
122 %base_store = getelementptr <vscale x 2 x half>, ptr %base, i64 -7
123 call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data,
126 <vscale x 2 x i1> %mask)
131 define void @test_masked_ldst_sv2f32(ptr %base, <vscale x 2 x i1> %mask) nounwind {
132 ; CHECK-LABEL: test_masked_ldst_sv2f32:
134 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #-8, mul vl]
135 ; CHECK-NEXT: st1w { z0.d }, p0, [x0, #-7, mul vl]
137 %base_load = getelementptr <vscale x 2 x float>, ptr %base, i64 -8
138 %data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(ptr %base_load,
140 <vscale x 2 x i1> %mask,
141 <vscale x 2 x float> undef)
142 %base_store = getelementptr <vscale x 2 x float>, ptr %base, i64 -7
143 call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data,
146 <vscale x 2 x i1> %mask)
150 define void @test_masked_ldst_sv2f64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
151 ; CHECK-LABEL: test_masked_ldst_sv2f64:
153 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #-6, mul vl]
154 ; CHECK-NEXT: st1d { z0.d }, p0, [x0, #-5, mul vl]
156 %base_load = getelementptr <vscale x 2 x double>, ptr %base, i64 -6
157 %data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %base_load,
159 <vscale x 2 x i1> %mask,
160 <vscale x 2 x double> undef)
161 %base_store = getelementptr <vscale x 2 x double>, ptr %base, i64 -5
162 call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data,
165 <vscale x 2 x i1> %mask)
169 ; 2-lane zero/sign extended contiguous loads.
171 define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
172 ; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64:
174 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #-4, mul vl]
176 %base_load = getelementptr <vscale x 2 x i8>, ptr %base, i64 -4
177 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %base_load,
179 <vscale x 2 x i1> %mask,
180 <vscale x 2 x i8> undef)
181 %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
182 ret <vscale x 2 x i64> %ext
185 define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
186 ; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64:
188 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, #-3, mul vl]
190 %base_load = getelementptr <vscale x 2 x i8>, ptr %base, i64 -3
191 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %base_load,
193 <vscale x 2 x i1> %mask,
194 <vscale x 2 x i8> undef)
195 %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
196 ret <vscale x 2 x i64> %ext
199 define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
200 ; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64:
202 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #1, mul vl]
204 %base_load = getelementptr <vscale x 2 x i16>, ptr %base, i64 1
205 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %base_load,
207 <vscale x 2 x i1> %mask,
208 <vscale x 2 x i16> undef)
209 %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
210 ret <vscale x 2 x i64> %ext
213 define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
214 ; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64:
216 ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, #2, mul vl]
218 %base_load = getelementptr <vscale x 2 x i16>, ptr %base, i64 2
219 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %base_load,
221 <vscale x 2 x i1> %mask,
222 <vscale x 2 x i16> undef)
223 %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
224 ret <vscale x 2 x i64> %ext
227 define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
228 ; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64:
230 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #-2, mul vl]
232 %base_load = getelementptr <vscale x 2 x i32>, ptr %base, i64 -2
233 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %base_load,
235 <vscale x 2 x i1> %mask,
236 <vscale x 2 x i32> undef)
237 %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
238 ret <vscale x 2 x i64> %ext
241 define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
242 ; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64:
244 ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, #-1, mul vl]
246 %base_load = getelementptr <vscale x 2 x i32>, ptr %base, i64 -1
247 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %base_load,
249 <vscale x 2 x i1> %mask,
250 <vscale x 2 x i32> undef)
251 %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
252 ret <vscale x 2 x i64> %ext
255 ; 2-lane truncating contiguous stores.
257 define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask) nounwind {
258 ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8:
260 ; CHECK-NEXT: st1b { z0.d }, p0, [x0, #3, mul vl]
262 %base_load = getelementptr <vscale x 2 x i8>, ptr %base, i64 3
263 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
264 call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc,
267 <vscale x 2 x i1> %mask)
272 define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask) nounwind {
273 ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16:
275 ; CHECK-NEXT: st1h { z0.d }, p0, [x0, #4, mul vl]
277 %base_load = getelementptr <vscale x 2 x i16>, ptr %base, i64 4
278 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
279 call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc,
282 <vscale x 2 x i1> %mask)
286 define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask) nounwind {
287 ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32:
289 ; CHECK-NEXT: st1w { z0.d }, p0, [x0, #5, mul vl]
291 %base_load = getelementptr <vscale x 2 x i32>, ptr %base, i64 5
292 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
293 call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc,
296 <vscale x 2 x i1> %mask)
300 ; 4-lane contiguous load/stores.
302 define void @test_masked_ldst_sv4i8(ptr %base, <vscale x 4 x i1> %mask) nounwind {
303 ; CHECK-LABEL: test_masked_ldst_sv4i8:
305 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #-1, mul vl]
306 ; CHECK-NEXT: st1b { z0.s }, p0, [x0, #2, mul vl]
308 %base_load = getelementptr <vscale x 4 x i8>, ptr %base, i64 -1
309 %data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %base_load,
311 <vscale x 4 x i1> %mask,
312 <vscale x 4 x i8> undef)
313 %base_store = getelementptr <vscale x 4 x i8>, ptr %base, i64 2
314 call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data,
317 <vscale x 4 x i1> %mask)
321 define void @test_masked_ldst_sv4i16(ptr %base, <vscale x 4 x i1> %mask) nounwind {
322 ; CHECK-LABEL: test_masked_ldst_sv4i16:
324 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #-1, mul vl]
325 ; CHECK-NEXT: st1h { z0.s }, p0, [x0, #2, mul vl]
327 %base_load = getelementptr <vscale x 4 x i16>, ptr %base, i64 -1
328 %data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %base_load,
330 <vscale x 4 x i1> %mask,
331 <vscale x 4 x i16> undef)
332 %base_store = getelementptr <vscale x 4 x i16>, ptr %base, i64 2
333 call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data,
336 <vscale x 4 x i1> %mask)
340 define void @test_masked_ldst_sv4i32(ptr %base, <vscale x 4 x i1> %mask) nounwind {
341 ; CHECK-LABEL: test_masked_ldst_sv4i32:
343 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #6, mul vl]
344 ; CHECK-NEXT: st1w { z0.s }, p0, [x0, #7, mul vl]
346 %base_load = getelementptr <vscale x 4 x i32>, ptr %base, i64 6
347 %data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %base_load,
349 <vscale x 4 x i1> %mask,
350 <vscale x 4 x i32> undef)
351 %base_store = getelementptr <vscale x 4 x i32>, ptr %base, i64 7
352 call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data,
355 <vscale x 4 x i1> %mask)
359 define void @test_masked_ldst_sv4f16(ptr %base, <vscale x 4 x i1> %mask) nounwind {
360 ; CHECK-LABEL: test_masked_ldst_sv4f16:
362 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #-1, mul vl]
363 ; CHECK-NEXT: st1h { z0.s }, p0, [x0, #2, mul vl]
365 %base_load = getelementptr <vscale x 4 x half>, ptr %base, i64 -1
366 %data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(ptr %base_load,
368 <vscale x 4 x i1> %mask,
369 <vscale x 4 x half> undef)
370 %base_store = getelementptr <vscale x 4 x half>, ptr %base, i64 2
371 call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data,
374 <vscale x 4 x i1> %mask)
378 define void @test_masked_ldst_sv4f32(ptr %base, <vscale x 4 x i1> %mask) nounwind {
379 ; CHECK-LABEL: test_masked_ldst_sv4f32:
381 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #-1, mul vl]
382 ; CHECK-NEXT: st1w { z0.s }, p0, [x0, #2, mul vl]
384 %base_load = getelementptr <vscale x 4 x float>, ptr %base, i64 -1
385 %data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(ptr %base_load,
387 <vscale x 4 x i1> %mask,
388 <vscale x 4 x float> undef)
389 %base_store = getelementptr <vscale x 4 x float>, ptr %base, i64 2
390 call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data,
393 <vscale x 4 x i1> %mask)
397 ; 4-lane zero/sign extended contiguous loads.
399 define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask) nounwind {
400 ; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32:
402 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #-4, mul vl]
404 %base_load = getelementptr <vscale x 4 x i8>, ptr %base, i64 -4
405 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %base_load,
407 <vscale x 4 x i1> %mask,
408 <vscale x 4 x i8> undef)
409 %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
410 ret <vscale x 4 x i32> %ext
413 define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask) nounwind {
414 ; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32:
416 ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, #-3, mul vl]
418 %base_load = getelementptr <vscale x 4 x i8>, ptr %base, i64 -3
419 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %base_load,
421 <vscale x 4 x i1> %mask,
422 <vscale x 4 x i8> undef)
423 %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
424 ret <vscale x 4 x i32> %ext
427 define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask) nounwind {
428 ; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32:
430 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #1, mul vl]
432 %base_load = getelementptr <vscale x 4 x i16>, ptr %base, i64 1
433 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %base_load,
435 <vscale x 4 x i1> %mask,
436 <vscale x 4 x i16> undef)
437 %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
438 ret <vscale x 4 x i32> %ext
441 define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask) nounwind {
442 ; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32:
444 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, #2, mul vl]
446 %base_load = getelementptr <vscale x 4 x i16>, ptr %base, i64 2
447 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %base_load,
449 <vscale x 4 x i1> %mask,
450 <vscale x 4 x i16> undef)
451 %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
452 ret <vscale x 4 x i32> %ext
455 ; 4-lane truncating contiguous stores.
457 define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, ptr %base, <vscale x 4 x i1> %mask) nounwind {
458 ; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8:
460 ; CHECK-NEXT: st1b { z0.s }, p0, [x0, #3, mul vl]
462 %base_load = getelementptr <vscale x 4 x i8>, ptr %base, i64 3
463 %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
464 call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc,
467 <vscale x 4 x i1> %mask)
472 define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, ptr %base, <vscale x 4 x i1> %mask) nounwind {
473 ; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16:
475 ; CHECK-NEXT: st1h { z0.s }, p0, [x0, #4, mul vl]
477 %base_load = getelementptr <vscale x 4 x i16>, ptr %base, i64 4
478 %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
479 call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc,
482 <vscale x 4 x i1> %mask)
486 ; 8-lane contiguous load/stores.
488 define void @test_masked_ldst_sv8i8(ptr %base, <vscale x 8 x i1> %mask) nounwind {
489 ; CHECK-LABEL: test_masked_ldst_sv8i8:
491 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, #6, mul vl]
492 ; CHECK-NEXT: st1b { z0.h }, p0, [x0, #7, mul vl]
494 %base_load = getelementptr <vscale x 8 x i8>, ptr %base, i64 6
495 %data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %base_load,
497 <vscale x 8 x i1> %mask,
498 <vscale x 8 x i8> undef)
499 %base_store = getelementptr <vscale x 8 x i8>, ptr %base, i64 7
500 call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data,
503 <vscale x 8 x i1> %mask)
507 define void @test_masked_ldst_sv8i16(ptr %base, <vscale x 8 x i1> %mask) nounwind {
508 ; CHECK-LABEL: test_masked_ldst_sv8i16:
510 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #6, mul vl]
511 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, #7, mul vl]
513 %base_load = getelementptr <vscale x 8 x i16>, ptr %base, i64 6
514 %data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(ptr %base_load,
516 <vscale x 8 x i1> %mask,
517 <vscale x 8 x i16> undef)
518 %base_store = getelementptr <vscale x 8 x i16>, ptr %base, i64 7
519 call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data,
522 <vscale x 8 x i1> %mask)
526 define void @test_masked_ldst_sv8f16(ptr %base, <vscale x 8 x i1> %mask) nounwind {
527 ; CHECK-LABEL: test_masked_ldst_sv8f16:
529 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #-1, mul vl]
530 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, #2, mul vl]
532 %base_load = getelementptr <vscale x 8 x half>, ptr %base, i64 -1
533 %data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(ptr %base_load,
535 <vscale x 8 x i1> %mask,
536 <vscale x 8 x half> undef)
537 %base_store = getelementptr <vscale x 8 x half>, ptr %base, i64 2
538 call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data,
541 <vscale x 8 x i1> %mask)
545 define void @test_masked_ldst_sv8bf16(ptr %base, <vscale x 8 x i1> %mask) nounwind #0 {
546 ; CHECK-LABEL: test_masked_ldst_sv8bf16:
548 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #-1, mul vl]
549 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, #2, mul vl]
551 %base_load = getelementptr <vscale x 8 x bfloat>, ptr %base, i64 -1
552 %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(ptr %base_load,
554 <vscale x 8 x i1> %mask,
555 <vscale x 8 x bfloat> undef)
556 %base_store = getelementptr <vscale x 8 x bfloat>, ptr %base, i64 2
557 call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data,
560 <vscale x 8 x i1> %mask)
564 ; 8-lane zero/sign extended contiguous loads.
566 define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(ptr %base, <vscale x 8 x i1> %mask) nounwind {
567 ; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16:
569 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, #-4, mul vl]
571 %base_load = getelementptr <vscale x 8 x i8>, ptr %base, i64 -4
572 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %base_load,
574 <vscale x 8 x i1> %mask,
575 <vscale x 8 x i8> undef)
576 %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
577 ret <vscale x 8 x i16> %ext
580 define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(ptr %base, <vscale x 8 x i1> %mask) nounwind {
581 ; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16:
583 ; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, #-3, mul vl]
585 %base_load = getelementptr <vscale x 8 x i8>, ptr %base, i64 -3
586 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %base_load,
588 <vscale x 8 x i1> %mask,
589 <vscale x 8 x i8> undef)
590 %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
591 ret <vscale x 8 x i16> %ext
594 ; 8-lane truncating contiguous stores.
596 define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, ptr %base, <vscale x 8 x i1> %mask) nounwind {
597 ; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8:
599 ; CHECK-NEXT: st1b { z0.h }, p0, [x0, #3, mul vl]
601 %base_load = getelementptr <vscale x 8 x i8>, ptr %base, i64 3
602 %trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
603 call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc,
606 <vscale x 8 x i1> %mask)
610 ; 16-lane contiguous load/stores.
612 define void @test_masked_ldst_sv16i8(ptr %base, <vscale x 16 x i1> %mask) nounwind {
613 ; CHECK-LABEL: test_masked_ldst_sv16i8:
615 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #6, mul vl]
616 ; CHECK-NEXT: st1b { z0.b }, p0, [x0, #7, mul vl]
618 %base_load = getelementptr <vscale x 16 x i8>, ptr %base, i64 6
619 %data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(ptr %base_load,
621 <vscale x 16 x i1> %mask,
622 <vscale x 16 x i8> undef)
623 %base_store = getelementptr <vscale x 16 x i8>, ptr %base, i64 7
624 call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data,
627 <vscale x 16 x i1> %mask)
631 ; 2-element contiguous loads.
632 declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8 (ptr , i32, <vscale x 2 x i1>, <vscale x 2 x i8> )
633 declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
634 declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
635 declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
636 declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
637 declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
638 declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
640 ; 4-element contiguous loads.
641 declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8 (ptr , i32, <vscale x 4 x i1>, <vscale x 4 x i8> )
642 declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
643 declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
644 declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
645 declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
647 ; 8-element contiguous loads.
648 declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (ptr , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
649 declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
650 declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
651 declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
653 ; 16-element contiguous loads.
654 declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(ptr, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
656 ; 2-element contiguous stores.
657 declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , ptr , i32, <vscale x 2 x i1>)
658 declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, ptr, i32, <vscale x 2 x i1>)
659 declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, ptr, i32, <vscale x 2 x i1>)
660 declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, ptr, i32, <vscale x 2 x i1>)
661 declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, ptr, i32, <vscale x 2 x i1>)
662 declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, ptr, i32, <vscale x 2 x i1>)
663 declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, ptr, i32, <vscale x 2 x i1>)
665 ; 4-element contiguous stores.
666 declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , ptr , i32, <vscale x 4 x i1>)
667 declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, ptr, i32, <vscale x 4 x i1>)
668 declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, ptr, i32, <vscale x 4 x i1>)
669 declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, ptr, i32, <vscale x 4 x i1>)
670 declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, ptr, i32, <vscale x 4 x i1>)
672 ; 8-element contiguous stores.
673 declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , ptr , i32, <vscale x 8 x i1>)
674 declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, ptr, i32, <vscale x 8 x i1>)
675 declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, ptr, i32, <vscale x 8 x i1>)
676 declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, ptr, i32, <vscale x 8 x i1>)
678 ; 16-element contiguous stores.
679 declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, ptr, i32, <vscale x 16 x i1>)
681 ; +bf16 is required for the bfloat version.
682 attributes #0 = { "target-features"="+sve,+bf16" }