1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
4 ; Range checks: for all the instruction tested in this file, the
5 ; immediate must be within the range [-8, 7] (4-bit immediate). Out of
6 ; range values are tested only in one case (following). Valid values
7 ; are tested all through the rest of the file.
9 define void @imm_out_of_range(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
10 ; CHECK-LABEL: imm_out_of_range:
12 ; CHECK-NEXT: addvl x8, x0, #8
13 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
14 ; CHECK-NEXT: addvl x8, x0, #-9
15 ; CHECK-NEXT: st1d { z0.d }, p0, [x8]
17 %base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 8
18 %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_load,
20 <vscale x 2 x i1> %mask,
21 <vscale x 2 x i64> undef)
22 %base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -9
23 call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
24 <vscale x 2 x i64>* %base_store,
26 <vscale x 2 x i1> %mask)
30 ; 2-lane contiguous load/stores
32 define void @test_masked_ldst_sv2i8(<vscale x 2 x i8> * %base, <vscale x 2 x i1> %mask) nounwind {
33 ; CHECK-LABEL: test_masked_ldst_sv2i8:
35 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #-8, mul vl]
36 ; CHECK-NEXT: st1b { z0.d }, p0, [x0, #-7, mul vl]
38 %base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 -8
39 %data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_load,
41 <vscale x 2 x i1> %mask,
42 <vscale x 2 x i8> undef)
43 %base_store = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8> * %base, i64 -7
44 call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data,
45 <vscale x 2 x i8>* %base_store,
47 <vscale x 2 x i1> %mask)
51 define void @test_masked_ldst_sv2i16(<vscale x 2 x i16> * %base, <vscale x 2 x i1> %mask) nounwind {
52 ; CHECK-LABEL: test_masked_ldst_sv2i16:
54 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #-8, mul vl]
55 ; CHECK-NEXT: st1h { z0.d }, p0, [x0, #-7, mul vl]
57 %base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 -8
58 %data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_load,
60 <vscale x 2 x i1> %mask,
61 <vscale x 2 x i16> undef)
62 %base_store = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16> * %base, i64 -7
63 call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data,
64 <vscale x 2 x i16>* %base_store,
66 <vscale x 2 x i1> %mask)
71 define void @test_masked_ldst_sv2i32(<vscale x 2 x i32> * %base, <vscale x 2 x i1> %mask) nounwind {
72 ; CHECK-LABEL: test_masked_ldst_sv2i32:
74 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #-8, mul vl]
75 ; CHECK-NEXT: st1w { z0.d }, p0, [x0, #-7, mul vl]
77 %base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 -8
78 %data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_load,
80 <vscale x 2 x i1> %mask,
81 <vscale x 2 x i32> undef)
82 %base_store = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32> * %base, i64 -7
83 call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data,
84 <vscale x 2 x i32>* %base_store,
86 <vscale x 2 x i1> %mask)
90 define void @test_masked_ldst_sv2i64(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
91 ; CHECK-LABEL: test_masked_ldst_sv2i64:
93 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #-8, mul vl]
94 ; CHECK-NEXT: st1d { z0.d }, p0, [x0, #-7, mul vl]
96 %base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 -8
97 %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_load,
99 <vscale x 2 x i1> %mask,
100 <vscale x 2 x i64> undef)
101 %base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -7
102 call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
103 <vscale x 2 x i64>* %base_store,
105 <vscale x 2 x i1> %mask)
109 define void @test_masked_ldst_sv2f16(<vscale x 2 x half> * %base, <vscale x 2 x i1> %mask) nounwind {
110 ; CHECK-LABEL: test_masked_ldst_sv2f16:
112 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #-8, mul vl]
113 ; CHECK-NEXT: st1h { z0.d }, p0, [x0, #-7, mul vl]
115 %base_load = getelementptr <vscale x 2 x half>, <vscale x 2 x half>* %base, i64 -8
116 %data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>* %base_load,
118 <vscale x 2 x i1> %mask,
119 <vscale x 2 x half> undef)
120 %base_store = getelementptr <vscale x 2 x half>, <vscale x 2 x half> * %base, i64 -7
121 call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data,
122 <vscale x 2 x half>* %base_store,
124 <vscale x 2 x i1> %mask)
129 define void @test_masked_ldst_sv2f32(<vscale x 2 x float> * %base, <vscale x 2 x i1> %mask) nounwind {
130 ; CHECK-LABEL: test_masked_ldst_sv2f32:
132 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #-8, mul vl]
133 ; CHECK-NEXT: st1w { z0.d }, p0, [x0, #-7, mul vl]
135 %base_load = getelementptr <vscale x 2 x float>, <vscale x 2 x float>* %base, i64 -8
136 %data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>* %base_load,
138 <vscale x 2 x i1> %mask,
139 <vscale x 2 x float> undef)
140 %base_store = getelementptr <vscale x 2 x float>, <vscale x 2 x float> * %base, i64 -7
141 call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data,
142 <vscale x 2 x float>* %base_store,
144 <vscale x 2 x i1> %mask)
148 define void @test_masked_ldst_sv2f64(<vscale x 2 x double> * %base, <vscale x 2 x i1> %mask) nounwind {
149 ; CHECK-LABEL: test_masked_ldst_sv2f64:
151 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #-6, mul vl]
152 ; CHECK-NEXT: st1d { z0.d }, p0, [x0, #-5, mul vl]
154 %base_load = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %base, i64 -6
155 %data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>* %base_load,
157 <vscale x 2 x i1> %mask,
158 <vscale x 2 x double> undef)
159 %base_store = getelementptr <vscale x 2 x double>, <vscale x 2 x double> * %base, i64 -5
160 call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data,
161 <vscale x 2 x double>* %base_store,
163 <vscale x 2 x i1> %mask)
167 ; 2-lane zero/sign extended contiguous loads.
169 define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(<vscale x 2 x i8>* %base, <vscale x 2 x i1> %mask) nounwind {
170 ; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64:
172 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #-4, mul vl]
174 %base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 -4
175 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_load,
177 <vscale x 2 x i1> %mask,
178 <vscale x 2 x i8> undef)
179 %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
180 ret <vscale x 2 x i64> %ext
183 define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(<vscale x 2 x i8>* %base, <vscale x 2 x i1> %mask) nounwind {
184 ; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64:
186 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, #-3, mul vl]
188 %base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 -3
189 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_load,
191 <vscale x 2 x i1> %mask,
192 <vscale x 2 x i8> undef)
193 %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
194 ret <vscale x 2 x i64> %ext
197 define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(<vscale x 2 x i16>* %base, <vscale x 2 x i1> %mask) nounwind {
198 ; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64:
200 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #1, mul vl]
202 %base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 1
203 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_load,
205 <vscale x 2 x i1> %mask,
206 <vscale x 2 x i16> undef)
207 %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
208 ret <vscale x 2 x i64> %ext
211 define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(<vscale x 2 x i16>* %base, <vscale x 2 x i1> %mask) nounwind {
212 ; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64:
214 ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, #2, mul vl]
216 %base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 2
217 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_load,
219 <vscale x 2 x i1> %mask,
220 <vscale x 2 x i16> undef)
221 %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
222 ret <vscale x 2 x i64> %ext
225 define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(<vscale x 2 x i32>* %base, <vscale x 2 x i1> %mask) nounwind {
226 ; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64:
228 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #-2, mul vl]
230 %base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 -2
231 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_load,
233 <vscale x 2 x i1> %mask,
234 <vscale x 2 x i32> undef)
235 %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
236 ret <vscale x 2 x i64> %ext
239 define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(<vscale x 2 x i32>* %base, <vscale x 2 x i1> %mask) nounwind {
240 ; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64:
242 ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, #-1, mul vl]
244 %base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 -1
245 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_load,
247 <vscale x 2 x i1> %mask,
248 <vscale x 2 x i32> undef)
249 %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
250 ret <vscale x 2 x i64> %ext
253 ; 2-lane truncating contiguous stores.
255 define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, <vscale x 2 x i8> *%base, <vscale x 2 x i1> %mask) nounwind {
256 ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8:
258 ; CHECK-NEXT: st1b { z0.d }, p0, [x0, #3, mul vl]
260 %base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 3
261 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
262 call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc,
263 <vscale x 2 x i8> *%base_load,
265 <vscale x 2 x i1> %mask)
270 define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, <vscale x 2 x i16> *%base, <vscale x 2 x i1> %mask) nounwind {
271 ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16:
273 ; CHECK-NEXT: st1h { z0.d }, p0, [x0, #4, mul vl]
275 %base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 4
276 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
277 call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc,
278 <vscale x 2 x i16> *%base_load,
280 <vscale x 2 x i1> %mask)
284 define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, <vscale x 2 x i32> *%base, <vscale x 2 x i1> %mask) nounwind {
285 ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32:
287 ; CHECK-NEXT: st1w { z0.d }, p0, [x0, #5, mul vl]
289 %base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 5
290 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
291 call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc,
292 <vscale x 2 x i32> *%base_load,
294 <vscale x 2 x i1> %mask)
298 ; 4-lane contiguous load/stores.
300 define void @test_masked_ldst_sv4i8(<vscale x 4 x i8> * %base, <vscale x 4 x i1> %mask) nounwind {
301 ; CHECK-LABEL: test_masked_ldst_sv4i8:
303 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #-1, mul vl]
304 ; CHECK-NEXT: st1b { z0.s }, p0, [x0, #2, mul vl]
306 %base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 -1
307 %data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_load,
309 <vscale x 4 x i1> %mask,
310 <vscale x 4 x i8> undef)
311 %base_store = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8> * %base, i64 2
312 call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data,
313 <vscale x 4 x i8>* %base_store,
315 <vscale x 4 x i1> %mask)
319 define void @test_masked_ldst_sv4i16(<vscale x 4 x i16> * %base, <vscale x 4 x i1> %mask) nounwind {
320 ; CHECK-LABEL: test_masked_ldst_sv4i16:
322 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #-1, mul vl]
323 ; CHECK-NEXT: st1h { z0.s }, p0, [x0, #2, mul vl]
325 %base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 -1
326 %data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_load,
328 <vscale x 4 x i1> %mask,
329 <vscale x 4 x i16> undef)
330 %base_store = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16> * %base, i64 2
331 call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data,
332 <vscale x 4 x i16>* %base_store,
334 <vscale x 4 x i1> %mask)
338 define void @test_masked_ldst_sv4i32(<vscale x 4 x i32> * %base, <vscale x 4 x i1> %mask) nounwind {
339 ; CHECK-LABEL: test_masked_ldst_sv4i32:
341 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #6, mul vl]
342 ; CHECK-NEXT: st1w { z0.s }, p0, [x0, #7, mul vl]
344 %base_load = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %base, i64 6
345 %data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>* %base_load,
347 <vscale x 4 x i1> %mask,
348 <vscale x 4 x i32> undef)
349 %base_store = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32> * %base, i64 7
350 call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data,
351 <vscale x 4 x i32>* %base_store,
353 <vscale x 4 x i1> %mask)
357 define void @test_masked_ldst_sv4f16(<vscale x 4 x half> * %base, <vscale x 4 x i1> %mask) nounwind {
358 ; CHECK-LABEL: test_masked_ldst_sv4f16:
360 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #-1, mul vl]
361 ; CHECK-NEXT: st1h { z0.s }, p0, [x0, #2, mul vl]
363 %base_load = getelementptr <vscale x 4 x half>, <vscale x 4 x half>* %base, i64 -1
364 %data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>* %base_load,
366 <vscale x 4 x i1> %mask,
367 <vscale x 4 x half> undef)
368 %base_store = getelementptr <vscale x 4 x half>, <vscale x 4 x half> * %base, i64 2
369 call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data,
370 <vscale x 4 x half>* %base_store,
372 <vscale x 4 x i1> %mask)
376 define void @test_masked_ldst_sv4f32(<vscale x 4 x float> * %base, <vscale x 4 x i1> %mask) nounwind {
377 ; CHECK-LABEL: test_masked_ldst_sv4f32:
379 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #-1, mul vl]
380 ; CHECK-NEXT: st1w { z0.s }, p0, [x0, #2, mul vl]
382 %base_load = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %base, i64 -1
383 %data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>* %base_load,
385 <vscale x 4 x i1> %mask,
386 <vscale x 4 x float> undef)
387 %base_store = getelementptr <vscale x 4 x float>, <vscale x 4 x float> * %base, i64 2
388 call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data,
389 <vscale x 4 x float>* %base_store,
391 <vscale x 4 x i1> %mask)
395 ; 4-lane zero/sign extended contiguous loads.
397 define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(<vscale x 4 x i8>* %base, <vscale x 4 x i1> %mask) nounwind {
398 ; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32:
400 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #-4, mul vl]
402 %base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 -4
403 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_load,
405 <vscale x 4 x i1> %mask,
406 <vscale x 4 x i8> undef)
407 %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
408 ret <vscale x 4 x i32> %ext
411 define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(<vscale x 4 x i8>* %base, <vscale x 4 x i1> %mask) nounwind {
412 ; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32:
414 ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, #-3, mul vl]
416 %base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 -3
417 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_load,
419 <vscale x 4 x i1> %mask,
420 <vscale x 4 x i8> undef)
421 %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
422 ret <vscale x 4 x i32> %ext
425 define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(<vscale x 4 x i16>* %base, <vscale x 4 x i1> %mask) nounwind {
426 ; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32:
428 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #1, mul vl]
430 %base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 1
431 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_load,
433 <vscale x 4 x i1> %mask,
434 <vscale x 4 x i16> undef)
435 %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
436 ret <vscale x 4 x i32> %ext
439 define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(<vscale x 4 x i16>* %base, <vscale x 4 x i1> %mask) nounwind {
440 ; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32:
442 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, #2, mul vl]
444 %base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 2
445 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_load,
447 <vscale x 4 x i1> %mask,
448 <vscale x 4 x i16> undef)
449 %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
450 ret <vscale x 4 x i32> %ext
453 ; 4-lane truncating contiguous stores.
455 define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, <vscale x 4 x i8> *%base, <vscale x 4 x i1> %mask) nounwind {
456 ; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8:
458 ; CHECK-NEXT: st1b { z0.s }, p0, [x0, #3, mul vl]
460 %base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 3
461 %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
462 call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc,
463 <vscale x 4 x i8> *%base_load,
465 <vscale x 4 x i1> %mask)
470 define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, <vscale x 4 x i16> *%base, <vscale x 4 x i1> %mask) nounwind {
471 ; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16:
473 ; CHECK-NEXT: st1h { z0.s }, p0, [x0, #4, mul vl]
475 %base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 4
476 %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
477 call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc,
478 <vscale x 4 x i16> *%base_load,
480 <vscale x 4 x i1> %mask)
484 ; 8-lane contiguous load/stores.
486 define void @test_masked_ldst_sv8i8(<vscale x 8 x i8> * %base, <vscale x 8 x i1> %mask) nounwind {
487 ; CHECK-LABEL: test_masked_ldst_sv8i8:
489 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, #6, mul vl]
490 ; CHECK-NEXT: st1b { z0.h }, p0, [x0, #7, mul vl]
492 %base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 6
493 %data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_load,
495 <vscale x 8 x i1> %mask,
496 <vscale x 8 x i8> undef)
497 %base_store = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8> * %base, i64 7
498 call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data,
499 <vscale x 8 x i8>* %base_store,
501 <vscale x 8 x i1> %mask)
505 define void @test_masked_ldst_sv8i16(<vscale x 8 x i16> * %base, <vscale x 8 x i1> %mask) nounwind {
506 ; CHECK-LABEL: test_masked_ldst_sv8i16:
508 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #6, mul vl]
509 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, #7, mul vl]
511 %base_load = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %base, i64 6
512 %data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>* %base_load,
514 <vscale x 8 x i1> %mask,
515 <vscale x 8 x i16> undef)
516 %base_store = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16> * %base, i64 7
517 call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data,
518 <vscale x 8 x i16>* %base_store,
520 <vscale x 8 x i1> %mask)
524 define void @test_masked_ldst_sv8f16(<vscale x 8 x half> * %base, <vscale x 8 x i1> %mask) nounwind {
525 ; CHECK-LABEL: test_masked_ldst_sv8f16:
527 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #-1, mul vl]
528 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, #2, mul vl]
530 %base_load = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %base, i64 -1
531 %data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>* %base_load,
533 <vscale x 8 x i1> %mask,
534 <vscale x 8 x half> undef)
535 %base_store = getelementptr <vscale x 8 x half>, <vscale x 8 x half> * %base, i64 2
536 call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data,
537 <vscale x 8 x half>* %base_store,
539 <vscale x 8 x i1> %mask)
543 define void @test_masked_ldst_sv8bf16(<vscale x 8 x bfloat> * %base, <vscale x 8 x i1> %mask) nounwind #0 {
544 ; CHECK-LABEL: test_masked_ldst_sv8bf16:
546 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #-1, mul vl]
547 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, #2, mul vl]
549 %base_load = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base, i64 -1
550 %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>* %base_load,
552 <vscale x 8 x i1> %mask,
553 <vscale x 8 x bfloat> undef)
554 %base_store = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat> * %base, i64 2
555 call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data,
556 <vscale x 8 x bfloat>* %base_store,
558 <vscale x 8 x i1> %mask)
562 ; 8-lane zero/sign extended contiguous loads.
564 define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(<vscale x 8 x i8>* %base, <vscale x 8 x i1> %mask) nounwind {
565 ; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16:
567 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, #-4, mul vl]
569 %base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 -4
570 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_load,
572 <vscale x 8 x i1> %mask,
573 <vscale x 8 x i8> undef)
574 %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
575 ret <vscale x 8 x i16> %ext
578 define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(<vscale x 8 x i8>* %base, <vscale x 8 x i1> %mask) nounwind {
579 ; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16:
581 ; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, #-3, mul vl]
583 %base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 -3
584 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_load,
586 <vscale x 8 x i1> %mask,
587 <vscale x 8 x i8> undef)
588 %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
589 ret <vscale x 8 x i16> %ext
592 ; 8-lane truncating contiguous stores.
594 define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, <vscale x 8 x i8> *%base, <vscale x 8 x i1> %mask) nounwind {
595 ; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8:
597 ; CHECK-NEXT: st1b { z0.h }, p0, [x0, #3, mul vl]
599 %base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 3
600 %trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
601 call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc,
602 <vscale x 8 x i8> *%base_load,
604 <vscale x 8 x i1> %mask)
608 ; 16-lane contiguous load/stores.
610 define void @test_masked_ldst_sv16i8(<vscale x 16 x i8> * %base, <vscale x 16 x i1> %mask) nounwind {
611 ; CHECK-LABEL: test_masked_ldst_sv16i8:
613 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #6, mul vl]
614 ; CHECK-NEXT: st1b { z0.b }, p0, [x0, #7, mul vl]
616 %base_load = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base, i64 6
617 %data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>* %base_load,
619 <vscale x 16 x i1> %mask,
620 <vscale x 16 x i8> undef)
621 %base_store = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8> * %base, i64 7
622 call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data,
623 <vscale x 16 x i8>* %base_store,
625 <vscale x 16 x i1> %mask)
629 ; 2-element contiguous loads.
630 declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8 (<vscale x 2 x i8>* , i32, <vscale x 2 x i1>, <vscale x 2 x i8> )
631 declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
632 declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
633 declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
634 declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>*, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
635 declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>*, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
636 declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
638 ; 4-element contiguous loads.
639 declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8 (<vscale x 4 x i8>* , i32, <vscale x 4 x i1>, <vscale x 4 x i8> )
640 declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
641 declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
642 declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
643 declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
645 ; 8-element contiguous loads.
646 declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
647 declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
648 declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
649 declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
651 ; 16-element contiguous loads.
652 declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
654 ; 2-element contiguous stores.
655 declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , <vscale x 2 x i8>* , i32, <vscale x 2 x i1>)
656 declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>*, i32, <vscale x 2 x i1>)
657 declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>*, i32, <vscale x 2 x i1>)
658 declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>)
659 declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>*, i32, <vscale x 2 x i1>)
660 declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>*, i32, <vscale x 2 x i1>)
661 declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>*, i32, <vscale x 2 x i1>)
663 ; 4-element contiguous stores.
664 declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , <vscale x 4 x i8>* , i32, <vscale x 4 x i1>)
665 declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>*, i32, <vscale x 4 x i1>)
666 declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>*, i32, <vscale x 4 x i1>)
667 declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>)
668 declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>)
670 ; 8-element contiguous stores.
671 declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
672 declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
673 declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
674 declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>)
676 ; 16-element contiguous stores.
677 declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
679 ; +bf16 is required for the bfloat version.
680 attributes #0 = { "target-features"="+sve,+bf16" }