1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s
3 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sme < %s | FileCheck %s
5 ; NOTE: invalid, upper and lower bound immediate values of the regimm
6 ; addressing mode are checked only for the byte version of each
7 ; instruction (`ld<N>b`), as the code for detecting the immediate is
8 ; common to all instructions, and varies only for the number of
9 ; elements of the structure store, which is <N> = 2, 3, 4.
12 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
13 ; CHECK-LABEL: ld2.nxv32i8:
15 ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, #2, mul vl]
17 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 2
18 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8*
19 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
20 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
23 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
24 ; CHECK-LABEL: ld2.nxv32i8_lower_bound:
26 ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, #-16, mul vl]
28 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -16
29 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
30 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
31 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
34 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
35 ; CHECK-LABEL: ld2.nxv32i8_upper_bound:
37 ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, #14, mul vl]
39 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 14
40 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
41 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
42 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
45 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8_not_multiple_of_2(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
46 ; CHECK-LABEL: ld2.nxv32i8_not_multiple_of_2:
48 ; CHECK-NEXT: rdvl x8, #3
49 ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x8]
51 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 3
52 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
53 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
54 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
57 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8_outside_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
58 ; CHECK-LABEL: ld2.nxv32i8_outside_lower_bound:
60 ; CHECK-NEXT: rdvl x8, #-18
61 ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x8]
63 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -18
64 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
65 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
66 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
69 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8_outside_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
70 ; CHECK-LABEL: ld2.nxv32i8_outside_upper_bound:
72 ; CHECK-NEXT: rdvl x8, #16
73 ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x8]
75 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 16
76 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
77 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
78 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
82 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ld2.nxv16i16(<vscale x 8 x i1> %Pg, <vscale x 8 x i16>* %addr) {
83 ; CHECK-LABEL: ld2.nxv16i16:
85 ; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, #14, mul vl]
87 %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 14
88 %base_ptr = bitcast <vscale x 8 x i16>* %base to i16 *
89 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> %Pg, i16 *%base_ptr)
90 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
93 define { <vscale x 8 x half>, <vscale x 8 x half> } @ld2.nxv16f16(<vscale x 8 x i1> %Pg, <vscale x 8 x half>* %addr) {
94 ; CHECK-LABEL: ld2.nxv16f16:
96 ; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, #-16, mul vl]
98 %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 -16
99 %base_ptr = bitcast <vscale x 8 x half>* %base to half *
100 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2.sret.nxv8f16(<vscale x 8 x i1> %Pg, half *%base_ptr)
101 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
104 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld2.nxv16bf16(<vscale x 8 x i1> %Pg, <vscale x 8 x bfloat>* %addr) #0 {
105 ; CHECK-LABEL: ld2.nxv16bf16:
107 ; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0, #12, mul vl]
109 %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 12
110 %base_ptr = bitcast <vscale x 8 x bfloat>* %base to bfloat *
111 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> %Pg, bfloat *%base_ptr)
112 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
116 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ld2.nxv8i32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32>* %addr) {
117 ; CHECK-LABEL: ld2.nxv8i32:
119 ; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0, #14, mul vl]
121 %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 14
122 %base_ptr = bitcast <vscale x 4 x i32>* %base to i32 *
123 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> %Pg, i32 *%base_ptr)
124 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
127 define { <vscale x 4 x float>, <vscale x 4 x float> } @ld2.nxv8f32(<vscale x 4 x i1> %Pg, <vscale x 4 x float>* %addr) {
128 ; CHECK-LABEL: ld2.nxv8f32:
130 ; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0, #-16, mul vl]
132 %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 -16
133 %base_ptr = bitcast <vscale x 4 x float>* %base to float *
134 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2.sret.nxv4f32(<vscale x 4 x i1> %Pg, float *%base_ptr)
135 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
139 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ld2.nxv4i64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64>* %addr) {
140 ; CHECK-LABEL: ld2.nxv4i64:
142 ; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0, #14, mul vl]
144 %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 14
145 %base_ptr = bitcast <vscale x 2 x i64>* %base to i64 *
146 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> %Pg, i64 *%base_ptr)
147 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
150 define { <vscale x 2 x double>, <vscale x 2 x double> } @ld2.nxv4f64(<vscale x 2 x i1> %Pg, <vscale x 2 x double>* %addr) {
151 ; CHECK-LABEL: ld2.nxv4f64:
153 ; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0, #-16, mul vl]
155 %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -16
156 %base_ptr = bitcast <vscale x 2 x double>* %base to double *
157 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> %Pg, double *%base_ptr)
158 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
162 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
163 ; CHECK-LABEL: ld3.nxv48i8:
165 ; CHECK-NEXT: ld3b { z0.b - z2.b }, p0/z, [x0, #3, mul vl]
167 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 3
168 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
169 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
170 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
173 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
174 ; CHECK-LABEL: ld3.nxv48i8_lower_bound:
176 ; CHECK-NEXT: ld3b { z0.b - z2.b }, p0/z, [x0, #-24, mul vl]
178 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -24
179 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
180 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
181 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
184 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
185 ; CHECK-LABEL: ld3.nxv48i8_upper_bound:
187 ; CHECK-NEXT: ld3b { z0.b - z2.b }, p0/z, [x0, #21, mul vl]
189 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 21
190 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
191 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
192 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
195 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8_not_multiple_of_3_01(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
196 ; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_01:
198 ; CHECK-NEXT: rdvl x8, #4
199 ; CHECK-NEXT: ld3b { z0.b - z2.b }, p0/z, [x0, x8]
201 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 4
202 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
203 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
204 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
207 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8_not_multiple_of_3_02(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
208 ; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_02:
210 ; CHECK-NEXT: rdvl x8, #5
211 ; CHECK-NEXT: ld3b { z0.b - z2.b }, p0/z, [x0, x8]
213 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 5
214 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
215 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
216 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
219 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8_outside_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
220 ; CHECK-LABEL: ld3.nxv48i8_outside_lower_bound:
222 ; CHECK-NEXT: rdvl x8, #-27
223 ; CHECK-NEXT: ld3b { z0.b - z2.b }, p0/z, [x0, x8]
225 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -27
226 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
227 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
228 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
231 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8_outside_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
232 ; CHECK-LABEL: ld3.nxv48i8_outside_upper_bound:
234 ; CHECK-NEXT: rdvl x8, #24
235 ; CHECK-NEXT: ld3b { z0.b - z2.b }, p0/z, [x0, x8]
237 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 24
238 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
239 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
240 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
244 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld3.nxv24i16(<vscale x 8 x i1> %Pg, <vscale x 8 x i16> *%addr) {
245 ; CHECK-LABEL: ld3.nxv24i16:
247 ; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [x0, #21, mul vl]
249 %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 21
250 %base_ptr = bitcast <vscale x 8 x i16>* %base to i16 *
251 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3.sret.nxv8i16(<vscale x 8 x i1> %Pg, i16 *%base_ptr)
252 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
255 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld3.nxv24f16(<vscale x 8 x i1> %Pg, <vscale x 8 x half> *%addr) {
256 ; CHECK-LABEL: ld3.nxv24f16:
258 ; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [x0, #21, mul vl]
260 %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 21
261 %base_ptr = bitcast <vscale x 8 x half>* %base to half *
262 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3.sret.nxv8f16(<vscale x 8 x i1> %Pg, half *%base_ptr)
263 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
266 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld3.nxv24bf16(<vscale x 8 x i1> %Pg, <vscale x 8 x bfloat> *%addr) #0 {
267 ; CHECK-LABEL: ld3.nxv24bf16:
269 ; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [x0, #-24, mul vl]
271 %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 -24
272 %base_ptr = bitcast <vscale x 8 x bfloat>* %base to bfloat *
273 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3.sret.nxv8bf16(<vscale x 8 x i1> %Pg, bfloat *%base_ptr)
274 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
278 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld3.nxv12i32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> *%addr) {
279 ; CHECK-LABEL: ld3.nxv12i32:
281 ; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [x0, #21, mul vl]
283 %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 21
284 %base_ptr = bitcast <vscale x 4 x i32>* %base to i32 *
285 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3.sret.nxv4i32(<vscale x 4 x i1> %Pg, i32 *%base_ptr)
286 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
289 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld3.nxv12f32(<vscale x 4 x i1> %Pg, <vscale x 4 x float> *%addr) {
290 ; CHECK-LABEL: ld3.nxv12f32:
292 ; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [x0, #-24, mul vl]
294 %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 -24
295 %base_ptr = bitcast <vscale x 4 x float>* %base to float *
296 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3.sret.nxv4f32(<vscale x 4 x i1> %Pg, float *%base_ptr)
297 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
301 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld3.nxv6i64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> *%addr) {
302 ; CHECK-LABEL: ld3.nxv6i64:
304 ; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [x0, #21, mul vl]
306 %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 21
307 %base_ptr = bitcast <vscale x 2 x i64>* %base to i64 *
308 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> %Pg, i64 *%base_ptr)
309 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
312 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld3.nxv6f64(<vscale x 2 x i1> %Pg, <vscale x 2 x double> *%addr) {
313 ; CHECK-LABEL: ld3.nxv6f64:
315 ; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [x0, #-24, mul vl]
317 %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -24
318 %base_ptr = bitcast <vscale x 2 x double>* %base to double *
319 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3.sret.nxv2f64(<vscale x 2 x i1> %Pg, double *%base_ptr)
320 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
324 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
325 ; CHECK-LABEL: ld4.nxv64i8:
327 ; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0, #4, mul vl]
329 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 4
330 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
331 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
332 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
335 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
336 ; CHECK-LABEL: ld4.nxv64i8_lower_bound:
338 ; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0, #-32, mul vl]
340 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -32
341 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
342 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
343 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
346 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
347 ; CHECK-LABEL: ld4.nxv64i8_upper_bound:
349 ; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0, #28, mul vl]
351 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 28
352 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
353 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
354 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
357 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_not_multiple_of_4_01(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
358 ; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_01:
360 ; CHECK-NEXT: rdvl x8, #5
361 ; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0, x8]
363 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 5
364 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
365 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
366 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
369 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_not_multiple_of_4_02(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
370 ; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_02:
372 ; CHECK-NEXT: rdvl x8, #6
373 ; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0, x8]
375 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 6
376 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
377 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
378 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
381 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_not_multiple_of_4_03(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
382 ; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_03:
384 ; CHECK-NEXT: rdvl x8, #7
385 ; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0, x8]
387 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 7
388 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
389 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
390 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
393 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_outside_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
394 ; CHECK-LABEL: ld4.nxv64i8_outside_lower_bound:
396 ; CHECK-NEXT: rdvl x8, #1
397 ; CHECK-NEXT: mov x9, #-576
398 ; CHECK-NEXT: lsr x8, x8, #4
399 ; CHECK-NEXT: mul x8, x8, x9
400 ; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0, x8]
402 ; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #4) #9)
405 ; xOFFSET = RDVL * 2^-4 * -9 * 2^6 = RDVL * -36
406 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -36
407 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
408 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
409 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
412 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_outside_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
413 ; CHECK-LABEL: ld4.nxv64i8_outside_upper_bound:
415 ; CHECK-NEXT: rdvl x8, #1
416 ; CHECK-NEXT: mov w9, #512
417 ; CHECK-NEXT: lsr x8, x8, #4
418 ; CHECK-NEXT: mul x8, x8, x9
419 ; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0, x8]
421 ; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #16) #2)
424 ; xOFFSET = RDVL * 2^-4 * 2^9 = RDVL * 32
425 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 32
426 %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
427 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
428 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
432 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld4.nxv32i16(<vscale x 8 x i1> %Pg, <vscale x 8 x i16> *%addr) {
433 ; CHECK-LABEL: ld4.nxv32i16:
435 ; CHECK-NEXT: ld4h { z0.h - z3.h }, p0/z, [x0, #8, mul vl]
437 %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 8
438 %base_ptr = bitcast <vscale x 8 x i16>* %base to i16 *
439 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4.sret.nxv8i16(<vscale x 8 x i1> %Pg, i16 *%base_ptr)
440 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
443 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld4.nxv32f16(<vscale x 8 x i1> %Pg, <vscale x 8 x half> *%addr) {
444 ; CHECK-LABEL: ld4.nxv32f16:
446 ; CHECK-NEXT: ld4h { z0.h - z3.h }, p0/z, [x0, #28, mul vl]
448 %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 28
449 %base_ptr = bitcast <vscale x 8 x half>* %base to half *
450 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4.sret.nxv8f16(<vscale x 8 x i1> %Pg, half *%base_ptr)
451 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
454 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld4.nxv32bf16(<vscale x 8 x i1> %Pg, <vscale x 8 x bfloat> *%addr) #0 {
455 ; CHECK-LABEL: ld4.nxv32bf16:
457 ; CHECK-NEXT: ld4h { z0.h - z3.h }, p0/z, [x0, #-32, mul vl]
459 %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 -32
460 %base_ptr = bitcast <vscale x 8 x bfloat>* %base to bfloat *
461 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4.sret.nxv8bf16(<vscale x 8 x i1> %Pg, bfloat *%base_ptr)
462 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
466 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld4.nxv16i32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> *%addr) {
467 ; CHECK-LABEL: ld4.nxv16i32:
469 ; CHECK-NEXT: ld4w { z0.s - z3.s }, p0/z, [x0, #28, mul vl]
471 %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 28
472 %base_ptr = bitcast <vscale x 4 x i32>* %base to i32 *
473 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> %Pg, i32 *%base_ptr)
474 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
477 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld4.nxv16f32(<vscale x 4 x i1> %Pg, <vscale x 4 x float>* %addr) {
478 ; CHECK-LABEL: ld4.nxv16f32:
480 ; CHECK-NEXT: ld4w { z0.s - z3.s }, p0/z, [x0, #-32, mul vl]
482 %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 -32
483 %base_ptr = bitcast <vscale x 4 x float>* %base to float *
484 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4.sret.nxv4f32(<vscale x 4 x i1> %Pg, float *%base_ptr)
485 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
489 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld4.nxv8i64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> *%addr) {
490 ; CHECK-LABEL: ld4.nxv8i64:
492 ; CHECK-NEXT: ld4d { z0.d - z3.d }, p0/z, [x0, #28, mul vl]
494 %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 28
495 %base_ptr = bitcast <vscale x 2 x i64>* %base to i64 *
496 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> %Pg, i64 *%base_ptr)
497 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
500 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld4.nxv8f64(<vscale x 2 x i1> %Pg, <vscale x 2 x double> *%addr) {
501 ; CHECK-LABEL: ld4.nxv8f64:
503 ; CHECK-NEXT: ld4d { z0.d - z3.d }, p0/z, [x0, #-32, mul vl]
505 %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -32
506 %base_ptr = bitcast <vscale x 2 x double>* %base to double *
507 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4.sret.nxv2f64(<vscale x 2 x i1> %Pg, double * %base_ptr)
508 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
511 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1>, i8*)
512 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1>, i16*)
513 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1>, i32*)
514 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1>, i64*)
515 declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2.sret.nxv8f16(<vscale x 8 x i1>, half*)
516 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1>, bfloat*)
517 declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2.sret.nxv4f32(<vscale x 4 x i1>, float*)
518 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1>, double*)
520 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1>, i8*)
521 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3.sret.nxv8i16(<vscale x 8 x i1>, i16*)
522 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3.sret.nxv4i32(<vscale x 4 x i1>, i32*)
523 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1>, i64*)
524 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3.sret.nxv8f16(<vscale x 8 x i1>, half*)
525 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3.sret.nxv8bf16(<vscale x 8 x i1>, bfloat*)
526 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3.sret.nxv4f32(<vscale x 4 x i1>, float*)
527 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3.sret.nxv2f64(<vscale x 2 x i1>, double*)
529 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1>, i8*)
530 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4.sret.nxv8i16(<vscale x 8 x i1>, i16*)
531 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1>, i32*)
532 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1>, i64*)
533 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4.sret.nxv8f16(<vscale x 8 x i1>, half*)
534 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4.sret.nxv8bf16(<vscale x 8 x i1>, bfloat*)
535 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4.sret.nxv4f32(<vscale x 4 x i1>, float*)
536 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4.sret.nxv2f64(<vscale x 2 x i1>, double*)
538 ; +bf16 is required for the bfloat version.
539 attributes #0 = { "target-features"="+bf16" }