1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
6 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2q_si_i8_off16(<vscale x 16 x i1> %pg, ptr %addr ) {
7 ; CHECK-LABEL: ld2q_si_i8_off16:
9 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
11 %base = getelementptr <vscale x 16 x i8>, ptr %addr, i64 -16
12 %base_ptr = bitcast ptr %base to ptr
13 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_ptr);
14 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
17 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2q_si_i8_off14(<vscale x 16 x i1> %pg, ptr %addr ) {
18 ; CHECK-LABEL: ld2q_si_i8_off14:
20 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #14, mul vl]
22 %base = getelementptr <vscale x 16 x i8>, ptr %addr, i64 14
23 %base_ptr = bitcast ptr %base to ptr
24 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_ptr);
25 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
28 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2q_ss_i8(<vscale x 16 x i1> %pg, ptr %addr, i64 %a) {
29 ; CHECK-LABEL: ld2q_ss_i8:
31 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
33 %addr2 = getelementptr i128, ptr %addr, i64 %a
34 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %addr2);
35 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
38 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2q_i8(<vscale x 16 x i1> %pg, ptr %addr) {
39 ; CHECK-LABEL: ld2q_i8:
41 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
43 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %addr);
44 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
47 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ld2q_si_i16(<vscale x 8 x i1> %pg, ptr %addr ) {
48 ; CHECK-LABEL: ld2q_si_i16:
50 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
52 %base = getelementptr <vscale x 8 x i16>, ptr %addr, i64 -16
53 %base_ptr = bitcast ptr %base to ptr
54 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %base_ptr);
55 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
58 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ld2q_ss_i16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
59 ; CHECK-LABEL: ld2q_ss_i16:
61 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
63 %addr2 = getelementptr i128, ptr %addr, i64 %a
64 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %addr2);
65 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
68 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ld2q_i16(<vscale x 8 x i1> %pg, ptr %addr) {
69 ; CHECK-LABEL: ld2q_i16:
71 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
73 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %addr);
74 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
77 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ld2q_si_i32(<vscale x 4 x i1> %pg, ptr %addr ) {
78 ; CHECK-LABEL: ld2q_si_i32:
80 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
82 %base = getelementptr <vscale x 4 x i32>, ptr %addr, i64 -16
83 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %base);
84 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
87 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ld2q_ss_i32(<vscale x 4 x i1> %pg, ptr %addr, i64 %a) {
88 ; CHECK-LABEL: ld2q_ss_i32:
90 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
92 %addr2 = getelementptr i128, ptr %addr, i64 %a
93 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %addr2);
94 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
97 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ld2q_i32(<vscale x 4 x i1> %pg, ptr %addr) {
98 ; CHECK-LABEL: ld2q_i32:
100 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
102 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %addr);
103 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
106 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ld2q_si_i64(<vscale x 2 x i1> %pg, ptr %addr ) {
107 ; CHECK-LABEL: ld2q_si_i64:
109 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
111 %base = getelementptr <vscale x 2 x i64>, ptr %addr, i64 -16
112 %base_ptr = bitcast ptr %base to ptr
113 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %base_ptr);
114 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
117 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ld2q_ss_i64(<vscale x 2 x i1> %pg, ptr %addr, i64 %a) {
118 ; CHECK-LABEL: ld2q_ss_i64:
120 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
122 %addr2 = getelementptr i128, ptr %addr, i64 %a
123 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr2);
124 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
127 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ld2q_i64(<vscale x 2 x i1> %pg, ptr %addr) {
128 ; CHECK-LABEL: ld2q_i64:
130 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
132 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr);
133 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
136 define { <vscale x 8 x half>, <vscale x 8 x half> } @ld2q_si_f16(<vscale x 8 x i1> %pg, ptr %addr ) {
137 ; CHECK-LABEL: ld2q_si_f16:
139 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
141 %base = getelementptr <vscale x 8 x half>, ptr %addr, i64 -16
142 %base_ptr = bitcast ptr %base to ptr
143 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %base_ptr);
144 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
147 define { <vscale x 8 x half>, <vscale x 8 x half> } @ld2q_ss_f16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
148 ; CHECK-LABEL: ld2q_ss_f16:
150 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
152 %addr2 = getelementptr i128, ptr %addr, i64 %a
153 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %addr2);
154 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
157 define { <vscale x 8 x half>, <vscale x 8 x half> } @ld2q_f16(<vscale x 8 x i1> %pg, ptr %addr) {
158 ; CHECK-LABEL: ld2q_f16:
160 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
162 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %addr);
163 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
166 define { <vscale x 4 x float>, <vscale x 4 x float> } @ld2q_si_f32(<vscale x 4 x i1> %pg, ptr %addr ) {
167 ; CHECK-LABEL: ld2q_si_f32:
169 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
171 %base = getelementptr <vscale x 4 x float>, ptr %addr, i64 -16
172 %base_ptr = bitcast ptr %base to ptr
173 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %base_ptr);
174 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
177 define { <vscale x 4 x float>, <vscale x 4 x float> } @ld2q_ss_f32(<vscale x 4 x i1> %pg, ptr %addr, i64 %a) {
178 ; CHECK-LABEL: ld2q_ss_f32:
180 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
182 %addr2 = getelementptr i128, ptr %addr, i64 %a
183 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %addr2);
184 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
187 define { <vscale x 4 x float>, <vscale x 4 x float> } @ld2q_f32(<vscale x 4 x i1> %pg, ptr %addr) {
188 ; CHECK-LABEL: ld2q_f32:
190 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
192 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %addr);
193 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
196 define { <vscale x 2 x double>, <vscale x 2 x double> } @ld2q_si_f64(<vscale x 2 x i1> %pg, ptr %addr ) {
197 ; CHECK-LABEL: ld2q_si_f64:
199 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
201 %base = getelementptr <vscale x 2 x double>, ptr %addr, i64 -16
202 %base_ptr = bitcast ptr %base to ptr
203 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %base_ptr);
204 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
207 define { <vscale x 2 x double>, <vscale x 2 x double> } @ld2q_ss_f64(<vscale x 2 x i1> %pg, ptr %addr, i64 %a) {
208 ; CHECK-LABEL: ld2q_ss_f64:
210 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
212 %addr2 = getelementptr i128, ptr %addr, i64 %a
213 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %addr2);
214 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
217 define { <vscale x 2 x double>, <vscale x 2 x double> } @ld2q_f64(<vscale x 2 x i1> %pg, ptr %addr) {
218 ; CHECK-LABEL: ld2q_f64:
220 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
222 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %addr);
223 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
226 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld2q_si_bf16(<vscale x 8 x i1> %pg, ptr %addr ) {
227 ; CHECK-LABEL: ld2q_si_bf16:
229 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, #-16, mul vl]
231 %base = getelementptr <vscale x 8 x bfloat>, ptr %addr, i64 -16
232 %base_ptr = bitcast ptr %base to ptr
233 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %base_ptr);
234 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
237 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld2q_ss_bf16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
238 ; CHECK-LABEL: ld2q_ss_bf16:
240 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0, x1, lsl #4]
242 %addr2 = getelementptr i128, ptr %addr, i64 %a
243 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %addr2);
244 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
247 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld2q_bf16(<vscale x 8 x i1> %pg, ptr %addr) {
248 ; CHECK-LABEL: ld2q_bf16:
250 ; CHECK-NEXT: ld2q { z0.q, z1.q }, p0/z, [x0]
252 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %addr);
253 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
257 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3q_si_i8_off24(<vscale x 16 x i1> %pg, ptr %addr ) {
258 ; CHECK-LABEL: ld3q_si_i8_off24:
260 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
262 %base = getelementptr <vscale x 16 x i8>, ptr %addr, i64 -24
263 %base_ptr = bitcast ptr %base to ptr
264 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_ptr);
265 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
268 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3q_si_i8_off21(<vscale x 16 x i1> %pg, ptr %addr ) {
269 ; CHECK-LABEL: ld3q_si_i8_off21:
271 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #21, mul vl]
273 %base = getelementptr <vscale x 16 x i8>, ptr %addr, i64 21
274 %base_ptr = bitcast ptr %base to ptr
275 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_ptr);
276 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
279 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3q_ss_i8(<vscale x 16 x i1> %pg, ptr %addr, i64 %a) {
280 ; CHECK-LABEL: ld3q_ss_i8:
282 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
284 %addr2 = getelementptr i128, ptr %addr, i64 %a
285 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %addr2);
286 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
289 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @ld3q_i8(<vscale x 16 x i1> %pg, ptr %addr) {
290 ; CHECK-LABEL: ld3q_i8:
292 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
294 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %addr);
295 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res
298 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld3q_si_i16(<vscale x 8 x i1> %pg, ptr %addr ) {
299 ; CHECK-LABEL: ld3q_si_i16:
301 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
303 %base = getelementptr <vscale x 8 x i16>, ptr %addr, i64 -24
304 %base_ptr = bitcast ptr %base to ptr
305 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %base_ptr);
306 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
309 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld3q_ss_i16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
310 ; CHECK-LABEL: ld3q_ss_i16:
312 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
314 %addr2 = getelementptr i128, ptr %addr, i64 %a
315 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %addr2);
316 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
319 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld3q_i16(<vscale x 8 x i1> %pg, ptr %addr) {
320 ; CHECK-LABEL: ld3q_i16:
322 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
324 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %addr);
325 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
328 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld3q_si_i32(<vscale x 4 x i1> %pg, ptr %addr ) {
329 ; CHECK-LABEL: ld3q_si_i32:
331 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
333 %base = getelementptr <vscale x 4 x i32>, ptr %addr, i64 -24
334 %base_ptr = bitcast ptr %base to ptr
335 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %base_ptr);
336 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
339 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld3q_ss_i32(<vscale x 4 x i1> %pg, ptr %addr, i64 %a) {
340 ; CHECK-LABEL: ld3q_ss_i32:
342 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
344 %addr2 = getelementptr i128, ptr %addr, i64 %a
345 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %addr2);
346 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
349 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld3q_i32(<vscale x 4 x i1> %pg, ptr %addr) {
350 ; CHECK-LABEL: ld3q_i32:
352 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
354 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %addr);
355 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
358 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld3q_si_i64(<vscale x 2 x i1> %pg, ptr %addr ) {
359 ; CHECK-LABEL: ld3q_si_i64:
361 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
363 %addr2 = getelementptr <vscale x 4 x i32>, ptr %addr, i64 -24
364 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr2);
365 ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
368 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld3q_ss_i64(<vscale x 2 x i1> %pg, ptr %addr, i64 %a) {
369 ; CHECK-LABEL: ld3q_ss_i64:
371 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
373 %addr2 = getelementptr i128, ptr %addr, i64 %a
374 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr2);
375 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
378 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld3q_i64(<vscale x 2 x i1> %pg, ptr %addr) {
379 ; CHECK-LABEL: ld3q_i64:
381 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
383 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr);
384 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
387 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld3q_si_f16(<vscale x 8 x i1> %pg, ptr %addr ) {
388 ; CHECK-LABEL: ld3q_si_f16:
390 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
392 %base = getelementptr <vscale x 8 x half>, ptr %addr, i64 -24
393 %base_ptr = bitcast ptr %base to ptr
394 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %base_ptr);
395 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
398 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld3q_ss_f16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
399 ; CHECK-LABEL: ld3q_ss_f16:
401 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
403 %addr2 = getelementptr i128, ptr %addr, i64 %a
404 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %addr2);
405 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
408 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld3q_f16(<vscale x 8 x i1> %pg, ptr %addr) {
409 ; CHECK-LABEL: ld3q_f16:
411 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
413 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %addr);
414 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
417 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld3q_si_f32(<vscale x 4 x i1> %pg, ptr %addr ) {
418 ; CHECK-LABEL: ld3q_si_f32:
420 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
422 %base = getelementptr <vscale x 4 x float>, ptr %addr, i64 -24
423 %base_ptr = bitcast ptr %base to ptr
424 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %base_ptr);
425 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
428 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld3q_ss_f32(<vscale x 4 x i1> %pg, ptr %addr, i64 %a) {
429 ; CHECK-LABEL: ld3q_ss_f32:
431 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
433 %addr2 = getelementptr i128, ptr %addr, i64 %a
434 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %addr2);
435 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
438 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld3q_f32(<vscale x 4 x i1> %pg, ptr %addr) {
439 ; CHECK-LABEL: ld3q_f32:
441 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
443 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %addr);
444 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
447 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld3q_si_f64(<vscale x 2 x i1> %pg, ptr %addr ) {
448 ; CHECK-LABEL: ld3q_si_f64:
450 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
452 %base = getelementptr <vscale x 2 x double>, ptr %addr, i64 -24
453 %base_ptr = bitcast ptr %base to ptr
454 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %base_ptr);
455 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
458 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld3q_ss_f64(<vscale x 2 x i1> %pg, ptr %addr, i64 %a) {
459 ; CHECK-LABEL: ld3q_ss_f64:
461 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
463 %addr2 = getelementptr i128, ptr %addr, i64 %a
464 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %addr2);
465 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
468 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld3q_f64(<vscale x 2 x i1> %pg, ptr %addr) {
469 ; CHECK-LABEL: ld3q_f64:
471 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
473 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %addr);
474 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
477 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld3q_si_bf16(<vscale x 8 x i1> %pg, ptr %addr ) {
478 ; CHECK-LABEL: ld3q_si_bf16:
480 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, #-24, mul vl]
482 %base = getelementptr <vscale x 8 x bfloat>, ptr %addr, i64 -24
483 %base_ptr = bitcast ptr %base to ptr
484 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %base_ptr);
485 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
488 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld3q_ss_bf16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
489 ; CHECK-LABEL: ld3q_ss_bf16:
491 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0, x1, lsl #4]
493 %addr2 = getelementptr i128, ptr %addr, i64 %a
494 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %addr2);
495 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
498 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld3q_bf16(<vscale x 8 x i1> %pg, ptr %addr) {
499 ; CHECK-LABEL: ld3q_bf16:
501 ; CHECK-NEXT: ld3q { z0.q - z2.q }, p0/z, [x0]
503 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %addr);
504 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
508 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4q_si_i8_off32(<vscale x 16 x i1> %pg, ptr %addr ) {
509 ; CHECK-LABEL: ld4q_si_i8_off32:
511 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
513 %base = getelementptr <vscale x 16 x i8>, ptr %addr, i64 -32
514 %base_ptr = bitcast ptr %base to ptr
515 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_ptr);
516 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
519 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4q_si_i8_off28(<vscale x 16 x i1> %pg, ptr %addr ) {
520 ; CHECK-LABEL: ld4q_si_i8_off28:
522 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #28, mul vl]
524 %base = getelementptr <vscale x 16 x i8>, ptr %addr, i64 28
525 %base_ptr = bitcast ptr %base to ptr
526 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_ptr);
527 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
530 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4q_ss_i8(<vscale x 16 x i1> %pg, ptr %addr, i64 %a) {
531 ; CHECK-LABEL: ld4q_ss_i8:
533 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
535 %addr2 = getelementptr i128, ptr %addr, i64 %a
536 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %addr2);
537 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
540 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @ld4q_i8(<vscale x 16 x i1> %pg, ptr %addr) {
541 ; CHECK-LABEL: ld4q_i8:
543 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
545 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1> %pg, ptr %addr);
546 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res
549 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld4q_si_i16(<vscale x 8 x i1> %pg, ptr %addr ) {
550 ; CHECK-LABEL: ld4q_si_i16:
552 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
554 %base = getelementptr <vscale x 8 x i16>, ptr %addr, i64 -32
555 %base_ptr = bitcast ptr %base to ptr
556 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %base_ptr);
557 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
560 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld4q_ss_i16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
561 ; CHECK-LABEL: ld4q_ss_i16:
563 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
565 %addr2 = getelementptr i128, ptr %addr, i64 %a
566 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %addr2);
567 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
570 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld4q_i16(<vscale x 8 x i1> %pg, ptr %addr) {
571 ; CHECK-LABEL: ld4q_i16:
573 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
575 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> %pg, ptr %addr);
576 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
579 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld4q_si_i32(<vscale x 4 x i1> %pg, ptr %addr ) {
580 ; CHECK-LABEL: ld4q_si_i32:
582 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
584 %base = getelementptr <vscale x 4 x i32>, ptr %addr, i64 -32
585 %base_ptr = bitcast ptr %base to ptr
586 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %base_ptr);
587 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
590 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld4q_ss_i32(<vscale x 4 x i1> %pg, ptr %addr, i64 %a) {
591 ; CHECK-LABEL: ld4q_ss_i32:
593 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
595 %addr2 = getelementptr i128, ptr %addr, i64 %a
596 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %addr2);
597 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
600 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld4q_i32(<vscale x 4 x i1> %pg, ptr %addr) {
601 ; CHECK-LABEL: ld4q_i32:
603 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
605 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1> %pg, ptr %addr);
606 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
609 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld4q_si_i64(<vscale x 2 x i1> %pg, ptr %addr ) {
610 ; CHECK-LABEL: ld4q_si_i64:
612 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
614 %base = getelementptr <vscale x 2 x i64>, ptr %addr, i64 -32
615 %base_ptr = bitcast ptr %base to ptr
616 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %base_ptr);
617 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
620 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld4q_ss_i64(<vscale x 2 x i1> %pg, ptr %addr, i64 %a) {
621 ; CHECK-LABEL: ld4q_ss_i64:
623 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
625 %addr2 = getelementptr i128, ptr %addr, i64 %a
626 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr2);
627 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
630 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld4q_i64(<vscale x 2 x i1> %pg, ptr %addr) {
631 ; CHECK-LABEL: ld4q_i64:
633 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
635 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1> %pg, ptr %addr);
636 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
639 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld4q_si_f16(<vscale x 8 x i1> %pg, ptr %addr ) {
640 ; CHECK-LABEL: ld4q_si_f16:
642 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
644 %base = getelementptr <vscale x 8 x half>, ptr %addr, i64 -32
645 %base_ptr = bitcast ptr %base to ptr
646 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %base_ptr);
647 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
650 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld4q_ss_f16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
651 ; CHECK-LABEL: ld4q_ss_f16:
653 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
655 %addr2 = getelementptr i128, ptr %addr, i64 %a
656 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %addr2);
657 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
660 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld4q_f16(<vscale x 8 x i1> %pg, ptr %addr) {
661 ; CHECK-LABEL: ld4q_f16:
663 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
665 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1> %pg, ptr %addr);
666 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
669 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld4q_si_f32(<vscale x 4 x i1> %pg, ptr %addr ) {
670 ; CHECK-LABEL: ld4q_si_f32:
672 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
674 %base = getelementptr <vscale x 4 x float>, ptr %addr, i64 -32
675 %base_ptr = bitcast ptr %base to ptr
676 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %base_ptr);
677 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
680 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld4q_ss_f32(<vscale x 4 x i1> %pg, ptr %addr, i64 %a) {
681 ; CHECK-LABEL: ld4q_ss_f32:
683 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
685 %addr2 = getelementptr i128, ptr %addr, i64 %a
686 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %addr2);
687 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
690 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld4q_f32(<vscale x 4 x i1> %pg, ptr %addr) {
691 ; CHECK-LABEL: ld4q_f32:
693 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
695 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1> %pg, ptr %addr);
696 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
699 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld4q_si_f64(<vscale x 2 x i1> %pg, ptr %addr ) {
700 ; CHECK-LABEL: ld4q_si_f64:
702 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
704 %base = getelementptr <vscale x 2 x double>, ptr %addr, i64 -32
705 %base_ptr = bitcast ptr %base to ptr
706 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %base_ptr);
707 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
710 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld4q_ss_f64(<vscale x 2 x i1> %pg, ptr %addr, i64 %a) {
711 ; CHECK-LABEL: ld4q_ss_f64:
713 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
715 %addr2 = getelementptr i128, ptr %addr, i64 %a
716 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %addr2);
717 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
720 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld4q_f64(<vscale x 2 x i1> %pg, ptr %addr) {
721 ; CHECK-LABEL: ld4q_f64:
723 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
725 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1> %pg, ptr %addr);
726 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
729 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld4q_si_bf16(<vscale x 8 x i1> %pg, ptr %addr ) {
730 ; CHECK-LABEL: ld4q_si_bf16:
732 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, #-32, mul vl]
734 %base = getelementptr <vscale x 8 x bfloat>, ptr %addr, i64 -32
735 %base_ptr = bitcast ptr %base to ptr
736 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %base_ptr);
737 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
740 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld4q_ss_bf16(<vscale x 8 x i1> %pg, ptr %addr, i64 %a) {
741 ; CHECK-LABEL: ld4q_ss_bf16:
743 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0, x1, lsl #4]
745 %addr2 = getelementptr i128, ptr %addr, i64 %a
746 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %addr2);
747 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
750 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld4q_bf16(<vscale x 8 x i1> %pg, ptr %addr) {
751 ; CHECK-LABEL: ld4q_bf16:
753 ; CHECK-NEXT: ld4q { z0.q - z3.q }, p0/z, [x0]
755 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1> %pg, ptr %addr);
756 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
760 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr)
761 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr)
762 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr)
763 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr)
764 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount"), ptr)
765 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount"), ptr)
766 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr)
767 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8bf16(target("aarch64.svcount"), ptr)
769 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2q.sret.nxv16i8(<vscale x 16 x i1>, ptr)
770 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1>, ptr)
771 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2q.sret.nxv4i32(<vscale x 4 x i1>, ptr)
772 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2q.sret.nxv2i64(<vscale x 2 x i1>, ptr)
774 declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2q.sret.nxv8f16(<vscale x 8 x i1>, ptr)
775 declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2q.sret.nxv4f32(<vscale x 4 x i1>, ptr)
776 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2q.sret.nxv2f64(<vscale x 2 x i1>, ptr)
777 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2q.sret.nxv8bf16(<vscale x 8 x i1>, ptr)
779 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3q.sret.nxv16i8(<vscale x 16 x i1>, ptr)
780 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1>, ptr)
781 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3q.sret.nxv4i32(<vscale x 4 x i1>, ptr)
782 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3q.sret.nxv2i64(<vscale x 2 x i1>, ptr)
784 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3q.sret.nxv8f16(<vscale x 8 x i1>, ptr)
785 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3q.sret.nxv4f32(<vscale x 4 x i1>, ptr)
786 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3q.sret.nxv2f64(<vscale x 2 x i1>, ptr)
787 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3q.sret.nxv8bf16(<vscale x 8 x i1>, ptr)
789 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4q.sret.nxv16i8(<vscale x 16 x i1>, ptr)
790 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1>, ptr)
791 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4q.sret.nxv4i32(<vscale x 4 x i1>, ptr)
792 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4q.sret.nxv2i64(<vscale x 2 x i1>, ptr)
794 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4q.sret.nxv8f16(<vscale x 8 x i1>, ptr)
795 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4q.sret.nxv4f32(<vscale x 4 x i1>, ptr)
796 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4q.sret.nxv2f64(<vscale x 2 x i1>, ptr)
797 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4q.sret.nxv8bf16(<vscale x 8 x i1>, ptr)