1 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
4 ; LD1B, LD1W, LD1H, LD1D: base + 32-bit unscaled offset, sign (sxtw) or zero
5 ; (uxtw) extended to 64 bits.
6 ; e.g. ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
10 define <vscale x 4 x i32> @gld1b_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
11 ; CHECK-LABEL: gld1b_s_uxtw:
12 ; CHECK: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
14 %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg,
16 <vscale x 4 x i32> %b)
17 %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
18 ret <vscale x 4 x i32> %res
21 define <vscale x 4 x i32> @gld1b_s_sxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
22 ; CHECK-LABEL: gld1b_s_sxtw:
23 ; CHECK: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
25 %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8(<vscale x 4 x i1> %pg,
27 <vscale x 4 x i32> %b)
28 %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
29 ret <vscale x 4 x i32> %res
32 define <vscale x 2 x i64> @gld1b_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
33 ; CHECK-LABEL: gld1b_d_uxtw:
34 ; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
36 %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8(<vscale x 2 x i1> %pg,
38 <vscale x 2 x i32> %b)
39 %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
40 ret <vscale x 2 x i64> %res
43 define <vscale x 2 x i64> @gld1b_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
44 ; CHECK-LABEL: gld1b_d_sxtw:
45 ; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
47 %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8(<vscale x 2 x i1> %pg,
49 <vscale x 2 x i32> %b)
50 %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
51 ret <vscale x 2 x i64> %res
55 define <vscale x 4 x i32> @gld1h_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
56 ; CHECK-LABEL: gld1h_s_uxtw:
57 ; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
59 %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg,
61 <vscale x 4 x i32> %b)
62 %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
63 ret <vscale x 4 x i32> %res
66 define <vscale x 4 x i32> @gld1h_s_sxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
67 ; CHECK-LABEL: gld1h_s_sxtw:
68 ; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
70 %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16(<vscale x 4 x i1> %pg,
72 <vscale x 4 x i32> %b)
73 %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
74 ret <vscale x 4 x i32> %res
77 define <vscale x 2 x i64> @gld1h_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
78 ; CHECK-LABEL: gld1h_d_uxtw:
79 ; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
81 %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16(<vscale x 2 x i1> %pg,
83 <vscale x 2 x i32> %b)
84 %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
85 ret <vscale x 2 x i64> %res
88 define <vscale x 2 x i64> @gld1h_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
89 ; CHECK-LABEL: gld1h_d_sxtw:
90 ; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
92 %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16(<vscale x 2 x i1> %pg,
94 <vscale x 2 x i32> %b)
95 %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
96 ret <vscale x 2 x i64> %res
100 define <vscale x 4 x i32> @gld1w_s_uxtw(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) {
101 ; CHECK-LABEL: gld1w_s_uxtw:
102 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
104 %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32(<vscale x 4 x i1> %pg,
106 <vscale x 4 x i32> %b)
107 ret <vscale x 4 x i32> %load
110 define <vscale x 4 x i32> @gld1w_s_sxtw(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) {
111 ; CHECK-LABEL: gld1w_s_sxtw:
112 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
114 %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32(<vscale x 4 x i1> %pg,
116 <vscale x 4 x i32> %b)
117 ret <vscale x 4 x i32> %load
120 define <vscale x 2 x i64> @gld1w_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
121 ; CHECK-LABEL: gld1w_d_uxtw:
122 ; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
124 %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32(<vscale x 2 x i1> %pg,
126 <vscale x 2 x i32> %b)
127 %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
128 ret <vscale x 2 x i64> %res
131 define <vscale x 2 x i64> @gld1w_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
132 ; CHECK-LABEL: gld1w_d_sxtw:
133 ; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
135 %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32(<vscale x 2 x i1> %pg,
137 <vscale x 2 x i32> %b)
138 %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
139 ret <vscale x 2 x i64> %res
142 define <vscale x 4 x float> @gld1w_s_uxtw_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) {
143 ; CHECK-LABEL: gld1w_s_uxtw_float:
144 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
146 %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4f32(<vscale x 4 x i1> %pg,
148 <vscale x 4 x i32> %b)
149 ret <vscale x 4 x float> %load
152 define <vscale x 4 x float> @gld1w_s_sxtw_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) {
153 ; CHECK-LABEL: gld1w_s_sxtw_float:
154 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
156 %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4f32(<vscale x 4 x i1> %pg,
158 <vscale x 4 x i32> %b)
159 ret <vscale x 4 x float> %load
163 define <vscale x 2 x i64> @gld1d_d_uxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
164 ; CHECK-LABEL: gld1d_d_uxtw:
165 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
167 %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i64(<vscale x 2 x i1> %pg,
169 <vscale x 2 x i32> %b)
170 ret <vscale x 2 x i64> %load
173 define <vscale x 2 x i64> @gld1d_d_sxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
174 ; CHECK-LABEL: gld1d_d_sxtw:
175 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
177 %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i64(<vscale x 2 x i1> %pg,
179 <vscale x 2 x i32> %b)
180 ret <vscale x 2 x i64> %load
183 define <vscale x 2 x double> @gld1d_d_uxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
184 ; CHECK-LABEL: gld1d_d_uxtw_double:
185 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
187 %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2f64(<vscale x 2 x i1> %pg,
189 <vscale x 2 x i32> %b)
190 ret <vscale x 2 x double> %load
193 define <vscale x 2 x double> @gld1d_d_sxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
194 ; CHECK-LABEL: gld1d_d_sxtw_double:
195 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
197 %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2f64(<vscale x 2 x i1> %pg,
199 <vscale x 2 x i32> %b)
200 ret <vscale x 2 x double> %load
204 ; LD1SB, LD1SW, LD1SH: base + 32-bit unscaled offset, sign (sxtw) or zero
205 ; (uxtw) extended to 64 bits.
206 ; e.g. ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
210 define <vscale x 4 x i32> @gld1sb_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
211 ; CHECK-LABEL: gld1sb_s_uxtw:
212 ; CHECK: ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
214 %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg,
216 <vscale x 4 x i32> %b)
217 %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
218 ret <vscale x 4 x i32> %res
221 define <vscale x 4 x i32> @gld1sb_s_sxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
222 ; CHECK-LABEL: gld1sb_s_sxtw:
223 ; CHECK: ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw]
225 %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8(<vscale x 4 x i1> %pg,
227 <vscale x 4 x i32> %b)
228 %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
229 ret <vscale x 4 x i32> %res
232 define <vscale x 2 x i64> @gld1sb_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
233 ; CHECK-LABEL: gld1sb_d_uxtw:
234 ; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
236 %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8(<vscale x 2 x i1> %pg,
238 <vscale x 2 x i32> %b)
239 %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
240 ret <vscale x 2 x i64> %res
243 define <vscale x 2 x i64> @gld1sb_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
244 ; CHECK-LABEL: gld1sb_d_sxtw:
245 ; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw]
247 %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8(<vscale x 2 x i1> %pg,
249 <vscale x 2 x i32> %b)
250 %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
251 ret <vscale x 2 x i64> %res
255 define <vscale x 4 x i32> @gld1sh_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
256 ; CHECK-LABEL: gld1sh_s_uxtw:
257 ; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
259 %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg,
261 <vscale x 4 x i32> %b)
262 %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
263 ret <vscale x 4 x i32> %res
266 define <vscale x 4 x i32> @gld1sh_s_sxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
267 ; CHECK-LABEL: gld1sh_s_sxtw:
268 ; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw]
270 %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16(<vscale x 4 x i1> %pg,
272 <vscale x 4 x i32> %b)
273 %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
274 ret <vscale x 4 x i32> %res
277 define <vscale x 2 x i64> @gld1sh_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
278 ; CHECK-LABEL: gld1sh_d_uxtw:
279 ; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
281 %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16(<vscale x 2 x i1> %pg,
283 <vscale x 2 x i32> %b)
284 %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
285 ret <vscale x 2 x i64> %res
288 define <vscale x 2 x i64> @gld1sh_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
289 ; CHECK-LABEL: gld1sh_d_sxtw:
290 ; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw]
292 %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16(<vscale x 2 x i1> %pg,
294 <vscale x 2 x i32> %b)
295 %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
296 ret <vscale x 2 x i64> %res
300 define <vscale x 2 x i64> @gld1sw_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
301 ; CHECK-LABEL: gld1sw_d_uxtw:
302 ; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
304 %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32(<vscale x 2 x i1> %pg,
306 <vscale x 2 x i32> %b)
307 %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
308 ret <vscale x 2 x i64> %res
311 define <vscale x 2 x i64> @gld1sw_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
312 ; CHECK-LABEL: gld1sw_d_sxtw:
313 ; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
315 %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32(<vscale x 2 x i1> %pg,
317 <vscale x 2 x i32> %b)
318 %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
319 ret <vscale x 2 x i64> %res
323 declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
324 declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
325 declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
326 declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
329 declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
330 declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
331 declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
332 declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
335 declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
336 declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
337 declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
338 declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
340 declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
341 declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
344 declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
345 declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
347 declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
348 declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)