1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
3 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 -force-streaming < %s | FileCheck %s
5 ; == Normal Multi-Vector Consecutive Loads ==
7 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld1_x2_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
8 ; CHECK-LABEL: ld1_x2_i8:
10 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
11 ; CHECK-NEXT: addvl sp, sp, #-1
12 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
13 ; CHECK-NEXT: mov p8.b, p0.b
14 ; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
15 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
16 ; CHECK-NEXT: addvl sp, sp, #1
17 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
19 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
20 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
23 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld1_x2_i8_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
24 ; CHECK-LABEL: ld1_x2_i8_scalar:
26 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
27 ; CHECK-NEXT: addvl sp, sp, #-1
28 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
29 ; CHECK-NEXT: mov p8.b, p0.b
30 ; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0, x1]
31 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
32 ; CHECK-NEXT: addvl sp, sp, #1
33 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
35 %base = getelementptr i8, ptr %ptr, i64 %index
36 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %base);
37 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
40 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ld1_x2_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
41 ; CHECK-LABEL: ld1_x2_i16:
43 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
44 ; CHECK-NEXT: addvl sp, sp, #-1
45 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
46 ; CHECK-NEXT: mov p8.b, p0.b
47 ; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
48 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
49 ; CHECK-NEXT: addvl sp, sp, #1
50 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
52 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
53 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
56 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ld1_x2_i16_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
57 ; CHECK-LABEL: ld1_x2_i16_scalar:
59 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
60 ; CHECK-NEXT: addvl sp, sp, #-1
61 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
62 ; CHECK-NEXT: mov p8.b, p0.b
63 ; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1]
64 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
65 ; CHECK-NEXT: addvl sp, sp, #1
66 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
68 %base = getelementptr i16, ptr %ptr, i64 %index
69 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %base);
70 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
73 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ld1_x2_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
74 ; CHECK-LABEL: ld1_x2_i32:
76 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
77 ; CHECK-NEXT: addvl sp, sp, #-1
78 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
79 ; CHECK-NEXT: mov p8.b, p0.b
80 ; CHECK-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0]
81 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
82 ; CHECK-NEXT: addvl sp, sp, #1
83 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
85 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
86 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
89 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ld1_x2_i32_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
90 ; CHECK-LABEL: ld1_x2_i32_scalar:
92 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
93 ; CHECK-NEXT: addvl sp, sp, #-1
94 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
95 ; CHECK-NEXT: mov p8.b, p0.b
96 ; CHECK-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0, x1, lsl #2]
97 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
98 ; CHECK-NEXT: addvl sp, sp, #1
99 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
101 %base = getelementptr i32, ptr %ptr, i64 %index
102 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %base);
103 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
106 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ld1_x2_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
107 ; CHECK-LABEL: ld1_x2_i64:
109 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
110 ; CHECK-NEXT: addvl sp, sp, #-1
111 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
112 ; CHECK-NEXT: mov p8.b, p0.b
113 ; CHECK-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0]
114 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
115 ; CHECK-NEXT: addvl sp, sp, #1
116 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
118 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
119 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
122 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ld1_x2_i64_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
123 ; CHECK-LABEL: ld1_x2_i64_scalar:
125 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
126 ; CHECK-NEXT: addvl sp, sp, #-1
127 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
128 ; CHECK-NEXT: mov p8.b, p0.b
129 ; CHECK-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0, x1, lsl #3]
130 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
131 ; CHECK-NEXT: addvl sp, sp, #1
132 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
134 %base = getelementptr i64, ptr %ptr, i64 %index
135 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %base);
136 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
139 define { <vscale x 8 x half>, <vscale x 8 x half> } @ld1_x2_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
140 ; CHECK-LABEL: ld1_x2_f16:
142 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
143 ; CHECK-NEXT: addvl sp, sp, #-1
144 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
145 ; CHECK-NEXT: mov p8.b, p0.b
146 ; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
147 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
148 ; CHECK-NEXT: addvl sp, sp, #1
149 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
151 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr);
152 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
155 define { <vscale x 8 x half>, <vscale x 8 x half> } @ld1_x2_f16_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
156 ; CHECK-LABEL: ld1_x2_f16_scalar:
158 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
159 ; CHECK-NEXT: addvl sp, sp, #-1
160 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
161 ; CHECK-NEXT: mov p8.b, p0.b
162 ; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1]
163 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
164 ; CHECK-NEXT: addvl sp, sp, #1
165 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
167 %base = getelementptr half, ptr %ptr, i64 %index
168 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") %pn, ptr %base);
169 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
172 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld1_x2_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
173 ; CHECK-LABEL: ld1_x2_bf16:
175 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
176 ; CHECK-NEXT: addvl sp, sp, #-1
177 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
178 ; CHECK-NEXT: mov p8.b, p0.b
179 ; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
180 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
181 ; CHECK-NEXT: addvl sp, sp, #1
182 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
184 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr);
185 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
188 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld1_x2_bf16_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
189 ; CHECK-LABEL: ld1_x2_bf16_scalar:
191 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
192 ; CHECK-NEXT: addvl sp, sp, #-1
193 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
194 ; CHECK-NEXT: mov p8.b, p0.b
195 ; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1]
196 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
197 ; CHECK-NEXT: addvl sp, sp, #1
198 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
200 %base = getelementptr bfloat, ptr %ptr, i64 %index
201 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount") %pn, ptr %base);
202 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
205 define { <vscale x 4 x float>, <vscale x 4 x float> } @ld1_x2_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
206 ; CHECK-LABEL: ld1_x2_f32:
208 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
209 ; CHECK-NEXT: addvl sp, sp, #-1
210 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
211 ; CHECK-NEXT: mov p8.b, p0.b
212 ; CHECK-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0]
213 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
214 ; CHECK-NEXT: addvl sp, sp, #1
215 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
217 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr);
218 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
221 define { <vscale x 4 x float>, <vscale x 4 x float> } @ld1_x2_f32_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
222 ; CHECK-LABEL: ld1_x2_f32_scalar:
224 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
225 ; CHECK-NEXT: addvl sp, sp, #-1
226 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
227 ; CHECK-NEXT: mov p8.b, p0.b
228 ; CHECK-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0, x1, lsl #2]
229 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
230 ; CHECK-NEXT: addvl sp, sp, #1
231 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
233 %base = getelementptr float, ptr %ptr, i64 %index
234 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %pn, ptr %base);
235 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
238 define { <vscale x 2 x double>, <vscale x 2 x double> } @ld1_x2_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
239 ; CHECK-LABEL: ld1_x2_f64:
241 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
242 ; CHECK-NEXT: addvl sp, sp, #-1
243 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
244 ; CHECK-NEXT: mov p8.b, p0.b
245 ; CHECK-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0]
246 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
247 ; CHECK-NEXT: addvl sp, sp, #1
248 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
250 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr);
251 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
254 define { <vscale x 2 x double>, <vscale x 2 x double> } @ld1_x2_f64_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
255 ; CHECK-LABEL: ld1_x2_f64_scalar:
257 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
258 ; CHECK-NEXT: addvl sp, sp, #-1
259 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
260 ; CHECK-NEXT: mov p8.b, p0.b
261 ; CHECK-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0, x1, lsl #3]
262 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
263 ; CHECK-NEXT: addvl sp, sp, #1
264 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
266 %base = getelementptr double, ptr %ptr, i64 %index
267 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %pn, ptr %base);
268 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
271 ; Test to ensure we load into the correct registers for the instruction
272 define <vscale x 16 x i8> @ld1_x2_i8_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 16 x i8> %val) {
273 ; CHECK-LABEL: ld1_x2_i8_z0_taken:
275 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
276 ; CHECK-NEXT: addvl sp, sp, #-1
277 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
278 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
279 ; CHECK-NEXT: .cfi_offset w29, -16
280 ; CHECK-NEXT: mov p8.b, p0.b
281 ; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0]
282 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
283 ; CHECK-NEXT: add z0.b, z0.b, z2.b
284 ; CHECK-NEXT: addvl sp, sp, #1
285 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
287 %ld1 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
288 %ld1_0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ld1, 0
289 %res = add <vscale x 16 x i8> %val, %ld1_0
290 ret <vscale x 16 x i8> %res
293 ; Test to ensure we load into the correct registers for the instruction
294 define <vscale x 16 x i8> @ld1_x2_i8_z0_taken_scalar(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 16 x i8> %val, i64 %index) {
295 ; CHECK-LABEL: ld1_x2_i8_z0_taken_scalar:
297 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
298 ; CHECK-NEXT: addvl sp, sp, #-1
299 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
300 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
301 ; CHECK-NEXT: .cfi_offset w29, -16
302 ; CHECK-NEXT: mov p8.b, p0.b
303 ; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1]
304 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
305 ; CHECK-NEXT: add z0.b, z0.b, z2.b
306 ; CHECK-NEXT: addvl sp, sp, #1
307 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
309 %base = getelementptr i8, ptr %ptr, i64 %index
310 %ld1 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %base);
311 %ld1_0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ld1, 0
312 %res = add <vscale x 16 x i8> %val, %ld1_0
313 ret <vscale x 16 x i8> %res
316 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld1_x4_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
317 ; CHECK-LABEL: ld1_x4_i8:
319 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
320 ; CHECK-NEXT: addvl sp, sp, #-1
321 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
322 ; CHECK-NEXT: mov p8.b, p0.b
323 ; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
324 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
325 ; CHECK-NEXT: addvl sp, sp, #1
326 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
328 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
329 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
332 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld1_x4_i8_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
333 ; CHECK-LABEL: ld1_x4_i8_scalar:
335 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
336 ; CHECK-NEXT: addvl sp, sp, #-1
337 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
338 ; CHECK-NEXT: mov p8.b, p0.b
339 ; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0, x1]
340 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
341 ; CHECK-NEXT: addvl sp, sp, #1
342 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
344 %base = getelementptr i8, ptr %ptr, i64 %index
345 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %base);
346 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
349 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld1_x4_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
350 ; CHECK-LABEL: ld1_x4_i16:
352 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
353 ; CHECK-NEXT: addvl sp, sp, #-1
354 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
355 ; CHECK-NEXT: mov p8.b, p0.b
356 ; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0]
357 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
358 ; CHECK-NEXT: addvl sp, sp, #1
359 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
361 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
362 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
365 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld1_x4_i16_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
366 ; CHECK-LABEL: ld1_x4_i16_scalar:
368 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
369 ; CHECK-NEXT: addvl sp, sp, #-1
370 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
371 ; CHECK-NEXT: mov p8.b, p0.b
372 ; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1]
373 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
374 ; CHECK-NEXT: addvl sp, sp, #1
375 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
377 %base = getelementptr i16, ptr %ptr, i64 %index
378 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %base);
379 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
382 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld1_x4_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
383 ; CHECK-LABEL: ld1_x4_i32:
385 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
386 ; CHECK-NEXT: addvl sp, sp, #-1
387 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
388 ; CHECK-NEXT: mov p8.b, p0.b
389 ; CHECK-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0]
390 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
391 ; CHECK-NEXT: addvl sp, sp, #1
392 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
394 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
395 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
398 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld1_x4_i32_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
399 ; CHECK-LABEL: ld1_x4_i32_scalar:
401 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
402 ; CHECK-NEXT: addvl sp, sp, #-1
403 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
404 ; CHECK-NEXT: mov p8.b, p0.b
405 ; CHECK-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0, x1, lsl #2]
406 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
407 ; CHECK-NEXT: addvl sp, sp, #1
408 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
410 %base = getelementptr i32, ptr %ptr, i64 %index
411 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %base);
412 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
415 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld1_x4_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
416 ; CHECK-LABEL: ld1_x4_i64:
418 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
419 ; CHECK-NEXT: addvl sp, sp, #-1
420 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
421 ; CHECK-NEXT: mov p8.b, p0.b
422 ; CHECK-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0]
423 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
424 ; CHECK-NEXT: addvl sp, sp, #1
425 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
427 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
428 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
431 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld1_x4_i64_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
432 ; CHECK-LABEL: ld1_x4_i64_scalar:
434 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
435 ; CHECK-NEXT: addvl sp, sp, #-1
436 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
437 ; CHECK-NEXT: mov p8.b, p0.b
438 ; CHECK-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0, x1, lsl #3]
439 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
440 ; CHECK-NEXT: addvl sp, sp, #1
441 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
443 %base = getelementptr i64, ptr %ptr, i64 %index
444 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %base);
445 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
448 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld1_x4_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
449 ; CHECK-LABEL: ld1_x4_f16:
451 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
452 ; CHECK-NEXT: addvl sp, sp, #-1
453 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
454 ; CHECK-NEXT: mov p8.b, p0.b
455 ; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0]
456 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
457 ; CHECK-NEXT: addvl sp, sp, #1
458 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
460 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr);
461 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
464 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld1_x4_f16_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
465 ; CHECK-LABEL: ld1_x4_f16_scalar:
467 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
468 ; CHECK-NEXT: addvl sp, sp, #-1
469 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
470 ; CHECK-NEXT: mov p8.b, p0.b
471 ; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1]
472 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
473 ; CHECK-NEXT: addvl sp, sp, #1
474 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
476 %base = getelementptr half, ptr %ptr, i64 %index
477 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %pn, ptr %base);
478 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
481 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld1_x4_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
482 ; CHECK-LABEL: ld1_x4_bf16:
484 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
485 ; CHECK-NEXT: addvl sp, sp, #-1
486 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
487 ; CHECK-NEXT: mov p8.b, p0.b
488 ; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0]
489 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
490 ; CHECK-NEXT: addvl sp, sp, #1
491 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
493 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x4.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr);
494 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
497 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld1_x4_bf16_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
498 ; CHECK-LABEL: ld1_x4_bf16_scalar:
500 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
501 ; CHECK-NEXT: addvl sp, sp, #-1
502 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
503 ; CHECK-NEXT: mov p8.b, p0.b
504 ; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1]
505 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
506 ; CHECK-NEXT: addvl sp, sp, #1
507 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
509 %base = getelementptr bfloat, ptr %ptr, i64 %index
510 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x4.nxv8bf16(target("aarch64.svcount") %pn, ptr %base);
511 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
514 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld1_x4_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
515 ; CHECK-LABEL: ld1_x4_f32:
517 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
518 ; CHECK-NEXT: addvl sp, sp, #-1
519 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
520 ; CHECK-NEXT: mov p8.b, p0.b
521 ; CHECK-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0]
522 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
523 ; CHECK-NEXT: addvl sp, sp, #1
524 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
526 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr);
527 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
530 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld1_x4_f32_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
531 ; CHECK-LABEL: ld1_x4_f32_scalar:
533 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
534 ; CHECK-NEXT: addvl sp, sp, #-1
535 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
536 ; CHECK-NEXT: mov p8.b, p0.b
537 ; CHECK-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0, x1, lsl #2]
538 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
539 ; CHECK-NEXT: addvl sp, sp, #1
540 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
542 %base = getelementptr float, ptr %ptr, i64 %index
543 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %pn, ptr %base);
544 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
547 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld1_x4_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
548 ; CHECK-LABEL: ld1_x4_f64:
550 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
551 ; CHECK-NEXT: addvl sp, sp, #-1
552 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
553 ; CHECK-NEXT: mov p8.b, p0.b
554 ; CHECK-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0]
555 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
556 ; CHECK-NEXT: addvl sp, sp, #1
557 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
559 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr);
560 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
563 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld1_x4_f64_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
564 ; CHECK-LABEL: ld1_x4_f64_scalar:
566 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
567 ; CHECK-NEXT: addvl sp, sp, #-1
568 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
569 ; CHECK-NEXT: mov p8.b, p0.b
570 ; CHECK-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0, x1, lsl #3]
571 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
572 ; CHECK-NEXT: addvl sp, sp, #1
573 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
575 %base = getelementptr double, ptr %ptr, i64 %index
576 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %pn, ptr %base);
577 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
580 ; Test to ensure we load into the correct registers for the instruction
581 define <vscale x 8 x i16> @ld1_x4_i16_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 8 x i16> %val) {
582 ; CHECK-LABEL: ld1_x4_i16_z0_taken:
584 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
585 ; CHECK-NEXT: addvl sp, sp, #-1
586 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
587 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
588 ; CHECK-NEXT: .cfi_offset w29, -16
589 ; CHECK-NEXT: mov p8.b, p0.b
590 ; CHECK-NEXT: ld1h { z4.h - z7.h }, pn8/z, [x0]
591 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
592 ; CHECK-NEXT: add z0.h, z0.h, z4.h
593 ; CHECK-NEXT: addvl sp, sp, #1
594 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
596 %ld1 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
597 %ld1_0 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %ld1, 0
598 %res = add <vscale x 8 x i16> %val, %ld1_0
599 ret <vscale x 8 x i16> %res
602 ; Test to ensure we load into the correct registers for the instruction
603 define <vscale x 8 x i16> @ld1_x4_i16_z0_taken_scalar(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 8 x i16> %val, i64 %index) {
604 ; CHECK-LABEL: ld1_x4_i16_z0_taken_scalar:
606 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
607 ; CHECK-NEXT: addvl sp, sp, #-1
608 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
609 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
610 ; CHECK-NEXT: .cfi_offset w29, -16
611 ; CHECK-NEXT: mov p8.b, p0.b
612 ; CHECK-NEXT: ld1h { z4.h - z7.h }, pn8/z, [x0, x1, lsl #1]
613 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
614 ; CHECK-NEXT: add z0.h, z0.h, z4.h
615 ; CHECK-NEXT: addvl sp, sp, #1
616 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
618 %base = getelementptr i16, ptr %ptr, i64 %index
619 %ld1 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %base);
620 %ld1_0 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %ld1, 0
621 %res = add <vscale x 8 x i16> %val, %ld1_0
622 ret <vscale x 8 x i16> %res
625 ; == Non-temporal Multi-Vector Consecutive Loads ==
627 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ldnt1_x2_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
628 ; CHECK-LABEL: ldnt1_x2_i8:
630 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
631 ; CHECK-NEXT: addvl sp, sp, #-1
632 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
633 ; CHECK-NEXT: mov p8.b, p0.b
634 ; CHECK-NEXT: ldnt1b { z0.b, z1.b }, pn8/z, [x0]
635 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
636 ; CHECK-NEXT: addvl sp, sp, #1
637 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
639 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
640 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
643 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ldnt1_x2_i8_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
644 ; CHECK-LABEL: ldnt1_x2_i8_scalar:
646 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
647 ; CHECK-NEXT: addvl sp, sp, #-1
648 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
649 ; CHECK-NEXT: mov p8.b, p0.b
650 ; CHECK-NEXT: ldnt1b { z0.b, z1.b }, pn8/z, [x0, x1]
651 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
652 ; CHECK-NEXT: addvl sp, sp, #1
653 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
655 %base = getelementptr i8, ptr %ptr, i64 %index
656 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %base);
657 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
660 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ldnt1_x2_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
661 ; CHECK-LABEL: ldnt1_x2_i16:
663 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
664 ; CHECK-NEXT: addvl sp, sp, #-1
665 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
666 ; CHECK-NEXT: mov p8.b, p0.b
667 ; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0]
668 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
669 ; CHECK-NEXT: addvl sp, sp, #1
670 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
672 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
673 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
676 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ldnt1_x2_i16_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
677 ; CHECK-LABEL: ldnt1_x2_i16_scalar:
679 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
680 ; CHECK-NEXT: addvl sp, sp, #-1
681 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
682 ; CHECK-NEXT: mov p8.b, p0.b
683 ; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1]
684 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
685 ; CHECK-NEXT: addvl sp, sp, #1
686 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
688 %base = getelementptr i16, ptr %ptr, i64 %index
689 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %base);
690 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
693 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ldnt1_x2_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
694 ; CHECK-LABEL: ldnt1_x2_i32:
696 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
697 ; CHECK-NEXT: addvl sp, sp, #-1
698 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
699 ; CHECK-NEXT: mov p8.b, p0.b
700 ; CHECK-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0]
701 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
702 ; CHECK-NEXT: addvl sp, sp, #1
703 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
705 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
706 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
709 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ldnt1_x2_i32_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
710 ; CHECK-LABEL: ldnt1_x2_i32_scalar:
712 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
713 ; CHECK-NEXT: addvl sp, sp, #-1
714 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
715 ; CHECK-NEXT: mov p8.b, p0.b
716 ; CHECK-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0, x1, lsl #2]
717 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
718 ; CHECK-NEXT: addvl sp, sp, #1
719 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
721 %base = getelementptr i32, ptr %ptr, i64 %index
722 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %base);
723 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
726 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ldnt1_x2_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
727 ; CHECK-LABEL: ldnt1_x2_i64:
729 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
730 ; CHECK-NEXT: addvl sp, sp, #-1
731 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
732 ; CHECK-NEXT: mov p8.b, p0.b
733 ; CHECK-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0]
734 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
735 ; CHECK-NEXT: addvl sp, sp, #1
736 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
738 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
739 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
742 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ldnt1_x2_i64_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
743 ; CHECK-LABEL: ldnt1_x2_i64_scalar:
745 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
746 ; CHECK-NEXT: addvl sp, sp, #-1
747 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
748 ; CHECK-NEXT: mov p8.b, p0.b
749 ; CHECK-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0, x1, lsl #3]
750 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
751 ; CHECK-NEXT: addvl sp, sp, #1
752 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
754 %base = getelementptr i64, ptr %ptr, i64 %index
755 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %base);
756 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
759 define { <vscale x 8 x half>, <vscale x 8 x half> } @ldnt1_x2_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
760 ; CHECK-LABEL: ldnt1_x2_f16:
762 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
763 ; CHECK-NEXT: addvl sp, sp, #-1
764 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
765 ; CHECK-NEXT: mov p8.b, p0.b
766 ; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0]
767 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
768 ; CHECK-NEXT: addvl sp, sp, #1
769 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
771 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr);
772 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
775 define { <vscale x 8 x half>, <vscale x 8 x half> } @ldnt1_x2_f16_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
776 ; CHECK-LABEL: ldnt1_x2_f16_scalar:
778 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
779 ; CHECK-NEXT: addvl sp, sp, #-1
780 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
781 ; CHECK-NEXT: mov p8.b, p0.b
782 ; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1]
783 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
784 ; CHECK-NEXT: addvl sp, sp, #1
785 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
787 %base = getelementptr i16, ptr %ptr, i64 %index
788 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") %pn, ptr %base);
789 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
792 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ldnt1_x2_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
793 ; CHECK-LABEL: ldnt1_x2_bf16:
795 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
796 ; CHECK-NEXT: addvl sp, sp, #-1
797 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
798 ; CHECK-NEXT: mov p8.b, p0.b
799 ; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0]
800 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
801 ; CHECK-NEXT: addvl sp, sp, #1
802 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
804 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr);
805 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
808 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ldnt1_x2_bf16_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
809 ; CHECK-LABEL: ldnt1_x2_bf16_scalar:
811 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
812 ; CHECK-NEXT: addvl sp, sp, #-1
813 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
814 ; CHECK-NEXT: mov p8.b, p0.b
815 ; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1]
816 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
817 ; CHECK-NEXT: addvl sp, sp, #1
818 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
820 %base = getelementptr bfloat, ptr %ptr, i64 %index
821 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8bf16(target("aarch64.svcount") %pn, ptr %base);
822 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
825 define { <vscale x 4 x float>, <vscale x 4 x float> } @ldnt1_x2_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
826 ; CHECK-LABEL: ldnt1_x2_f32:
828 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
829 ; CHECK-NEXT: addvl sp, sp, #-1
830 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
831 ; CHECK-NEXT: mov p8.b, p0.b
832 ; CHECK-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0]
833 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
834 ; CHECK-NEXT: addvl sp, sp, #1
835 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
837 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr);
838 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
841 define { <vscale x 4 x float>, <vscale x 4 x float> } @ldnt1_x2_f32_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
842 ; CHECK-LABEL: ldnt1_x2_f32_scalar:
844 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
845 ; CHECK-NEXT: addvl sp, sp, #-1
846 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
847 ; CHECK-NEXT: mov p8.b, p0.b
848 ; CHECK-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0, x1, lsl #2]
849 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
850 ; CHECK-NEXT: addvl sp, sp, #1
851 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
853 %base = getelementptr float, ptr %ptr, i64 %index
854 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") %pn, ptr %base);
855 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
858 define { <vscale x 2 x double>, <vscale x 2 x double> } @ldnt1_x2_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
859 ; CHECK-LABEL: ldnt1_x2_f64:
861 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
862 ; CHECK-NEXT: addvl sp, sp, #-1
863 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
864 ; CHECK-NEXT: mov p8.b, p0.b
865 ; CHECK-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0]
866 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
867 ; CHECK-NEXT: addvl sp, sp, #1
868 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
870 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr);
871 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
874 define { <vscale x 2 x double>, <vscale x 2 x double> } @ldnt1_x2_f64_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
875 ; CHECK-LABEL: ldnt1_x2_f64_scalar:
877 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
878 ; CHECK-NEXT: addvl sp, sp, #-1
879 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
880 ; CHECK-NEXT: mov p8.b, p0.b
881 ; CHECK-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0, x1, lsl #3]
882 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
883 ; CHECK-NEXT: addvl sp, sp, #1
884 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
886 %base = getelementptr double, ptr %ptr, i64 %index
887 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") %pn, ptr %base);
888 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
891 ; Test to ensure we load into the correct registers for the instruction
892 define <vscale x 4 x i32> @ldnt1_x2_i32_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 4 x i32> %val) {
893 ; CHECK-LABEL: ldnt1_x2_i32_z0_taken:
895 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
896 ; CHECK-NEXT: addvl sp, sp, #-1
897 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
898 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
899 ; CHECK-NEXT: .cfi_offset w29, -16
900 ; CHECK-NEXT: mov p8.b, p0.b
901 ; CHECK-NEXT: ldnt1w { z2.s, z3.s }, pn8/z, [x0]
902 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
903 ; CHECK-NEXT: add z0.s, z0.s, z2.s
904 ; CHECK-NEXT: addvl sp, sp, #1
905 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
907 %ld1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
908 %ld1_0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ld1, 0
909 %res = add <vscale x 4 x i32> %val, %ld1_0
910 ret <vscale x 4 x i32> %res
913 ; Test to ensure we load into the correct registers for the instruction
914 define <vscale x 4 x i32> @ldnt1_x2_i32_z0_taken_scalar(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 4 x i32> %val, i64 %index) {
915 ; CHECK-LABEL: ldnt1_x2_i32_z0_taken_scalar:
917 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
918 ; CHECK-NEXT: addvl sp, sp, #-1
919 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
920 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
921 ; CHECK-NEXT: .cfi_offset w29, -16
922 ; CHECK-NEXT: mov p8.b, p0.b
923 ; CHECK-NEXT: ldnt1w { z2.s, z3.s }, pn8/z, [x0, x1, lsl #2]
924 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
925 ; CHECK-NEXT: add z0.s, z0.s, z2.s
926 ; CHECK-NEXT: addvl sp, sp, #1
927 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
929 %base = getelementptr i32, ptr %ptr, i64 %index
930 %ld1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %base);
931 %ld1_0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %ld1, 0
932 %res = add <vscale x 4 x i32> %val, %ld1_0
933 ret <vscale x 4 x i32> %res
936 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ldnt1_x4_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
937 ; CHECK-LABEL: ldnt1_x4_i8:
939 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
940 ; CHECK-NEXT: addvl sp, sp, #-1
941 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
942 ; CHECK-NEXT: mov p8.b, p0.b
943 ; CHECK-NEXT: ldnt1b { z0.b - z3.b }, pn8/z, [x0]
944 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
945 ; CHECK-NEXT: addvl sp, sp, #1
946 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
948 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr);
949 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
952 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ldnt1_x4_i8_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
953 ; CHECK-LABEL: ldnt1_x4_i8_scalar:
955 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
956 ; CHECK-NEXT: addvl sp, sp, #-1
957 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
958 ; CHECK-NEXT: mov p8.b, p0.b
959 ; CHECK-NEXT: ldnt1b { z0.b - z3.b }, pn8/z, [x0, x1]
960 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
961 ; CHECK-NEXT: addvl sp, sp, #1
962 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
964 %base = getelementptr i8, ptr %ptr, i64 %index
965 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %base);
966 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
969 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ldnt1_x4_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
970 ; CHECK-LABEL: ldnt1_x4_i16:
972 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
973 ; CHECK-NEXT: addvl sp, sp, #-1
974 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
975 ; CHECK-NEXT: mov p8.b, p0.b
976 ; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0]
977 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
978 ; CHECK-NEXT: addvl sp, sp, #1
979 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
981 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr);
982 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
985 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ldnt1_x4_i16_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
986 ; CHECK-LABEL: ldnt1_x4_i16_scalar:
988 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
989 ; CHECK-NEXT: addvl sp, sp, #-1
990 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
991 ; CHECK-NEXT: mov p8.b, p0.b
992 ; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1]
993 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
994 ; CHECK-NEXT: addvl sp, sp, #1
995 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
997 %base = getelementptr i16, ptr %ptr, i64 %index
998 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %base);
999 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
1002 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ldnt1_x4_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
1003 ; CHECK-LABEL: ldnt1_x4_i32:
1005 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1006 ; CHECK-NEXT: addvl sp, sp, #-1
1007 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1008 ; CHECK-NEXT: mov p8.b, p0.b
1009 ; CHECK-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0]
1010 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1011 ; CHECK-NEXT: addvl sp, sp, #1
1012 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1014 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr);
1015 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
1018 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ldnt1_x4_i32_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
1019 ; CHECK-LABEL: ldnt1_x4_i32_scalar:
1021 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1022 ; CHECK-NEXT: addvl sp, sp, #-1
1023 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1024 ; CHECK-NEXT: mov p8.b, p0.b
1025 ; CHECK-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0, x1, lsl #2]
1026 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1027 ; CHECK-NEXT: addvl sp, sp, #1
1028 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1030 %base = getelementptr i32, ptr %ptr, i64 %index
1031 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %base);
1032 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
1035 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ldnt1_x4_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
1036 ; CHECK-LABEL: ldnt1_x4_i64:
1038 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1039 ; CHECK-NEXT: addvl sp, sp, #-1
1040 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1041 ; CHECK-NEXT: mov p8.b, p0.b
1042 ; CHECK-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0]
1043 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1044 ; CHECK-NEXT: addvl sp, sp, #1
1045 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1047 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
1048 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
1051 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ldnt1_x4_i64_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
1052 ; CHECK-LABEL: ldnt1_x4_i64_scalar:
1054 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1055 ; CHECK-NEXT: addvl sp, sp, #-1
1056 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1057 ; CHECK-NEXT: mov p8.b, p0.b
1058 ; CHECK-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0, x1, lsl #3]
1059 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1060 ; CHECK-NEXT: addvl sp, sp, #1
1061 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1063 %base = getelementptr i64, ptr %ptr, i64 %index
1064 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %base);
1065 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
1068 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ldnt1_x4_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
1069 ; CHECK-LABEL: ldnt1_x4_f16:
1071 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1072 ; CHECK-NEXT: addvl sp, sp, #-1
1073 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1074 ; CHECK-NEXT: mov p8.b, p0.b
1075 ; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0]
1076 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1077 ; CHECK-NEXT: addvl sp, sp, #1
1078 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1080 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr);
1081 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
1084 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ldnt1_x4_f16_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
1085 ; CHECK-LABEL: ldnt1_x4_f16_scalar:
1087 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1088 ; CHECK-NEXT: addvl sp, sp, #-1
1089 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1090 ; CHECK-NEXT: mov p8.b, p0.b
1091 ; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1]
1092 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1093 ; CHECK-NEXT: addvl sp, sp, #1
1094 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1096 %base = getelementptr half, ptr %ptr, i64 %index
1097 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") %pn, ptr %base);
1098 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
1101 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ldnt1_x4_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
1102 ; CHECK-LABEL: ldnt1_x4_bf16:
1104 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1105 ; CHECK-NEXT: addvl sp, sp, #-1
1106 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1107 ; CHECK-NEXT: mov p8.b, p0.b
1108 ; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0]
1109 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1110 ; CHECK-NEXT: addvl sp, sp, #1
1111 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1113 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr);
1114 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
1117 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ldnt1_x4_bf16_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
1118 ; CHECK-LABEL: ldnt1_x4_bf16_scalar:
1120 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1121 ; CHECK-NEXT: addvl sp, sp, #-1
1122 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1123 ; CHECK-NEXT: mov p8.b, p0.b
1124 ; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1]
1125 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1126 ; CHECK-NEXT: addvl sp, sp, #1
1127 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1129 %base = getelementptr bfloat, ptr %ptr, i64 %index
1130 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8bf16(target("aarch64.svcount") %pn, ptr %base);
1131 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
1134 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ldnt1_x4_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
1135 ; CHECK-LABEL: ldnt1_x4_f32:
1137 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1138 ; CHECK-NEXT: addvl sp, sp, #-1
1139 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1140 ; CHECK-NEXT: mov p8.b, p0.b
1141 ; CHECK-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0]
1142 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1143 ; CHECK-NEXT: addvl sp, sp, #1
1144 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1146 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr);
1147 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
1150 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ldnt1_x4_f32_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
1151 ; CHECK-LABEL: ldnt1_x4_f32_scalar:
1153 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1154 ; CHECK-NEXT: addvl sp, sp, #-1
1155 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1156 ; CHECK-NEXT: mov p8.b, p0.b
1157 ; CHECK-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0, x1, lsl #2]
1158 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1159 ; CHECK-NEXT: addvl sp, sp, #1
1160 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1162 %base = getelementptr float, ptr %ptr, i64 %index
1163 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") %pn, ptr %base);
1164 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
1167 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ldnt1_x4_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind {
1168 ; CHECK-LABEL: ldnt1_x4_f64:
1170 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1171 ; CHECK-NEXT: addvl sp, sp, #-1
1172 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1173 ; CHECK-NEXT: mov p8.b, p0.b
1174 ; CHECK-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0]
1175 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1176 ; CHECK-NEXT: addvl sp, sp, #1
1177 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1179 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr);
1180 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
1183 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ldnt1_x4_f64_scalar(target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
1184 ; CHECK-LABEL: ldnt1_x4_f64_scalar:
1186 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1187 ; CHECK-NEXT: addvl sp, sp, #-1
1188 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1189 ; CHECK-NEXT: mov p8.b, p0.b
1190 ; CHECK-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0, x1, lsl #3]
1191 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1192 ; CHECK-NEXT: addvl sp, sp, #1
1193 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1195 %base = getelementptr double, ptr %ptr, i64 %index
1196 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") %pn, ptr %base);
1197 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
1200 ; Test to ensure we load into the correct registers for the instruction
1201 define <vscale x 2 x i64> @ldnt1_x4_i64_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 2 x i64> %val) {
1202 ; CHECK-LABEL: ldnt1_x4_i64_z0_taken:
1204 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1205 ; CHECK-NEXT: addvl sp, sp, #-1
1206 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1207 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1208 ; CHECK-NEXT: .cfi_offset w29, -16
1209 ; CHECK-NEXT: mov p8.b, p0.b
1210 ; CHECK-NEXT: ldnt1d { z4.d - z7.d }, pn8/z, [x0]
1211 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1212 ; CHECK-NEXT: add z0.d, z0.d, z4.d
1213 ; CHECK-NEXT: addvl sp, sp, #1
1214 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1216 %ld1 = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr);
1217 %ld1_0 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %ld1, 0
1218 %res = add <vscale x 2 x i64> %val, %ld1_0
1219 ret <vscale x 2 x i64> %res
1222 ; Test to ensure we load into the correct registers for the instruction
1223 define <vscale x 2 x i64> @ldnt1_x4_i64_z0_taken_scalar(target("aarch64.svcount") %pn, ptr %ptr, <vscale x 2 x i64> %val, i64 %index) {
1224 ; CHECK-LABEL: ldnt1_x4_i64_z0_taken_scalar:
1226 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1227 ; CHECK-NEXT: addvl sp, sp, #-1
1228 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1229 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1230 ; CHECK-NEXT: .cfi_offset w29, -16
1231 ; CHECK-NEXT: mov p8.b, p0.b
1232 ; CHECK-NEXT: ldnt1d { z4.d - z7.d }, pn8/z, [x0, x1, lsl #3]
1233 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1234 ; CHECK-NEXT: add z0.d, z0.d, z4.d
1235 ; CHECK-NEXT: addvl sp, sp, #1
1236 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1238 %base = getelementptr i64, ptr %ptr, i64 %index
1239 %ld1 = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %base);
1240 %ld1_0 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %ld1, 0
1241 %res = add <vscale x 2 x i64> %val, %ld1_0
1242 ret <vscale x 2 x i64> %res
1245 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount"), ptr)
1246 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount"), ptr)
1247 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount"), ptr)
1248 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount"), ptr)
1249 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount"), ptr)
1250 declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount"), ptr)
1251 declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount"), ptr)
1252 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount"), ptr)
1254 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr)
1255 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr)
1256 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr)
1257 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr)
1258 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount"), ptr)
1259 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount"), ptr)
1260 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr)
1261 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld1.pn.x4.nxv8bf16(target("aarch64.svcount"), ptr)
1263 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount"), ptr)
1264 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount"), ptr)
1265 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount"), ptr)
1266 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount"), ptr)
1267 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount"), ptr)
1268 declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount"), ptr)
1269 declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount"), ptr)
1270 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8bf16(target("aarch64.svcount"), ptr)
1272 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr)
1273 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr)
1274 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr)
1275 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr)
1276 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount"), ptr)
1277 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount"), ptr)
1278 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr)
1279 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8bf16(target("aarch64.svcount"), ptr)