1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
5 ; Move Multi-Vector From Tile (Read) x2
10 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg2_b(i32 %slice) {
11 ; CHECK-LABEL: za_read_horiz_vg2_b:
13 ; CHECK-NEXT: mov w12, w0
14 ; CHECK-NEXT: mov { z0.b, z1.b }, za0h.b[w12, 0:1]
15 ; CHECK-NEXT: mov { z0.b, z1.b }, za0h.b[w12, 14:15]
17 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice)
18 %slice.14 = add i32 %slice, 14
19 %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice.14)
20 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
23 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg2_h(i32 %slice) {
24 ; CHECK-LABEL: za_read_horiz_vg2_h:
26 ; CHECK-NEXT: mov w12, w0
27 ; CHECK-NEXT: mov { z0.h, z1.h }, za0h.h[w12, 0:1]
28 ; CHECK-NEXT: mov { z0.h, z1.h }, za1h.h[w12, 6:7]
30 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 0, i32 %slice)
31 %slice.6 = add i32 %slice, 6
32 %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 %slice.6)
33 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
36 define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg2_f16(i32 %slice) {
37 ; CHECK-LABEL: za_read_horiz_vg2_f16:
39 ; CHECK-NEXT: mov w12, w0
40 ; CHECK-NEXT: mov { z0.h, z1.h }, za0h.h[w12, 0:1]
41 ; CHECK-NEXT: mov { z0.h, z1.h }, za1h.h[w12, 6:7]
43 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 0, i32 %slice)
44 %slice.6 = add i32 %slice, 6
45 %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 %slice.6)
46 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
49 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg2_bf16(i32 %slice) {
50 ; CHECK-LABEL: za_read_horiz_vg2_bf16:
52 ; CHECK-NEXT: mov w12, w0
53 ; CHECK-NEXT: mov { z0.h, z1.h }, za0h.h[w12, 0:1]
54 ; CHECK-NEXT: mov { z0.h, z1.h }, za1h.h[w12, 6:7]
56 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 0, i32 %slice)
57 %slice.6 = add i32 %slice, 6
58 %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 %slice.6)
59 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
62 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg2_s(i32 %slice) {
63 ; CHECK-LABEL: za_read_horiz_vg2_s:
65 ; CHECK-NEXT: mov w12, w0
66 ; CHECK-NEXT: mov { z0.s, z1.s }, za0h.s[w12, 0:1]
67 ; CHECK-NEXT: mov { z0.s, z1.s }, za3h.s[w12, 2:3]
69 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 0, i32 %slice)
70 %slice.2 = add i32 %slice, 2
71 %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 %slice.2)
72 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
75 define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_horiz_vg2_f32(i32 %slice) {
76 ; CHECK-LABEL: za_read_horiz_vg2_f32:
78 ; CHECK-NEXT: mov w12, w0
79 ; CHECK-NEXT: mov { z0.s, z1.s }, za0h.s[w12, 0:1]
80 ; CHECK-NEXT: mov { z0.s, z1.s }, za3h.s[w12, 2:3]
82 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 0, i32 %slice)
83 %slice.2 = add i32 %slice, 2
84 %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 %slice.2)
85 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
88 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_horiz_vg2_d(i32 %slice) {
89 ; CHECK-LABEL: za_read_horiz_vg2_d:
91 ; CHECK-NEXT: mov w12, w0
92 ; CHECK-NEXT: mov { z0.d, z1.d }, za0h.d[w12, 0:1]
94 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 0, i32 %slice)
95 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
98 define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_horiz_vg2_f64(i32 %slice) {
99 ; CHECK-LABEL: za_read_horiz_vg2_f64:
101 ; CHECK-NEXT: mov w12, w0
102 ; CHECK-NEXT: mov { z0.d, z1.d }, za0h.d[w12, 0:1]
104 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 0, i32 %slice)
105 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
110 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg2_b(i32 %slice) {
111 ; CHECK-LABEL: za_read_vert_vg2_b:
113 ; CHECK-NEXT: mov w12, w0
114 ; CHECK-NEXT: mov { z0.b, z1.b }, za0v.b[w12, 0:1]
115 ; CHECK-NEXT: mov { z0.b, z1.b }, za0v.b[w12, 14:15]
117 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice)
118 %slice.14 = add i32 %slice, 14
119 %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice.14)
120 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
123 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg2_h(i32 %slice) {
124 ; CHECK-LABEL: za_read_vert_vg2_h:
126 ; CHECK-NEXT: mov w12, w0
127 ; CHECK-NEXT: mov { z0.h, z1.h }, za0v.h[w12, 0:1]
128 ; CHECK-NEXT: mov { z0.h, z1.h }, za1v.h[w12, 6:7]
130 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 0, i32 %slice)
131 %slice.6 = add i32 %slice, 6
132 %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 %slice.6)
133 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
136 define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg2_f16(i32 %slice) {
137 ; CHECK-LABEL: za_read_vert_vg2_f16:
139 ; CHECK-NEXT: mov w12, w0
140 ; CHECK-NEXT: mov { z0.h, z1.h }, za0v.h[w12, 0:1]
141 ; CHECK-NEXT: mov { z0.h, z1.h }, za1v.h[w12, 6:7]
143 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 0, i32 %slice)
144 %slice.6 = add i32 %slice, 6
145 %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 %slice.6)
146 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
149 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg2_bf16(i32 %slice) {
150 ; CHECK-LABEL: za_read_vert_vg2_bf16:
152 ; CHECK-NEXT: mov w12, w0
153 ; CHECK-NEXT: mov { z0.h, z1.h }, za0v.h[w12, 0:1]
154 ; CHECK-NEXT: mov { z0.h, z1.h }, za1v.h[w12, 6:7]
156 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 0, i32 %slice)
157 %slice.6 = add i32 %slice, 6
158 %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 %slice.6)
159 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
162 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg2_s(i32 %slice) {
163 ; CHECK-LABEL: za_read_vert_vg2_s:
165 ; CHECK-NEXT: mov w12, w0
166 ; CHECK-NEXT: mov { z0.s, z1.s }, za0v.s[w12, 0:1]
167 ; CHECK-NEXT: mov { z0.s, z1.s }, za3v.s[w12, 2:3]
169 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 0, i32 %slice)
170 %slice.2 = add i32 %slice, 2
171 %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 %slice.2)
172 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
175 define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vert_vg2_f32(i32 %slice) {
176 ; CHECK-LABEL: za_read_vert_vg2_f32:
178 ; CHECK-NEXT: mov w12, w0
179 ; CHECK-NEXT: mov { z0.s, z1.s }, za0v.s[w12, 0:1]
180 ; CHECK-NEXT: mov { z0.s, z1.s }, za3v.s[w12, 2:3]
182 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 0, i32 %slice)
183 %slice.2 = add i32 %slice, 2
184 %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 %slice.2)
185 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
188 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vert_vg2_d(i32 %slice) {
189 ; CHECK-LABEL: za_read_vert_vg2_d:
191 ; CHECK-NEXT: mov w12, w0
192 ; CHECK-NEXT: mov { z0.d, z1.d }, za0v.d[w12, 0:1]
194 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 0, i32 %slice)
195 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
198 define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vert_vg2_f64(i32 %slice) {
199 ; CHECK-LABEL: za_read_vert_vg2_f64:
201 ; CHECK-NEXT: mov w12, w0
202 ; CHECK-NEXT: mov { z0.d, z1.d }, za0v.d[w12, 0:1]
204 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 0, i32 %slice)
205 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
209 ; Move Multi-Vector From Tile (Read) x4
214 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg4_b(i32 %slice) {
215 ; CHECK-LABEL: za_read_horiz_vg4_b:
217 ; CHECK-NEXT: mov w12, w0
218 ; CHECK-NEXT: mov { z0.b - z3.b }, za0h.b[w12, 0:3]
219 ; CHECK-NEXT: mov { z0.b - z3.b }, za0h.b[w12, 12:15]
221 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice)
222 %slice.12 = add i32 %slice, 12
223 %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice.12)
224 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
227 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg4_h(i32 %slice) {
228 ; CHECK-LABEL: za_read_horiz_vg4_h:
230 ; CHECK-NEXT: mov w12, w0
231 ; CHECK-NEXT: mov { z0.h - z3.h }, za0h.h[w12, 0:3]
232 ; CHECK-NEXT: mov { z0.h - z3.h }, za1h.h[w12, 4:7]
234 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 0, i32 %slice)
235 %slice.4 = add i32 %slice, 4
236 %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 %slice.4)
237 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
240 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg4_f16(i32 %slice) {
241 ; CHECK-LABEL: za_read_horiz_vg4_f16:
243 ; CHECK-NEXT: mov w12, w0
244 ; CHECK-NEXT: mov { z0.h - z3.h }, za0h.h[w12, 0:3]
245 ; CHECK-NEXT: mov { z0.h - z3.h }, za1h.h[w12, 4:7]
247 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 0, i32 %slice)
248 %slice.4 = add i32 %slice, 4
249 %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 %slice.4)
250 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
253 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg4_bf16(i32 %slice) {
254 ; CHECK-LABEL: za_read_horiz_vg4_bf16:
256 ; CHECK-NEXT: mov w12, w0
257 ; CHECK-NEXT: mov { z0.h - z3.h }, za0h.h[w12, 0:3]
258 ; CHECK-NEXT: mov { z0.h - z3.h }, za1h.h[w12, 4:7]
260 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 0, i32 %slice)
261 %slice.4 = add i32 %slice, 4
262 %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 %slice.4)
263 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
266 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg4_s(i32 %slice) {
267 ; CHECK-LABEL: za_read_horiz_vg4_s:
269 ; CHECK-NEXT: mov w12, w0
270 ; CHECK-NEXT: mov { z0.s - z3.s }, za0h.s[w12, 0:3]
272 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 0, i32 %slice)
273 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
276 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_horiz_vg4_f32(i32 %slice) {
277 ; CHECK-LABEL: za_read_horiz_vg4_f32:
279 ; CHECK-NEXT: mov w12, w0
280 ; CHECK-NEXT: mov { z0.s - z3.s }, za0h.s[w12, 0:3]
282 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 0, i32 %slice)
283 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
286 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_horiz_vg4_d(i32 %slice) {
287 ; CHECK-LABEL: za_read_horiz_vg4_d:
289 ; CHECK-NEXT: mov w12, w0
290 ; CHECK-NEXT: mov { z0.d - z3.d }, za0h.d[w12, 0:3]
292 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 0, i32 %slice)
293 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
296 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_horiz_vg4_f64(i32 %slice) {
297 ; CHECK-LABEL: za_read_horiz_vg4_f64:
299 ; CHECK-NEXT: mov w12, w0
300 ; CHECK-NEXT: mov { z0.d - z3.d }, za0h.d[w12, 0:3]
302 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 0, i32 %slice)
303 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
308 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg4_b(i32 %slice) {
309 ; CHECK-LABEL: za_read_vert_vg4_b:
311 ; CHECK-NEXT: mov w12, w0
312 ; CHECK-NEXT: mov { z0.b - z3.b }, za0v.b[w12, 0:3]
313 ; CHECK-NEXT: mov { z0.b - z3.b }, za0v.b[w12, 12:15]
315 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice)
316 %slice.12 = add i32 %slice, 12
317 %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice.12)
318 ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
321 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg4_h(i32 %slice) {
322 ; CHECK-LABEL: za_read_vert_vg4_h:
324 ; CHECK-NEXT: mov w12, w0
325 ; CHECK-NEXT: mov { z0.h - z3.h }, za0v.h[w12, 0:3]
326 ; CHECK-NEXT: mov { z0.h - z3.h }, za1v.h[w12, 4:7]
328 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 0, i32 %slice)
329 %slice.4 = add i32 %slice, 4
330 %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 %slice.4)
331 ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
334 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg4_f16(i32 %slice) {
335 ; CHECK-LABEL: za_read_vert_vg4_f16:
337 ; CHECK-NEXT: mov w12, w0
338 ; CHECK-NEXT: mov { z0.h - z3.h }, za0v.h[w12, 0:3]
339 ; CHECK-NEXT: mov { z0.h - z3.h }, za1v.h[w12, 4:7]
341 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 0, i32 %slice)
342 %slice.4 = add i32 %slice, 4
343 %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 %slice.4)
344 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
347 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg4_bf16(i32 %slice) {
348 ; CHECK-LABEL: za_read_vert_vg4_bf16:
350 ; CHECK-NEXT: mov w12, w0
351 ; CHECK-NEXT: mov { z0.h - z3.h }, za0v.h[w12, 0:3]
352 ; CHECK-NEXT: mov { z0.h - z3.h }, za1v.h[w12, 4:7]
354 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 0, i32 %slice)
355 %slice.4 = add i32 %slice, 4
356 %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 %slice.4)
357 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
360 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg4_s(i32 %slice) {
361 ; CHECK-LABEL: za_read_vert_vg4_s:
363 ; CHECK-NEXT: mov w12, w0
364 ; CHECK-NEXT: mov { z0.s - z3.s }, za0v.s[w12, 0:3]
366 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 0, i32 %slice)
367 ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
370 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vert_vg4_f32(i32 %slice) {
371 ; CHECK-LABEL: za_read_vert_vg4_f32:
373 ; CHECK-NEXT: mov w12, w0
374 ; CHECK-NEXT: mov { z0.s - z3.s }, za0v.s[w12, 0:3]
376 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 0, i32 %slice)
377 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
380 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vert_vg4_d(i32 %slice) {
381 ; CHECK-LABEL: za_read_vert_vg4_d:
383 ; CHECK-NEXT: mov w12, w0
384 ; CHECK-NEXT: mov { z0.d - z3.d }, za0v.d[w12, 0:3]
386 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 0, i32 %slice)
387 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
390 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vert_vg4_f64(i32 %slice) {
391 ; CHECK-LABEL: za_read_vert_vg4_f64:
393 ; CHECK-NEXT: mov w12, w0
394 ; CHECK-NEXT: mov { z0.d - z3.d }, za0v.d[w12, 0:3]
396 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 0, i32 %slice)
397 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
400 ; Move Multi-Vector From ZA (Read) x2
402 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x2_d(i32 %slice) {
403 ; CHECK-LABEL: za_read_vg1x2_d:
405 ; CHECK-NEXT: mov w8, w0
406 ; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
407 ; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
409 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice)
410 %slice.7 = add i32 %slice, 7
411 %res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice.7)
412 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
415 define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x2_f64(i32 %slice) {
416 ; CHECK-LABEL: za_read_vg1x2_f64:
418 ; CHECK-NEXT: mov w8, w0
419 ; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
420 ; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
422 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice)
423 %slice.7 = add i32 %slice, 7
424 %res2 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice.7)
425 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res2
428 ; Move Multi-Vector From ZA (Read) x4
430 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x4_d(i32 %slice) {
431 ; CHECK-LABEL: za_read_vg1x4_d:
433 ; CHECK-NEXT: mov w8, w0
434 ; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
435 ; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
437 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice)
438 %slice.7 = add i32 %slice, 7
439 %res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice.7)
440 ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
443 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x4_f64(i32 %slice) {
444 ; CHECK-LABEL: za_read_vg1x4_f64:
446 ; CHECK-NEXT: mov w8, w0
447 ; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
448 ; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
450 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice)
451 %slice.7 = add i32 %slice, 7
452 %res2 = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice.7)
453 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res2
456 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32, i32)
457 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32, i32)
458 declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32, i32)
459 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32, i32)
460 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32, i32)
461 declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32, i32)
462 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32, i32)
463 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32, i32)
465 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32, i32)
466 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32, i32)
467 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32, i32)
468 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32, i32)
469 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32, i32)
470 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32, i32)
471 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32, i32)
472 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32, i32)
474 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32, i32)
475 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32, i32)
476 declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32, i32)
477 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32, i32)
478 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32, i32)
479 declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32, i32)
480 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32, i32)
481 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32, i32)
483 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32, i32)
484 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32, i32)
485 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32, i32)
486 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32, i32)
487 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32, i32)
488 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32, i32)
489 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32, i32)
490 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32, i32)
492 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32)
493 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32)
495 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32)
496 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32)