1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s
5 ; Move Multi-Vector To Tile (Write) x 2
10 define void @za_write_vg2_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) {
11 ; CHECK-LABEL: za_write_vg2_horiz_b:
13 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
14 ; CHECK-NEXT: mov w12, w0
15 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
16 ; CHECK-NEXT: mov za0h.b[w12, 0:1], { z0.b, z1.b }
17 ; CHECK-NEXT: mov za0h.b[w12, 14:15], { z0.b, z1.b }
19 call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
20 %slice.14 = add i32 %slice, 14
21 call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 %slice.14, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
25 define void @za_write_vg2_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) {
26 ; CHECK-LABEL: za_write_vg2_horiz_h:
28 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
29 ; CHECK-NEXT: mov w12, w0
30 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
31 ; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h }
32 ; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h }
34 call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
35 %slice.6 = add i32 %slice, 6
36 call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 %slice.6, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
40 define void @za_write_vg2_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) {
41 ; CHECK-LABEL: za_write_vg2_horiz_f16:
43 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
44 ; CHECK-NEXT: mov w12, w0
45 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
46 ; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h }
47 ; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h }
49 call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2)
50 %slice.6 = add i32 %slice, 6
51 call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 1, i32 %slice.6, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2)
55 define void @za_write_vg2_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) {
56 ; CHECK-LABEL: za_write_vg2_horiz_bf16:
58 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
59 ; CHECK-NEXT: mov w12, w0
60 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
61 ; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h }
62 ; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h }
64 call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2)
65 %slice.6 = add i32 %slice, 6
66 call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 1, i32 %slice.6, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2)
70 define void @za_write_vg2_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) {
71 ; CHECK-LABEL: za_write_vg2_horiz_s:
73 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
74 ; CHECK-NEXT: mov w12, w0
75 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
76 ; CHECK-NEXT: mov za0h.s[w12, 0:1], { z0.s, z1.s }
77 ; CHECK-NEXT: mov za3h.s[w12, 2:3], { z0.s, z1.s }
79 call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2)
80 %slice.2 = add i32 %slice, 2
81 call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 %slice.2, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2)
85 define void @za_write_vg2_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
86 ; CHECK-LABEL: za_write_vg2_horiz_f32:
88 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
89 ; CHECK-NEXT: mov w12, w0
90 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
91 ; CHECK-NEXT: mov za0h.s[w12, 0:1], { z0.s, z1.s }
92 ; CHECK-NEXT: mov za3h.s[w12, 2:3], { z0.s, z1.s }
94 call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
95 %slice.2 = add i32 %slice, 2
96 call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 3, i32 %slice.2, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
100 define void @za_write_vg2_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) {
101 ; CHECK-LABEL: za_write_vg2_horiz_d:
103 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
104 ; CHECK-NEXT: mov w12, w0
105 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
106 ; CHECK-NEXT: mov za0h.d[w12, 0:1], { z0.d, z1.d }
108 call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2)
112 define void @za_write_vg2_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) {
113 ; CHECK-LABEL: za_write_vg2_horiz_f64:
115 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
116 ; CHECK-NEXT: mov w12, w0
117 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
118 ; CHECK-NEXT: mov za0h.d[w12, 0:1], { z0.d, z1.d }
120 call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2)
126 define void @za_write_vg2_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) {
127 ; CHECK-LABEL: za_write_vg2_vert_b:
129 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
130 ; CHECK-NEXT: mov w12, w0
131 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
132 ; CHECK-NEXT: mov za0v.b[w12, 0:1], { z0.b, z1.b }
133 ; CHECK-NEXT: mov za0v.b[w12, 14:15], { z0.b, z1.b }
135 call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
136 %slice.14 = add i32 %slice, 14
137 call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 %slice.14, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
141 define void @za_write_vg2_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) {
142 ; CHECK-LABEL: za_write_vg2_vert_h:
144 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
145 ; CHECK-NEXT: mov w12, w0
146 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
147 ; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h }
148 ; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h }
150 call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
151 %slice.6 = add i32 %slice, 6
152 call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 %slice.6, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
156 define void @za_write_vg2_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) {
157 ; CHECK-LABEL: za_write_vg2_vert_f16:
159 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
160 ; CHECK-NEXT: mov w12, w0
161 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
162 ; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h }
163 ; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h }
165 call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2)
166 %slice.6 = add i32 %slice, 6
167 call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 1, i32 %slice.6, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2)
171 define void @za_write_vg2_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) {
172 ; CHECK-LABEL: za_write_vg2_vert_bf16:
174 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
175 ; CHECK-NEXT: mov w12, w0
176 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
177 ; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h }
178 ; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h }
180 call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2)
181 %slice.6 = add i32 %slice, 6
182 call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 1, i32 %slice.6, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2)
186 define void @za_write_vg2_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) {
187 ; CHECK-LABEL: za_write_vg2_vert_s:
189 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
190 ; CHECK-NEXT: mov w12, w0
191 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
192 ; CHECK-NEXT: mov za0v.s[w12, 0:1], { z0.s, z1.s }
193 ; CHECK-NEXT: mov za3v.s[w12, 2:3], { z0.s, z1.s }
195 call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2)
196 %slice.2 = add i32 %slice, 2
197 call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 %slice.2, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2)
201 define void @za_write_vg2_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
202 ; CHECK-LABEL: za_write_vg2_vert_f32:
204 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
205 ; CHECK-NEXT: mov w12, w0
206 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
207 ; CHECK-NEXT: mov za0v.s[w12, 0:1], { z0.s, z1.s }
208 ; CHECK-NEXT: mov za3v.s[w12, 2:3], { z0.s, z1.s }
210 call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
211 %slice.2 = add i32 %slice, 2
212 call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 3, i32 %slice.2, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
216 define void @za_write_vg2_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) {
217 ; CHECK-LABEL: za_write_vg2_vert_d:
219 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
220 ; CHECK-NEXT: mov w12, w0
221 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
222 ; CHECK-NEXT: mov za0v.d[w12, 0:1], { z0.d, z1.d }
224 call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2)
228 define void @za_write_vg2_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) {
229 ; CHECK-LABEL: za_write_vg2_vert_f64:
231 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
232 ; CHECK-NEXT: mov w12, w0
233 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
234 ; CHECK-NEXT: mov za0v.d[w12, 0:1], { z0.d, z1.d }
236 call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2)
241 ; Move Multi-Vector To Tile (Write) x 4
246 define void @za_write_vg4_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) {
247 ; CHECK-LABEL: za_write_vg4_horiz_b:
249 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
250 ; CHECK-NEXT: mov w12, w0
251 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
252 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
253 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
254 ; CHECK-NEXT: mov za0h.b[w12, 0:3], { z0.b - z3.b }
255 ; CHECK-NEXT: mov za0h.b[w12, 12:15], { z0.b - z3.b }
257 call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
258 %slice.12 = add i32 %slice, 12
259 call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 %slice.12, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
263 define void @za_write_vg4_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) {
264 ; CHECK-LABEL: za_write_vg4_horiz_h:
266 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
267 ; CHECK-NEXT: mov w12, w0
268 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
269 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
270 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
271 ; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h }
272 ; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h }
274 call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
275 %slice.4 = add i32 %slice, 4
276 call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 %slice.4, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
280 define void @za_write_vg4_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) {
281 ; CHECK-LABEL: za_write_vg4_horiz_f16:
283 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
284 ; CHECK-NEXT: mov w12, w0
285 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
286 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
287 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
288 ; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h }
289 ; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h }
291 call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4)
292 %slice.4 = add i32 %slice, 4
293 call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 1, i32 %slice.4, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4)
297 define void @za_write_vg4_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) {
298 ; CHECK-LABEL: za_write_vg4_horiz_bf16:
300 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
301 ; CHECK-NEXT: mov w12, w0
302 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
303 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
304 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
305 ; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h }
306 ; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h }
308 call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4)
309 %slice.4 = add i32 %slice, 4
310 call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 1, i32 %slice.4, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4)
314 define void @za_write_vg4_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) {
315 ; CHECK-LABEL: za_write_vg4_horiz_s:
317 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
318 ; CHECK-NEXT: mov w12, w0
319 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
320 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
321 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
322 ; CHECK-NEXT: mov za0h.s[w12, 0:3], { z0.s - z3.s }
324 call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4)
328 define void @za_write_vg4_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) {
329 ; CHECK-LABEL: za_write_vg4_horiz_f32:
331 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
332 ; CHECK-NEXT: mov w12, w0
333 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
334 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
335 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
336 ; CHECK-NEXT: mov za0h.s[w12, 0:3], { z0.s - z3.s }
338 call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4)
342 define void @za_write_vg4_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) {
343 ; CHECK-LABEL: za_write_vg4_horiz_d:
345 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
346 ; CHECK-NEXT: mov w12, w0
347 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
348 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
349 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
350 ; CHECK-NEXT: mov za0h.d[w12, 0:3], { z0.d - z3.d }
352 call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4)
356 define void @za_write_vg4_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) {
357 ; CHECK-LABEL: za_write_vg4_horiz_f64:
359 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
360 ; CHECK-NEXT: mov w12, w0
361 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
362 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
363 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
364 ; CHECK-NEXT: mov za0h.d[w12, 0:3], { z0.d - z3.d }
366 call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4)
372 define void @za_write_vg4_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) {
373 ; CHECK-LABEL: za_write_vg4_vert_b:
375 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
376 ; CHECK-NEXT: mov w12, w0
377 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
378 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
379 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
380 ; CHECK-NEXT: mov za0v.b[w12, 0:3], { z0.b - z3.b }
381 ; CHECK-NEXT: mov za0v.b[w12, 12:15], { z0.b - z3.b }
383 call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
384 %slice.12 = add i32 %slice, 12
385 call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 %slice.12, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
389 define void @za_write_vg4_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) {
390 ; CHECK-LABEL: za_write_vg4_vert_h:
392 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
393 ; CHECK-NEXT: mov w12, w0
394 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
395 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
396 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
397 ; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h }
398 ; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h }
400 call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
401 %slice.4 = add i32 %slice, 4
402 call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 %slice.4, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
406 define void @za_write_vg4_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) {
407 ; CHECK-LABEL: za_write_vg4_vert_f16:
409 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
410 ; CHECK-NEXT: mov w12, w0
411 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
412 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
413 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
414 ; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h }
415 ; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h }
417 call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4)
418 %slice.4 = add i32 %slice, 4
419 call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 1, i32 %slice.4, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4)
423 define void @za_write_vg4_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) {
424 ; CHECK-LABEL: za_write_vg4_vert_bf16:
426 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
427 ; CHECK-NEXT: mov w12, w0
428 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
429 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
430 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
431 ; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h }
432 ; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h }
434 call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4)
435 %slice.4 = add i32 %slice, 4
436 call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 1, i32 %slice.4, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4)
440 define void @za_write_vg4_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) {
441 ; CHECK-LABEL: za_write_vg4_vert_s:
443 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
444 ; CHECK-NEXT: mov w12, w0
445 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
446 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
447 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
448 ; CHECK-NEXT: mov za0v.s[w12, 0:3], { z0.s - z3.s }
450 call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4)
454 define void @za_write_vg4_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) {
455 ; CHECK-LABEL: za_write_vg4_vert_f32:
457 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
458 ; CHECK-NEXT: mov w12, w0
459 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
460 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
461 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
462 ; CHECK-NEXT: mov za0v.s[w12, 0:3], { z0.s - z3.s }
464 call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4)
468 define void @za_write_vg4_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) {
469 ; CHECK-LABEL: za_write_vg4_vert_d:
471 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
472 ; CHECK-NEXT: mov w12, w0
473 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
474 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
475 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
476 ; CHECK-NEXT: mov za0v.d[w12, 0:3], { z0.d - z3.d }
478 call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4)
482 define void @za_write_vg4_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) {
483 ; CHECK-LABEL: za_write_vg4_vert_f64:
485 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
486 ; CHECK-NEXT: mov w12, w0
487 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
488 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
489 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
490 ; CHECK-NEXT: mov za0v.d[w12, 0:3], { z0.d - z3.d }
492 call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4)
497 ; Move Multi-Vector To ZA (Write) x2
500 define void @za_write_vg1x2_b(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2) {
501 ; CHECK-LABEL: za_write_vg1x2_b:
503 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
504 ; CHECK-NEXT: mov w8, w0
505 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
506 ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
507 ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
509 call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2)
510 %slice.7 = add i32 %slice, 7
511 call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 %slice.7, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2)
515 define void @za_write_vg1x2_h(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2) {
516 ; CHECK-LABEL: za_write_vg1x2_h:
518 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
519 ; CHECK-NEXT: mov w8, w0
520 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
521 ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
522 ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
524 call void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2)
525 %slice.7 = add i32 %slice, 7
526 call void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32 %slice.7, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2)
530 define void @za_write_vg1x2_f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2) {
531 ; CHECK-LABEL: za_write_vg1x2_f16:
533 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
534 ; CHECK-NEXT: mov w8, w0
535 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
536 ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
537 ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
539 call void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2)
540 %slice.7 = add i32 %slice, 7
541 call void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2)
545 define void @za_write_vg1x2_bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2) {
546 ; CHECK-LABEL: za_write_vg1x2_bf16:
548 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
549 ; CHECK-NEXT: mov w8, w0
550 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
551 ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
552 ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
554 call void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2)
555 %slice.7 = add i32 %slice, 7
556 call void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2)
560 define void @za_write_vg1x2_s(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2) {
561 ; CHECK-LABEL: za_write_vg1x2_s:
563 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
564 ; CHECK-NEXT: mov w8, w0
565 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
566 ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
567 ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
569 call void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2)
570 %slice.7 = add i32 %slice, 7
571 call void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2)
575 define void @za_write_vg1x2_f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2) {
576 ; CHECK-LABEL: za_write_vg1x2_f32:
578 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
579 ; CHECK-NEXT: mov w8, w0
580 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
581 ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
582 ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
584 call void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2)
585 %slice.7 = add i32 %slice, 7
586 call void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32 %slice.7, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2)
590 define void @za_write_vg1x2_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2) {
591 ; CHECK-LABEL: za_write_vg1x2_d:
593 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
594 ; CHECK-NEXT: mov w8, w0
595 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
596 ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
597 ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
599 call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2)
600 %slice.7 = add i32 %slice, 7
601 call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2)
605 define void @za_write_vg1x2_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2) {
606 ; CHECK-LABEL: za_write_vg1x2_f64:
608 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
609 ; CHECK-NEXT: mov w8, w0
610 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
611 ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
612 ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
614 call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2)
615 %slice.7 = add i32 %slice, 7
616 call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 %slice.7, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2)
621 ; Move Multi-Vector To ZA (Write) x4
624 define void @za_write_vg1x4_b(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4) {
625 ; CHECK-LABEL: za_write_vg1x4_b:
627 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
628 ; CHECK-NEXT: mov w8, w0
629 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
630 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
631 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
632 ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
633 ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
635 call void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4)
636 %slice.7 = add i32 %slice, 7
637 call void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32 %slice.7, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4)
641 define void @za_write_vg1x4_h(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4) {
642 ; CHECK-LABEL: za_write_vg1x4_h:
644 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
645 ; CHECK-NEXT: mov w8, w0
646 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
647 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
648 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
649 ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
650 ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
652 call void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4)
653 %slice.7 = add i32 %slice, 7
654 call void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32 %slice.7, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4)
658 define void @za_write_vg1x4_f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4) {
659 ; CHECK-LABEL: za_write_vg1x4_f16:
661 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
662 ; CHECK-NEXT: mov w8, w0
663 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
664 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
665 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
666 ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
667 ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
669 call void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4)
670 %slice.7 = add i32 %slice, 7
671 call void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32 %slice.7, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4)
675 define void @za_write_vg1x4_bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4) {
676 ; CHECK-LABEL: za_write_vg1x4_bf16:
678 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
679 ; CHECK-NEXT: mov w8, w0
680 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
681 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
682 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
683 ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
684 ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
686 call void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4)
687 %slice.7 = add i32 %slice, 7
688 call void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4)
692 define void @za_write_vg1x4_s(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4) {
693 ; CHECK-LABEL: za_write_vg1x4_s:
695 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
696 ; CHECK-NEXT: mov w8, w0
697 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
698 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
699 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
700 ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
701 ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
703 call void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4)
704 %slice.7 = add i32 %slice, 7
705 call void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4)
709 define void @za_write_vg1x4_f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4) {
710 ; CHECK-LABEL: za_write_vg1x4_f32:
712 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
713 ; CHECK-NEXT: mov w8, w0
714 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
715 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
716 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
717 ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
718 ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
720 call void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4)
721 %slice.7 = add i32 %slice, 7
722 call void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32 %slice.7, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4)
726 define void @za_write_vg1x4_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4) {
727 ; CHECK-LABEL: za_write_vg1x4_d:
729 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
730 ; CHECK-NEXT: mov w8, w0
731 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
732 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
733 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
734 ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
735 ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
737 call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4)
738 %slice.7 = add i32 %slice, 7
739 call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4)
743 define void @za_write_vg1x4_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4) {
744 ; CHECK-LABEL: za_write_vg1x4_f64:
746 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
747 ; CHECK-NEXT: mov w8, w0
748 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
749 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
750 ; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
751 ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
752 ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
754 call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4)
755 %slice.7 = add i32 %slice, 7
756 call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 %slice.7, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4)
760 declare void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
761 declare void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
762 declare void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>)
763 declare void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
764 declare void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>)
765 declare void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>)
766 declare void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>)
767 declare void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>)
769 declare void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
770 declare void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
771 declare void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>)
772 declare void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
773 declare void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>)
774 declare void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>)
775 declare void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>)
776 declare void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>)
778 declare void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
779 declare void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
780 declare void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
781 declare void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
782 declare void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
783 declare void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
784 declare void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
785 declare void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
787 declare void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
788 declare void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
789 declare void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
790 declare void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
791 declare void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
792 declare void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
793 declare void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
794 declare void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
796 declare void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
797 declare void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
798 declare void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>)
799 declare void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>)
800 declare void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>)
801 declare void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
802 declare void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>)
803 declare void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>)
805 declare void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
806 declare void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
807 declare void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
808 declare void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
809 declare void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
810 declare void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
811 declare void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
812 declare void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)