1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
4 define void @insert_row_b(i32 %tileslice, <vscale x 16 x i1> %pg,
5 ; CHECK-LABEL: insert_row_b:
7 ; CHECK-NEXT: mov w12, w0
8 ; CHECK-NEXT: mov za0h.b[w12, 0], p0/m, z0.b
9 ; CHECK-NEXT: mov za0h.b[w12, 2], p0/m, z1.b
10 ; CHECK-NEXT: mov za0h.b[w12, 4], p0/m, z2.b
11 ; CHECK-NEXT: mov za0h.b[w12, 6], p0/m, z3.b
12 ; CHECK-NEXT: mov za0h.b[w12, 8], p0/m, z4.b
13 ; CHECK-NEXT: mov za0h.b[w12, 10], p0/m, z5.b
14 ; CHECK-NEXT: mov za0h.b[w12, 12], p0/m, z6.b
15 ; CHECK-NEXT: mov za0h.b[w12, 14], p0/m, z7.b
17 <vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1,
18 <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
19 <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5,
20 <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7) {
21 call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 %tileslice, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z0)
22 %tileslice.2 = add i32 %tileslice, 2
23 call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 %tileslice.2, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z1)
24 %tileslice.4 = add i32 %tileslice, 4
25 call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 %tileslice.4, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z2)
26 %tileslice.6 = add i32 %tileslice, 6
27 call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 %tileslice.6, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z3)
28 %tileslice.8 = add i32 %tileslice, 8
29 call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 %tileslice.8, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z4)
30 %tileslice.10 = add i32 %tileslice, 10
31 call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 %tileslice.10, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z5)
32 %tileslice.12 = add i32 %tileslice, 12
33 call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 %tileslice.12, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z6)
34 %tileslice.14 = add i32 %tileslice, 14
35 call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 %tileslice.14, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z7)
39 define void @insert_col_b(i32 %tileslice, <vscale x 16 x i1> %pg,
40 ; CHECK-LABEL: insert_col_b:
42 ; CHECK-NEXT: mov w12, w0
43 ; CHECK-NEXT: mov za0v.b[w12, 1], p0/m, z0.b
44 ; CHECK-NEXT: mov za0v.b[w12, 3], p0/m, z1.b
45 ; CHECK-NEXT: mov za0v.b[w12, 5], p0/m, z2.b
46 ; CHECK-NEXT: mov za0v.b[w12, 7], p0/m, z3.b
47 ; CHECK-NEXT: mov za0v.b[w12, 9], p0/m, z4.b
48 ; CHECK-NEXT: mov za0v.b[w12, 11], p0/m, z5.b
49 ; CHECK-NEXT: mov za0v.b[w12, 13], p0/m, z6.b
50 ; CHECK-NEXT: mov za0v.b[w12, 15], p0/m, z7.b
52 <vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1,
53 <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
54 <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5,
55 <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7) {
56 %tileslice.1 = add i32 %tileslice, 1
57 call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 %tileslice.1, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z0)
58 %tileslice.3 = add i32 %tileslice, 3
59 call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 %tileslice.3, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z1)
60 %tileslice.5 = add i32 %tileslice, 5
61 call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 %tileslice.5, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z2)
62 %tileslice.7 = add i32 %tileslice, 7
63 call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 %tileslice.7, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z3)
64 %tileslice.9 = add i32 %tileslice, 9
65 call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 %tileslice.9, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z4)
66 %tileslice.11 = add i32 %tileslice, 11
67 call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 %tileslice.11, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z5)
68 %tileslice.13 = add i32 %tileslice, 13
69 call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 %tileslice.13, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z6)
70 %tileslice.15 = add i32 %tileslice, 15
71 call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 %tileslice.15, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %z7)
75 define void @insert_row_h(i32 %tileslice, <vscale x 8 x i1> %pg,
76 ; CHECK-LABEL: insert_row_h:
78 ; CHECK-NEXT: mov w12, w0
79 ; CHECK-NEXT: mov za0h.h[w12, 0], p0/m, z0.h
80 ; CHECK-NEXT: mov za0h.h[w12, 2], p0/m, z2.h
81 ; CHECK-NEXT: mov za0h.h[w12, 4], p0/m, z4.h
82 ; CHECK-NEXT: mov za0h.h[w12, 6], p0/m, z6.h
84 <vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1,
85 <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3,
86 <vscale x 8 x i16> %z4, <vscale x 8 x i16> %z5,
87 <vscale x 8 x i16> %z6, <vscale x 8 x i16> %z7) {
88 call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 %tileslice, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %z0)
89 %tileslice.2 = add i32 %tileslice, 2
90 call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 %tileslice.2, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %z2)
91 %tileslice.4 = add i32 %tileslice, 4
92 call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 %tileslice.4, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %z4)
93 %tileslice.6 = add i32 %tileslice, 6
94 call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 %tileslice.6, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %z6)
98 define void @insert_col_h(i32 %tileslice, <vscale x 8 x i1> %pg,
99 ; CHECK-LABEL: insert_col_h:
101 ; CHECK-NEXT: mov w12, w0
102 ; CHECK-NEXT: mov za1v.h[w12, 1], p0/m, z1.h
103 ; CHECK-NEXT: mov za1v.h[w12, 3], p0/m, z3.h
104 ; CHECK-NEXT: mov za1v.h[w12, 5], p0/m, z5.h
105 ; CHECK-NEXT: mov za1v.h[w12, 7], p0/m, z7.h
107 <vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1,
108 <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3,
109 <vscale x 8 x i16> %z4, <vscale x 8 x i16> %z5,
110 <vscale x 8 x i16> %z6, <vscale x 8 x i16> %z7) {
111 %tileslice.1 = add i32 %tileslice, 1
112 call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 %tileslice.1, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %z1)
113 %tileslice.3 = add i32 %tileslice, 3
114 call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 %tileslice.3, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %z3)
115 %tileslice.5 = add i32 %tileslice, 5
116 call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 %tileslice.5, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %z5)
117 %tileslice.7 = add i32 %tileslice, 7
118 call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 %tileslice.7, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %z7)
122 define void @insert_f16(i32 %tileslice, <vscale x 8 x i1> %pg,
123 ; CHECK-LABEL: insert_f16:
125 ; CHECK-NEXT: mov w12, w0
126 ; CHECK-NEXT: mov za0h.h[w12, 0], p0/m, z0.h
127 ; CHECK-NEXT: mov za0h.h[w12, 1], p0/m, z1.h
128 ; CHECK-NEXT: mov za0v.h[w12, 2], p0/m, z2.h
129 ; CHECK-NEXT: mov za0v.h[w12, 3], p0/m, z3.h
130 ; CHECK-NEXT: mov za0h.h[w12, 4], p0/m, z4.h
131 ; CHECK-NEXT: mov za0h.h[w12, 5], p0/m, z5.h
132 ; CHECK-NEXT: mov za0v.h[w12, 6], p0/m, z6.h
133 ; CHECK-NEXT: mov za0v.h[w12, 7], p0/m, z7.h
135 <vscale x 8 x half> %z0, <vscale x 8 x half> %z1,
136 <vscale x 8 x half> %z2, <vscale x 8 x half> %z3,
137 <vscale x 8 x half> %z4, <vscale x 8 x half> %z5,
138 <vscale x 8 x half> %z6, <vscale x 8 x half> %z7) {
139 call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 0, i32 %tileslice, <vscale x 8 x i1> %pg, <vscale x 8 x half> %z0)
140 %tileslice.1 = add i32 %tileslice, 1
141 call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 0, i32 %tileslice.1, <vscale x 8 x i1> %pg, <vscale x 8 x half> %z1)
142 %tileslice.2 = add i32 %tileslice, 2
143 call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 0, i32 %tileslice.2, <vscale x 8 x i1> %pg, <vscale x 8 x half> %z2)
144 %tileslice.3 = add i32 %tileslice, 3
145 call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 0, i32 %tileslice.3, <vscale x 8 x i1> %pg, <vscale x 8 x half> %z3)
146 %tileslice.4 = add i32 %tileslice, 4
147 call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 0, i32 %tileslice.4, <vscale x 8 x i1> %pg, <vscale x 8 x half> %z4)
148 %tileslice.5 = add i32 %tileslice, 5
149 call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 0, i32 %tileslice.5, <vscale x 8 x i1> %pg, <vscale x 8 x half> %z5)
150 %tileslice.6 = add i32 %tileslice, 6
151 call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 0, i32 %tileslice.6, <vscale x 8 x i1> %pg, <vscale x 8 x half> %z6)
152 %tileslice.7 = add i32 %tileslice, 7
153 call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 0, i32 %tileslice.7, <vscale x 8 x i1> %pg, <vscale x 8 x half> %z7)
157 define void @insert_bf16(i32 %tileslice, <vscale x 8 x i1> %pg,
158 ; CHECK-LABEL: insert_bf16:
160 ; CHECK-NEXT: mov w12, w0
161 ; CHECK-NEXT: mov za0h.h[w12, 0], p0/m, z0.h
162 ; CHECK-NEXT: mov za0h.h[w12, 1], p0/m, z1.h
163 ; CHECK-NEXT: mov za0v.h[w12, 2], p0/m, z2.h
164 ; CHECK-NEXT: mov za0v.h[w12, 3], p0/m, z3.h
165 ; CHECK-NEXT: mov za0h.h[w12, 4], p0/m, z4.h
166 ; CHECK-NEXT: mov za0h.h[w12, 5], p0/m, z5.h
167 ; CHECK-NEXT: mov za0v.h[w12, 6], p0/m, z6.h
168 ; CHECK-NEXT: mov za0v.h[w12, 7], p0/m, z7.h
170 <vscale x 8 x bfloat> %z0, <vscale x 8 x bfloat> %z1,
171 <vscale x 8 x bfloat> %z2, <vscale x 8 x bfloat> %z3,
172 <vscale x 8 x bfloat> %z4, <vscale x 8 x bfloat> %z5,
173 <vscale x 8 x bfloat> %z6, <vscale x 8 x bfloat> %z7) {
174 call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 0, i32 %tileslice, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %z0)
175 %tileslice.1 = add i32 %tileslice, 1
176 call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 0, i32 %tileslice.1, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %z1)
177 %tileslice.2 = add i32 %tileslice, 2
178 call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 0, i32 %tileslice.2, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %z2)
179 %tileslice.3 = add i32 %tileslice, 3
180 call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 0, i32 %tileslice.3, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %z3)
181 %tileslice.4 = add i32 %tileslice, 4
182 call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 0, i32 %tileslice.4, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %z4)
183 %tileslice.5 = add i32 %tileslice, 5
184 call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 0, i32 %tileslice.5, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %z5)
185 %tileslice.6 = add i32 %tileslice, 6
186 call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 0, i32 %tileslice.6, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %z6)
187 %tileslice.7 = add i32 %tileslice, 7
188 call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 0, i32 %tileslice.7, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %z7)
192 define void @insert_row_s(i32 %tileslice, <vscale x 4 x i1> %pg,
193 ; CHECK-LABEL: insert_row_s:
195 ; CHECK-NEXT: mov w12, w0
196 ; CHECK-NEXT: mov za0h.s[w12, 0], p0/m, z0.s
197 ; CHECK-NEXT: mov za0h.s[w12, 2], p0/m, z2.s
199 <vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1,
200 <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3) {
201 call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 %tileslice, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %z0)
202 %tileslice.2 = add i32 %tileslice, 2
203 call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 %tileslice.2, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %z2)
207 define void @insert_col_s(i32 %tileslice, <vscale x 4 x i1> %pg,
208 ; CHECK-LABEL: insert_col_s:
210 ; CHECK-NEXT: mov w12, w0
211 ; CHECK-NEXT: mov za3v.s[w12, 1], p0/m, z1.s
212 ; CHECK-NEXT: mov za3v.s[w12, 3], p0/m, z3.s
214 <vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1,
215 <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3) {
216 %tileslice.1 = add i32 %tileslice, 1
217 call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 %tileslice.1, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %z1)
218 %tileslice.3 = add i32 %tileslice, 3
219 call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 %tileslice.3, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %z3)
223 define void @insert_f32(i32 %tileslice, <vscale x 4 x i1> %pg,
224 ; CHECK-LABEL: insert_f32:
226 ; CHECK-NEXT: mov w12, w0
227 ; CHECK-NEXT: mov za0h.s[w12, 0], p0/m, z0.s
228 ; CHECK-NEXT: mov za0h.s[w12, 1], p0/m, z1.s
229 ; CHECK-NEXT: mov za0v.s[w12, 2], p0/m, z2.s
230 ; CHECK-NEXT: mov za0v.s[w12, 3], p0/m, z3.s
232 <vscale x 4 x float> %z0, <vscale x 4 x float> %z1,
233 <vscale x 4 x float> %z2, <vscale x 4 x float> %z3) {
234 call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 %tileslice, <vscale x 4 x i1> %pg, <vscale x 4 x float> %z0)
235 %tileslice.1 = add i32 %tileslice, 1
236 call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 %tileslice.1, <vscale x 4 x i1> %pg, <vscale x 4 x float> %z1)
237 %tileslice.2 = add i32 %tileslice, 2
238 call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 0, i32 %tileslice.2, <vscale x 4 x i1> %pg, <vscale x 4 x float> %z2)
239 %tileslice.3 = add i32 %tileslice, 3
240 call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 0, i32 %tileslice.3, <vscale x 4 x i1> %pg, <vscale x 4 x float> %z3)
244 define void @insert_row_d(i32 %tileslice, <vscale x 2 x i1> %pg,
245 ; CHECK-LABEL: insert_row_d:
247 ; CHECK-NEXT: mov w12, w0
248 ; CHECK-NEXT: mov za0h.d[w12, 0], p0/m, z0.d
250 <vscale x 2 x i64> %z0, <vscale x 2 x i64> %z1) {
251 call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 %tileslice, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %z0)
255 define void @insert_col_d(i32 %tileslice, <vscale x 2 x i1> %pg,
256 ; CHECK-LABEL: insert_col_d:
258 ; CHECK-NEXT: mov w12, w0
259 ; CHECK-NEXT: mov za7v.d[w12, 1], p0/m, z1.d
261 <vscale x 2 x i64> %z0, <vscale x 2 x i64> %z1) {
262 %tileslice.1 = add i32 %tileslice, 1
263 call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 %tileslice.1, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %z1)
267 define void @insert_f64(i32 %tileslice, <vscale x 2 x i1> %pg,
268 ; CHECK-LABEL: insert_f64:
270 ; CHECK-NEXT: mov w12, w0
271 ; CHECK-NEXT: mov za0h.d[w12, 0], p0/m, z0.d
272 ; CHECK-NEXT: mov za0v.d[w12, 1], p0/m, z1.d
274 <vscale x 2 x double> %z0, <vscale x 2 x double> %z1) {
275 call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 0, i32 %tileslice, <vscale x 2 x i1> %pg, <vscale x 2 x double> %z0)
276 %tileslice.1 = add i32 %tileslice, 1
277 call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 0, i32 %tileslice.1, <vscale x 2 x i1> %pg, <vscale x 2 x double> %z1)
281 define void @insert_row_q_v16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %zn) {
282 ; CHECK-LABEL: insert_row_q_v16i8:
284 ; CHECK-NEXT: mov w12, wzr
285 ; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q
287 call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 0, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %zn)
291 define void @insert_row_q_v8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %zn) {
292 ; CHECK-LABEL: insert_row_q_v8i16:
294 ; CHECK-NEXT: mov w12, wzr
295 ; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q
297 call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 0, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %zn)
301 define void @insert_row_q_v8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %zn) {
302 ; CHECK-LABEL: insert_row_q_v8f16:
304 ; CHECK-NEXT: mov w12, wzr
305 ; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q
307 call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 0, i32 0, <vscale x 8 x i1> %pg, <vscale x 8 x half> %zn)
311 define void @insert_row_q_v8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %zn) {
312 ; CHECK-LABEL: insert_row_q_v8bf16:
314 ; CHECK-NEXT: mov w12, wzr
315 ; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q
317 call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 0, i32 0, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %zn)
321 define void @insert_row_q_v4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %zn) {
322 ; CHECK-LABEL: insert_row_q_v4i32:
324 ; CHECK-NEXT: mov w12, wzr
325 ; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q
327 call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 0, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %zn)
331 define void @insert_row_q_v4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %zn) {
332 ; CHECK-LABEL: insert_row_q_v4f32:
334 ; CHECK-NEXT: mov w12, wzr
335 ; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q
337 call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 0, i32 0, <vscale x 4 x i1> %pg, <vscale x 4 x float> %zn)
341 define void @insert_row_q_v2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %zn) {
342 ; CHECK-LABEL: insert_row_q_v2i64:
344 ; CHECK-NEXT: mov w12, wzr
345 ; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q
347 call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 0, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %zn)
351 define void @insert_row_q_v2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %zn) {
352 ; CHECK-LABEL: insert_row_q_v2f64:
354 ; CHECK-NEXT: mov w12, wzr
355 ; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q
357 call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 0, i32 0, <vscale x 2 x i1> %pg, <vscale x 2 x double> %zn)
361 define void @insert_col_q_v16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %zn) {
362 ; CHECK-LABEL: insert_col_q_v16i8:
364 ; CHECK-NEXT: mov w12, wzr
365 ; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q
367 call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 0, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %zn)
371 define void @insert_col_q_v8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %zn) {
372 ; CHECK-LABEL: insert_col_q_v8i16:
374 ; CHECK-NEXT: mov w12, wzr
375 ; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q
377 call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 0, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %zn)
381 define void @insert_col_q_v8f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %zn) {
382 ; CHECK-LABEL: insert_col_q_v8f16:
384 ; CHECK-NEXT: mov w12, wzr
385 ; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q
387 call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 0, <vscale x 8 x i1> %pg, <vscale x 8 x half> %zn)
391 define void @insert_col_q_v8bf16(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %zn) {
392 ; CHECK-LABEL: insert_col_q_v8bf16:
394 ; CHECK-NEXT: mov w12, wzr
395 ; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q
397 call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 0, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %zn)
401 define void @insert_col_q_v4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %zn) {
402 ; CHECK-LABEL: insert_col_q_v4i32:
404 ; CHECK-NEXT: mov w12, wzr
405 ; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q
407 call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 0, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %zn)
411 define void @insert_col_q_v4f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %zn) {
412 ; CHECK-LABEL: insert_col_q_v4f32:
414 ; CHECK-NEXT: mov w12, wzr
415 ; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q
417 call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 0, <vscale x 4 x i1> %pg, <vscale x 4 x float> %zn)
421 define void @insert_col_q_v2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %zn) {
422 ; CHECK-LABEL: insert_col_q_v2i64:
424 ; CHECK-NEXT: mov w12, wzr
425 ; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q
427 call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 0, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %zn)
431 define void @insert_col_q_v2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %zn) {
432 ; CHECK-LABEL: insert_col_q_v2f64:
434 ; CHECK-NEXT: mov w12, wzr
435 ; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q
437 call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 0, <vscale x 2 x i1> %pg, <vscale x 2 x double> %zn)
441 define void @test_sink_offset_operand(<vscale x 4 x i1> %pg, i32 %base, i32 %N) {
442 ; CHECK-LABEL: test_sink_offset_operand:
443 ; CHECK: // %bb.0: // %entry
444 ; CHECK-NEXT: mov z0.s, #0 // =0x0
445 ; CHECK-NEXT: mov w12, w0
446 ; CHECK-NEXT: .LBB28_1: // %for.body
447 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
448 ; CHECK-NEXT: mov za0h.s[w12, 0], p0/m, z0.s
449 ; CHECK-NEXT: subs w1, w1, #3
450 ; CHECK-NEXT: mov za0h.s[w12, 1], p0/m, z0.s
451 ; CHECK-NEXT: mov za0h.s[w12, 2], p0/m, z0.s
452 ; CHECK-NEXT: b.ne .LBB28_1
453 ; CHECK-NEXT: // %bb.2: // %exit
456 %add1 = add i32 %base, 1
457 %add2 = add i32 %base, 2
461 %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
462 call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 %base, <vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
463 call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 %add1, <vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
464 call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 %add2, <vscale x 4 x i1> %pg, <vscale x 4 x i32> zeroinitializer)
465 %inc = add nuw nsw i32 %i, 3
466 %exitcond.not = icmp eq i32 %inc, %N
467 br i1 %exitcond.not, label %exit, label %for.body
473 declare void @llvm.aarch64.sme.write.horiz.nxv16i8(i32, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
474 declare void @llvm.aarch64.sme.write.horiz.nxv8i16(i32, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
475 declare void @llvm.aarch64.sme.write.horiz.nxv8f16(i32, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
476 declare void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
477 declare void @llvm.aarch64.sme.write.horiz.nxv4i32(i32, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
478 declare void @llvm.aarch64.sme.write.horiz.nxv4f32(i32, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
479 declare void @llvm.aarch64.sme.write.horiz.nxv2i64(i32, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
480 declare void @llvm.aarch64.sme.write.horiz.nxv2f64(i32, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
481 declare void @llvm.aarch64.sme.write.vert.nxv16i8(i32, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
482 declare void @llvm.aarch64.sme.write.vert.nxv8i16(i32, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
483 declare void @llvm.aarch64.sme.write.vert.nxv8f16(i32, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
484 declare void @llvm.aarch64.sme.write.vert.nxv8bf16(i32, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
485 declare void @llvm.aarch64.sme.write.vert.nxv4i32(i32, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
486 declare void @llvm.aarch64.sme.write.vert.nxv4f32(i32, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
487 declare void @llvm.aarch64.sme.write.vert.nxv2i64(i32, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
488 declare void @llvm.aarch64.sme.write.vert.nxv2f64(i32, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
490 declare void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
491 declare void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
492 declare void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
493 declare void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
494 declare void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
495 declare void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
496 declare void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
497 declare void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
498 declare void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
499 declare void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
500 declare void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
501 declare void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
502 declare void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
503 declare void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
504 declare void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
505 declare void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32, i32, <vscale x 2 x i1>, <vscale x 2 x double>)