1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
4 define void @st1b(<vscale x 16 x i1> %pg, ptr %ptr, i32 %sliceidx) {
7 ; CHECK-NEXT: mov w12, w1
8 ; CHECK-NEXT: mov w13, wzr
9 ; CHECK-NEXT: st1b {za0h.b[w12, 15]}, p0, [x0]
10 ; CHECK-NEXT: st1b {za0v.b[w13, 0]}, p0, [x0]
12 %tileslice = add i32 %sliceidx, 15
13 call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> %pg, ptr %ptr, i32 0, i32 %tileslice)
14 call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> %pg, ptr %ptr, i32 0, i32 0)
18 define void @st1b_with_addr_offset(<vscale x 16 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) {
19 ; CHECK-LABEL: st1b_with_addr_offset:
21 ; CHECK-NEXT: mov w13, wzr
22 ; CHECK-NEXT: mov w12, w2
23 ; CHECK-NEXT: st1b {za0h.b[w13, 0]}, p0, [x0, x1]
24 ; CHECK-NEXT: st1b {za0v.b[w12, 15]}, p0, [x0, x1]
26 %base = getelementptr i8, ptr %ptr, i64 %index
27 %tileslice = add i32 %sliceidx, 15
28 call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> %pg, ptr %base, i32 0, i32 0)
29 call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> %pg, ptr %base, i32 0, i32 %tileslice)
33 define void @st1h(<vscale x 8 x i1> %pg, ptr %ptr, i32 %sliceidx) {
36 ; CHECK-NEXT: mov w12, w1
37 ; CHECK-NEXT: mov w13, wzr
38 ; CHECK-NEXT: st1h {za0h.h[w12, 7]}, p0, [x0]
39 ; CHECK-NEXT: st1h {za1h.h[w13, 0]}, p0, [x0]
40 ; CHECK-NEXT: st1h {za0v.h[w13, 0]}, p0, [x0]
41 ; CHECK-NEXT: st1h {za1v.h[w12, 7]}, p0, [x0]
43 %tileslice = add i32 %sliceidx, 7
44 call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> %pg, ptr %ptr, i32 0, i32 %tileslice)
45 call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> %pg, ptr %ptr, i32 1, i32 0)
46 call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> %pg, ptr %ptr, i32 0, i32 0)
47 call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> %pg, ptr %ptr, i32 1, i32 %tileslice)
51 define void @st1h_with_addr_offset(<vscale x 8 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) {
52 ; CHECK-LABEL: st1h_with_addr_offset:
54 ; CHECK-NEXT: mov w12, w2
55 ; CHECK-NEXT: mov w13, wzr
56 ; CHECK-NEXT: st1h {za0h.h[w12, 7]}, p0, [x0, x1, lsl #1]
57 ; CHECK-NEXT: st1h {za1v.h[w13, 0]}, p0, [x0, x1, lsl #1]
59 %base = getelementptr i16, ptr %ptr, i64 %index
60 %tileslice = add i32 %sliceidx, 7
61 call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> %pg, ptr %base, i32 0, i32 %tileslice)
62 call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> %pg, ptr %base, i32 1, i32 0)
66 define void @st1w(<vscale x 4 x i1> %pg, ptr %ptr, i32 %sliceidx) {
69 ; CHECK-NEXT: mov w13, wzr
70 ; CHECK-NEXT: mov w12, w1
71 ; CHECK-NEXT: st1w {za0h.s[w13, 0]}, p0, [x0]
72 ; CHECK-NEXT: st1w {za1h.s[w13, 0]}, p0, [x0]
73 ; CHECK-NEXT: st1w {za2h.s[w13, 0]}, p0, [x0]
74 ; CHECK-NEXT: st1w {za3h.s[w12, 3]}, p0, [x0]
75 ; CHECK-NEXT: st1w {za0v.s[w13, 0]}, p0, [x0]
76 ; CHECK-NEXT: st1w {za1v.s[w13, 0]}, p0, [x0]
77 ; CHECK-NEXT: st1w {za2v.s[w12, 3]}, p0, [x0]
78 ; CHECK-NEXT: st1w {za3v.s[w13, 0]}, p0, [x0]
80 %tileslice = add i32 %sliceidx, 3
81 call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 0, i32 0)
82 call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 1, i32 0)
83 call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 2, i32 0)
84 call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 3, i32 %tileslice)
85 call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 0, i32 0)
86 call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 1, i32 0)
87 call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 2, i32 %tileslice)
88 call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 3, i32 0)
92 define void @st1w_with_addr_offset(<vscale x 4 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) {
93 ; CHECK-LABEL: st1w_with_addr_offset:
95 ; CHECK-NEXT: mov w13, wzr
96 ; CHECK-NEXT: mov w12, w2
97 ; CHECK-NEXT: st1w {za0h.s[w13, 0]}, p0, [x0, x1, lsl #2]
98 ; CHECK-NEXT: st1w {za3v.s[w12, 3]}, p0, [x0, x1, lsl #2]
100 %base = getelementptr i32, ptr %ptr, i64 %index
101 %tileslice = add i32 %sliceidx, 3
102 call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %base, i32 0, i32 0)
103 call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> %pg, ptr %base, i32 3, i32 %tileslice)
107 define void @st1d(<vscale x 2 x i1> %pg, ptr %ptr, i32 %sliceidx) {
110 ; CHECK-NEXT: mov w13, wzr
111 ; CHECK-NEXT: mov w12, w1
112 ; CHECK-NEXT: st1d {za0h.d[w13, 0]}, p0, [x0]
113 ; CHECK-NEXT: st1d {za1h.d[w13, 0]}, p0, [x0]
114 ; CHECK-NEXT: st1d {za2h.d[w13, 0]}, p0, [x0]
115 ; CHECK-NEXT: st1d {za3h.d[w13, 0]}, p0, [x0]
116 ; CHECK-NEXT: st1d {za4h.d[w12, 1]}, p0, [x0]
117 ; CHECK-NEXT: st1d {za5h.d[w13, 0]}, p0, [x0]
118 ; CHECK-NEXT: st1d {za6h.d[w13, 0]}, p0, [x0]
119 ; CHECK-NEXT: st1d {za7h.d[w13, 0]}, p0, [x0]
120 ; CHECK-NEXT: st1d {za0v.d[w13, 0]}, p0, [x0]
121 ; CHECK-NEXT: st1d {za1v.d[w13, 0]}, p0, [x0]
122 ; CHECK-NEXT: st1d {za2v.d[w13, 0]}, p0, [x0]
123 ; CHECK-NEXT: st1d {za3v.d[w13, 0]}, p0, [x0]
124 ; CHECK-NEXT: st1d {za4v.d[w13, 0]}, p0, [x0]
125 ; CHECK-NEXT: st1d {za5v.d[w13, 0]}, p0, [x0]
126 ; CHECK-NEXT: st1d {za6v.d[w13, 0]}, p0, [x0]
127 ; CHECK-NEXT: st1d {za7v.d[w12, 1]}, p0, [x0]
129 %tileslice = add i32 %sliceidx, 1
130 call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 0, i32 0)
131 call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 1, i32 0)
132 call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 2, i32 0)
133 call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 3, i32 0)
134 call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 4, i32 %tileslice)
135 call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 5, i32 0)
136 call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 6, i32 0)
137 call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 7, i32 0)
138 call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 0, i32 0)
139 call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 1, i32 0)
140 call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 2, i32 0)
141 call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 3, i32 0)
142 call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 4, i32 0)
143 call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 5, i32 0)
144 call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 6, i32 0)
145 call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 7, i32 %tileslice)
149 define void @st1d_with_addr_offset(<vscale x 2 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) {
150 ; CHECK-LABEL: st1d_with_addr_offset:
152 ; CHECK-NEXT: mov w12, w2
153 ; CHECK-NEXT: mov w13, wzr
154 ; CHECK-NEXT: st1d {za0h.d[w12, 1]}, p0, [x0, x1, lsl #3]
155 ; CHECK-NEXT: st1d {za7v.d[w13, 0]}, p0, [x0, x1, lsl #3]
157 %base = getelementptr i64, ptr %ptr, i64 %index
158 %tileslice = add i32 %sliceidx, 1
159 call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %base, i32 0, i32 %tileslice)
160 call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %base, i32 7, i32 0)
164 define void @st1q(<vscale x 1 x i1> %pg, ptr %ptr) {
167 ; CHECK-NEXT: mov w12, wzr
168 ; CHECK-NEXT: st1q {za0h.q[w12, 0]}, p0, [x0]
169 ; CHECK-NEXT: st1q {za1h.q[w12, 0]}, p0, [x0]
170 ; CHECK-NEXT: st1q {za2h.q[w12, 0]}, p0, [x0]
171 ; CHECK-NEXT: st1q {za3h.q[w12, 0]}, p0, [x0]
172 ; CHECK-NEXT: st1q {za4h.q[w12, 0]}, p0, [x0]
173 ; CHECK-NEXT: st1q {za5h.q[w12, 0]}, p0, [x0]
174 ; CHECK-NEXT: st1q {za6h.q[w12, 0]}, p0, [x0]
175 ; CHECK-NEXT: st1q {za7h.q[w12, 0]}, p0, [x0]
176 ; CHECK-NEXT: st1q {za8h.q[w12, 0]}, p0, [x0]
177 ; CHECK-NEXT: st1q {za9h.q[w12, 0]}, p0, [x0]
178 ; CHECK-NEXT: st1q {za10h.q[w12, 0]}, p0, [x0]
179 ; CHECK-NEXT: st1q {za11h.q[w12, 0]}, p0, [x0]
180 ; CHECK-NEXT: st1q {za12h.q[w12, 0]}, p0, [x0]
181 ; CHECK-NEXT: st1q {za13h.q[w12, 0]}, p0, [x0]
182 ; CHECK-NEXT: st1q {za14h.q[w12, 0]}, p0, [x0]
183 ; CHECK-NEXT: st1q {za15h.q[w12, 0]}, p0, [x0]
184 ; CHECK-NEXT: st1q {za0v.q[w12, 0]}, p0, [x0]
185 ; CHECK-NEXT: st1q {za1v.q[w12, 0]}, p0, [x0]
186 ; CHECK-NEXT: st1q {za2v.q[w12, 0]}, p0, [x0]
187 ; CHECK-NEXT: st1q {za3v.q[w12, 0]}, p0, [x0]
188 ; CHECK-NEXT: st1q {za4v.q[w12, 0]}, p0, [x0]
189 ; CHECK-NEXT: st1q {za5v.q[w12, 0]}, p0, [x0]
190 ; CHECK-NEXT: st1q {za6v.q[w12, 0]}, p0, [x0]
191 ; CHECK-NEXT: st1q {za7v.q[w12, 0]}, p0, [x0]
192 ; CHECK-NEXT: st1q {za8v.q[w12, 0]}, p0, [x0]
193 ; CHECK-NEXT: st1q {za9v.q[w12, 0]}, p0, [x0]
194 ; CHECK-NEXT: st1q {za10v.q[w12, 0]}, p0, [x0]
195 ; CHECK-NEXT: st1q {za11v.q[w12, 0]}, p0, [x0]
196 ; CHECK-NEXT: st1q {za12v.q[w12, 0]}, p0, [x0]
197 ; CHECK-NEXT: st1q {za13v.q[w12, 0]}, p0, [x0]
198 ; CHECK-NEXT: st1q {za14v.q[w12, 0]}, p0, [x0]
199 ; CHECK-NEXT: st1q {za15v.q[w12, 0]}, p0, [x0]
201 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 0, i32 0)
202 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 1, i32 0)
203 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 2, i32 0)
204 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 3, i32 0)
205 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 4, i32 0)
206 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 5, i32 0)
207 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 6, i32 0)
208 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 7, i32 0)
209 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 8, i32 0)
210 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 9, i32 0)
211 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 10, i32 0)
212 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 11, i32 0)
213 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 12, i32 0)
214 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 13, i32 0)
215 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 14, i32 0)
216 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 15, i32 0)
217 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 0, i32 0)
218 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 1, i32 0)
219 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 2, i32 0)
220 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 3, i32 0)
221 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 4, i32 0)
222 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 5, i32 0)
223 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 6, i32 0)
224 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 7, i32 0)
225 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 8, i32 0)
226 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 9, i32 0)
227 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 10, i32 0)
228 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 11, i32 0)
229 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 12, i32 0)
230 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 13, i32 0)
231 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 14, i32 0)
232 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 15, i32 0)
236 define void @st1q_with_addr_offset(<vscale x 1 x i1> %pg, ptr %ptr, i64 %index) {
237 ; CHECK-LABEL: st1q_with_addr_offset:
239 ; CHECK-NEXT: mov w12, wzr
240 ; CHECK-NEXT: st1q {za0h.q[w12, 0]}, p0, [x0, x1, lsl #4]
241 ; CHECK-NEXT: st1q {za15v.q[w12, 0]}, p0, [x0, x1, lsl #4]
243 %base = getelementptr i128, ptr %ptr, i64 %index
244 call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %base, i32 0, i32 0)
245 call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %base, i32 15, i32 0)
249 define void @str(ptr %ptr) {
252 ; CHECK-NEXT: mov w12, wzr
253 ; CHECK-NEXT: str za[w12, 0], [x0]
255 call void @llvm.aarch64.sme.str(i32 0, ptr %ptr, i32 0)
259 define void @str_with_off_15(ptr %ptr) {
260 ; CHECK-LABEL: str_with_off_15:
262 ; CHECK-NEXT: mov w12, #15 // =0xf
263 ; CHECK-NEXT: add x8, x0, #15
264 ; CHECK-NEXT: str za[w12, 0], [x8]
266 %base = getelementptr i8, ptr %ptr, i64 15
267 call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0)
271 define void @str_with_off_15mulvl(ptr %ptr) {
272 ; CHECK-LABEL: str_with_off_15mulvl:
274 ; CHECK-NEXT: mov w12, #15 // =0xf
275 ; CHECK-NEXT: addvl x8, x0, #15
276 ; CHECK-NEXT: str za[w12, 0], [x8]
278 %vscale = call i64 @llvm.vscale.i64()
279 %mulvl = mul i64 %vscale, 240
280 %base = getelementptr i8, ptr %ptr, i64 %mulvl
281 call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0)
285 define void @str_with_off_16mulvl(ptr %ptr) {
286 ; CHECK-LABEL: str_with_off_16mulvl:
288 ; CHECK-NEXT: mov w12, #16 // =0x10
289 ; CHECK-NEXT: addvl x8, x0, #16
290 ; CHECK-NEXT: str za[w12, 0], [x8]
292 %vscale = call i64 @llvm.vscale.i64()
293 %mulvl = mul i64 %vscale, 256
294 %base = getelementptr i8, ptr %ptr, i64 %mulvl
295 call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 0)
299 define void @str_with_off_var(ptr %base, i32 %off) {
300 ; CHECK-LABEL: str_with_off_var:
302 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
303 ; CHECK-NEXT: sxtw x8, w1
304 ; CHECK-NEXT: rdsvl x9, #1
305 ; CHECK-NEXT: add w12, w1, #16
306 ; CHECK-NEXT: madd x8, x9, x8, x0
307 ; CHECK-NEXT: str za[w12, 0], [x8]
309 call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 %off)
313 define void @str_with_off_15imm(ptr %ptr) {
314 ; CHECK-LABEL: str_with_off_15imm:
316 ; CHECK-NEXT: mov w12, #15 // =0xf
317 ; CHECK-NEXT: add x8, x0, #15
318 ; CHECK-NEXT: str za[w12, 15], [x8, #15, mul vl]
320 %base = getelementptr i8, ptr %ptr, i64 15
321 call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 15)
325 define void @str_with_off_16imm(ptr %ptr) {
326 ; CHECK-LABEL: str_with_off_16imm:
328 ; CHECK-NEXT: rdsvl x8, #1
329 ; CHECK-NEXT: mov w12, #31 // =0x1f
330 ; CHECK-NEXT: add x8, x0, x8, lsl #4
331 ; CHECK-NEXT: add x8, x8, #15
332 ; CHECK-NEXT: str za[w12, 0], [x8]
334 %base = getelementptr i8, ptr %ptr, i64 15
335 call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 16)
339 define void @str_with_off_many_imm(i32 %tile_slice, ptr %ptr) {
340 ; CHECK-LABEL: str_with_off_many_imm:
341 ; CHECK: // %bb.0: // %entry
342 ; CHECK-NEXT: mov w12, w0
343 ; CHECK-NEXT: str za[w12, 1], [x1, #1, mul vl]
344 ; CHECK-NEXT: str za[w12, 2], [x1, #2, mul vl]
345 ; CHECK-NEXT: str za[w12, 3], [x1, #3, mul vl]
346 ; CHECK-NEXT: str za[w12, 4], [x1, #4, mul vl]
349 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 1)
350 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 2)
351 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 3)
352 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 4)
356 define void @str_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) {
357 ; CHECK-LABEL: str_with_off_many_imm_15_18:
358 ; CHECK: // %bb.0: // %entry
359 ; CHECK-NEXT: rdsvl x8, #1
360 ; CHECK-NEXT: mov w12, w0
361 ; CHECK-NEXT: add x8, x1, x8, lsl #4
362 ; CHECK-NEXT: str za[w12, 15], [x1, #15, mul vl]
363 ; CHECK-NEXT: add w12, w0, #16
364 ; CHECK-NEXT: str za[w12, 0], [x8]
365 ; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl]
366 ; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl]
369 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 15)
370 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 16)
371 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 17)
372 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 18)
376 define void @str_with_off_many_imm_16_19(i32 %tile_slice, ptr %ptr) {
377 ; CHECK-LABEL: str_with_off_many_imm_16_19:
378 ; CHECK: // %bb.0: // %entry
379 ; CHECK-NEXT: rdsvl x8, #1
380 ; CHECK-NEXT: add w12, w0, #16
381 ; CHECK-NEXT: add x8, x1, x8, lsl #4
382 ; CHECK-NEXT: str za[w12, 0], [x8]
383 ; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl]
384 ; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl]
385 ; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl]
388 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 16)
389 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 17)
390 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 18)
391 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 19)
395 define void @str_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) {
396 ; CHECK-LABEL: str_with_off_many_imm_31_34:
397 ; CHECK: // %bb.0: // %entry
398 ; CHECK-NEXT: rdsvl x8, #1
399 ; CHECK-NEXT: add w12, w0, #16
400 ; CHECK-NEXT: add w13, w0, #32
401 ; CHECK-NEXT: add x9, x1, x8, lsl #4
402 ; CHECK-NEXT: add x8, x1, x8, lsl #5
403 ; CHECK-NEXT: str za[w12, 15], [x9, #15, mul vl]
404 ; CHECK-NEXT: str za[w13, 0], [x8]
405 ; CHECK-NEXT: str za[w13, 1], [x8, #1, mul vl]
406 ; CHECK-NEXT: str za[w13, 2], [x8, #2, mul vl]
409 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 31)
410 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 32)
411 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 33)
412 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 34)
416 define void @str_with_off_many_imm_32_35(i32 %tile_slice, ptr %ptr) {
417 ; CHECK-LABEL: str_with_off_many_imm_32_35:
418 ; CHECK: // %bb.0: // %entry
419 ; CHECK-NEXT: rdsvl x8, #1
420 ; CHECK-NEXT: add w12, w0, #32
421 ; CHECK-NEXT: add x8, x1, x8, lsl #5
422 ; CHECK-NEXT: str za[w12, 0], [x8]
423 ; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl]
424 ; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl]
425 ; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl]
428 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 32)
429 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 33)
430 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 34)
431 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 35)
435 define void @str_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) {
436 ; CHECK-LABEL: str_with_off_many_var:
437 ; CHECK: // %bb.0: // %entry
438 ; CHECK-NEXT: sxtw x8, w2
439 ; CHECK-NEXT: rdsvl x9, #1
440 ; CHECK-NEXT: add w12, w0, w2
441 ; CHECK-NEXT: madd x8, x9, x8, x1
442 ; CHECK-NEXT: str za[w12, 0], [x8]
443 ; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl]
444 ; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl]
445 ; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl]
448 %0 = trunc i64 %vnum to i32
449 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %0)
451 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1)
453 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2)
455 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3)
459 define void @str_with_off_many_var_high(i32 %tile_slice, ptr %ptr, i64 %vnum) {
460 ; CHECK-LABEL: str_with_off_many_var_high:
461 ; CHECK: // %bb.0: // %entry
462 ; CHECK-NEXT: add w8, w2, #32
463 ; CHECK-NEXT: rdsvl x10, #1
464 ; CHECK-NEXT: sxtw x9, w8
465 ; CHECK-NEXT: add w12, w0, w8
466 ; CHECK-NEXT: madd x9, x10, x9, x1
467 ; CHECK-NEXT: str za[w12, 1], [x9, #1, mul vl]
468 ; CHECK-NEXT: str za[w12, 2], [x9, #2, mul vl]
469 ; CHECK-NEXT: str za[w12, 3], [x9, #3, mul vl]
470 ; CHECK-NEXT: str za[w12, 4], [x9, #4, mul vl]
473 %0 = trunc i64 %vnum to i32
475 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1)
477 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2)
479 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3)
481 tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %4)
486 ; Ensure that the tile offset is sunk, given that this is likely to be an 'add'
487 ; that's decomposed into a base + offset in ISel.
488 define void @test_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src, i32 %base, i32 %N) {
489 ; CHECK-LABEL: test_sink_tile0_offset_operand:
490 ; CHECK: // %bb.0: // %entry
491 ; CHECK-NEXT: mov w12, w1
492 ; CHECK-NEXT: .LBB24_1: // %for.body
493 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
494 ; CHECK-NEXT: st1w {za0h.s[w12, 0]}, p0, [x0]
495 ; CHECK-NEXT: subs w2, w2, #1
496 ; CHECK-NEXT: st1w {za0h.s[w12, 1]}, p0, [x0]
497 ; CHECK-NEXT: st1w {za0h.s[w12, 2]}, p0, [x0]
498 ; CHECK-NEXT: b.ne .LBB24_1
499 ; CHECK-NEXT: // %bb.2: // %exit
502 %add0 = add i32 %base, 1
503 %add1 = add i32 %base, 2
507 %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
508 tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %src, i32 0, i32 %base)
509 tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %src, i32 0, i32 %add0)
510 tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %src, i32 0, i32 %add1)
511 %inc = add nuw nsw i32 %i, 1
512 %exitcond.not = icmp eq i32 %inc, %N
513 br i1 %exitcond.not, label %exit, label %for.body
519 declare void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1>, ptr, i32, i32)
520 declare void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1>, ptr, i32, i32)
521 declare void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1>, ptr, i32, i32)
522 declare void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1>, ptr, i32, i32)
523 declare void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1>, ptr, i32, i32)
524 declare void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1>, ptr, i32, i32)
525 declare void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1>, ptr, i32, i32)
526 declare void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1>, ptr, i32, i32)
527 declare void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1>, ptr, i32, i32)
528 declare void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1>, ptr, i32, i32)
530 declare void @llvm.aarch64.sme.str(i32, ptr, i32)
531 declare i64 @llvm.vscale.i64()