1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
4 define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
5 ; CHECK-LABEL: extract_row_b:
7 ; CHECK-NEXT: mov z1.d, z0.d
8 ; CHECK-NEXT: mov z2.d, z0.d
9 ; CHECK-NEXT: mov w12, w0
10 ; CHECK-NEXT: mov z1.b, p0/m, za0h.b[w12, 0]
11 ; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 2]
12 ; CHECK-NEXT: mov z2.d, z0.d
13 ; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 4]
14 ; CHECK-NEXT: mov z2.d, z0.d
15 ; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 6]
16 ; CHECK-NEXT: mov z2.d, z0.d
17 ; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 8]
18 ; CHECK-NEXT: mov z2.d, z0.d
19 ; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 10]
20 ; CHECK-NEXT: mov z2.d, z0.d
21 ; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 12]
22 ; CHECK-NEXT: mov z0.b, p0/m, za0h.b[w12, 14]
23 ; CHECK-NEXT: mov z0.d, z1.d
25 %z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice)
26 %tileslice.2 = add i32 %tileslice, 2
27 %z1 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.2)
28 %tileslice.4 = add i32 %tileslice, 4
29 %z2 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.4)
30 %tileslice.6 = add i32 %tileslice, 6
31 %z3 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.6)
32 %tileslice.8 = add i32 %tileslice, 8
33 %z4 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.8)
34 %tileslice.10 = add i32 %tileslice, 10
35 %z5 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.10)
36 %tileslice.12 = add i32 %tileslice, 12
37 %z6 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.12)
38 %tileslice.14 = add i32 %tileslice, 14
39 %z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.14)
40 ret <vscale x 16 x i8> %z0
43 define <vscale x 16 x i8> @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
44 ; CHECK-LABEL: extract_col_b:
46 ; CHECK-NEXT: mov z1.d, z0.d
47 ; CHECK-NEXT: mov z2.d, z0.d
48 ; CHECK-NEXT: mov w12, w0
49 ; CHECK-NEXT: mov z1.b, p0/m, za0v.b[w12, 1]
50 ; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 3]
51 ; CHECK-NEXT: mov z2.d, z0.d
52 ; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 5]
53 ; CHECK-NEXT: mov z2.d, z0.d
54 ; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 7]
55 ; CHECK-NEXT: mov z2.d, z0.d
56 ; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 9]
57 ; CHECK-NEXT: mov z2.d, z0.d
58 ; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 11]
59 ; CHECK-NEXT: mov z2.d, z0.d
60 ; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 13]
61 ; CHECK-NEXT: mov z0.b, p0/m, za0v.b[w12, 15]
62 ; CHECK-NEXT: mov z0.d, z1.d
64 %tileslice.1 = add i32 %tileslice, 1
65 %z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.1)
66 %tileslice.3 = add i32 %tileslice, 3
67 %z1 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.3)
68 %tileslice.5 = add i32 %tileslice, 5
69 %z2 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.5)
70 %tileslice.7 = add i32 %tileslice, 7
71 %z3 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.7)
72 %tileslice.9 = add i32 %tileslice, 9
73 %z4 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.9)
74 %tileslice.11 = add i32 %tileslice, 11
75 %z5 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.11)
76 %tileslice.13 = add i32 %tileslice, 13
77 %z6 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.13)
78 %tileslice.15 = add i32 %tileslice, 15
79 %z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.15)
80 ret <vscale x 16 x i8> %z0
83 define <vscale x 8 x i16> @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
84 ; CHECK-LABEL: extract_row_h:
86 ; CHECK-NEXT: mov z1.d, z0.d
87 ; CHECK-NEXT: mov z2.d, z0.d
88 ; CHECK-NEXT: mov w12, w0
89 ; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 0]
90 ; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 2]
91 ; CHECK-NEXT: mov z2.d, z0.d
92 ; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 4]
93 ; CHECK-NEXT: mov z0.h, p0/m, za0h.h[w12, 6]
94 ; CHECK-NEXT: mov z0.d, z1.d
96 %z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
97 %tileslice.2 = add i32 %tileslice, 2
98 %z1 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.2)
99 %tileslice.4 = add i32 %tileslice, 4
100 %z2 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.4)
101 %tileslice.6 = add i32 %tileslice, 6
102 %z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
103 ret <vscale x 8 x i16> %z0
106 define <vscale x 8 x i16> @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
107 ; CHECK-LABEL: extract_col_h:
109 ; CHECK-NEXT: mov z1.d, z0.d
110 ; CHECK-NEXT: mov z2.d, z0.d
111 ; CHECK-NEXT: mov w12, w0
112 ; CHECK-NEXT: mov z1.h, p0/m, za1v.h[w12, 1]
113 ; CHECK-NEXT: mov z2.h, p0/m, za1v.h[w12, 3]
114 ; CHECK-NEXT: mov z2.d, z0.d
115 ; CHECK-NEXT: mov z2.h, p0/m, za1v.h[w12, 5]
116 ; CHECK-NEXT: mov z0.h, p0/m, za1v.h[w12, 7]
117 ; CHECK-NEXT: mov z0.d, z1.d
119 %tileslice.1 = add i32 %tileslice, 1
120 %z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.1)
121 %tileslice.3 = add i32 %tileslice, 3
122 %z1 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.3)
123 %tileslice.5 = add i32 %tileslice, 5
124 %z2 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.5)
125 %tileslice.7 = add i32 %tileslice, 7
126 %z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.7)
127 ret <vscale x 8 x i16> %z0
130 define <vscale x 8 x half> @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
131 ; CHECK-LABEL: extract_f16:
133 ; CHECK-NEXT: mov z1.d, z0.d
134 ; CHECK-NEXT: mov z2.d, z0.d
135 ; CHECK-NEXT: mov w12, w0
136 ; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 0]
137 ; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 1]
138 ; CHECK-NEXT: mov z2.d, z0.d
139 ; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 2]
140 ; CHECK-NEXT: mov z2.d, z0.d
141 ; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 3]
142 ; CHECK-NEXT: mov z2.d, z0.d
143 ; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 4]
144 ; CHECK-NEXT: mov z2.d, z0.d
145 ; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 5]
146 ; CHECK-NEXT: mov z2.d, z0.d
147 ; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 6]
148 ; CHECK-NEXT: mov z0.h, p0/m, za0v.h[w12, 7]
149 ; CHECK-NEXT: mov z0.d, z1.d
151 %z0 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
152 %tileslice.1 = add i32 %tileslice, 1
153 %z1 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
154 %tileslice.2 = add i32 %tileslice, 2
155 %z2 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.2)
156 %tileslice.3 = add i32 %tileslice, 3
157 %z3 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.3)
158 %tileslice.4 = add i32 %tileslice, 4
159 %z4 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.4)
160 %tileslice.5 = add i32 %tileslice, 5
161 %z5 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.5)
162 %tileslice.6 = add i32 %tileslice, 6
163 %z6 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
164 %tileslice.7 = add i32 %tileslice, 7
165 %z7 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
166 ret <vscale x 8 x half> %z0
169 define <vscale x 8 x bfloat> @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 %tileslice, ptr %ptr) {
170 ; CHECK-LABEL: extract_bf16:
172 ; CHECK-NEXT: mov z1.d, z0.d
173 ; CHECK-NEXT: mov z2.d, z0.d
174 ; CHECK-NEXT: mov w12, w0
175 ; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 0]
176 ; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 1]
177 ; CHECK-NEXT: mov z2.d, z0.d
178 ; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 2]
179 ; CHECK-NEXT: mov z2.d, z0.d
180 ; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 3]
181 ; CHECK-NEXT: mov z2.d, z0.d
182 ; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 4]
183 ; CHECK-NEXT: mov z2.d, z0.d
184 ; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 5]
185 ; CHECK-NEXT: mov z2.d, z0.d
186 ; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 6]
187 ; CHECK-NEXT: mov z0.h, p0/m, za0v.h[w12, 7]
188 ; CHECK-NEXT: mov z0.d, z1.d
190 %z0 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
191 %tileslice.1 = add i32 %tileslice, 1
192 %z1 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
193 %tileslice.2 = add i32 %tileslice, 2
194 %z2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.2)
195 %tileslice.3 = add i32 %tileslice, 3
196 %z3 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.3)
197 %tileslice.4 = add i32 %tileslice, 4
198 %z4 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.4)
199 %tileslice.5 = add i32 %tileslice, 5
200 %z5 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.5)
201 %tileslice.6 = add i32 %tileslice, 6
202 %z6 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
203 %tileslice.7 = add i32 %tileslice, 7
204 %z7 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
205 ret <vscale x 8 x bfloat> %z0
208 define <vscale x 4 x i32> @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
209 ; CHECK-LABEL: extract_row_s:
211 ; CHECK-NEXT: mov z1.d, z0.d
212 ; CHECK-NEXT: mov w12, w0
213 ; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 0]
214 ; CHECK-NEXT: mov z0.s, p0/m, za0h.s[w12, 2]
215 ; CHECK-NEXT: mov z0.d, z1.d
217 %z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
218 %tileslice.2 = add i32 %tileslice, 2
219 %z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
220 ret <vscale x 4 x i32> %z0
223 define <vscale x 4 x i32> @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
224 ; CHECK-LABEL: extract_col_s:
226 ; CHECK-NEXT: mov z1.d, z0.d
227 ; CHECK-NEXT: mov w12, w0
228 ; CHECK-NEXT: mov z1.s, p0/m, za3v.s[w12, 1]
229 ; CHECK-NEXT: mov z0.s, p0/m, za3v.s[w12, 3]
230 ; CHECK-NEXT: mov z0.d, z1.d
232 %tileslice.1 = add i32 %tileslice, 1
233 %z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.1)
234 %tileslice.3 = add i32 %tileslice, 3
235 %z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.3)
236 ret <vscale x 4 x i32> %z0
239 define <vscale x 4 x float> @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
240 ; CHECK-LABEL: extract_f32:
242 ; CHECK-NEXT: mov z1.d, z0.d
243 ; CHECK-NEXT: mov z2.d, z0.d
244 ; CHECK-NEXT: mov w12, w0
245 ; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 0]
246 ; CHECK-NEXT: mov z2.s, p0/m, za0h.s[w12, 1]
247 ; CHECK-NEXT: mov z2.d, z0.d
248 ; CHECK-NEXT: mov z2.s, p0/m, za0v.s[w12, 2]
249 ; CHECK-NEXT: mov z0.s, p0/m, za0v.s[w12, 3]
250 ; CHECK-NEXT: mov z0.d, z1.d
252 %z0 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
253 %tileslice.1 = add i32 %tileslice, 1
254 %z1 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.1)
255 %tileslice.2 = add i32 %tileslice, 2
256 %z2 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
257 %tileslice.3 = add i32 %tileslice, 3
258 %z3 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.3)
259 ret <vscale x 4 x float> %z0
262 define <vscale x 2 x i64> @extract_row_d(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
263 ; CHECK-LABEL: extract_row_d:
265 ; CHECK-NEXT: mov w12, w0
266 ; CHECK-NEXT: mov z0.d, p0/m, za0h.d[w12, 0]
268 %z0 = call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice)
269 ret <vscale x 2 x i64> %z0
272 define <vscale x 2 x i64> @extract_col_d(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
273 ; CHECK-LABEL: extract_col_d:
275 ; CHECK-NEXT: mov w12, w0
276 ; CHECK-NEXT: mov z0.d, p0/m, za1v.d[w12, 1]
278 %tileslice.1 = add i32 %tileslice, 1
279 %z0 = call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg, i32 1, i32 %tileslice.1)
280 ret <vscale x 2 x i64> %z0
283 define <vscale x 2 x double> @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
284 ; CHECK-LABEL: extract_f64:
286 ; CHECK-NEXT: mov z1.d, z0.d
287 ; CHECK-NEXT: mov w12, w0
288 ; CHECK-NEXT: mov z1.d, p0/m, za0h.d[w12, 0]
289 ; CHECK-NEXT: mov z0.d, p0/m, za0v.d[w12, 1]
290 ; CHECK-NEXT: mov z0.d, z1.d
292 %z0 = call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice)
293 %tileslice.1 = add i32 %tileslice, 1
294 %z1 = call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice.1)
295 ret <vscale x 2 x double> %z0
298 define <vscale x 16 x i8> @extract_row_q_v16i18(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg) {
299 ; CHECK-LABEL: extract_row_q_v16i18:
301 ; CHECK-NEXT: mov w12, wzr
302 ; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0]
304 %res = call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 0)
305 ret <vscale x 16 x i8> %res
308 define <vscale x 8 x i16> @extract_row_q_v8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg) {
309 ; CHECK-LABEL: extract_row_q_v8i16:
311 ; CHECK-NEXT: mov w12, wzr
312 ; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0]
314 %res = call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 0)
315 ret <vscale x 8 x i16> %res
318 define <vscale x 8 x half> @extract_row_q_v8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg) {
319 ; CHECK-LABEL: extract_row_q_v8f16:
321 ; CHECK-NEXT: mov w12, wzr
322 ; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0]
324 %res = call <vscale x 8 x half> @llvm.aarch64.sme.readq.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 0)
325 ret <vscale x 8 x half> %res
328 define <vscale x 4 x i32> @extract_row_q_v4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg) {
329 ; CHECK-LABEL: extract_row_q_v4i32:
331 ; CHECK-NEXT: mov w12, wzr
332 ; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0]
334 %res = call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 0)
335 ret <vscale x 4 x i32> %res
338 define <vscale x 4 x float> @extract_row_q_v4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg) {
339 ; CHECK-LABEL: extract_row_q_v4f32:
341 ; CHECK-NEXT: mov w12, wzr
342 ; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0]
344 %res = call <vscale x 4 x float> @llvm.aarch64.sme.readq.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 0)
345 ret <vscale x 4 x float> %res
348 define <vscale x 2 x i64> @extract_row_q_v2i64(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg) {
349 ; CHECK-LABEL: extract_row_q_v2i64:
351 ; CHECK-NEXT: mov w12, wzr
352 ; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0]
354 %res = call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg, i32 0, i32 0)
355 ret <vscale x 2 x i64> %res
358 define <vscale x 2 x double> @extract_row_q_v2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg) {
359 ; CHECK-LABEL: extract_row_q_v2f64:
361 ; CHECK-NEXT: mov w12, wzr
362 ; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0]
364 %res = call <vscale x 2 x double> @llvm.aarch64.sme.readq.horiz.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 0)
365 ret <vscale x 2 x double> %res
368 define <vscale x 16 x i8> @extract_col_q_v16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg) {
369 ; CHECK-LABEL: extract_col_q_v16i8:
371 ; CHECK-NEXT: mov w12, wzr
372 ; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0]
374 %res = call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 15, i32 0)
375 ret <vscale x 16 x i8> %res
378 define <vscale x 8 x i16> @extract_col_q_v8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg) {
379 ; CHECK-LABEL: extract_col_q_v8i16:
381 ; CHECK-NEXT: mov w12, wzr
382 ; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0]
384 %res = call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 15, i32 0)
385 ret <vscale x 8 x i16> %res
388 define <vscale x 8 x half> @extract_col_q_v8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg) {
389 ; CHECK-LABEL: extract_col_q_v8f16:
391 ; CHECK-NEXT: mov w12, wzr
392 ; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0]
394 %res = call <vscale x 8 x half> @llvm.aarch64.sme.readq.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 15, i32 0)
395 ret <vscale x 8 x half> %res
398 define <vscale x 4 x i32> @extract_col_q_v4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg) {
399 ; CHECK-LABEL: extract_col_q_v4i32:
401 ; CHECK-NEXT: mov w12, wzr
402 ; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0]
404 %res = call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 15, i32 0)
405 ret <vscale x 4 x i32> %res
408 define <vscale x 4 x float> @extract_col_q_v4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg) {
409 ; CHECK-LABEL: extract_col_q_v4f32:
411 ; CHECK-NEXT: mov w12, wzr
412 ; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0]
414 %res = call <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 15, i32 0)
415 ret <vscale x 4 x float> %res
418 define <vscale x 2 x i64> @extract_col_q_v2i64(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg) {
419 ; CHECK-LABEL: extract_col_q_v2i64:
421 ; CHECK-NEXT: mov w12, wzr
422 ; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0]
424 %res = call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg, i32 15, i32 0)
425 ret <vscale x 2 x i64> %res
428 define <vscale x 2 x double> @extract_col_q_v2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg) {
429 ; CHECK-LABEL: extract_col_q_v2f64:
431 ; CHECK-NEXT: mov w12, wzr
432 ; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0]
434 %res = call <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 15, i32 0)
435 ret <vscale x 2 x double> %res
438 define <vscale x 4 x i32> @test_sink_offset_operand(<vscale x 4 x i1> %pg, i32 %base, i32 %N) {
439 ; CHECK-LABEL: test_sink_offset_operand:
440 ; CHECK: // %bb.0: // %entry
441 ; CHECK-NEXT: mov z0.s, #0 // =0x0
442 ; CHECK-NEXT: mov w12, w0
443 ; CHECK-NEXT: .LBB26_1: // %for.body
444 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
445 ; CHECK-NEXT: mov z1.d, z0.d
446 ; CHECK-NEXT: mov z2.d, z0.d
447 ; CHECK-NEXT: subs w1, w1, #3
448 ; CHECK-NEXT: mov z3.d, z0.d
449 ; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 0]
450 ; CHECK-NEXT: mov z2.s, p0/m, za0h.s[w12, 1]
451 ; CHECK-NEXT: mov z3.s, p0/m, za0h.s[w12, 2]
452 ; CHECK-NEXT: b.ne .LBB26_1
453 ; CHECK-NEXT: // %bb.2: // %exit
454 ; CHECK-NEXT: add z0.s, z1.s, z2.s
455 ; CHECK-NEXT: add z0.s, z0.s, z3.s
458 %add1 = add i32 %base, 1
459 %add2 = add i32 %base, 2
463 %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
464 %z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, i32 0, i32 %base)
465 %z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, i32 0, i32 %add1)
466 %z2 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, i32 0, i32 %add2)
467 %inc = add nuw nsw i32 %i, 3
468 %exitcond.not = icmp eq i32 %inc, %N
469 br i1 %exitcond.not, label %exit, label %for.body
472 %tmp1 = add <vscale x 4 x i32> %z0, %z1
473 %res = add <vscale x 4 x i32> %tmp1, %z2
474 ret <vscale x 4 x i32> %res
477 declare <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i32, i32)
478 declare <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i32, i32)
479 declare <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, i32, i32)
480 declare <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, i32, i32)
481 declare <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32, i32)
482 declare <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, i32, i32)
483 declare <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i32, i32)
484 declare <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32, i32)
485 declare <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i32, i32)
486 declare <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i32, i32)
487 declare <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, i32, i32)
488 declare <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, i32, i32)
489 declare <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32, i32)
490 declare <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, i32, i32)
491 declare <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i32, i32)
492 declare <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32, i32)
494 declare <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i32, i32)
495 declare <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i32, i32)
496 declare <vscale x 8 x half> @llvm.aarch64.sme.readq.horiz.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, i32, i32)
497 declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.horiz.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, i32, i32)
498 declare <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32, i32)
499 declare <vscale x 4 x float> @llvm.aarch64.sme.readq.horiz.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, i32, i32)
500 declare <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i32, i32)
501 declare <vscale x 2 x double> @llvm.aarch64.sme.readq.horiz.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32, i32)
502 declare <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i32, i32)
503 declare <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i32, i32)
504 declare <vscale x 8 x half> @llvm.aarch64.sme.readq.vert.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, i32, i32)
505 declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.vert.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, i32, i32)
506 declare <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32, i32)
507 declare <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, i32, i32)
508 declare <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i32, i32)
509 declare <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32, i32)