1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s
4 target triple = "aarch64-linux"
6 ;MOVAZ (tile to vector, Multi)
13 define {<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_hor_z8_i8_x2(i32 %tile, i32 %slice) #0 {
14 ; CHECK-LABEL: test_readz_hor_z8_i8_x2:
16 ; CHECK-NEXT: mov w12, w1
17 ; CHECK-NEXT: movaz { z0.b, z1.b }, za0h.b[w12, 0:1]
18 ; CHECK-NEXT: movaz { z0.b, z1.b }, za0h.b[w12, 14:15]
20 %res = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 %slice)
21 %slice.max = add i32 %slice, 14
22 %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 %slice.max)
23 ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
25 define {<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_hor_z16_i16_x2(i32 %slice) #0 {
26 ; CHECK-LABEL: test_readz_hor_z16_i16_x2:
28 ; CHECK-NEXT: mov w12, w0
29 ; CHECK-NEXT: movaz { z0.h, z1.h }, za0h.h[w12, 0:1]
30 ; CHECK-NEXT: movaz { z0.h, z1.h }, za1h.h[w12, 6:7]
32 %res = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 %slice)
33 %slice.max = add i32 %slice, 6
34 %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 %slice.max)
35 ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
38 define {<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_hor_z32_i32_x2(i32 %slice) #0 {
39 ; CHECK-LABEL: test_readz_hor_z32_i32_x2:
41 ; CHECK-NEXT: mov w12, w0
42 ; CHECK-NEXT: movaz { z0.s, z1.s }, za0h.s[w12, 0:1]
43 ; CHECK-NEXT: movaz { z0.s, z1.s }, za3h.s[w12, 2:3]
45 %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 %slice)
46 %slice.max = add i32 %slice, 2
47 %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 3, i32 %slice.max)
48 ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
51 define {<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_hor_z64_i64_x2(i32 %slice) #0 {
52 ; CHECK-LABEL: test_readz_hor_z64_i64_x2:
54 ; CHECK-NEXT: mov w12, w0
55 ; CHECK-NEXT: movaz { z0.d, z1.d }, za0h.d[w12, 0:1]
56 ; CHECK-NEXT: movaz { z2.d, z3.d }, za7h.d[w12, 0:1]
58 %res = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 %slice)
59 %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 7, i32 %slice)
60 ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %res
63 define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_hor_z16_bf16_x2(i32 %slice) #0 {
64 ; CHECK-LABEL: test_readz_hor_z16_bf16_x2:
66 ; CHECK-NEXT: mov w12, w0
67 ; CHECK-NEXT: movaz { z0.h, z1.h }, za0h.h[w12, 0:1]
68 ; CHECK-NEXT: movaz { z0.h, z1.h }, za1h.h[w12, 6:7]
70 %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 0, i32 %slice)
71 %slice.max = add i32 %slice, 6
72 %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 %slice.max)
73 ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
76 define {<vscale x 8 x half>, <vscale x 8 x half>} @test_readz_hor_z16_f16_x2(i32 %slice) #0 {
77 ; CHECK-LABEL: test_readz_hor_z16_f16_x2:
79 ; CHECK-NEXT: mov w12, w0
80 ; CHECK-NEXT: movaz { z0.h, z1.h }, za0h.h[w12, 0:1]
81 ; CHECK-NEXT: movaz { z0.h, z1.h }, za1h.h[w12, 6:7]
83 %res = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 %slice)
84 %slice.max = add i32 %slice, 6
85 %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 1, i32 %slice.max)
86 ret {<vscale x 8 x half>, <vscale x 8 x half>} %res2
89 define {<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_hor_z32_f32_x2(i32 %slice) #0 {
90 ; CHECK-LABEL: test_readz_hor_z32_f32_x2:
92 ; CHECK-NEXT: mov w12, w0
93 ; CHECK-NEXT: movaz { z0.s, z1.s }, za0h.s[w12, 0:1]
94 ; CHECK-NEXT: movaz { z0.s, z1.s }, za3h.s[w12, 2:3]
96 %res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 0, i32 %slice)
97 %slice.max = add i32 %slice, 2
98 %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 %slice.max)
99 ret {<vscale x 4 x float>, <vscale x 4 x float>} %res2
102 define {<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_hor_z64_f64_x2(i32 %slice) #0 {
103 ; CHECK-LABEL: test_readz_hor_z64_f64_x2:
105 ; CHECK-NEXT: mov w12, w0
106 ; CHECK-NEXT: movaz { z0.d, z1.d }, za0h.d[w12, 0:1]
107 ; CHECK-NEXT: movaz { z2.d, z3.d }, za7h.d[w12, 0:1]
109 %res = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 0, i32 %slice)
110 %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 %slice)
111 ret {<vscale x 2 x double>, <vscale x 2 x double>} %res
118 define {<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_ver_z8_i8_x2(i32 %tile, i32 %slice) #0 {
119 ; CHECK-LABEL: test_readz_ver_z8_i8_x2:
121 ; CHECK-NEXT: mov w12, w1
122 ; CHECK-NEXT: movaz { z0.b, z1.b }, za0v.b[w12, 0:1]
123 ; CHECK-NEXT: movaz { z0.b, z1.b }, za0v.b[w12, 14:15]
125 %res = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 %slice)
126 %slice.max = add i32 %slice, 14
127 %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 %slice.max)
128 ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
130 define {<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_ver_z16_i16_x2(i32 %slice) #0 {
131 ; CHECK-LABEL: test_readz_ver_z16_i16_x2:
133 ; CHECK-NEXT: mov w12, w0
134 ; CHECK-NEXT: movaz { z0.h, z1.h }, za0v.h[w12, 0:1]
135 ; CHECK-NEXT: movaz { z0.h, z1.h }, za1v.h[w12, 6:7]
137 %res = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 %slice)
138 %slice.max = add i32 %slice, 6
139 %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 %slice.max)
140 ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
143 define {<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_ver_z32_i32_x2(i32 %slice) #0 {
144 ; CHECK-LABEL: test_readz_ver_z32_i32_x2:
146 ; CHECK-NEXT: mov w12, w0
147 ; CHECK-NEXT: movaz { z0.s, z1.s }, za0v.s[w12, 0:1]
148 ; CHECK-NEXT: movaz { z0.s, z1.s }, za3v.s[w12, 2:3]
150 %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 %slice)
151 %slice.max = add i32 %slice, 2
152 %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 3, i32 %slice.max)
153 ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
156 define {<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_ver_z64_i64_x2(i32 %slice) #0 {
157 ; CHECK-LABEL: test_readz_ver_z64_i64_x2:
159 ; CHECK-NEXT: mov w12, w0
160 ; CHECK-NEXT: movaz { z0.d, z1.d }, za0v.d[w12, 0:1]
161 ; CHECK-NEXT: movaz { z2.d, z3.d }, za7v.d[w12, 0:1]
163 %res = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 %slice)
164 %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 7, i32 %slice)
165 ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %res
168 define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_ver_z16_bf16_x2(i32 %slice) #0 {
169 ; CHECK-LABEL: test_readz_ver_z16_bf16_x2:
171 ; CHECK-NEXT: mov w12, w0
172 ; CHECK-NEXT: movaz { z0.h, z1.h }, za0v.h[w12, 0:1]
173 ; CHECK-NEXT: movaz { z0.h, z1.h }, za1v.h[w12, 6:7]
175 %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 0, i32 %slice)
176 %slice.max = add i32 %slice, 6
177 %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 %slice.max)
178 ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
181 define {<vscale x 8 x half>, <vscale x 8 x half>} @test_readz_ver_z16_f16_x2(i32 %slice) #0 {
182 ; CHECK-LABEL: test_readz_ver_z16_f16_x2:
184 ; CHECK-NEXT: mov w12, w0
185 ; CHECK-NEXT: movaz { z0.h, z1.h }, za0v.h[w12, 0:1]
186 ; CHECK-NEXT: movaz { z0.h, z1.h }, za1v.h[w12, 6:7]
188 %res = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 %slice)
189 %slice.max = add i32 %slice, 6
190 %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 1, i32 %slice.max)
191 ret {<vscale x 8 x half>, <vscale x 8 x half>} %res2
194 define {<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_ver_z32_f32_x2(i32 %slice) #0 {
195 ; CHECK-LABEL: test_readz_ver_z32_f32_x2:
197 ; CHECK-NEXT: mov w12, w0
198 ; CHECK-NEXT: movaz { z0.s, z1.s }, za0v.s[w12, 0:1]
199 ; CHECK-NEXT: movaz { z0.s, z1.s }, za3v.s[w12, 2:3]
201 %res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 0, i32 %slice)
202 %slice.max = add i32 %slice, 2
203 %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 %slice.max)
204 ret {<vscale x 4 x float>, <vscale x 4 x float>} %res2
207 define {<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_ver_z64_f64_x2(i32 %slice) #0 {
208 ; CHECK-LABEL: test_readz_ver_z64_f64_x2:
210 ; CHECK-NEXT: mov w12, w0
211 ; CHECK-NEXT: movaz { z0.d, z1.d }, za0v.d[w12, 0:1]
212 ; CHECK-NEXT: movaz { z2.d, z3.d }, za7v.d[w12, 0:1]
214 %res = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 0, i32 %slice)
215 %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 %slice)
216 ret {<vscale x 2 x double>, <vscale x 2 x double>} %res
223 define {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_hor_z8_i8_x4(i32 %tile, i32 %slice) #0 {
224 ; CHECK-LABEL: test_readz_hor_z8_i8_x4:
226 ; CHECK-NEXT: mov w12, w1
227 ; CHECK-NEXT: movaz { z0.b - z3.b }, za0h.b[w12, 0:3]
228 ; CHECK-NEXT: movaz { z0.b - z3.b }, za0h.b[w12, 12:15]
230 %res = call {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 %slice)
231 %slice.max = add i32 %slice, 12
232 %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 %slice.max)
233 ret {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
235 define {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_hor_z16_i16_x4(i32 %slice) #0 {
236 ; CHECK-LABEL: test_readz_hor_z16_i16_x4:
238 ; CHECK-NEXT: mov w12, w0
239 ; CHECK-NEXT: movaz { z0.h - z3.h }, za0h.h[w12, 0:3]
240 ; CHECK-NEXT: movaz { z0.h - z3.h }, za1h.h[w12, 4:7]
242 %res = call {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 %slice)
243 %slice.max = add i32 %slice, 4
244 %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 %slice.max)
245 ret {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
248 define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_hor_z32_i32_x4(i32 %slice) #0 {
249 ; CHECK-LABEL: test_readz_hor_z32_i32_x4:
251 ; CHECK-NEXT: mov w12, w0
252 ; CHECK-NEXT: movaz { z0.s - z3.s }, za0h.s[w12, 0:3]
253 ; CHECK-NEXT: movaz { z0.s - z3.s }, za3h.s[w12, 0:3]
255 %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 %slice)
256 %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 3, i32 %slice)
257 ret {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
260 define {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_hor_z64_i64_x4(i32 %slice) #0 {
261 ; CHECK-LABEL: test_readz_hor_z64_i64_x4:
263 ; CHECK-NEXT: mov w12, w0
264 ; CHECK-NEXT: movaz { z0.d - z3.d }, za0h.d[w12, 0:3]
265 ; CHECK-NEXT: movaz { z4.d - z7.d }, za7h.d[w12, 0:3]
267 %res = call {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 %slice)
268 %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 7, i32 %slice)
269 ret {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} %res
272 define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_hor_z16_bf16_x4(i32 %slice) #0 {
273 ; CHECK-LABEL: test_readz_hor_z16_bf16_x4:
275 ; CHECK-NEXT: mov w12, w0
276 ; CHECK-NEXT: movaz { z0.h - z3.h }, za0h.h[w12, 0:3]
277 ; CHECK-NEXT: movaz { z0.h - z3.h }, za1h.h[w12, 4:7]
279 %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 0, i32 %slice)
280 %slice.max = add i32 %slice, 4
281 %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 %slice.max)
282 ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
285 define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @test_readz_hor_z16_f16_x4(i32 %slice) #0 {
286 ; CHECK-LABEL: test_readz_hor_z16_f16_x4:
288 ; CHECK-NEXT: mov w12, w0
289 ; CHECK-NEXT: movaz { z0.h - z3.h }, za0h.h[w12, 0:3]
290 ; CHECK-NEXT: movaz { z0.h - z3.h }, za1h.h[w12, 4:7]
292 %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 %slice)
293 %slice.max = add i32 %slice, 4
294 %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 1, i32 %slice.max)
295 ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res2
298 define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_hor_z32_f32_x4(i32 %slice) #0 {
299 ; CHECK-LABEL: test_readz_hor_z32_f32_x4:
301 ; CHECK-NEXT: mov w12, w0
302 ; CHECK-NEXT: movaz { z0.s - z3.s }, za0h.s[w12, 0:3]
303 ; CHECK-NEXT: movaz { z0.s - z3.s }, za3h.s[w12, 0:3]
305 %res = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 0, i32 %slice)
306 %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 %slice)
307 ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res2
310 define {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_hor_z64_f64_x4(i32 %slice) #0 {
311 ; CHECK-LABEL: test_readz_hor_z64_f64_x4:
313 ; CHECK-NEXT: mov w12, w0
314 ; CHECK-NEXT: movaz { z0.d - z3.d }, za0h.d[w12, 0:3]
315 ; CHECK-NEXT: movaz { z4.d - z7.d }, za7h.d[w12, 0:3]
317 %res = call {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 0, i32 %slice)
318 %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 %slice)
319 ret {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} %res
326 define {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_ver_z8_i8_x4(i32 %tile, i32 %slice) #0 {
327 ; CHECK-LABEL: test_readz_ver_z8_i8_x4:
329 ; CHECK-NEXT: mov w12, w1
330 ; CHECK-NEXT: movaz { z0.b - z3.b }, za0v.b[w12, 0:3]
331 ; CHECK-NEXT: movaz { z0.b - z3.b }, za0v.b[w12, 12:15]
333 %res = call {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 %slice)
334 %slice.max = add i32 %slice, 12
335 %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 %slice.max)
336 ret {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
338 define {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_ver_z16_i16_x4(i32 %slice) #0 {
339 ; CHECK-LABEL: test_readz_ver_z16_i16_x4:
341 ; CHECK-NEXT: mov w12, w0
342 ; CHECK-NEXT: movaz { z0.h - z3.h }, za0v.h[w12, 0:3]
343 ; CHECK-NEXT: movaz { z0.h - z3.h }, za1v.h[w12, 4:7]
345 %res = call {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 %slice)
346 %slice.max = add i32 %slice, 4
347 %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 %slice.max)
348 ret {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
351 define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_ver_z32_i32_x4(i32 %slice) #0 {
352 ; CHECK-LABEL: test_readz_ver_z32_i32_x4:
354 ; CHECK-NEXT: mov w12, w0
355 ; CHECK-NEXT: movaz { z0.s - z3.s }, za0v.s[w12, 0:3]
356 ; CHECK-NEXT: movaz { z0.s - z3.s }, za3v.s[w12, 0:3]
358 %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 %slice)
359 %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 3, i32 %slice)
360 ret {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
363 define {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_ver_z64_i64_x4(i32 %slice) #0 {
364 ; CHECK-LABEL: test_readz_ver_z64_i64_x4:
366 ; CHECK-NEXT: mov w12, w0
367 ; CHECK-NEXT: movaz { z0.d - z3.d }, za0v.d[w12, 0:3]
368 ; CHECK-NEXT: movaz { z4.d - z7.d }, za7v.d[w12, 0:3]
370 %res = call {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 %slice)
371 %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 7, i32 %slice)
372 ret {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} %res
375 define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_ver_z16_bf16_x4(i32 %slice) #0 {
376 ; CHECK-LABEL: test_readz_ver_z16_bf16_x4:
378 ; CHECK-NEXT: mov w12, w0
379 ; CHECK-NEXT: movaz { z0.h - z3.h }, za0v.h[w12, 0:3]
380 ; CHECK-NEXT: movaz { z0.h - z3.h }, za1v.h[w12, 4:7]
382 %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 0, i32 %slice)
383 %slice.max = add i32 %slice, 4
384 %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 %slice.max)
385 ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
388 define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @test_readz_ver_z16_f16_x4(i32 %slice) #0 {
389 ; CHECK-LABEL: test_readz_ver_z16_f16_x4:
391 ; CHECK-NEXT: mov w12, w0
392 ; CHECK-NEXT: movaz { z0.h - z3.h }, za0v.h[w12, 0:3]
393 ; CHECK-NEXT: movaz { z0.h - z3.h }, za1v.h[w12, 4:7]
395 %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 %slice)
396 %slice.max = add i32 %slice, 4
397 %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 1, i32 %slice.max)
398 ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res2
401 define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_ver_z32_f32_x4(i32 %slice) #0 {
402 ; CHECK-LABEL: test_readz_ver_z32_f32_x4:
404 ; CHECK-NEXT: mov w12, w0
405 ; CHECK-NEXT: movaz { z0.s - z3.s }, za0v.s[w12, 0:3]
406 ; CHECK-NEXT: movaz { z0.s - z3.s }, za3v.s[w12, 0:3]
408 %res = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 0, i32 %slice)
409 %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 %slice)
410 ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res2
413 define {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_ver_z64_f64_x4(i32 %slice) #0 {
414 ; CHECK-LABEL: test_readz_ver_z64_f64_x4:
416 ; CHECK-NEXT: mov w12, w0
417 ; CHECK-NEXT: movaz { z0.d - z3.d }, za0v.d[w12, 0:3]
418 ; CHECK-NEXT: movaz { z4.d - z7.d }, za7v.d[w12, 0:3]
420 %res = call {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 0, i32 %slice)
421 %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 %slice)
422 ret {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} %res
426 declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.za8.x2.nxv16i8(i32, i32)
427 declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32, i32)
428 declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32, i32)
429 declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32, i32)
430 declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32, i32)
431 declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32, i32)
432 declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32, i32)
433 declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32, i32)
435 declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.za8.x2.nxv16i8(i32, i32)
436 declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32, i32)
437 declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32, i32)
438 declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32, i32)
439 declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32, i32)
440 declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32, i32)
441 declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32, i32)
442 declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32, i32)
444 declare {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.za8.x4.nxv16i8(i32, i32)
445 declare {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32, i32)
446 declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32, i32)
447 declare {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32, i32)
448 declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32, i32)
449 declare {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32, i32)
450 declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32, i32)
451 declare {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32, i32)
453 declare {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.za8.x4.nxv16i8(i32, i32)
454 declare {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32, i32)
455 declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32, i32)
456 declare {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32, i32)
457 declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32, i32)
458 declare {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32, i32)
459 declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32, i32)
460 declare {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32, i32)
462 ;MOVAZ (tile to vector, single)
467 define <vscale x 16 x i8> @test_readz_hor_z8_i8(i32 %tile, i32 %slice) #0 {
468 ; CHECK-LABEL: test_readz_hor_z8_i8:
470 ; CHECK-NEXT: mov w12, w1
471 ; CHECK-NEXT: movaz z0.b, za0h.b[w12, 0]
472 ; CHECK-NEXT: movaz z0.b, za0h.b[w12, 14]
474 %res = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 %slice)
475 %slice.max = add i32 %slice, 14
476 %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 %slice.max)
477 ret <vscale x 16 x i8> %res2
480 define <vscale x 8 x i16> @test_readz_hor_z16_i16(i32 %tile, i32 %slice) #0 {
481 ; CHECK-LABEL: test_readz_hor_z16_i16:
483 ; CHECK-NEXT: mov w12, w1
484 ; CHECK-NEXT: movaz z0.h, za0h.h[w12, 0]
485 ; CHECK-NEXT: movaz z0.h, za1h.h[w12, 7]
487 %res = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 0, i32 %slice)
488 %slice.max = add i32 %slice, 7
489 %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 1, i32 %slice.max)
490 ret <vscale x 8 x i16> %res2
493 define <vscale x 4 x i32> @test_readz_hor_z32_i32(i32 %tile, i32 %slice) #0 {
494 ; CHECK-LABEL: test_readz_hor_z32_i32:
496 ; CHECK-NEXT: mov w12, w1
497 ; CHECK-NEXT: movaz z0.s, za0h.s[w12, 0]
498 ; CHECK-NEXT: movaz z0.s, za3h.s[w12, 3]
500 %res = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 0, i32 %slice)
501 %slice.max = add i32 %slice, 3
502 %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 3, i32 %slice.max)
503 ret <vscale x 4 x i32> %res2
506 define <vscale x 2 x i64> @test_readz_hor_z64_i64(i32 %tile, i32 %slice) #0 {
507 ; CHECK-LABEL: test_readz_hor_z64_i64:
509 ; CHECK-NEXT: mov w12, w1
510 ; CHECK-NEXT: movaz z0.d, za0h.d[w12, 0]
511 ; CHECK-NEXT: movaz z1.d, za7h.d[w12, 1]
513 %res = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 0, i32 %slice)
514 %slice.max = add i32 %slice, 1
515 %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 7, i32 %slice.max)
516 ret <vscale x 2 x i64> %res
519 define <vscale x 8 x bfloat> @test_readz_hor_z16_bf16(i32 %tile, i32 %slice) #0 {
520 ; CHECK-LABEL: test_readz_hor_z16_bf16:
522 ; CHECK-NEXT: mov w12, w1
523 ; CHECK-NEXT: movaz z0.h, za0h.h[w12, 0]
524 ; CHECK-NEXT: movaz z0.h, za1h.h[w12, 7]
526 %res = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32 0, i32 %slice)
527 %slice.max = add i32 %slice, 7
528 %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32 1, i32 %slice.max)
529 ret <vscale x 8 x bfloat> %res2
532 define <vscale x 8 x half> @test_readz_hor_z16_f16(i32 %tile, i32 %slice) #0 {
533 ; CHECK-LABEL: test_readz_hor_z16_f16:
535 ; CHECK-NEXT: mov w12, w1
536 ; CHECK-NEXT: movaz z0.h, za0h.h[w12, 0]
537 ; CHECK-NEXT: movaz z0.h, za1h.h[w12, 7]
539 %res = call <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32 0, i32 %slice)
540 %slice.max = add i32 %slice, 7
541 %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32 1, i32 %slice.max)
542 ret <vscale x 8 x half> %res2
545 define <vscale x 4 x float> @test_readz_hor_z32_f32(i32 %tile, i32 %slice) #0 {
546 ; CHECK-LABEL: test_readz_hor_z32_f32:
548 ; CHECK-NEXT: mov w12, w1
549 ; CHECK-NEXT: movaz z0.s, za0h.s[w12, 0]
550 ; CHECK-NEXT: movaz z0.s, za3h.s[w12, 3]
552 %res = call <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32 0, i32 %slice)
553 %slice.max = add i32 %slice, 3
554 %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32 3, i32 %slice.max)
555 ret <vscale x 4 x float> %res2
558 define <vscale x 2 x double> @test_readz_hor_z64_f64(i32 %tile, i32 %slice) #0 {
559 ; CHECK-LABEL: test_readz_hor_z64_f64:
561 ; CHECK-NEXT: mov w12, w1
562 ; CHECK-NEXT: movaz z0.d, za0h.d[w12, 0]
563 ; CHECK-NEXT: movaz z1.d, za7h.d[w12, 1]
565 %res = call <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32 0, i32 %slice)
566 %slice.max = add i32 %slice, 1
567 %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32 7, i32 %slice.max)
568 ret <vscale x 2 x double> %res
571 define <vscale x 16 x i8> @test_readz_hor_z128_i8(i32 %tile, i32 %slice) #0 {
572 ; CHECK-LABEL: test_readz_hor_z128_i8:
574 ; CHECK-NEXT: mov w12, w1
575 ; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
576 ; CHECK-NEXT: movaz z0.q, za15h.q[w12, 0]
578 %res = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 0, i32 %slice)
579 %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 15, i32 %slice)
580 ret <vscale x 16 x i8> %res2
583 define <vscale x 8 x i16> @test_readz_hor_z128_i16(i32 %tile, i32 %slice) #0 {
584 ; CHECK-LABEL: test_readz_hor_z128_i16:
586 ; CHECK-NEXT: mov w12, w1
587 ; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
588 ; CHECK-NEXT: movaz z0.q, za15h.q[w12, 0]
590 %res = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 0, i32 %slice)
591 %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 15, i32 %slice)
592 ret <vscale x 8 x i16> %res2
595 define <vscale x 4 x i32> @test_readz_hor_z128_i32(i32 %tile, i32 %slice) #0 {
596 ; CHECK-LABEL: test_readz_hor_z128_i32:
598 ; CHECK-NEXT: mov w12, w1
599 ; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
600 ; CHECK-NEXT: movaz z0.q, za15h.q[w12, 0]
602 %res = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 0, i32 %slice)
603 %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 15, i32 %slice)
604 ret <vscale x 4 x i32> %res2
607 define <vscale x 2 x i64> @test_readz_hor_z128_i64(i32 %tile, i32 %slice) #0 {
608 ; CHECK-LABEL: test_readz_hor_z128_i64:
610 ; CHECK-NEXT: mov w12, w1
611 ; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
612 ; CHECK-NEXT: movaz z1.q, za15h.q[w12, 0]
614 %res = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 0, i32 %slice)
615 %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 15, i32 %slice)
616 ret <vscale x 2 x i64> %res
619 define <vscale x 8 x bfloat> @test_readz_hor_z128_bf16(i32 %tile, i32 %slice) #0 {
620 ; CHECK-LABEL: test_readz_hor_z128_bf16:
622 ; CHECK-NEXT: mov w12, w1
623 ; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
624 ; CHECK-NEXT: movaz z0.q, za15h.q[w12, 0]
626 %res = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32 0, i32 %slice)
627 %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32 15, i32 %slice)
628 ret <vscale x 8 x bfloat> %res2
631 define <vscale x 8 x half> @test_readz_hor_z128_f16(i32 %tile, i32 %slice) #0 {
632 ; CHECK-LABEL: test_readz_hor_z128_f16:
634 ; CHECK-NEXT: mov w12, w1
635 ; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
636 ; CHECK-NEXT: movaz z0.q, za15h.q[w12, 0]
638 %res = call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32 0, i32 %slice)
639 %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32 15, i32 %slice)
640 ret <vscale x 8 x half> %res2
643 define <vscale x 4 x float> @test_readz_hor_z128_f32(i32 %tile, i32 %slice) #0 {
644 ; CHECK-LABEL: test_readz_hor_z128_f32:
646 ; CHECK-NEXT: mov w12, w1
647 ; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
648 ; CHECK-NEXT: movaz z0.q, za15h.q[w12, 0]
650 %res = call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32 0, i32 %slice)
651 %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32 15, i32 %slice)
652 ret <vscale x 4 x float> %res2
655 define <vscale x 2 x double> @test_readz_hor_z128_f64(i32 %tile, i32 %slice) #0 {
656 ; CHECK-LABEL: test_readz_hor_z128_f64:
658 ; CHECK-NEXT: mov w12, w1
659 ; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
660 ; CHECK-NEXT: movaz z1.q, za15h.q[w12, 0]
662 %res = call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32 0, i32 %slice)
663 %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32 15, i32 %slice)
664 ret <vscale x 2 x double> %res
670 define <vscale x 16 x i8> @test_readz_ver_z8_i8(i32 %tile, i32 %slice) #0 {
671 ; CHECK-LABEL: test_readz_ver_z8_i8:
673 ; CHECK-NEXT: mov w12, w1
674 ; CHECK-NEXT: movaz z0.b, za0v.b[w12, 0]
675 ; CHECK-NEXT: movaz z0.b, za0v.b[w12, 14]
677 %res = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.vert.nxv16i8(i32 0, i32 %slice)
678 %slice.max = add i32 %slice, 14
679 %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.vert.nxv16i8(i32 0, i32 %slice.max)
680 ret <vscale x 16 x i8> %res2
683 define <vscale x 8 x i16> @test_readz_ver_z16_i16(i32 %tile, i32 %slice) #0 {
684 ; CHECK-LABEL: test_readz_ver_z16_i16:
686 ; CHECK-NEXT: mov w12, w1
687 ; CHECK-NEXT: movaz z0.h, za0v.h[w12, 0]
688 ; CHECK-NEXT: movaz z0.h, za1v.h[w12, 7]
690 %res = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.vert.nxv8i16(i32 0, i32 %slice)
691 %slice.max = add i32 %slice, 7
692 %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.vert.nxv8i16(i32 1, i32 %slice.max)
693 ret <vscale x 8 x i16> %res2
696 define <vscale x 4 x i32> @test_readz_ver_z32_i32(i32 %tile, i32 %slice) #0 {
697 ; CHECK-LABEL: test_readz_ver_z32_i32:
699 ; CHECK-NEXT: mov w12, w1
700 ; CHECK-NEXT: movaz z0.s, za0v.s[w12, 0]
701 ; CHECK-NEXT: movaz z0.s, za3v.s[w12, 3]
703 %res = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.vert.nxv4i32(i32 0, i32 %slice)
704 %slice.max = add i32 %slice, 3
705 %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.vert.nxv4i32(i32 3, i32 %slice.max)
706 ret <vscale x 4 x i32> %res2
709 define <vscale x 2 x i64> @test_readz_ver_z64_i64(i32 %tile, i32 %slice) #0 {
710 ; CHECK-LABEL: test_readz_ver_z64_i64:
712 ; CHECK-NEXT: mov w12, w1
713 ; CHECK-NEXT: movaz z0.d, za0v.d[w12, 0]
714 ; CHECK-NEXT: movaz z1.d, za7v.d[w12, 1]
716 %res = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.vert.nxv2i64(i32 0, i32 %slice)
717 %slice.max = add i32 %slice, 1
718 %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.vert.nxv2i64(i32 7, i32 %slice.max)
719 ret <vscale x 2 x i64> %res
722 define <vscale x 8 x bfloat> @test_readz_ver_z16_bf16(i32 %tile, i32 %slice) #0 {
723 ; CHECK-LABEL: test_readz_ver_z16_bf16:
725 ; CHECK-NEXT: mov w12, w1
726 ; CHECK-NEXT: movaz z0.h, za0v.h[w12, 0]
727 ; CHECK-NEXT: movaz z0.h, za1v.h[w12, 7]
729 %res = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.vert.nxv8bf16(i32 0, i32 %slice)
730 %slice.max = add i32 %slice, 7
731 %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.vert.nxv8bf16(i32 1, i32 %slice.max)
732 ret <vscale x 8 x bfloat> %res2
735 define <vscale x 8 x half> @test_readz_ver_z16_f16(i32 %tile, i32 %slice) #0 {
736 ; CHECK-LABEL: test_readz_ver_z16_f16:
738 ; CHECK-NEXT: mov w12, w1
739 ; CHECK-NEXT: movaz z0.h, za0v.h[w12, 0]
740 ; CHECK-NEXT: movaz z0.h, za1v.h[w12, 7]
742 %res = call <vscale x 8 x half> @llvm.aarch64.sme.readz.vert.nxv8f16(i32 0, i32 %slice)
743 %slice.max = add i32 %slice, 7
744 %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.vert.nxv8f16(i32 1, i32 %slice.max)
745 ret <vscale x 8 x half> %res2
748 define <vscale x 4 x float> @test_readz_ver_z32_f32(i32 %tile, i32 %slice) #0 {
749 ; CHECK-LABEL: test_readz_ver_z32_f32:
751 ; CHECK-NEXT: mov w12, w1
752 ; CHECK-NEXT: movaz z0.s, za0v.s[w12, 0]
753 ; CHECK-NEXT: movaz z0.s, za3v.s[w12, 3]
755 %res = call <vscale x 4 x float> @llvm.aarch64.sme.readz.vert.nxv4f32(i32 0, i32 %slice)
756 %slice.max = add i32 %slice, 3
757 %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.vert.nxv4f32(i32 3, i32 %slice.max)
758 ret <vscale x 4 x float> %res2
761 define <vscale x 2 x double> @test_readz_ver_z64_f64(i32 %tile, i32 %slice) #0 {
762 ; CHECK-LABEL: test_readz_ver_z64_f64:
764 ; CHECK-NEXT: mov w12, w1
765 ; CHECK-NEXT: movaz z0.d, za0v.d[w12, 0]
766 ; CHECK-NEXT: movaz z1.d, za7v.d[w12, 1]
768 %res = call <vscale x 2 x double> @llvm.aarch64.sme.readz.vert.nxv2f64(i32 0, i32 %slice)
769 %slice.max = add i32 %slice, 1
770 %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.vert.nxv2f64(i32 7, i32 %slice.max)
771 ret <vscale x 2 x double> %res
774 define <vscale x 16 x i8> @test_readz_ver_z128_i8(i32 %tile, i32 %slice) #0 {
775 ; CHECK-LABEL: test_readz_ver_z128_i8:
777 ; CHECK-NEXT: mov w12, w1
778 ; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
779 ; CHECK-NEXT: movaz z0.q, za15v.q[w12, 0]
781 %res = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.vert.nxv16i8(i32 0, i32 %slice)
782 %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.vert.nxv16i8(i32 15, i32 %slice)
783 ret <vscale x 16 x i8> %res2
786 define <vscale x 8 x i16> @test_readz_ver_z128_i16(i32 %tile, i32 %slice) #0 {
787 ; CHECK-LABEL: test_readz_ver_z128_i16:
789 ; CHECK-NEXT: mov w12, w1
790 ; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
791 ; CHECK-NEXT: movaz z0.q, za15v.q[w12, 0]
793 %res = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.vert.nxv8i16(i32 0, i32 %slice)
794 %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.vert.nxv8i16(i32 15, i32 %slice)
795 ret <vscale x 8 x i16> %res2
798 define <vscale x 4 x i32> @test_readz_ver_z128_i32(i32 %tile, i32 %slice) #0 {
799 ; CHECK-LABEL: test_readz_ver_z128_i32:
801 ; CHECK-NEXT: mov w12, w1
802 ; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
803 ; CHECK-NEXT: movaz z0.q, za15v.q[w12, 0]
805 %res = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.vert.nxv4i32(i32 0, i32 %slice)
806 %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.vert.nxv4i32(i32 15, i32 %slice)
807 ret <vscale x 4 x i32> %res2
810 define <vscale x 2 x i64> @test_readz_ver_z128_i64(i32 %tile, i32 %slice) #0 {
811 ; CHECK-LABEL: test_readz_ver_z128_i64:
813 ; CHECK-NEXT: mov w12, w1
814 ; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
815 ; CHECK-NEXT: movaz z1.q, za15v.q[w12, 0]
817 %res = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.vert.nxv2i64(i32 0, i32 %slice)
818 %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.vert.nxv2i64(i32 15, i32 %slice)
819 ret <vscale x 2 x i64> %res
822 define <vscale x 8 x bfloat> @test_readz_ver_z128_bf16(i32 %tile, i32 %slice) #0 {
823 ; CHECK-LABEL: test_readz_ver_z128_bf16:
825 ; CHECK-NEXT: mov w12, w1
826 ; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
827 ; CHECK-NEXT: movaz z0.q, za15v.q[w12, 0]
829 %res = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32 0, i32 %slice)
830 %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32 15, i32 %slice)
831 ret <vscale x 8 x bfloat> %res2
834 define <vscale x 8 x half> @test_readz_ver_z128_f16(i32 %tile, i32 %slice) #0 {
835 ; CHECK-LABEL: test_readz_ver_z128_f16:
837 ; CHECK-NEXT: mov w12, w1
838 ; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
839 ; CHECK-NEXT: movaz z0.q, za15v.q[w12, 0]
841 %res = call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32 0, i32 %slice)
842 %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32 15, i32 %slice)
843 ret <vscale x 8 x half> %res2
846 define <vscale x 4 x float> @test_readz_ver_z128_f32(i32 %tile, i32 %slice) #0 {
847 ; CHECK-LABEL: test_readz_ver_z128_f32:
849 ; CHECK-NEXT: mov w12, w1
850 ; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
851 ; CHECK-NEXT: movaz z0.q, za15v.q[w12, 0]
853 %res = call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32 0, i32 %slice)
854 %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32 15, i32 %slice)
855 ret <vscale x 4 x float> %res2
858 define <vscale x 2 x double> @test_readz_ver_z128_f64(i32 %tile, i32 %slice) #0 {
859 ; CHECK-LABEL: test_readz_ver_z128_f64:
861 ; CHECK-NEXT: mov w12, w1
862 ; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
863 ; CHECK-NEXT: movaz z1.q, za15v.q[w12, 0]
865 %res = call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32 0, i32 %slice)
866 %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32 15, i32 %slice)
867 ret <vscale x 2 x double> %res
870 declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32, i32)
871 declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32, i32)
872 declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32, i32)
873 declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32, i32)
874 declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32, i32)
875 declare <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32, i32)
876 declare <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32, i32)
877 declare <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32, i32)
878 declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32, i32)
879 declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32, i32)
880 declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32, i32)
881 declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32, i32)
882 declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32, i32)
883 declare <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32, i32)
884 declare <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32, i32)
885 declare <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32, i32)
888 declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.vert.nxv16i8(i32, i32)
889 declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.vert.nxv8i16(i32, i32)
890 declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.vert.nxv4i32(i32, i32)
891 declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.vert.nxv2i64(i32, i32)
892 declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.vert.nxv8bf16(i32, i32)
893 declare <vscale x 8 x half> @llvm.aarch64.sme.readz.vert.nxv8f16(i32, i32)
894 declare <vscale x 4 x float> @llvm.aarch64.sme.readz.vert.nxv4f32(i32, i32)
895 declare <vscale x 2 x double> @llvm.aarch64.sme.readz.vert.nxv2f64(i32, i32)
896 declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.vert.nxv16i8(i32, i32)
897 declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.vert.nxv8i16(i32, i32)
898 declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.vert.nxv4i32(i32, i32)
899 declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.vert.nxv2i64(i32, i32)
900 declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32, i32)
901 declare <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32, i32)
902 declare <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32, i32)
903 declare <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32, i32)
905 ;MOVAZ (array to vector, Multi)
912 define {<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_z8_i8_x2(i32 %slice) #0 {
913 ; CHECK-LABEL: test_readz_z8_i8_x2:
915 ; CHECK-NEXT: mov w8, w0
916 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
917 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
919 %res = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.x2.nxv16i8(i32 %slice)
920 %slice.max = add i32 %slice, 7
921 %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.x2.nxv16i8(i32 %slice.max)
922 ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
925 define {<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_z16_i16_x2(i32 %slice) #0 {
926 ; CHECK-LABEL: test_readz_z16_i16_x2:
928 ; CHECK-NEXT: mov w8, w0
929 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
930 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
932 %res = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.x2.nxv8i16(i32 %slice)
933 %slice.max = add i32 %slice, 7
934 %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.x2.nxv8i16(i32 %slice.max)
935 ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
938 define {<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_z32_i32_x2(i32 %slice) #0 {
939 ; CHECK-LABEL: test_readz_z32_i32_x2:
941 ; CHECK-NEXT: mov w8, w0
942 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
943 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
945 %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.x2.nxv4i32(i32 %slice)
946 %slice.max = add i32 %slice, 7
947 %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.x2.nxv4i32(i32 %slice.max)
948 ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
951 define {<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_z64_i64_x2(i32 %slice) #0 {
952 ; CHECK-LABEL: test_readz_z64_i64_x2:
954 ; CHECK-NEXT: mov w8, w0
955 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
956 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
958 %res = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.x2.nxv2i64(i32 %slice)
959 %slice.max = add i32 %slice, 7
960 %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.x2.nxv2i64(i32 %slice.max)
961 ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %res2
964 define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_z16_bf16_x2(i32 %slice) #0 {
965 ; CHECK-LABEL: test_readz_z16_bf16_x2:
967 ; CHECK-NEXT: mov w8, w0
968 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
969 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
971 %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x2.nxv8bf16(i32 %slice)
972 %slice.max = add i32 %slice, 7
973 %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x2.nxv8bf16(i32 %slice.max)
974 ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
977 define {<vscale x 8 x half>, <vscale x 8 x half>} @test_readz_z16_f16_x2(i32 %slice) #0 {
978 ; CHECK-LABEL: test_readz_z16_f16_x2:
980 ; CHECK-NEXT: mov w8, w0
981 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
982 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
984 %res = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.x2.nxv8f16(i32 %slice)
985 %slice.max = add i32 %slice, 7
986 %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.x2.nxv8f16(i32 %slice.max)
987 ret {<vscale x 8 x half>, <vscale x 8 x half>} %res2
990 define {<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_z32_f32_x2(i32 %slice) #0 {
991 ; CHECK-LABEL: test_readz_z32_f32_x2:
993 ; CHECK-NEXT: mov w8, w0
994 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
995 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
997 %res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x2.nxv4f32(i32 %slice)
998 %slice.max = add i32 %slice, 7
999 %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x2.nxv4f32(i32 %slice.max)
1000 ret {<vscale x 4 x float>, <vscale x 4 x float>} %res2
1003 define {<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_z64_f64_x2(i32 %slice) #0 {
1004 ; CHECK-LABEL: test_readz_z64_f64_x2:
1006 ; CHECK-NEXT: mov w8, w0
1007 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
1008 ; CHECK-NEXT: movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
1010 %res = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x2.nxv2f64(i32 %slice)
1011 %slice.max = add i32 %slice, 7
1012 %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x2.nxv2f64(i32 %slice.max)
1013 ret {<vscale x 2 x double>, <vscale x 2 x double>} %res2
1020 define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_z8_i8_x4(i32 %slice) #0 {
1021 ; CHECK-LABEL: test_readz_z8_i8_x4:
1023 ; CHECK-NEXT: mov w8, w0
1024 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
1025 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
1027 %res = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.x4.nxv16i8(i32 %slice)
1028 %slice.max = add i32 %slice, 7
1029 %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.x4.nxv16i8(i32 %slice.max)
1030 ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res2
1033 define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_z16_i16_x4(i32 %slice) #0 {
1034 ; CHECK-LABEL: test_readz_z16_i16_x4:
1036 ; CHECK-NEXT: mov w8, w0
1037 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
1038 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
1040 %res = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.x4.nxv8i16(i32 %slice)
1041 %slice.max = add i32 %slice, 7
1042 %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.x4.nxv8i16(i32 %slice.max)
1043 ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %res2
1046 define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_z32_i32_x4(i32 %slice) #0 {
1047 ; CHECK-LABEL: test_readz_z32_i32_x4:
1049 ; CHECK-NEXT: mov w8, w0
1050 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
1051 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
1053 %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.x4.nxv4i32(i32 %slice)
1054 %slice.max = add i32 %slice, 7
1055 %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.x4.nxv4i32(i32 %slice.max)
1056 ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %res2
1059 define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_z64_i64_x4(i32 %slice) #0 {
1060 ; CHECK-LABEL: test_readz_z64_i64_x4:
1062 ; CHECK-NEXT: mov w8, w0
1063 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
1064 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
1066 %res = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.x4.nxv2i64(i32 %slice)
1067 %slice.max = add i32 %slice, 7
1068 %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.x4.nxv2i64(i32 %slice.max)
1069 ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %res2
1072 define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_z16_bf16_x4(i32 %slice) #0 {
1073 ; CHECK-LABEL: test_readz_z16_bf16_x4:
1075 ; CHECK-NEXT: mov w8, w0
1076 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
1077 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
1079 %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x4.nxv8bf16(i32 %slice)
1080 %slice.max = add i32 %slice, 7
1081 %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x4.nxv8bf16(i32 %slice.max)
1082 ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
1085 define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @test_readz_z16_f16_x4(i32 %slice) #0 {
1086 ; CHECK-LABEL: test_readz_z16_f16_x4:
1088 ; CHECK-NEXT: mov w8, w0
1089 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
1090 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
1092 %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.x4.nxv8f16(i32 %slice)
1093 %slice.max = add i32 %slice, 7
1094 %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.x4.nxv8f16(i32 %slice.max)
1095 ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res2
1098 define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @test_readz_z32_f32_x4(i32 %slice) #0 {
1099 ; CHECK-LABEL: test_readz_z32_f32_x4:
1101 ; CHECK-NEXT: mov w8, w0
1102 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
1103 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
1105 %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x4.nxv4f32(i32 %slice)
1106 %slice.max = add i32 %slice, 7
1107 %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x4.nxv4f32(i32 %slice.max)
1108 ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res2
1111 define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @test_readz_z64_f64_x4(i32 %slice) #0 {
1112 ; CHECK-LABEL: test_readz_z64_f64_x4:
1114 ; CHECK-NEXT: mov w8, w0
1115 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
1116 ; CHECK-NEXT: movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
1118 %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x4.nxv2f64(i32 %slice)
1119 %slice.max = add i32 %slice, 7
1120 %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x4.nxv2f64(i32 %slice.max)
1121 ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res2
1124 attributes #0 = { "target-features"="+sme2p1" }