1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
8 define <vscale x 2 x i64> @masked_zload_nxv2i8(<vscale x 2 x i8>* %src, <vscale x 2 x i1> %mask) {
9 ; CHECK-LABEL: masked_zload_nxv2i8:
11 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0]
13 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %src, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
14 %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
15 ret <vscale x 2 x i64> %ext
18 define <vscale x 2 x i64> @masked_zload_nxv2i16(<vscale x 2 x i16>* %src, <vscale x 2 x i1> %mask) {
19 ; CHECK-LABEL: masked_zload_nxv2i16:
21 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
23 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %src, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
24 %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
25 ret <vscale x 2 x i64> %ext
28 define <vscale x 2 x i64> @masked_zload_nxv2i32(<vscale x 2 x i32>* %src, <vscale x 2 x i1> %mask) {
29 ; CHECK-LABEL: masked_zload_nxv2i32:
31 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
33 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %src, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
34 %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
35 ret <vscale x 2 x i64> %ext
38 define <vscale x 4 x i32> @masked_zload_nxv4i8(<vscale x 4 x i8>* %src, <vscale x 4 x i1> %mask) {
39 ; CHECK-LABEL: masked_zload_nxv4i8:
41 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
43 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %src, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
44 %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
45 ret <vscale x 4 x i32> %ext
48 define <vscale x 4 x i32> @masked_zload_nxv4i16(<vscale x 4 x i16>* %src, <vscale x 4 x i1> %mask) {
49 ; CHECK-LABEL: masked_zload_nxv4i16:
51 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
53 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %src, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
54 %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
55 ret <vscale x 4 x i32> %ext
58 define <vscale x 8 x i16> @masked_zload_nxv8i8(<vscale x 8 x i8>* %src, <vscale x 8 x i1> %mask) {
59 ; CHECK-LABEL: masked_zload_nxv8i8:
61 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
63 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %src, i32 1, <vscale x 8 x i1> %mask, <vscale x 8 x i8> undef)
64 %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
65 ret <vscale x 8 x i16> %ext
68 define <vscale x 2 x i64> @masked_zload_passthru(<vscale x 2 x i32>* %src, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthru) {
69 ; CHECK-LABEL: masked_zload_passthru:
71 ; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0]
72 ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
73 ; CHECK-NEXT: mov z0.d, p0/m, z1.d
75 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %src, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthru)
76 %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
77 ret <vscale x 2 x i64> %ext
80 ; Return type requires splitting
81 define <vscale x 8 x i64> @masked_zload_nxv8i16(<vscale x 8 x i16>* %a, <vscale x 8 x i1> %mask) {
82 ; CHECK-LABEL: masked_zload_nxv8i16:
84 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
85 ; CHECK-NEXT: uunpklo z1.s, z0.h
86 ; CHECK-NEXT: uunpkhi z3.s, z0.h
87 ; CHECK-NEXT: uunpklo z0.d, z1.s
88 ; CHECK-NEXT: uunpkhi z1.d, z1.s
89 ; CHECK-NEXT: uunpklo z2.d, z3.s
90 ; CHECK-NEXT: uunpkhi z3.d, z3.s
92 %load = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>* %a, i32 2, <vscale x 8 x i1> %mask, <vscale x 8 x i16> undef)
93 %ext = zext <vscale x 8 x i16> %load to <vscale x 8 x i64>
94 ret <vscale x 8 x i64> %ext
97 ; Masked load requires promotion
98 define <vscale x 2 x double> @masked_zload_2i16_2f64(<vscale x 2 x i16>* noalias %in, <vscale x 2 x i1> %mask) {
99 ; CHECK-LABEL: masked_zload_2i16_2f64:
101 ; CHECK-NEXT: ptrue p1.d
102 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
103 ; CHECK-NEXT: ucvtf z0.d, p1/m, z0.d
105 %wide.load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %in, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
106 %zext = zext <vscale x 2 x i16> %wide.load to <vscale x 2 x i32>
107 %res = uitofp <vscale x 2 x i32> %zext to <vscale x 2 x double>
108 ret <vscale x 2 x double> %res
111 ; Extending loads from unpacked to wide illegal types
113 define <vscale x 4 x i64> @masked_zload_4i8_4i64(ptr %a, <vscale x 4 x i1> %b) {
114 ; CHECK-LABEL: masked_zload_4i8_4i64:
116 ; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0]
117 ; CHECK-NEXT: uunpklo z0.d, z1.s
118 ; CHECK-NEXT: uunpkhi z1.d, z1.s
120 %aval = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8> *%a, i32 16, <vscale x 4 x i1> %b, <vscale x 4 x i8> zeroinitializer)
121 %aext = zext <vscale x 4 x i8> %aval to <vscale x 4 x i64>
122 ret <vscale x 4 x i64> %aext
125 define <vscale x 4 x i64> @masked_zload_4i16_4i64(ptr %a, <vscale x 4 x i1> %b) {
126 ; CHECK-LABEL: masked_zload_4i16_4i64:
128 ; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0]
129 ; CHECK-NEXT: uunpklo z0.d, z1.s
130 ; CHECK-NEXT: uunpkhi z1.d, z1.s
132 %aval = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16> *%a, i32 16, <vscale x 4 x i1> %b, <vscale x 4 x i16> zeroinitializer)
133 %aext = zext <vscale x 4 x i16> %aval to <vscale x 4 x i64>
134 ret <vscale x 4 x i64> %aext
137 define <vscale x 8 x i32> @masked_zload_8i8_8i32(ptr %a, <vscale x 8 x i1> %b) {
138 ; CHECK-LABEL: masked_zload_8i8_8i32:
140 ; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0]
141 ; CHECK-NEXT: uunpklo z0.s, z1.h
142 ; CHECK-NEXT: uunpkhi z1.s, z1.h
144 %aval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%a, i32 16, <vscale x 8 x i1> %b, <vscale x 8 x i8> zeroinitializer)
145 %aext = zext <vscale x 8 x i8> %aval to <vscale x 8 x i32>
146 ret <vscale x 8 x i32> %aext
149 define <vscale x 8 x i64> @masked_zload_8i8_8i64(ptr %a, <vscale x 8 x i1> %b) {
150 ; CHECK-LABEL: masked_zload_8i8_8i64:
152 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
153 ; CHECK-NEXT: uunpklo z1.s, z0.h
154 ; CHECK-NEXT: uunpkhi z3.s, z0.h
155 ; CHECK-NEXT: uunpklo z0.d, z1.s
156 ; CHECK-NEXT: uunpkhi z1.d, z1.s
157 ; CHECK-NEXT: uunpklo z2.d, z3.s
158 ; CHECK-NEXT: uunpkhi z3.d, z3.s
160 %aval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%a, i32 16, <vscale x 8 x i1> %b, <vscale x 8 x i8> zeroinitializer)
161 %aext = zext <vscale x 8 x i8> %aval to <vscale x 8 x i64>
162 ret <vscale x 8 x i64> %aext
165 define <vscale x 4 x i64> @masked_zload_x2_4i8_4i64(ptr %a, ptr %b, <vscale x 4 x i1> %c) {
166 ; CHECK-LABEL: masked_zload_x2_4i8_4i64:
168 ; CHECK-NEXT: punpkhi p1.h, p0.b
169 ; CHECK-NEXT: punpklo p0.h, p0.b
170 ; CHECK-NEXT: ld1b { z1.d }, p1/z, [x0, #1, mul vl]
171 ; CHECK-NEXT: ld1b { z2.d }, p1/z, [x1, #1, mul vl]
172 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0]
173 ; CHECK-NEXT: ld1b { z3.d }, p0/z, [x1]
174 ; CHECK-NEXT: add z1.d, z1.d, z2.d
175 ; CHECK-NEXT: add z0.d, z0.d, z3.d
177 %aval = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8> *%a, i32 16, <vscale x 4 x i1> %c, <vscale x 4 x i8> zeroinitializer)
178 %bval = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8> *%b, i32 16, <vscale x 4 x i1> %c, <vscale x 4 x i8> zeroinitializer)
179 %aext = zext <vscale x 4 x i8> %aval to <vscale x 4 x i64>
180 %bext = zext <vscale x 4 x i8> %bval to <vscale x 4 x i64>
181 %res = add <vscale x 4 x i64> %aext, %bext
182 ret <vscale x 4 x i64> %res
185 define <vscale x 4 x i64> @masked_zload_x2_4i16_4i64(ptr %a, ptr %b, <vscale x 4 x i1> %c) {
186 ; CHECK-LABEL: masked_zload_x2_4i16_4i64:
188 ; CHECK-NEXT: punpkhi p1.h, p0.b
189 ; CHECK-NEXT: punpklo p0.h, p0.b
190 ; CHECK-NEXT: ld1h { z1.d }, p1/z, [x0, #1, mul vl]
191 ; CHECK-NEXT: ld1h { z2.d }, p1/z, [x1, #1, mul vl]
192 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
193 ; CHECK-NEXT: ld1h { z3.d }, p0/z, [x1]
194 ; CHECK-NEXT: add z1.d, z1.d, z2.d
195 ; CHECK-NEXT: add z0.d, z0.d, z3.d
197 %aval = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16> *%a, i32 16, <vscale x 4 x i1> %c, <vscale x 4 x i16> zeroinitializer)
198 %bval = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16> *%b, i32 16, <vscale x 4 x i1> %c, <vscale x 4 x i16> zeroinitializer)
199 %aext = zext <vscale x 4 x i16> %aval to <vscale x 4 x i64>
200 %bext = zext <vscale x 4 x i16> %bval to <vscale x 4 x i64>
201 %res = add <vscale x 4 x i64> %aext, %bext
202 ret <vscale x 4 x i64> %res
205 define <vscale x 8 x i32> @masked_zload_x2_8i8_8i32(ptr %a, ptr %b, <vscale x 8 x i1> %c) {
206 ; CHECK-LABEL: masked_zload_x2_8i8_8i32:
208 ; CHECK-NEXT: punpkhi p1.h, p0.b
209 ; CHECK-NEXT: punpklo p0.h, p0.b
210 ; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, #1, mul vl]
211 ; CHECK-NEXT: ld1b { z2.s }, p1/z, [x1, #1, mul vl]
212 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
213 ; CHECK-NEXT: ld1b { z3.s }, p0/z, [x1]
214 ; CHECK-NEXT: add z1.s, z1.s, z2.s
215 ; CHECK-NEXT: add z0.s, z0.s, z3.s
217 %aval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%a, i32 16, <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer)
218 %bval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%b, i32 16, <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer)
219 %aext = zext <vscale x 8 x i8> %aval to <vscale x 8 x i32>
220 %bext = zext <vscale x 8 x i8> %bval to <vscale x 8 x i32>
221 %res = add <vscale x 8 x i32> %aext, %bext
222 ret <vscale x 8 x i32> %res
225 define <vscale x 8 x i64> @masked_zload_x2_8i8_8i64(ptr %a, ptr %b, <vscale x 8 x i1> %c) {
226 ; CHECK-LABEL: masked_zload_x2_8i8_8i64:
228 ; CHECK-NEXT: punpkhi p1.h, p0.b
229 ; CHECK-NEXT: punpklo p0.h, p0.b
230 ; CHECK-NEXT: punpkhi p2.h, p1.b
231 ; CHECK-NEXT: punpklo p1.h, p1.b
232 ; CHECK-NEXT: punpkhi p3.h, p0.b
233 ; CHECK-NEXT: punpklo p0.h, p0.b
234 ; CHECK-NEXT: ld1b { z3.d }, p2/z, [x0, #3, mul vl]
235 ; CHECK-NEXT: ld1b { z5.d }, p2/z, [x1, #3, mul vl]
236 ; CHECK-NEXT: ld1b { z2.d }, p1/z, [x0, #2, mul vl]
237 ; CHECK-NEXT: ld1b { z6.d }, p1/z, [x1, #2, mul vl]
238 ; CHECK-NEXT: ld1b { z1.d }, p3/z, [x0, #1, mul vl]
239 ; CHECK-NEXT: ld1b { z7.d }, p3/z, [x1, #1, mul vl]
240 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0]
241 ; CHECK-NEXT: ld1b { z4.d }, p0/z, [x1]
242 ; CHECK-NEXT: add z3.d, z3.d, z5.d
243 ; CHECK-NEXT: add z2.d, z2.d, z6.d
244 ; CHECK-NEXT: add z1.d, z1.d, z7.d
245 ; CHECK-NEXT: add z0.d, z0.d, z4.d
247 %aval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%a, i32 16, <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer)
248 %bval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%b, i32 16, <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer)
249 %aext = zext <vscale x 8 x i8> %aval to <vscale x 8 x i64>
250 %bext = zext <vscale x 8 x i8> %bval to <vscale x 8 x i64>
251 %res = add <vscale x 8 x i64> %aext, %bext
252 ret <vscale x 8 x i64> %res
256 declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>*, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
257 declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
258 declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
259 declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>*, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
260 declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
261 declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>*, i32, <vscale x 8 x i1>, <vscale x 8 x i8>)
262 declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)