1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
8 define <vscale x 2 x i64> @masked_sload_nxv2i8(<vscale x 2 x i8> *%a, <vscale x 2 x i1> %mask) {
9 ; CHECK-LABEL: masked_sload_nxv2i8:
11 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
13 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8> *%a, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
14 %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
15 ret <vscale x 2 x i64> %ext
18 define <vscale x 2 x i64> @masked_sload_nxv2i16(<vscale x 2 x i16> *%a, <vscale x 2 x i1> %mask) {
19 ; CHECK-LABEL: masked_sload_nxv2i16:
21 ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
23 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16> *%a, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
24 %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
25 ret <vscale x 2 x i64> %ext
28 define <vscale x 2 x i64> @masked_sload_nxv2i32(<vscale x 2 x i32> *%a, <vscale x 2 x i1> %mask) {
29 ; CHECK-LABEL: masked_sload_nxv2i32:
31 ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
33 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32> *%a, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
34 %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
35 ret <vscale x 2 x i64> %ext
38 define <vscale x 4 x i32> @masked_sload_nxv4i8(<vscale x 4 x i8> *%a, <vscale x 4 x i1> %mask) {
39 ; CHECK-LABEL: masked_sload_nxv4i8:
41 ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
43 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8> *%a, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
44 %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
45 ret <vscale x 4 x i32> %ext
48 define <vscale x 4 x i32> @masked_sload_nxv4i16(<vscale x 4 x i16> *%a, <vscale x 4 x i1> %mask) {
49 ; CHECK-LABEL: masked_sload_nxv4i16:
51 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
53 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16> *%a, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
54 %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
55 ret <vscale x 4 x i32> %ext
58 define <vscale x 8 x i16> @masked_sload_nxv8i8(<vscale x 8 x i8> *%a, <vscale x 8 x i1> %mask) {
59 ; CHECK-LABEL: masked_sload_nxv8i8:
61 ; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
63 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%a, i32 1, <vscale x 8 x i1> %mask, <vscale x 8 x i8> undef)
64 %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
65 ret <vscale x 8 x i16> %ext
68 define <vscale x 2 x i64> @masked_sload_passthru(<vscale x 2 x i32> *%a, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthru) {
69 ; CHECK-LABEL: masked_sload_passthru:
71 ; CHECK-NEXT: ptrue p1.d
72 ; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0]
73 ; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
74 ; CHECK-NEXT: mov z0.d, p0/m, z1.d
76 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32> *%a, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthru)
77 %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
78 ret <vscale x 2 x i64> %ext
81 ; Return type requires splitting
82 define <vscale x 16 x i32> @masked_sload_nxv16i8(<vscale x 16 x i8>* %a, <vscale x 16 x i1> %mask) {
83 ; CHECK-LABEL: masked_sload_nxv16i8:
85 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
86 ; CHECK-NEXT: sunpklo z1.h, z0.b
87 ; CHECK-NEXT: sunpkhi z3.h, z0.b
88 ; CHECK-NEXT: sunpklo z0.s, z1.h
89 ; CHECK-NEXT: sunpkhi z1.s, z1.h
90 ; CHECK-NEXT: sunpklo z2.s, z3.h
91 ; CHECK-NEXT: sunpkhi z3.s, z3.h
93 %load = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>* %a, i32 2, <vscale x 16 x i1> %mask, <vscale x 16 x i8> undef)
94 %ext = sext <vscale x 16 x i8> %load to <vscale x 16 x i32>
95 ret <vscale x 16 x i32> %ext
98 ; Masked load requires promotion
99 define <vscale x 4 x double> @masked_sload_4i8_4f32(<vscale x 4 x i8>* noalias %in, <vscale x 4 x i1> %mask) {
100 ; CHECK-LABEL: masked_sload_4i8_4f32:
102 ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
103 ; CHECK-NEXT: ptrue p1.d
104 ; CHECK-NEXT: sunpkhi z1.d, z0.s
105 ; CHECK-NEXT: sunpklo z0.d, z0.s
106 ; CHECK-NEXT: scvtf z0.d, p1/m, z0.d
107 ; CHECK-NEXT: scvtf z1.d, p1/m, z1.d
109 %wide.load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %in, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
110 %sext = sext <vscale x 4 x i8> %wide.load to <vscale x 4 x i64>
111 %res = sitofp <vscale x 4 x i64> %sext to <vscale x 4 x double>
112 ret <vscale x 4 x double> %res
116 ; Extending loads from unpacked to wide illegal types
118 define <vscale x 4 x i64> @masked_sload_4i8_4i64(ptr %a, <vscale x 4 x i1> %b) {
119 ; CHECK-LABEL: masked_sload_4i8_4i64:
121 ; CHECK-NEXT: ld1sb { z1.s }, p0/z, [x0]
122 ; CHECK-NEXT: sunpklo z0.d, z1.s
123 ; CHECK-NEXT: sunpkhi z1.d, z1.s
125 %aval = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8> *%a, i32 16, <vscale x 4 x i1> %b, <vscale x 4 x i8> zeroinitializer)
126 %aext = sext <vscale x 4 x i8> %aval to <vscale x 4 x i64>
127 ret <vscale x 4 x i64> %aext
130 define <vscale x 4 x i64> @masked_sload_4i16_4i64(ptr %a, <vscale x 4 x i1> %b) {
131 ; CHECK-LABEL: masked_sload_4i16_4i64:
133 ; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
134 ; CHECK-NEXT: sunpklo z0.d, z1.s
135 ; CHECK-NEXT: sunpkhi z1.d, z1.s
137 %aval = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16> *%a, i32 16, <vscale x 4 x i1> %b, <vscale x 4 x i16> zeroinitializer)
138 %aext = sext <vscale x 4 x i16> %aval to <vscale x 4 x i64>
139 ret <vscale x 4 x i64> %aext
142 define <vscale x 8 x i32> @masked_sload_8i8_8i32(ptr %a, <vscale x 8 x i1> %b) {
143 ; CHECK-LABEL: masked_sload_8i8_8i32:
145 ; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
146 ; CHECK-NEXT: sunpklo z0.s, z1.h
147 ; CHECK-NEXT: sunpkhi z1.s, z1.h
149 %aval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%a, i32 16, <vscale x 8 x i1> %b, <vscale x 8 x i8> zeroinitializer)
150 %aext = sext <vscale x 8 x i8> %aval to <vscale x 8 x i32>
151 ret <vscale x 8 x i32> %aext
154 define <vscale x 8 x i64> @masked_sload_8i8_8i64(ptr %a, <vscale x 8 x i1> %b) {
155 ; CHECK-LABEL: masked_sload_8i8_8i64:
157 ; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
158 ; CHECK-NEXT: sunpklo z1.s, z0.h
159 ; CHECK-NEXT: sunpkhi z3.s, z0.h
160 ; CHECK-NEXT: sunpklo z0.d, z1.s
161 ; CHECK-NEXT: sunpkhi z1.d, z1.s
162 ; CHECK-NEXT: sunpklo z2.d, z3.s
163 ; CHECK-NEXT: sunpkhi z3.d, z3.s
165 %aval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%a, i32 16, <vscale x 8 x i1> %b, <vscale x 8 x i8> zeroinitializer)
166 %aext = sext <vscale x 8 x i8> %aval to <vscale x 8 x i64>
167 ret <vscale x 8 x i64> %aext
170 define <vscale x 4 x i64> @masked_sload_x2_4i8_4i64(ptr %a, ptr %b, <vscale x 4 x i1> %c) {
171 ; CHECK-LABEL: masked_sload_x2_4i8_4i64:
173 ; CHECK-NEXT: punpkhi p1.h, p0.b
174 ; CHECK-NEXT: punpklo p0.h, p0.b
175 ; CHECK-NEXT: ld1sb { z1.d }, p1/z, [x0, #1, mul vl]
176 ; CHECK-NEXT: ld1sb { z2.d }, p1/z, [x1, #1, mul vl]
177 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
178 ; CHECK-NEXT: ld1sb { z3.d }, p0/z, [x1]
179 ; CHECK-NEXT: add z1.d, z1.d, z2.d
180 ; CHECK-NEXT: add z0.d, z0.d, z3.d
182 %aval = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8> *%a, i32 16, <vscale x 4 x i1> %c, <vscale x 4 x i8> zeroinitializer)
183 %bval = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8> *%b, i32 16, <vscale x 4 x i1> %c, <vscale x 4 x i8> zeroinitializer)
184 %aext = sext <vscale x 4 x i8> %aval to <vscale x 4 x i64>
185 %bext = sext <vscale x 4 x i8> %bval to <vscale x 4 x i64>
186 %res = add <vscale x 4 x i64> %aext, %bext
187 ret <vscale x 4 x i64> %res
190 define <vscale x 4 x i64> @masked_sload_x2_4i16_4i64(ptr %a, ptr %b, <vscale x 4 x i1> %c) {
191 ; CHECK-LABEL: masked_sload_x2_4i16_4i64:
193 ; CHECK-NEXT: punpkhi p1.h, p0.b
194 ; CHECK-NEXT: punpklo p0.h, p0.b
195 ; CHECK-NEXT: ld1sh { z1.d }, p1/z, [x0, #1, mul vl]
196 ; CHECK-NEXT: ld1sh { z2.d }, p1/z, [x1, #1, mul vl]
197 ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
198 ; CHECK-NEXT: ld1sh { z3.d }, p0/z, [x1]
199 ; CHECK-NEXT: add z1.d, z1.d, z2.d
200 ; CHECK-NEXT: add z0.d, z0.d, z3.d
202 %aval = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16> *%a, i32 16, <vscale x 4 x i1> %c, <vscale x 4 x i16> zeroinitializer)
203 %bval = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16> *%b, i32 16, <vscale x 4 x i1> %c, <vscale x 4 x i16> zeroinitializer)
204 %aext = sext <vscale x 4 x i16> %aval to <vscale x 4 x i64>
205 %bext = sext <vscale x 4 x i16> %bval to <vscale x 4 x i64>
206 %res = add <vscale x 4 x i64> %aext, %bext
207 ret <vscale x 4 x i64> %res
210 define <vscale x 8 x i32> @masked_sload_x2_8i8_8i32(ptr %a, ptr %b, <vscale x 8 x i1> %c) {
211 ; CHECK-LABEL: masked_sload_x2_8i8_8i32:
213 ; CHECK-NEXT: punpkhi p1.h, p0.b
214 ; CHECK-NEXT: punpklo p0.h, p0.b
215 ; CHECK-NEXT: ld1sb { z1.s }, p1/z, [x0, #1, mul vl]
216 ; CHECK-NEXT: ld1sb { z2.s }, p1/z, [x1, #1, mul vl]
217 ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
218 ; CHECK-NEXT: ld1sb { z3.s }, p0/z, [x1]
219 ; CHECK-NEXT: add z1.s, z1.s, z2.s
220 ; CHECK-NEXT: add z0.s, z0.s, z3.s
222 %aval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%a, i32 16, <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer)
223 %bval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%b, i32 16, <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer)
224 %aext = sext <vscale x 8 x i8> %aval to <vscale x 8 x i32>
225 %bext = sext <vscale x 8 x i8> %bval to <vscale x 8 x i32>
226 %res = add <vscale x 8 x i32> %aext, %bext
227 ret <vscale x 8 x i32> %res
230 define <vscale x 8 x i64> @masked_sload_x2_8i8_8i64(ptr %a, ptr %b, <vscale x 8 x i1> %c) {
231 ; CHECK-LABEL: masked_sload_x2_8i8_8i64:
233 ; CHECK-NEXT: punpkhi p1.h, p0.b
234 ; CHECK-NEXT: punpklo p0.h, p0.b
235 ; CHECK-NEXT: punpkhi p2.h, p1.b
236 ; CHECK-NEXT: punpklo p1.h, p1.b
237 ; CHECK-NEXT: punpkhi p3.h, p0.b
238 ; CHECK-NEXT: punpklo p0.h, p0.b
239 ; CHECK-NEXT: ld1sb { z3.d }, p2/z, [x0, #3, mul vl]
240 ; CHECK-NEXT: ld1sb { z5.d }, p2/z, [x1, #3, mul vl]
241 ; CHECK-NEXT: ld1sb { z2.d }, p1/z, [x0, #2, mul vl]
242 ; CHECK-NEXT: ld1sb { z6.d }, p1/z, [x1, #2, mul vl]
243 ; CHECK-NEXT: ld1sb { z1.d }, p3/z, [x0, #1, mul vl]
244 ; CHECK-NEXT: ld1sb { z7.d }, p3/z, [x1, #1, mul vl]
245 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
246 ; CHECK-NEXT: ld1sb { z4.d }, p0/z, [x1]
247 ; CHECK-NEXT: add z3.d, z3.d, z5.d
248 ; CHECK-NEXT: add z2.d, z2.d, z6.d
249 ; CHECK-NEXT: add z1.d, z1.d, z7.d
250 ; CHECK-NEXT: add z0.d, z0.d, z4.d
252 %aval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%a, i32 16, <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer)
253 %bval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8> *%b, i32 16, <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer)
254 %aext = sext <vscale x 8 x i8> %aval to <vscale x 8 x i64>
255 %bext = sext <vscale x 8 x i8> %bval to <vscale x 8 x i64>
256 %res = add <vscale x 8 x i64> %aext, %bext
257 ret <vscale x 8 x i64> %res
261 declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>*, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
262 declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
263 declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
264 declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>*, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
265 declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
266 declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>*, i32, <vscale x 8 x i1>, <vscale x 8 x i8>)
267 declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)