1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc <4 x i32> @zext_scaled_i16_i32(ptr %base, ptr %offptr) {
5 ; CHECK-LABEL: zext_scaled_i16_i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vldrw.u32 q1, [r1]
8 ; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1]
11 %offs = load <4 x i32>, ptr %offptr, align 4
12 %ptrs = getelementptr inbounds i16, ptr %base, <4 x i32> %offs
13 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
14 %gather.zext = zext <4 x i16> %gather to <4 x i32>
15 ret <4 x i32> %gather.zext
18 define arm_aapcs_vfpcc <4 x i32> @sext_scaled_i16_i32(ptr %base, ptr %offptr) {
19 ; CHECK-LABEL: sext_scaled_i16_i32:
20 ; CHECK: @ %bb.0: @ %entry
21 ; CHECK-NEXT: vldrw.u32 q1, [r1]
22 ; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1]
25 %offs = load <4 x i32>, ptr %offptr, align 4
26 %ptrs = getelementptr inbounds i16, ptr %base, <4 x i32> %offs
27 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
28 %gather.sext = sext <4 x i16> %gather to <4 x i32>
29 ret <4 x i32> %gather.sext
32 define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32(ptr %base, ptr %offptr) {
33 ; CHECK-LABEL: scaled_i32_i32:
34 ; CHECK: @ %bb.0: @ %entry
35 ; CHECK-NEXT: vldrw.u32 q1, [r1]
36 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
39 %offs = load <4 x i32>, ptr %offptr, align 4
40 %ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs
41 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
45 ; TODO: scaled_f16_i32
47 define arm_aapcs_vfpcc <4 x float> @scaled_f32_i32(ptr %base, ptr %offptr) {
48 ; CHECK-LABEL: scaled_f32_i32:
49 ; CHECK: @ %bb.0: @ %entry
50 ; CHECK-NEXT: vldrw.u32 q1, [r1]
51 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
54 %offs = load <4 x i32>, ptr %offptr, align 4
55 %i32_ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs
56 %ptrs = bitcast <4 x ptr> %i32_ptrs to <4 x ptr>
57 %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
58 ret <4 x float> %gather
61 define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i16(ptr %base, ptr %offptr) {
62 ; CHECK-LABEL: unsigned_scaled_b_i32_i16:
63 ; CHECK: @ %bb.0: @ %entry
64 ; CHECK-NEXT: vldrh.u32 q1, [r1]
65 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
68 %offs = load <4 x i16>, ptr %offptr, align 2
69 %offs.zext = zext <4 x i16> %offs to <4 x i32>
70 %ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs.zext
71 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
75 define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i16(ptr %base, ptr %offptr) {
76 ; CHECK-LABEL: signed_scaled_i32_i16:
77 ; CHECK: @ %bb.0: @ %entry
78 ; CHECK-NEXT: vldrh.s32 q1, [r1]
79 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
82 %offs = load <4 x i16>, ptr %offptr, align 2
83 %offs.sext = sext <4 x i16> %offs to <4 x i32>
84 %ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs.sext
85 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
89 define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i16(ptr %base, ptr %offptr) {
90 ; CHECK-LABEL: a_unsigned_scaled_f32_i16:
91 ; CHECK: @ %bb.0: @ %entry
92 ; CHECK-NEXT: vldrh.u32 q1, [r1]
93 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
96 %offs = load <4 x i16>, ptr %offptr, align 2
97 %offs.zext = zext <4 x i16> %offs to <4 x i32>
98 %i32_ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs.zext
99 %ptrs = bitcast <4 x ptr> %i32_ptrs to <4 x ptr>
100 %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
101 ret <4 x float> %gather
104 define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i16(ptr %base, ptr %offptr) {
105 ; CHECK-LABEL: b_signed_scaled_f32_i16:
106 ; CHECK: @ %bb.0: @ %entry
107 ; CHECK-NEXT: vldrh.s32 q1, [r1]
108 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
111 %offs = load <4 x i16>, ptr %offptr, align 2
112 %offs.sext = sext <4 x i16> %offs to <4 x i32>
113 %i32_ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs.sext
114 %ptrs = bitcast <4 x ptr> %i32_ptrs to <4 x ptr>
115 %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
116 ret <4 x float> %gather
119 define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i16(ptr %base, ptr %offptr) {
120 ; CHECK-LABEL: zext_signed_scaled_i16_i16:
121 ; CHECK: @ %bb.0: @ %entry
122 ; CHECK-NEXT: vldrh.s32 q1, [r1]
123 ; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1]
126 %offs = load <4 x i16>, ptr %offptr, align 2
127 %offs.sext = sext <4 x i16> %offs to <4 x i32>
128 %ptrs = getelementptr inbounds i16, ptr %base, <4 x i32> %offs.sext
129 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
130 %gather.zext = zext <4 x i16> %gather to <4 x i32>
131 ret <4 x i32> %gather.zext
134 define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i16(ptr %base, ptr %offptr) {
135 ; CHECK-LABEL: sext_signed_scaled_i16_i16:
136 ; CHECK: @ %bb.0: @ %entry
137 ; CHECK-NEXT: vldrh.s32 q1, [r1]
138 ; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1]
141 %offs = load <4 x i16>, ptr %offptr, align 2
142 %offs.sext = sext <4 x i16> %offs to <4 x i32>
143 %ptrs = getelementptr inbounds i16, ptr %base, <4 x i32> %offs.sext
144 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
145 %gather.sext = sext <4 x i16> %gather to <4 x i32>
146 ret <4 x i32> %gather.sext
149 define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i16(ptr %base, ptr %offptr) {
150 ; CHECK-LABEL: zext_unsigned_scaled_i16_i16:
151 ; CHECK: @ %bb.0: @ %entry
152 ; CHECK-NEXT: vldrh.u32 q1, [r1]
153 ; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1]
156 %offs = load <4 x i16>, ptr %offptr, align 2
157 %offs.zext = zext <4 x i16> %offs to <4 x i32>
158 %ptrs = getelementptr inbounds i16, ptr %base, <4 x i32> %offs.zext
159 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
160 %gather.zext = zext <4 x i16> %gather to <4 x i32>
161 ret <4 x i32> %gather.zext
164 define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i16(ptr %base, ptr %offptr) {
165 ; CHECK-LABEL: sext_unsigned_scaled_i16_i16:
166 ; CHECK: @ %bb.0: @ %entry
167 ; CHECK-NEXT: vldrh.u32 q1, [r1]
168 ; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1]
171 %offs = load <4 x i16>, ptr %offptr, align 2
172 %offs.zext = zext <4 x i16> %offs to <4 x i32>
173 %ptrs = getelementptr inbounds i16, ptr %base, <4 x i32> %offs.zext
174 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
175 %gather.sext = sext <4 x i16> %gather to <4 x i32>
176 ret <4 x i32> %gather.sext
179 define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i8(ptr %base, ptr %offptr) {
180 ; CHECK-LABEL: unsigned_scaled_b_i32_i8:
181 ; CHECK: @ %bb.0: @ %entry
182 ; CHECK-NEXT: vldrb.u32 q1, [r1]
183 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
186 %offs = load <4 x i8>, ptr %offptr, align 1
187 %offs.zext = zext <4 x i8> %offs to <4 x i32>
188 %ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs.zext
189 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
190 ret <4 x i32> %gather
193 define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i8(ptr %base, ptr %offptr) {
194 ; CHECK-LABEL: signed_scaled_i32_i8:
195 ; CHECK: @ %bb.0: @ %entry
196 ; CHECK-NEXT: vldrb.s32 q1, [r1]
197 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
200 %offs = load <4 x i8>, ptr %offptr, align 1
201 %offs.sext = sext <4 x i8> %offs to <4 x i32>
202 %ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs.sext
203 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
204 ret <4 x i32> %gather
207 define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i8(ptr %base, ptr %offptr) {
208 ; CHECK-LABEL: a_unsigned_scaled_f32_i8:
209 ; CHECK: @ %bb.0: @ %entry
210 ; CHECK-NEXT: vldrb.u32 q1, [r1]
211 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
214 %offs = load <4 x i8>, ptr %offptr, align 1
215 %offs.zext = zext <4 x i8> %offs to <4 x i32>
216 %i32_ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs.zext
217 %ptrs = bitcast <4 x ptr> %i32_ptrs to <4 x ptr>
218 %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
219 ret <4 x float> %gather
222 define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i8(ptr %base, ptr %offptr) {
223 ; CHECK-LABEL: b_signed_scaled_f32_i8:
224 ; CHECK: @ %bb.0: @ %entry
225 ; CHECK-NEXT: vldrb.s32 q1, [r1]
226 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
229 %offs = load <4 x i8>, ptr %offptr, align 1
230 %offs.sext = sext <4 x i8> %offs to <4 x i32>
231 %i32_ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs.sext
232 %ptrs = bitcast <4 x ptr> %i32_ptrs to <4 x ptr>
233 %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
234 ret <4 x float> %gather
237 define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i8(ptr %base, ptr %offptr) {
238 ; CHECK-LABEL: zext_signed_scaled_i16_i8:
239 ; CHECK: @ %bb.0: @ %entry
240 ; CHECK-NEXT: vldrb.s32 q1, [r1]
241 ; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1]
244 %offs = load <4 x i8>, ptr %offptr, align 1
245 %offs.sext = sext <4 x i8> %offs to <4 x i32>
246 %ptrs = getelementptr inbounds i16, ptr %base, <4 x i32> %offs.sext
247 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
248 %gather.zext = zext <4 x i16> %gather to <4 x i32>
249 ret <4 x i32> %gather.zext
252 define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i8(ptr %base, ptr %offptr) {
253 ; CHECK-LABEL: sext_signed_scaled_i16_i8:
254 ; CHECK: @ %bb.0: @ %entry
255 ; CHECK-NEXT: vldrb.s32 q1, [r1]
256 ; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1]
259 %offs = load <4 x i8>, ptr %offptr, align 1
260 %offs.sext = sext <4 x i8> %offs to <4 x i32>
261 %ptrs = getelementptr inbounds i16, ptr %base, <4 x i32> %offs.sext
262 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
263 %gather.sext = sext <4 x i16> %gather to <4 x i32>
264 ret <4 x i32> %gather.sext
267 define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i8(ptr %base, ptr %offptr) {
268 ; CHECK-LABEL: zext_unsigned_scaled_i16_i8:
269 ; CHECK: @ %bb.0: @ %entry
270 ; CHECK-NEXT: vldrb.u32 q1, [r1]
271 ; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1]
274 %offs = load <4 x i8>, ptr %offptr, align 1
275 %offs.zext = zext <4 x i8> %offs to <4 x i32>
276 %ptrs = getelementptr inbounds i16, ptr %base, <4 x i32> %offs.zext
277 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
278 %gather.zext = zext <4 x i16> %gather to <4 x i32>
279 ret <4 x i32> %gather.zext
282 define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i8(ptr %base, ptr %offptr) {
283 ; CHECK-LABEL: sext_unsigned_scaled_i16_i8:
284 ; CHECK: @ %bb.0: @ %entry
285 ; CHECK-NEXT: vldrb.u32 q1, [r1]
286 ; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1]
289 %offs = load <4 x i8>, ptr %offptr, align 1
290 %offs.zext = zext <4 x i8> %offs to <4 x i32>
291 %ptrs = getelementptr inbounds i16, ptr %base, <4 x i32> %offs.zext
292 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
293 %gather.sext = sext <4 x i16> %gather to <4 x i32>
294 ret <4 x i32> %gather.sext
297 define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep(ptr %base, ptr %offptr) {
298 ; CHECK-LABEL: scaled_i32_i32_2gep:
299 ; CHECK: @ %bb.0: @ %entry
300 ; CHECK-NEXT: vldrw.u32 q0, [r1]
301 ; CHECK-NEXT: movs r2, #20
302 ; CHECK-NEXT: vshl.i32 q0, q0, #2
303 ; CHECK-NEXT: vadd.i32 q0, q0, r0
304 ; CHECK-NEXT: vadd.i32 q1, q0, r2
305 ; CHECK-NEXT: vldrw.u32 q0, [q1]
308 %offs = load <4 x i32>, ptr %offptr, align 4
309 %ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> %offs
310 %ptrs2 = getelementptr inbounds i32, <4 x ptr> %ptrs, i32 5
311 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
312 ret <4 x i32> %gather
315 define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2(ptr %base) {
316 ; CHECK-LABEL: scaled_i32_i32_2gep2:
317 ; CHECK: @ %bb.0: @ %entry
318 ; CHECK-NEXT: adr r1, .LCPI21_0
319 ; CHECK-NEXT: vldrw.u32 q1, [r1]
320 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1]
322 ; CHECK-NEXT: .p2align 4
323 ; CHECK-NEXT: @ %bb.1:
324 ; CHECK-NEXT: .LCPI21_0:
325 ; CHECK-NEXT: .long 20 @ 0x14
326 ; CHECK-NEXT: .long 32 @ 0x20
327 ; CHECK-NEXT: .long 44 @ 0x2c
328 ; CHECK-NEXT: .long 56 @ 0x38
330 %ptrs = getelementptr inbounds i32, ptr %base, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
331 %ptrs2 = getelementptr inbounds i32, <4 x ptr> %ptrs, i32 5
332 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
333 ret <4 x i32> %gather
336 declare <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i8>)
337 declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
338 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
339 declare <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x half>)
340 declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)