1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16(ptr %base, ptr %offptr) {
5 ; CHECK-LABEL: zext_unscaled_i8_i16:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vldrh.u16 q1, [r1]
8 ; CHECK-NEXT: vldrb.u16 q0, [r0, q1]
11 %offs = load <8 x i16>, ptr %offptr, align 2
12 %offs.zext = zext <8 x i16> %offs to <8 x i32>
13 %ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
14 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
15 %gather.zext = zext <8 x i8> %gather to <8 x i16>
16 ret <8 x i16> %gather.zext
19 define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16_noext(ptr %base, ptr %offptr) {
20 ; CHECK-LABEL: zext_unscaled_i8_i16_noext:
21 ; CHECK: @ %bb.0: @ %entry
22 ; CHECK-NEXT: .save {r4, r5, r6, lr}
23 ; CHECK-NEXT: push {r4, r5, r6, lr}
24 ; CHECK-NEXT: vldrb.s32 q0, [r1, #4]
25 ; CHECK-NEXT: vadd.i32 q0, q0, r0
26 ; CHECK-NEXT: vmov r2, lr, d1
27 ; CHECK-NEXT: vmov r12, r3, d0
28 ; CHECK-NEXT: vldrb.s32 q0, [r1]
29 ; CHECK-NEXT: vadd.i32 q0, q0, r0
30 ; CHECK-NEXT: vmov r4, r5, d0
31 ; CHECK-NEXT: vmov r0, r1, d1
32 ; CHECK-NEXT: ldrb r6, [r2]
33 ; CHECK-NEXT: ldrb.w r2, [r12]
34 ; CHECK-NEXT: ldrb r3, [r3]
35 ; CHECK-NEXT: ldrb.w lr, [lr]
36 ; CHECK-NEXT: ldrb r4, [r4]
37 ; CHECK-NEXT: ldrb r5, [r5]
38 ; CHECK-NEXT: vmov.16 q0[0], r4
39 ; CHECK-NEXT: ldrb r0, [r0]
40 ; CHECK-NEXT: vmov.16 q0[1], r5
41 ; CHECK-NEXT: ldrb r1, [r1]
42 ; CHECK-NEXT: vmov.16 q0[2], r0
43 ; CHECK-NEXT: vmov.16 q0[3], r1
44 ; CHECK-NEXT: vmov.16 q0[4], r2
45 ; CHECK-NEXT: vmov.16 q0[5], r3
46 ; CHECK-NEXT: vmov.16 q0[6], r6
47 ; CHECK-NEXT: vmov.16 q0[7], lr
48 ; CHECK-NEXT: vmovlb.u8 q0, q0
49 ; CHECK-NEXT: pop {r4, r5, r6, pc}
51 %offs = load <8 x i8>, ptr %offptr, align 2
52 %ptrs = getelementptr inbounds i8, ptr %base, <8 x i8> %offs
53 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
54 %gather.zext = zext <8 x i8> %gather to <8 x i16>
55 ret <8 x i16> %gather.zext
58 define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(ptr %base, ptr %offptr) {
59 ; CHECK-LABEL: scaled_v8i16_sext:
60 ; CHECK: @ %bb.0: @ %entry
61 ; CHECK-NEXT: .save {r4, r5, r7, lr}
62 ; CHECK-NEXT: push {r4, r5, r7, lr}
63 ; CHECK-NEXT: vldrb.s32 q0, [r1, #4]
64 ; CHECK-NEXT: vshl.i32 q0, q0, #1
65 ; CHECK-NEXT: vadd.i32 q0, q0, r0
66 ; CHECK-NEXT: vmov r2, r12, d0
67 ; CHECK-NEXT: vmov r3, lr, d1
68 ; CHECK-NEXT: vldrb.s32 q0, [r1]
69 ; CHECK-NEXT: vshl.i32 q0, q0, #1
70 ; CHECK-NEXT: vadd.i32 q0, q0, r0
71 ; CHECK-NEXT: vmov r4, r5, d0
72 ; CHECK-NEXT: vmov r0, r1, d1
73 ; CHECK-NEXT: ldrh r2, [r2]
74 ; CHECK-NEXT: ldrh.w r12, [r12]
75 ; CHECK-NEXT: ldrh r3, [r3]
76 ; CHECK-NEXT: ldrh.w lr, [lr]
77 ; CHECK-NEXT: ldrh r4, [r4]
78 ; CHECK-NEXT: ldrh r5, [r5]
79 ; CHECK-NEXT: vmov.16 q0[0], r4
80 ; CHECK-NEXT: ldrh r0, [r0]
81 ; CHECK-NEXT: vmov.16 q0[1], r5
82 ; CHECK-NEXT: ldrh r1, [r1]
83 ; CHECK-NEXT: vmov.16 q0[2], r0
84 ; CHECK-NEXT: vmov.16 q0[3], r1
85 ; CHECK-NEXT: vmov.16 q0[4], r2
86 ; CHECK-NEXT: vmov.16 q0[5], r12
87 ; CHECK-NEXT: vmov.16 q0[6], r3
88 ; CHECK-NEXT: vmov.16 q0[7], lr
89 ; CHECK-NEXT: pop {r4, r5, r7, pc}
91 %offs = load <8 x i8>, ptr %offptr, align 2
92 %offs.sext = sext <8 x i8> %offs to <8 x i16>
93 %ptrs = getelementptr inbounds i16, ptr %base, <8 x i16> %offs.sext
94 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
98 define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_zext(ptr %base, ptr %offptr) {
99 ; CHECK-LABEL: scaled_v8i16_zext:
100 ; CHECK: @ %bb.0: @ %entry
101 ; CHECK-NEXT: .save {r4, r5, r7, lr}
102 ; CHECK-NEXT: push {r4, r5, r7, lr}
103 ; CHECK-NEXT: vldrb.u32 q0, [r1, #4]
104 ; CHECK-NEXT: vshl.i32 q0, q0, #1
105 ; CHECK-NEXT: vadd.i32 q0, q0, r0
106 ; CHECK-NEXT: vmov r2, r12, d0
107 ; CHECK-NEXT: vmov r3, lr, d1
108 ; CHECK-NEXT: vldrb.u32 q0, [r1]
109 ; CHECK-NEXT: vshl.i32 q0, q0, #1
110 ; CHECK-NEXT: vadd.i32 q0, q0, r0
111 ; CHECK-NEXT: vmov r4, r5, d0
112 ; CHECK-NEXT: vmov r0, r1, d1
113 ; CHECK-NEXT: ldrh r2, [r2]
114 ; CHECK-NEXT: ldrh.w r12, [r12]
115 ; CHECK-NEXT: ldrh r3, [r3]
116 ; CHECK-NEXT: ldrh.w lr, [lr]
117 ; CHECK-NEXT: ldrh r4, [r4]
118 ; CHECK-NEXT: ldrh r5, [r5]
119 ; CHECK-NEXT: vmov.16 q0[0], r4
120 ; CHECK-NEXT: ldrh r0, [r0]
121 ; CHECK-NEXT: vmov.16 q0[1], r5
122 ; CHECK-NEXT: ldrh r1, [r1]
123 ; CHECK-NEXT: vmov.16 q0[2], r0
124 ; CHECK-NEXT: vmov.16 q0[3], r1
125 ; CHECK-NEXT: vmov.16 q0[4], r2
126 ; CHECK-NEXT: vmov.16 q0[5], r12
127 ; CHECK-NEXT: vmov.16 q0[6], r3
128 ; CHECK-NEXT: vmov.16 q0[7], lr
129 ; CHECK-NEXT: pop {r4, r5, r7, pc}
131 %offs = load <8 x i8>, ptr %offptr, align 2
132 %offs.zext = zext <8 x i8> %offs to <8 x i16>
133 %ptrs = getelementptr inbounds i16, ptr %base, <8 x i16> %offs.zext
134 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
135 ret <8 x i16> %gather
138 define arm_aapcs_vfpcc <8 x i16> @sext_unscaled_i8_i16(ptr %base, ptr %offptr) {
139 ; CHECK-LABEL: sext_unscaled_i8_i16:
140 ; CHECK: @ %bb.0: @ %entry
141 ; CHECK-NEXT: vldrh.u16 q1, [r1]
142 ; CHECK-NEXT: vldrb.s16 q0, [r0, q1]
145 %offs = load <8 x i16>, ptr %offptr, align 2
146 %offs.zext = zext <8 x i16> %offs to <8 x i32>
147 %ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
148 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
149 %gather.sext = sext <8 x i8> %gather to <8 x i16>
150 ret <8 x i16> %gather.sext
153 define arm_aapcs_vfpcc <8 x i16> @unscaled_i16_i16(ptr %base, ptr %offptr) {
154 ; CHECK-LABEL: unscaled_i16_i16:
155 ; CHECK: @ %bb.0: @ %entry
156 ; CHECK-NEXT: vldrh.u16 q1, [r1]
157 ; CHECK-NEXT: vldrh.u16 q0, [r0, q1]
160 %offs = load <8 x i16>, ptr %offptr, align 2
161 %offs.zext = zext <8 x i16> %offs to <8 x i32>
162 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
163 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
164 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
165 ret <8 x i16> %gather
168 define arm_aapcs_vfpcc <8 x half> @unscaled_f16_i16(ptr %base, ptr %offptr) {
169 ; CHECK-LABEL: unscaled_f16_i16:
170 ; CHECK: @ %bb.0: @ %entry
171 ; CHECK-NEXT: vldrh.u16 q1, [r1]
172 ; CHECK-NEXT: vldrh.u16 q0, [r0, q1]
175 %offs = load <8 x i16>, ptr %offptr, align 2
176 %offs.zext = zext <8 x i16> %offs to <8 x i32>
177 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
178 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
179 %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef)
180 ret <8 x half> %gather
183 define arm_aapcs_vfpcc <8 x i16> @zext_unsigned_unscaled_i8_i8(ptr %base, ptr %offptr) {
184 ; CHECK-LABEL: zext_unsigned_unscaled_i8_i8:
185 ; CHECK: @ %bb.0: @ %entry
186 ; CHECK-NEXT: vldrb.u16 q1, [r1]
187 ; CHECK-NEXT: vldrb.u16 q0, [r0, q1]
190 %offs = load <8 x i8>, ptr %offptr, align 1
191 %offs.zext = zext <8 x i8> %offs to <8 x i32>
192 %ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
193 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
194 %gather.zext = zext <8 x i8> %gather to <8 x i16>
195 ret <8 x i16> %gather.zext
198 define arm_aapcs_vfpcc <8 x i16> @sext_unsigned_unscaled_i8_i8(ptr %base, ptr %offptr) {
199 ; CHECK-LABEL: sext_unsigned_unscaled_i8_i8:
200 ; CHECK: @ %bb.0: @ %entry
201 ; CHECK-NEXT: vldrb.u16 q1, [r1]
202 ; CHECK-NEXT: vldrb.s16 q0, [r0, q1]
205 %offs = load <8 x i8>, ptr %offptr, align 1
206 %offs.zext = zext <8 x i8> %offs to <8 x i32>
207 %ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
208 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
209 %gather.sext = sext <8 x i8> %gather to <8 x i16>
210 ret <8 x i16> %gather.sext
213 define arm_aapcs_vfpcc <8 x i16> @unsigned_unscaled_i16_i8(ptr %base, ptr %offptr) {
214 ; CHECK-LABEL: unsigned_unscaled_i16_i8:
215 ; CHECK: @ %bb.0: @ %entry
216 ; CHECK-NEXT: vldrb.u16 q1, [r1]
217 ; CHECK-NEXT: vldrh.u16 q0, [r0, q1]
220 %offs = load <8 x i8>, ptr %offptr, align 1
221 %offs.zext = zext <8 x i8> %offs to <8 x i32>
222 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
223 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
224 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
225 ret <8 x i16> %gather
228 define arm_aapcs_vfpcc <8 x half> @unsigned_unscaled_f16_i8(ptr %base, ptr %offptr) {
229 ; CHECK-LABEL: unsigned_unscaled_f16_i8:
230 ; CHECK: @ %bb.0: @ %entry
231 ; CHECK-NEXT: vldrb.u16 q1, [r1]
232 ; CHECK-NEXT: vldrh.u16 q0, [r0, q1]
235 %offs = load <8 x i8>, ptr %offptr, align 1
236 %offs.zext = zext <8 x i8> %offs to <8 x i32>
237 %byte_ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
238 %ptrs = bitcast <8 x ptr> %byte_ptrs to <8 x ptr>
239 %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef)
240 ret <8 x half> %gather
243 declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>) #1
244 declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>) #1
245 declare <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x half>) #1