1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc i32 @vqdmulh_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
5 ; CHECK-LABEL: vqdmulh_v16i8:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: vqdmulh.s8 q0, q1, q0
8 ; CHECK-NEXT: vaddv.s8 r0, q0
11 %l2 = sext <16 x i8> %s0 to <16 x i32>
12 %l5 = sext <16 x i8> %s1 to <16 x i32>
13 %l6 = mul nsw <16 x i32> %l5, %l2
14 %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
15 %l8 = icmp slt <16 x i32> %l7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
16 %l9 = select <16 x i1> %l8, <16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
17 %l10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %l9)
21 define arm_aapcs_vfpcc <16 x i8> @vqdmulh_v16i8_b(<16 x i8> %s0, <16 x i8> %s1) {
22 ; CHECK-LABEL: vqdmulh_v16i8_b:
23 ; CHECK: @ %bb.0: @ %entry
24 ; CHECK-NEXT: vqdmulh.s8 q0, q1, q0
27 %l2 = sext <16 x i8> %s0 to <16 x i32>
28 %l5 = sext <16 x i8> %s1 to <16 x i32>
29 %l6 = mul nsw <16 x i32> %l5, %l2
30 %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
31 %l8 = icmp slt <16 x i32> %l7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
32 %l9 = select <16 x i1> %l8, <16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
33 %l10 = trunc <16 x i32> %l9 to <16 x i8>
37 define arm_aapcs_vfpcc <8 x i8> @vqdmulh_v8i8_b(<8 x i8> %s0, <8 x i8> %s1) {
38 ; CHECK-LABEL: vqdmulh_v8i8_b:
39 ; CHECK: @ %bb.0: @ %entry
40 ; CHECK-NEXT: vqdmulh.s8 q0, q1, q0
41 ; CHECK-NEXT: vmovlb.s8 q0, q0
44 %l2 = sext <8 x i8> %s0 to <8 x i32>
45 %l5 = sext <8 x i8> %s1 to <8 x i32>
46 %l6 = mul nsw <8 x i32> %l5, %l2
47 %l7 = ashr <8 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
48 %l8 = icmp slt <8 x i32> %l7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
49 %l9 = select <8 x i1> %l8, <8 x i32> %l7, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
50 %l10 = trunc <8 x i32> %l9 to <8 x i8>
54 define arm_aapcs_vfpcc <4 x i8> @vqdmulh_v4i8_b(<4 x i8> %s0, <4 x i8> %s1) {
55 ; CHECK-LABEL: vqdmulh_v4i8_b:
56 ; CHECK: @ %bb.0: @ %entry
57 ; CHECK-NEXT: vqdmulh.s8 q0, q1, q0
58 ; CHECK-NEXT: vmovlb.s8 q0, q0
59 ; CHECK-NEXT: vmovlb.s16 q0, q0
62 %l2 = sext <4 x i8> %s0 to <4 x i32>
63 %l5 = sext <4 x i8> %s1 to <4 x i32>
64 %l6 = mul nsw <4 x i32> %l5, %l2
65 %l7 = ashr <4 x i32> %l6, <i32 7, i32 7, i32 7, i32 7>
66 %l8 = icmp slt <4 x i32> %l7, <i32 127, i32 127, i32 127, i32 127>
67 %l9 = select <4 x i1> %l8, <4 x i32> %l7, <4 x i32> <i32 127, i32 127, i32 127, i32 127>
68 %l10 = trunc <4 x i32> %l9 to <4 x i8>
72 define arm_aapcs_vfpcc <32 x i8> @vqdmulh_v32i8_b(<32 x i8> %s0, <32 x i8> %s1) {
73 ; CHECK-LABEL: vqdmulh_v32i8_b:
74 ; CHECK: @ %bb.0: @ %entry
75 ; CHECK-NEXT: vqdmulh.s8 q0, q2, q0
76 ; CHECK-NEXT: vqdmulh.s8 q1, q3, q1
79 %l2 = sext <32 x i8> %s0 to <32 x i32>
80 %l5 = sext <32 x i8> %s1 to <32 x i32>
81 %l6 = mul nsw <32 x i32> %l5, %l2
82 %l7 = ashr <32 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
83 %l8 = icmp slt <32 x i32> %l7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
84 %l9 = select <32 x i1> %l8, <32 x i32> %l7, <32 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
85 %l10 = trunc <32 x i32> %l9 to <32 x i8>
89 define arm_aapcs_vfpcc i32 @vqdmulh_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
90 ; CHECK-LABEL: vqdmulh_v8i16:
91 ; CHECK: @ %bb.0: @ %entry
92 ; CHECK-NEXT: vqdmulh.s16 q0, q1, q0
93 ; CHECK-NEXT: vaddv.s16 r0, q0
96 %l2 = sext <8 x i16> %s0 to <8 x i32>
97 %l5 = sext <8 x i16> %s1 to <8 x i32>
98 %l6 = mul nsw <8 x i32> %l5, %l2
99 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
100 %l8 = icmp slt <8 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
101 %l9 = select <8 x i1> %l8, <8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
102 %l10 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l9)
106 define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_b(<8 x i16> %s0, <8 x i16> %s1) {
107 ; CHECK-LABEL: vqdmulh_v8i16_b:
108 ; CHECK: @ %bb.0: @ %entry
109 ; CHECK-NEXT: vqdmulh.s16 q0, q1, q0
112 %l2 = sext <8 x i16> %s0 to <8 x i32>
113 %l5 = sext <8 x i16> %s1 to <8 x i32>
114 %l6 = mul nsw <8 x i32> %l5, %l2
115 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
116 %l8 = icmp slt <8 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
117 %l9 = select <8 x i1> %l8, <8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
118 %l10 = trunc <8 x i32> %l9 to <8 x i16>
122 define arm_aapcs_vfpcc <4 x i16> @vqdmulh_v4i16_b(<4 x i16> %s0, <4 x i16> %s1) {
123 ; CHECK-LABEL: vqdmulh_v4i16_b:
124 ; CHECK: @ %bb.0: @ %entry
125 ; CHECK-NEXT: vqdmulh.s16 q0, q1, q0
126 ; CHECK-NEXT: vmovlb.s16 q0, q0
129 %l2 = sext <4 x i16> %s0 to <4 x i32>
130 %l5 = sext <4 x i16> %s1 to <4 x i32>
131 %l6 = mul nsw <4 x i32> %l5, %l2
132 %l7 = ashr <4 x i32> %l6, <i32 15, i32 15, i32 15, i32 15>
133 %l4 = icmp slt <4 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767>
134 %l9 = select <4 x i1> %l4, <4 x i32> %l7, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
135 %l10 = trunc <4 x i32> %l9 to <4 x i16>
139 define arm_aapcs_vfpcc <16 x i16> @vqdmulh_v16i16_b(<16 x i16> %s0, <16 x i16> %s1) {
140 ; CHECK-LABEL: vqdmulh_v16i16_b:
141 ; CHECK: @ %bb.0: @ %entry
142 ; CHECK-NEXT: vqdmulh.s16 q0, q2, q0
143 ; CHECK-NEXT: vqdmulh.s16 q1, q3, q1
146 %l2 = sext <16 x i16> %s0 to <16 x i32>
147 %l5 = sext <16 x i16> %s1 to <16 x i32>
148 %l6 = mul nsw <16 x i32> %l5, %l2
149 %l7 = ashr <16 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
150 %l16 = icmp slt <16 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
151 %l9 = select <16 x i1> %l16, <16 x i32> %l7, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
152 %l10 = trunc <16 x i32> %l9 to <16 x i16>
156 define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_c(<8 x i16> %s0, <8 x i16> %s1) {
157 ; CHECK-LABEL: vqdmulh_v8i16_c:
158 ; CHECK: @ %bb.0: @ %entry
159 ; CHECK-NEXT: .pad #16
160 ; CHECK-NEXT: sub sp, #16
161 ; CHECK-NEXT: vmov.u16 r0, q0[6]
162 ; CHECK-NEXT: vmov.u16 r1, q0[4]
163 ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
164 ; CHECK-NEXT: vmov.u16 r0, q0[7]
165 ; CHECK-NEXT: vmov.u16 r1, q0[5]
166 ; CHECK-NEXT: vmov.u16 r2, q0[0]
167 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
168 ; CHECK-NEXT: vmov.u16 r0, q1[6]
169 ; CHECK-NEXT: vmov.u16 r1, q1[4]
170 ; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
171 ; CHECK-NEXT: vmov.u16 r0, q1[7]
172 ; CHECK-NEXT: vmov.u16 r1, q1[5]
173 ; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
174 ; CHECK-NEXT: mov r0, sp
175 ; CHECK-NEXT: vmullb.s16 q2, q3, q2
176 ; CHECK-NEXT: vmov.u16 r1, q0[2]
177 ; CHECK-NEXT: vshl.i32 q2, q2, #10
178 ; CHECK-NEXT: vshr.s32 q2, q2, #10
179 ; CHECK-NEXT: vshr.s32 q2, q2, #15
180 ; CHECK-NEXT: vstrh.32 q2, [r0, #8]
181 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r1
182 ; CHECK-NEXT: vmov.u16 r1, q0[3]
183 ; CHECK-NEXT: vmov.u16 r2, q0[1]
184 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r1
185 ; CHECK-NEXT: vmov.u16 r1, q1[2]
186 ; CHECK-NEXT: vmov.u16 r2, q1[0]
187 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
188 ; CHECK-NEXT: vmov.u16 r1, q1[3]
189 ; CHECK-NEXT: vmov.u16 r2, q1[1]
190 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
191 ; CHECK-NEXT: vmullb.s16 q0, q0, q2
192 ; CHECK-NEXT: vshl.i32 q0, q0, #10
193 ; CHECK-NEXT: vshr.s32 q0, q0, #10
194 ; CHECK-NEXT: vshr.s32 q0, q0, #15
195 ; CHECK-NEXT: vstrh.32 q0, [r0]
196 ; CHECK-NEXT: vldrw.u32 q0, [r0]
197 ; CHECK-NEXT: add sp, #16
200 %l2 = sext <8 x i16> %s0 to <8 x i22>
201 %l5 = sext <8 x i16> %s1 to <8 x i22>
202 %l6 = mul nsw <8 x i22> %l5, %l2
203 %l7 = ashr <8 x i22> %l6, <i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15>
204 %l8 = icmp slt <8 x i22> %l7, <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767>
205 %l9 = select <8 x i1> %l8, <8 x i22> %l7, <8 x i22> <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767>
206 %l10 = trunc <8 x i22> %l9 to <8 x i16>
210 define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved(<8 x i16> %s0, <8 x i16> %s1) {
211 ; CHECK-LABEL: vqdmulh_v8i16_interleaved:
212 ; CHECK: @ %bb.0: @ %entry
213 ; CHECK-NEXT: vqdmulh.s16 q0, q1, q0
216 %0 = shufflevector <8 x i16> %s0, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
217 %1 = sext <8 x i16> %0 to <8 x i32>
218 %l2 = sext <8 x i16> %s0 to <8 x i32>
219 %2 = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
220 %3 = sext <8 x i16> %2 to <8 x i32>
221 %l5 = sext <8 x i16> %s1 to <8 x i32>
222 %l6 = mul nsw <8 x i32> %3, %1
223 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
224 %l8 = icmp slt <8 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
225 %l9 = select <8 x i1> %l8, <8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
226 %l10 = trunc <8 x i32> %l9 to <8 x i16>
227 %4 = shufflevector <8 x i16> %l10, <8 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
231 define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved2(<4 x i32> %s0a, <8 x i16> %s1) {
232 ; CHECK-LABEL: vqdmulh_v8i16_interleaved2:
234 ; CHECK-NEXT: vqdmulh.s16 q2, q1, q0
235 ; CHECK-NEXT: vrev32.16 q1, q1
236 ; CHECK-NEXT: vqdmulh.s16 q0, q1, q0
237 ; CHECK-NEXT: vmovnt.i32 q2, q0
238 ; CHECK-NEXT: vmov q0, q2
240 %s0 = trunc <4 x i32> %s0a to <4 x i16>
241 %strided.vec = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
242 %strided.vec44 = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
243 %l7 = sext <4 x i16> %strided.vec to <4 x i32>
244 %l8 = sext <4 x i16> %s0 to <4 x i32>
245 %l9 = mul nsw <4 x i32> %l7, %l8
246 %l10 = ashr <4 x i32> %l9, <i32 15, i32 15, i32 15, i32 15>
247 %l11 = icmp slt <4 x i32> %l10, <i32 32767, i32 32767, i32 32767, i32 32767>
248 %l12 = select <4 x i1> %l11, <4 x i32> %l10, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
249 %l13 = trunc <4 x i32> %l12 to <4 x i16>
250 %l14 = sext <4 x i16> %strided.vec44 to <4 x i32>
251 %l15 = mul nsw <4 x i32> %l14, %l8
252 %l16 = ashr <4 x i32> %l15, <i32 15, i32 15, i32 15, i32 15>
253 %l17 = icmp slt <4 x i32> %l16, <i32 32767, i32 32767, i32 32767, i32 32767>
254 %l18 = select <4 x i1> %l17, <4 x i32> %l16, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
255 %l19 = trunc <4 x i32> %l18 to <4 x i16>
256 %interleaved.vec = shufflevector <4 x i16> %l13, <4 x i16> %l19, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
257 ret <8 x i16> %interleaved.vec
260 define arm_aapcs_vfpcc i64 @vqdmulh_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
261 ; CHECK-LABEL: vqdmulh_v4i32:
262 ; CHECK: @ %bb.0: @ %entry
263 ; CHECK-NEXT: vqdmulh.s32 q0, q1, q0
264 ; CHECK-NEXT: vaddlv.s32 r0, r1, q0
267 %l2 = sext <4 x i32> %s0 to <4 x i64>
268 %l5 = sext <4 x i32> %s1 to <4 x i64>
269 %l6 = mul nsw <4 x i64> %l5, %l2
270 %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31>
271 %l8 = icmp slt <4 x i64> %l7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
272 %l9 = select <4 x i1> %l8, <4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
273 %l10 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %l9)
277 define arm_aapcs_vfpcc <4 x i32> @vqdmulh_v4i32_b(<4 x i32> %s0, <4 x i32> %s1) {
278 ; CHECK-LABEL: vqdmulh_v4i32_b:
279 ; CHECK: @ %bb.0: @ %entry
280 ; CHECK-NEXT: vqdmulh.s32 q0, q1, q0
283 %l2 = sext <4 x i32> %s0 to <4 x i64>
284 %l5 = sext <4 x i32> %s1 to <4 x i64>
285 %l6 = mul nsw <4 x i64> %l5, %l2
286 %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31>
287 %l8 = icmp slt <4 x i64> %l7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
288 %l9 = select <4 x i1> %l8, <4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
289 %l10 = trunc <4 x i64> %l9 to <4 x i32>
293 define arm_aapcs_vfpcc <2 x i32> @vqdmulh_v2i32_b(<2 x i32> %s0, <2 x i32> %s1) {
294 ; CHECK-LABEL: vqdmulh_v2i32_b:
295 ; CHECK: @ %bb.0: @ %entry
296 ; CHECK-NEXT: vqdmulh.s32 q0, q1, q0
297 ; CHECK-NEXT: vmov r0, s2
298 ; CHECK-NEXT: vmov r1, s0
299 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
300 ; CHECK-NEXT: asrs r0, r0, #31
301 ; CHECK-NEXT: asrs r1, r1, #31
302 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
305 %l2 = sext <2 x i32> %s0 to <2 x i64>
306 %l5 = sext <2 x i32> %s1 to <2 x i64>
307 %l6 = mul nsw <2 x i64> %l5, %l2
308 %l7 = ashr <2 x i64> %l6, <i64 31, i64 31>
309 %l8 = icmp slt <2 x i64> %l7, <i64 2147483647, i64 2147483647>
310 %l9 = select <2 x i1> %l8, <2 x i64> %l7, <2 x i64> <i64 2147483647, i64 2147483647>
311 %l10 = trunc <2 x i64> %l9 to <2 x i32>
315 define arm_aapcs_vfpcc <8 x i32> @vqdmulh_v8i32_b(<8 x i32> %s0, <8 x i32> %s1) {
316 ; CHECK-LABEL: vqdmulh_v8i32_b:
317 ; CHECK: @ %bb.0: @ %entry
318 ; CHECK-NEXT: vqdmulh.s32 q0, q2, q0
319 ; CHECK-NEXT: vqdmulh.s32 q1, q3, q1
322 %l2 = sext <8 x i32> %s0 to <8 x i64>
323 %l5 = sext <8 x i32> %s1 to <8 x i64>
324 %l6 = mul nsw <8 x i64> %l5, %l2
325 %l7 = ashr <8 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
326 %l8 = icmp slt <8 x i64> %l7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
327 %l9 = select <8 x i1> %l8, <8 x i64> %l7, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
328 %l10 = trunc <8 x i64> %l9 to <8 x i32>
332 define arm_aapcs_vfpcc <16 x i32> @vqdmulh_v16i32_b(<16 x i32> %s0, <16 x i32> %s1) {
333 ; CHECK-LABEL: vqdmulh_v16i32_b:
334 ; CHECK: @ %bb.0: @ %entry
335 ; CHECK-NEXT: .vsave {d8, d9}
336 ; CHECK-NEXT: vpush {d8, d9}
337 ; CHECK-NEXT: add r0, sp, #16
338 ; CHECK-NEXT: vldrw.u32 q4, [r0]
339 ; CHECK-NEXT: add r0, sp, #32
340 ; CHECK-NEXT: vqdmulh.s32 q0, q4, q0
341 ; CHECK-NEXT: vldrw.u32 q4, [r0]
342 ; CHECK-NEXT: add r0, sp, #48
343 ; CHECK-NEXT: vqdmulh.s32 q1, q4, q1
344 ; CHECK-NEXT: vldrw.u32 q4, [r0]
345 ; CHECK-NEXT: add r0, sp, #64
346 ; CHECK-NEXT: vqdmulh.s32 q2, q4, q2
347 ; CHECK-NEXT: vldrw.u32 q4, [r0]
348 ; CHECK-NEXT: vqdmulh.s32 q3, q4, q3
349 ; CHECK-NEXT: vpop {d8, d9}
352 %l2 = sext <16 x i32> %s0 to <16 x i64>
353 %l5 = sext <16 x i32> %s1 to <16 x i64>
354 %l6 = mul nsw <16 x i64> %l5, %l2
355 %l7 = ashr <16 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
356 %l8 = icmp slt <16 x i64> %l7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
357 %l9 = select <16 x i1> %l8, <16 x i64> %l7, <16 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
358 %l10 = trunc <16 x i64> %l9 to <16 x i32>
364 define void @vqdmulh_loop_i8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) local_unnamed_addr #0 {
365 ; CHECK-LABEL: vqdmulh_loop_i8:
366 ; CHECK: @ %bb.0: @ %entry
367 ; CHECK-NEXT: .save {r7, lr}
368 ; CHECK-NEXT: push {r7, lr}
369 ; CHECK-NEXT: mov.w lr, #64
370 ; CHECK-NEXT: .LBB17_1: @ %vector.body
371 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
372 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
373 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
374 ; CHECK-NEXT: vqdmulh.s8 q0, q1, q0
375 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
376 ; CHECK-NEXT: le lr, .LBB17_1
377 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
378 ; CHECK-NEXT: pop {r7, pc}
380 br label %vector.body
382 vector.body: ; preds = %vector.body, %entry
383 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
384 %0 = getelementptr inbounds i8, ptr %x, i32 %index
385 %wide.load = load <16 x i8>, ptr %0, align 1
386 %1 = sext <16 x i8> %wide.load to <16 x i32>
387 %2 = getelementptr inbounds i8, ptr %y, i32 %index
388 %wide.load26 = load <16 x i8>, ptr %2, align 1
389 %3 = sext <16 x i8> %wide.load26 to <16 x i32>
390 %4 = mul nsw <16 x i32> %3, %1
391 %5 = ashr <16 x i32> %4, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
392 %6 = icmp slt <16 x i32> %5, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
393 %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
394 %8 = trunc <16 x i32> %7 to <16 x i8>
395 %9 = getelementptr inbounds i8, ptr %z, i32 %index
396 store <16 x i8> %8, ptr %9, align 1
397 %index.next = add i32 %index, 16
398 %10 = icmp eq i32 %index.next, 1024
399 br i1 %10, label %for.cond.cleanup, label %vector.body
401 for.cond.cleanup: ; preds = %vector.body
405 define void @vqdmulh_loop_i16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
406 ; CHECK-LABEL: vqdmulh_loop_i16:
407 ; CHECK: @ %bb.0: @ %entry
408 ; CHECK-NEXT: .save {r7, lr}
409 ; CHECK-NEXT: push {r7, lr}
410 ; CHECK-NEXT: mov.w lr, #128
411 ; CHECK-NEXT: .LBB18_1: @ %vector.body
412 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
413 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
414 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
415 ; CHECK-NEXT: vqdmulh.s16 q0, q1, q0
416 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
417 ; CHECK-NEXT: le lr, .LBB18_1
418 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
419 ; CHECK-NEXT: pop {r7, pc}
421 br label %vector.body
423 vector.body: ; preds = %vector.body, %entry
424 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
425 %0 = getelementptr inbounds i16, ptr %x, i32 %index
426 %wide.load = load <8 x i16>, ptr %0, align 2
427 %1 = sext <8 x i16> %wide.load to <8 x i32>
428 %2 = getelementptr inbounds i16, ptr %y, i32 %index
429 %wide.load30 = load <8 x i16>, ptr %2, align 2
430 %3 = sext <8 x i16> %wide.load30 to <8 x i32>
431 %4 = mul nsw <8 x i32> %3, %1
432 %5 = ashr <8 x i32> %4, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
433 %6 = icmp slt <8 x i32> %5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
434 %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
435 %8 = trunc <8 x i32> %7 to <8 x i16>
436 %9 = getelementptr inbounds i16, ptr %z, i32 %index
437 store <8 x i16> %8, ptr %9, align 2
438 %index.next = add i32 %index, 8
439 %10 = icmp eq i32 %index.next, 1024
440 br i1 %10, label %for.cond.cleanup, label %vector.body
442 for.cond.cleanup: ; preds = %vector.body
446 define void @vqdmulh_loop_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
447 ; CHECK-LABEL: vqdmulh_loop_i32:
448 ; CHECK: @ %bb.0: @ %entry
449 ; CHECK-NEXT: .save {r7, lr}
450 ; CHECK-NEXT: push {r7, lr}
451 ; CHECK-NEXT: mov.w lr, #256
452 ; CHECK-NEXT: .LBB19_1: @ %vector.body
453 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
454 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
455 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
456 ; CHECK-NEXT: vqdmulh.s32 q0, q1, q0
457 ; CHECK-NEXT: vstrb.8 q0, [r2], #16
458 ; CHECK-NEXT: le lr, .LBB19_1
459 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
460 ; CHECK-NEXT: pop {r7, pc}
462 br label %vector.body
464 vector.body: ; preds = %vector.body, %entry
465 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
466 %0 = getelementptr inbounds i32, ptr %x, i32 %index
467 %wide.load = load <4 x i32>, ptr %0, align 4
468 %1 = sext <4 x i32> %wide.load to <4 x i64>
469 %2 = getelementptr inbounds i32, ptr %y, i32 %index
470 %wide.load30 = load <4 x i32>, ptr %2, align 4
471 %3 = sext <4 x i32> %wide.load30 to <4 x i64>
472 %4 = mul nsw <4 x i64> %3, %1
473 %5 = ashr <4 x i64> %4, <i64 31, i64 31, i64 31, i64 31>
474 %6 = icmp slt <4 x i64> %5, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
475 %7 = select <4 x i1> %6, <4 x i64> %5, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
476 %8 = trunc <4 x i64> %7 to <4 x i32>
477 %9 = getelementptr inbounds i32, ptr %z, i32 %index
478 store <4 x i32> %8, ptr %9, align 4
479 %index.next = add i32 %index, 4
480 %10 = icmp eq i32 %index.next, 1024
481 br i1 %10, label %for.cond.cleanup, label %vector.body
483 for.cond.cleanup: ; preds = %vector.body
487 define <2 x i64> @large_i128(<2 x double> %x) {
488 ; CHECK-LABEL: large_i128:
489 ; CHECK: @ %bb.0: @ %entry
490 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
491 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
492 ; CHECK-NEXT: .pad #4
493 ; CHECK-NEXT: sub sp, #4
494 ; CHECK-NEXT: mov r8, r3
495 ; CHECK-NEXT: mov r5, r2
496 ; CHECK-NEXT: bl __fixdfti
497 ; CHECK-NEXT: subs r7, r2, #1
498 ; CHECK-NEXT: mov.w r9, #1
499 ; CHECK-NEXT: sbcs r7, r3, #0
500 ; CHECK-NEXT: mov.w r4, #0
501 ; CHECK-NEXT: cset r7, lt
502 ; CHECK-NEXT: cmp r7, #0
503 ; CHECK-NEXT: csel r0, r0, r7, ne
504 ; CHECK-NEXT: csel r3, r3, r7, ne
505 ; CHECK-NEXT: csel r2, r2, r9, ne
506 ; CHECK-NEXT: csel r1, r1, r7, ne
507 ; CHECK-NEXT: rsbs r7, r0, #0
508 ; CHECK-NEXT: sbcs.w r7, r4, r1
509 ; CHECK-NEXT: sbcs.w r2, r4, r2
510 ; CHECK-NEXT: sbcs.w r2, r4, r3
511 ; CHECK-NEXT: cset r2, lt
512 ; CHECK-NEXT: cmp r2, #0
513 ; CHECK-NEXT: csel r6, r0, r2, ne
514 ; CHECK-NEXT: csel r7, r1, r2, ne
515 ; CHECK-NEXT: mov r0, r5
516 ; CHECK-NEXT: mov r1, r8
517 ; CHECK-NEXT: bl __fixdfti
518 ; CHECK-NEXT: subs r5, r2, #1
519 ; CHECK-NEXT: sbcs r5, r3, #0
520 ; CHECK-NEXT: cset r5, lt
521 ; CHECK-NEXT: cmp r5, #0
522 ; CHECK-NEXT: csel r0, r0, r5, ne
523 ; CHECK-NEXT: csel r3, r3, r5, ne
524 ; CHECK-NEXT: csel r2, r2, r9, ne
525 ; CHECK-NEXT: csel r1, r1, r5, ne
526 ; CHECK-NEXT: rsbs r5, r0, #0
527 ; CHECK-NEXT: sbcs.w r5, r4, r1
528 ; CHECK-NEXT: sbcs.w r2, r4, r2
529 ; CHECK-NEXT: sbcs.w r2, r4, r3
530 ; CHECK-NEXT: cset r3, lt
531 ; CHECK-NEXT: cmp r3, #0
532 ; CHECK-NEXT: csel r2, r0, r3, ne
533 ; CHECK-NEXT: csel r3, r1, r3, ne
534 ; CHECK-NEXT: mov r0, r6
535 ; CHECK-NEXT: mov r1, r7
536 ; CHECK-NEXT: add sp, #4
537 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
539 %conv = fptosi <2 x double> %x to <2 x i128>
540 %0 = icmp slt <2 x i128> %conv, <i128 18446744073709551616, i128 18446744073709551616>
541 %spec.store.select = select <2 x i1> %0, <2 x i128> %conv, <2 x i128> <i128 18446744073709551616, i128 18446744073709551616>
542 %1 = icmp sgt <2 x i128> %spec.store.select, zeroinitializer
543 %spec.store.select7 = select <2 x i1> %1, <2 x i128> %spec.store.select, <2 x i128> zeroinitializer
544 %conv6 = trunc <2 x i128> %spec.store.select7 to <2 x i64>
548 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
549 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
550 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)