Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / CodeGen / aarch64-neon-2velem.c
blob786c8149f38c53d073079e5f56450a327277b770
1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
4 // REQUIRES: aarch64-registered-target || arm-registered-target
6 #include <arm_neon.h>
8 // CHECK-LABEL: @test_vmla_lane_s16(
9 // CHECK-NEXT: entry:
10 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
11 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
13 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
14 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
15 // CHECK-NEXT: ret <4 x i16> [[ADD]]
17 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
18 return vmla_lane_s16(a, b, v, 3);
21 // CHECK-LABEL: @test_vmlaq_lane_s16(
22 // CHECK-NEXT: entry:
23 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
24 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
25 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
26 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
27 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
28 // CHECK-NEXT: ret <8 x i16> [[ADD]]
30 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
31 return vmlaq_lane_s16(a, b, v, 3);
34 // CHECK-LABEL: @test_vmla_lane_s32(
35 // CHECK-NEXT: entry:
36 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
37 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
38 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
39 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
40 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
41 // CHECK-NEXT: ret <2 x i32> [[ADD]]
43 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
44 return vmla_lane_s32(a, b, v, 1);
47 // CHECK-LABEL: @test_vmlaq_lane_s32(
48 // CHECK-NEXT: entry:
49 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
50 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
51 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
52 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
53 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
54 // CHECK-NEXT: ret <4 x i32> [[ADD]]
56 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
57 return vmlaq_lane_s32(a, b, v, 1);
60 // CHECK-LABEL: @test_vmla_laneq_s16(
61 // CHECK-NEXT: entry:
62 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
63 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
64 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
65 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
66 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
67 // CHECK-NEXT: ret <4 x i16> [[ADD]]
69 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
70 return vmla_laneq_s16(a, b, v, 7);
73 // CHECK-LABEL: @test_vmlaq_laneq_s16(
74 // CHECK-NEXT: entry:
75 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
76 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
77 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
78 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
79 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
80 // CHECK-NEXT: ret <8 x i16> [[ADD]]
82 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
83 return vmlaq_laneq_s16(a, b, v, 7);
86 // CHECK-LABEL: @test_vmla_laneq_s32(
87 // CHECK-NEXT: entry:
88 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
89 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
90 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
91 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
92 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
93 // CHECK-NEXT: ret <2 x i32> [[ADD]]
95 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
96 return vmla_laneq_s32(a, b, v, 3);
99 // CHECK-LABEL: @test_vmlaq_laneq_s32(
100 // CHECK-NEXT: entry:
101 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
102 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
103 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
104 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
105 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
106 // CHECK-NEXT: ret <4 x i32> [[ADD]]
108 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
109 return vmlaq_laneq_s32(a, b, v, 3);
112 // CHECK-LABEL: @test_vmls_lane_s16(
113 // CHECK-NEXT: entry:
114 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
115 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
116 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
117 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
118 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
119 // CHECK-NEXT: ret <4 x i16> [[SUB]]
121 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
122 return vmls_lane_s16(a, b, v, 3);
125 // CHECK-LABEL: @test_vmlsq_lane_s16(
126 // CHECK-NEXT: entry:
127 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
128 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
129 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
130 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
131 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
132 // CHECK-NEXT: ret <8 x i16> [[SUB]]
134 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
135 return vmlsq_lane_s16(a, b, v, 3);
138 // CHECK-LABEL: @test_vmls_lane_s32(
139 // CHECK-NEXT: entry:
140 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
141 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
142 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
143 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
144 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
145 // CHECK-NEXT: ret <2 x i32> [[SUB]]
147 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
148 return vmls_lane_s32(a, b, v, 1);
151 // CHECK-LABEL: @test_vmlsq_lane_s32(
152 // CHECK-NEXT: entry:
153 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
154 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
155 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
156 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
157 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
158 // CHECK-NEXT: ret <4 x i32> [[SUB]]
160 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
161 return vmlsq_lane_s32(a, b, v, 1);
164 // CHECK-LABEL: @test_vmls_laneq_s16(
165 // CHECK-NEXT: entry:
166 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
167 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
168 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
169 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
170 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
171 // CHECK-NEXT: ret <4 x i16> [[SUB]]
173 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
174 return vmls_laneq_s16(a, b, v, 7);
177 // CHECK-LABEL: @test_vmlsq_laneq_s16(
178 // CHECK-NEXT: entry:
179 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
180 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
181 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
182 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
183 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
184 // CHECK-NEXT: ret <8 x i16> [[SUB]]
186 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
187 return vmlsq_laneq_s16(a, b, v, 7);
190 // CHECK-LABEL: @test_vmls_laneq_s32(
191 // CHECK-NEXT: entry:
192 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
193 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
194 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
195 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
196 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
197 // CHECK-NEXT: ret <2 x i32> [[SUB]]
199 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
200 return vmls_laneq_s32(a, b, v, 3);
203 // CHECK-LABEL: @test_vmlsq_laneq_s32(
204 // CHECK-NEXT: entry:
205 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
206 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
207 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
208 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
209 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
210 // CHECK-NEXT: ret <4 x i32> [[SUB]]
212 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
213 return vmlsq_laneq_s32(a, b, v, 3);
216 // CHECK-LABEL: @test_vmul_lane_s16(
217 // CHECK-NEXT: entry:
218 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
219 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
220 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
221 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
222 // CHECK-NEXT: ret <4 x i16> [[MUL]]
224 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
225 return vmul_lane_s16(a, v, 3);
228 // CHECK-LABEL: @test_vmulq_lane_s16(
229 // CHECK-NEXT: entry:
230 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
231 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
232 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
233 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
234 // CHECK-NEXT: ret <8 x i16> [[MUL]]
236 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
237 return vmulq_lane_s16(a, v, 3);
240 // CHECK-LABEL: @test_vmul_lane_s32(
241 // CHECK-NEXT: entry:
242 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
243 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
244 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
245 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
246 // CHECK-NEXT: ret <2 x i32> [[MUL]]
248 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
249 return vmul_lane_s32(a, v, 1);
252 // CHECK-LABEL: @test_vmulq_lane_s32(
253 // CHECK-NEXT: entry:
254 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
255 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
256 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
257 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
258 // CHECK-NEXT: ret <4 x i32> [[MUL]]
260 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
261 return vmulq_lane_s32(a, v, 1);
264 // CHECK-LABEL: @test_vmul_lane_u16(
265 // CHECK-NEXT: entry:
266 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
267 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
268 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
269 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
270 // CHECK-NEXT: ret <4 x i16> [[MUL]]
272 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
273 return vmul_lane_u16(a, v, 3);
276 // CHECK-LABEL: @test_vmulq_lane_u16(
277 // CHECK-NEXT: entry:
278 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
279 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
280 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
281 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
282 // CHECK-NEXT: ret <8 x i16> [[MUL]]
284 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
285 return vmulq_lane_u16(a, v, 3);
288 // CHECK-LABEL: @test_vmul_lane_u32(
289 // CHECK-NEXT: entry:
290 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
291 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
292 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
293 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
294 // CHECK-NEXT: ret <2 x i32> [[MUL]]
296 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
297 return vmul_lane_u32(a, v, 1);
300 // CHECK-LABEL: @test_vmulq_lane_u32(
301 // CHECK-NEXT: entry:
302 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
303 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
304 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
305 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
306 // CHECK-NEXT: ret <4 x i32> [[MUL]]
308 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
309 return vmulq_lane_u32(a, v, 1);
312 // CHECK-LABEL: @test_vmul_laneq_s16(
313 // CHECK-NEXT: entry:
314 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
315 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
316 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
317 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
318 // CHECK-NEXT: ret <4 x i16> [[MUL]]
320 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
321 return vmul_laneq_s16(a, v, 7);
324 // CHECK-LABEL: @test_vmulq_laneq_s16(
325 // CHECK-NEXT: entry:
326 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
327 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
328 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
329 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
330 // CHECK-NEXT: ret <8 x i16> [[MUL]]
332 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
333 return vmulq_laneq_s16(a, v, 7);
336 // CHECK-LABEL: @test_vmul_laneq_s32(
337 // CHECK-NEXT: entry:
338 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
339 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
340 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
341 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
342 // CHECK-NEXT: ret <2 x i32> [[MUL]]
344 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
345 return vmul_laneq_s32(a, v, 3);
348 // CHECK-LABEL: @test_vmulq_laneq_s32(
349 // CHECK-NEXT: entry:
350 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
351 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
352 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
353 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
354 // CHECK-NEXT: ret <4 x i32> [[MUL]]
356 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
357 return vmulq_laneq_s32(a, v, 3);
360 // CHECK-LABEL: @test_vmul_laneq_u16(
361 // CHECK-NEXT: entry:
362 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
363 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
364 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
365 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
366 // CHECK-NEXT: ret <4 x i16> [[MUL]]
368 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
369 return vmul_laneq_u16(a, v, 7);
372 // CHECK-LABEL: @test_vmulq_laneq_u16(
373 // CHECK-NEXT: entry:
374 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
375 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
376 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
377 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
378 // CHECK-NEXT: ret <8 x i16> [[MUL]]
380 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
381 return vmulq_laneq_u16(a, v, 7);
384 // CHECK-LABEL: @test_vmul_laneq_u32(
385 // CHECK-NEXT: entry:
386 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
387 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
388 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
389 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
390 // CHECK-NEXT: ret <2 x i32> [[MUL]]
392 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
393 return vmul_laneq_u32(a, v, 3);
396 // CHECK-LABEL: @test_vmulq_laneq_u32(
397 // CHECK-NEXT: entry:
398 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
399 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
400 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
401 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
402 // CHECK-NEXT: ret <4 x i32> [[MUL]]
404 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
405 return vmulq_laneq_u32(a, v, 3);
408 // CHECK-LABEL: @test_vfma_lane_f32(
409 // CHECK-NEXT: entry:
410 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
411 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
412 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
413 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
414 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
415 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
416 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
417 // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
418 // CHECK-NEXT: ret <2 x float> [[FMLA2]]
420 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
421 return vfma_lane_f32(a, b, v, 1);
424 // CHECK-LABEL: @test_vfmaq_lane_f32(
425 // CHECK-NEXT: entry:
426 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
427 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
428 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
429 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
430 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
431 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
432 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
433 // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
434 // CHECK-NEXT: ret <4 x float> [[FMLA2]]
436 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
437 return vfmaq_lane_f32(a, b, v, 1);
440 // CHECK-LABEL: @test_vfma_laneq_f32(
441 // CHECK-NEXT: entry:
442 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
443 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
444 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
445 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
446 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
447 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
448 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
449 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
450 // CHECK-NEXT: ret <2 x float> [[TMP6]]
452 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
453 return vfma_laneq_f32(a, b, v, 3);
456 // CHECK-LABEL: @test_vfmaq_laneq_f32(
457 // CHECK-NEXT: entry:
458 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
459 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
460 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
461 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
462 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
463 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
464 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
465 // CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
466 // CHECK-NEXT: ret <4 x float> [[TMP6]]
468 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
469 return vfmaq_laneq_f32(a, b, v, 3);
472 // CHECK-LABEL: @test_vfms_lane_f32(
473 // CHECK-NEXT: entry:
474 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
475 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
476 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
477 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
478 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
479 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
480 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
481 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
482 // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
483 // CHECK-NEXT: ret <2 x float> [[FMLA2]]
485 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
486 return vfms_lane_f32(a, b, v, 1);
489 // CHECK-LABEL: @test_vfmsq_lane_f32(
490 // CHECK-NEXT: entry:
491 // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
492 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
493 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
494 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
495 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
496 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
497 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
498 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
499 // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
500 // CHECK-NEXT: ret <4 x float> [[FMLA2]]
502 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
503 return vfmsq_lane_f32(a, b, v, 1);
506 // CHECK-LABEL: @test_vfms_laneq_f32(
507 // CHECK-NEXT: entry:
508 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
509 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
510 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
511 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
512 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
513 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
514 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
515 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
516 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
517 // CHECK-NEXT: ret <2 x float> [[TMP6]]
519 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
520 return vfms_laneq_f32(a, b, v, 3);
523 // CHECK-LABEL: @test_vfmsq_laneq_f32(
524 // CHECK-NEXT: entry:
525 // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
526 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
527 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
528 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
529 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
530 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
531 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
532 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
533 // CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
534 // CHECK-NEXT: ret <4 x float> [[TMP6]]
536 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
537 return vfmsq_laneq_f32(a, b, v, 3);
540 // CHECK-LABEL: @test_vfmaq_lane_f64(
541 // CHECK-NEXT: entry:
542 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
543 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
544 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
545 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
546 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
547 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
548 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
549 // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
550 // CHECK-NEXT: ret <2 x double> [[FMLA2]]
552 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
553 return vfmaq_lane_f64(a, b, v, 0);
556 // CHECK-LABEL: @test_vfmaq_laneq_f64(
557 // CHECK-NEXT: entry:
558 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
559 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
560 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
561 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
562 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
563 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
564 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
565 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
566 // CHECK-NEXT: ret <2 x double> [[TMP6]]
568 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
569 return vfmaq_laneq_f64(a, b, v, 1);
572 // CHECK-LABEL: @test_vfmsq_lane_f64(
573 // CHECK-NEXT: entry:
574 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
575 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
576 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
577 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
578 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
579 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
580 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
581 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
582 // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
583 // CHECK-NEXT: ret <2 x double> [[FMLA2]]
585 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
586 return vfmsq_lane_f64(a, b, v, 0);
589 // CHECK-LABEL: @test_vfmsq_laneq_f64(
590 // CHECK-NEXT: entry:
591 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
592 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
593 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
594 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
595 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
596 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
597 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
598 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
599 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
600 // CHECK-NEXT: ret <2 x double> [[TMP6]]
602 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
603 return vfmsq_laneq_f64(a, b, v, 1);
606 // CHECK-LABEL: @test_vfmas_laneq_f32(
607 // CHECK-NEXT: entry:
608 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
609 // CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], float [[EXTRACT]], float [[A:%.*]])
610 // CHECK-NEXT: ret float [[TMP0]]
612 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
613 return vfmas_laneq_f32(a, b, v, 3);
616 // CHECK-LABEL: @test_vfmsd_lane_f64(
617 // CHECK-NEXT: entry:
618 // CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]]
619 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[V:%.*]], i32 0
620 // CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]])
621 // CHECK-NEXT: ret double [[TMP0]]
623 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
624 return vfmsd_lane_f64(a, b, v, 0);
627 // CHECK-LABEL: @test_vfmss_laneq_f32(
628 // CHECK-NEXT: entry:
629 // CHECK-NEXT: [[FNEG:%.*]] = fneg float [[B:%.*]]
630 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3
631 // CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A:%.*]])
632 // CHECK-NEXT: ret float [[TMP0]]
634 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
635 return vfmss_laneq_f32(a, b, v, 3);
638 // CHECK-LABEL: @test_vfmsd_laneq_f64(
639 // CHECK-NEXT: entry:
640 // CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]]
641 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i32 1
642 // CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]])
643 // CHECK-NEXT: ret double [[TMP0]]
645 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
646 return vfmsd_laneq_f64(a, b, v, 1);
649 // CHECK-LABEL: @test_vmlal_lane_s16(
650 // CHECK-NEXT: entry:
651 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
652 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
653 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
654 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
655 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
656 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
657 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
658 // CHECK-NEXT: ret <4 x i32> [[ADD]]
660 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
661 return vmlal_lane_s16(a, b, v, 3);
664 // CHECK-LABEL: @test_vmlal_lane_s32(
665 // CHECK-NEXT: entry:
666 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
667 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
668 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
669 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
670 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
671 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
672 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
673 // CHECK-NEXT: ret <2 x i64> [[ADD]]
675 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
676 return vmlal_lane_s32(a, b, v, 1);
679 // CHECK-LABEL: @test_vmlal_laneq_s16(
680 // CHECK-NEXT: entry:
681 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
682 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
683 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
684 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
685 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
686 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
687 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
688 // CHECK-NEXT: ret <4 x i32> [[ADD]]
690 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
691 return vmlal_laneq_s16(a, b, v, 7);
694 // CHECK-LABEL: @test_vmlal_laneq_s32(
695 // CHECK-NEXT: entry:
696 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
697 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
698 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
699 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
700 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
701 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
702 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
703 // CHECK-NEXT: ret <2 x i64> [[ADD]]
705 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
706 return vmlal_laneq_s32(a, b, v, 3);
709 // CHECK-LABEL: @test_vmlal_high_lane_s16(
710 // CHECK-NEXT: entry:
711 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
712 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
713 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
714 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
715 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
716 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
717 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
718 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
719 // CHECK-NEXT: ret <4 x i32> [[ADD]]
721 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
722 return vmlal_high_lane_s16(a, b, v, 3);
725 // CHECK-LABEL: @test_vmlal_high_lane_s32(
726 // CHECK-NEXT: entry:
727 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
728 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
729 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
730 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
731 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
732 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
733 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
734 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
735 // CHECK-NEXT: ret <2 x i64> [[ADD]]
737 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
738 return vmlal_high_lane_s32(a, b, v, 1);
741 // CHECK-LABEL: @test_vmlal_high_laneq_s16(
742 // CHECK-NEXT: entry:
743 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
744 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
745 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
746 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
747 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
748 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
749 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
750 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
751 // CHECK-NEXT: ret <4 x i32> [[ADD]]
753 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
754 return vmlal_high_laneq_s16(a, b, v, 7);
757 // CHECK-LABEL: @test_vmlal_high_laneq_s32(
758 // CHECK-NEXT: entry:
759 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
760 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
761 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
762 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
763 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
764 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
765 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
766 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
767 // CHECK-NEXT: ret <2 x i64> [[ADD]]
769 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
770 return vmlal_high_laneq_s32(a, b, v, 3);
773 // CHECK-LABEL: @test_vmlsl_lane_s16(
774 // CHECK-NEXT: entry:
775 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
776 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
777 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
778 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
779 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
780 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
781 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
782 // CHECK-NEXT: ret <4 x i32> [[SUB]]
784 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
785 return vmlsl_lane_s16(a, b, v, 3);
788 // CHECK-LABEL: @test_vmlsl_lane_s32(
789 // CHECK-NEXT: entry:
790 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
791 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
792 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
793 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
794 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
795 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
796 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
797 // CHECK-NEXT: ret <2 x i64> [[SUB]]
799 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
800 return vmlsl_lane_s32(a, b, v, 1);
803 // CHECK-LABEL: @test_vmlsl_laneq_s16(
804 // CHECK-NEXT: entry:
805 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
806 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
807 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
808 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
809 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
810 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
811 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
812 // CHECK-NEXT: ret <4 x i32> [[SUB]]
814 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
815 return vmlsl_laneq_s16(a, b, v, 7);
818 // CHECK-LABEL: @test_vmlsl_laneq_s32(
819 // CHECK-NEXT: entry:
820 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
821 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
822 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
823 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
824 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
825 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
826 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
827 // CHECK-NEXT: ret <2 x i64> [[SUB]]
829 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
830 return vmlsl_laneq_s32(a, b, v, 3);
833 // CHECK-LABEL: @test_vmlsl_high_lane_s16(
834 // CHECK-NEXT: entry:
835 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
836 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
837 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
838 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
839 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
840 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
841 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
842 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
843 // CHECK-NEXT: ret <4 x i32> [[SUB]]
845 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
846 return vmlsl_high_lane_s16(a, b, v, 3);
849 // CHECK-LABEL: @test_vmlsl_high_lane_s32(
850 // CHECK-NEXT: entry:
851 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
852 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
853 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
854 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
855 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
856 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
857 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
858 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
859 // CHECK-NEXT: ret <2 x i64> [[SUB]]
861 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
862 return vmlsl_high_lane_s32(a, b, v, 1);
865 // CHECK-LABEL: @test_vmlsl_high_laneq_s16(
866 // CHECK-NEXT: entry:
867 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
868 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
869 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
870 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
871 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
872 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
873 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
874 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
875 // CHECK-NEXT: ret <4 x i32> [[SUB]]
877 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
878 return vmlsl_high_laneq_s16(a, b, v, 7);
881 // CHECK-LABEL: @test_vmlsl_high_laneq_s32(
882 // CHECK-NEXT: entry:
883 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
884 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
885 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
886 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
887 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
888 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
889 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
890 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
891 // CHECK-NEXT: ret <2 x i64> [[SUB]]
893 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
894 return vmlsl_high_laneq_s32(a, b, v, 3);
897 // CHECK-LABEL: @test_vmlal_lane_u16(
898 // CHECK-NEXT: entry:
899 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
900 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
901 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
902 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
903 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
904 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
905 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
906 // CHECK-NEXT: ret <4 x i32> [[ADD]]
908 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
909 return vmlal_lane_u16(a, b, v, 3);
912 // CHECK-LABEL: @test_vmlal_lane_u32(
913 // CHECK-NEXT: entry:
914 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
915 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
916 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
917 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
918 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
919 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
920 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
921 // CHECK-NEXT: ret <2 x i64> [[ADD]]
923 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
924 return vmlal_lane_u32(a, b, v, 1);
927 // CHECK-LABEL: @test_vmlal_laneq_u16(
928 // CHECK-NEXT: entry:
929 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
930 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
931 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
932 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
933 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
934 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
935 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
936 // CHECK-NEXT: ret <4 x i32> [[ADD]]
938 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
939 return vmlal_laneq_u16(a, b, v, 7);
942 // CHECK-LABEL: @test_vmlal_laneq_u32(
943 // CHECK-NEXT: entry:
944 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
945 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
946 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
947 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
948 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
949 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
950 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
951 // CHECK-NEXT: ret <2 x i64> [[ADD]]
953 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
954 return vmlal_laneq_u32(a, b, v, 3);
957 // CHECK-LABEL: @test_vmlal_high_lane_u16(
958 // CHECK-NEXT: entry:
959 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
960 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
961 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
962 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
963 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
964 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
965 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
966 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
967 // CHECK-NEXT: ret <4 x i32> [[ADD]]
969 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
970 return vmlal_high_lane_u16(a, b, v, 3);
973 // CHECK-LABEL: @test_vmlal_high_lane_u32(
974 // CHECK-NEXT: entry:
975 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
976 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
977 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
978 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
979 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
980 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
981 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
982 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
983 // CHECK-NEXT: ret <2 x i64> [[ADD]]
985 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
986 return vmlal_high_lane_u32(a, b, v, 1);
989 // CHECK-LABEL: @test_vmlal_high_laneq_u16(
990 // CHECK-NEXT: entry:
991 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
992 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
993 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
994 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
995 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
996 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
997 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
998 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
999 // CHECK-NEXT: ret <4 x i32> [[ADD]]
1001 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
1002 return vmlal_high_laneq_u16(a, b, v, 7);
1005 // CHECK-LABEL: @test_vmlal_high_laneq_u32(
1006 // CHECK-NEXT: entry:
1007 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1008 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1009 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1010 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1011 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1012 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1013 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1014 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
1015 // CHECK-NEXT: ret <2 x i64> [[ADD]]
1017 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
1018 return vmlal_high_laneq_u32(a, b, v, 3);
1021 // CHECK-LABEL: @test_vmlsl_lane_u16(
1022 // CHECK-NEXT: entry:
1023 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1024 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1025 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1026 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1027 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1028 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
1029 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1030 // CHECK-NEXT: ret <4 x i32> [[SUB]]
1032 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
1033 return vmlsl_lane_u16(a, b, v, 3);
1036 // CHECK-LABEL: @test_vmlsl_lane_u32(
1037 // CHECK-NEXT: entry:
1038 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1039 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1040 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1041 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1042 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1043 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
1044 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1045 // CHECK-NEXT: ret <2 x i64> [[SUB]]
1047 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
1048 return vmlsl_lane_u32(a, b, v, 1);
1051 // CHECK-LABEL: @test_vmlsl_laneq_u16(
1052 // CHECK-NEXT: entry:
1053 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1054 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1055 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1056 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1057 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1058 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
1059 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1060 // CHECK-NEXT: ret <4 x i32> [[SUB]]
1062 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
1063 return vmlsl_laneq_u16(a, b, v, 7);
1066 // CHECK-LABEL: @test_vmlsl_laneq_u32(
1067 // CHECK-NEXT: entry:
1068 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1069 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1070 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1071 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1072 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1073 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
1074 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1075 // CHECK-NEXT: ret <2 x i64> [[SUB]]
1077 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
1078 return vmlsl_laneq_u32(a, b, v, 3);
1081 // CHECK-LABEL: @test_vmlsl_high_lane_u16(
1082 // CHECK-NEXT: entry:
1083 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1084 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1085 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1086 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1087 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1088 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1089 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1090 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1091 // CHECK-NEXT: ret <4 x i32> [[SUB]]
1093 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
1094 return vmlsl_high_lane_u16(a, b, v, 3);
1097 // CHECK-LABEL: @test_vmlsl_high_lane_u32(
1098 // CHECK-NEXT: entry:
1099 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1100 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1101 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1102 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1103 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1104 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1105 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1106 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1107 // CHECK-NEXT: ret <2 x i64> [[SUB]]
1109 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
1110 return vmlsl_high_lane_u32(a, b, v, 1);
1113 // CHECK-LABEL: @test_vmlsl_high_laneq_u16(
1114 // CHECK-NEXT: entry:
1115 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1116 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1117 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1118 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1119 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1120 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1121 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1122 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
1123 // CHECK-NEXT: ret <4 x i32> [[SUB]]
1125 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
1126 return vmlsl_high_laneq_u16(a, b, v, 7);
1129 // CHECK-LABEL: @test_vmlsl_high_laneq_u32(
1130 // CHECK-NEXT: entry:
1131 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1132 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1133 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1134 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1135 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1136 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1137 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1138 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
1139 // CHECK-NEXT: ret <2 x i64> [[SUB]]
1141 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
1142 return vmlsl_high_laneq_u32(a, b, v, 3);
1145 // CHECK-LABEL: @test_vmull_lane_s16(
1146 // CHECK-NEXT: entry:
1147 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1148 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1149 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1150 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1151 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1152 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1153 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
1155 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
1156 return vmull_lane_s16(a, v, 3);
1159 // CHECK-LABEL: @test_vmull_lane_s32(
1160 // CHECK-NEXT: entry:
1161 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1162 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1163 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1164 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1165 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1166 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1167 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
1169 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
1170 return vmull_lane_s32(a, v, 1);
1173 // CHECK-LABEL: @test_vmull_lane_u16(
1174 // CHECK-NEXT: entry:
1175 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1176 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1177 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1178 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1179 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1180 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1181 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
1183 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
1184 return vmull_lane_u16(a, v, 3);
1187 // CHECK-LABEL: @test_vmull_lane_u32(
1188 // CHECK-NEXT: entry:
1189 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1190 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1191 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1192 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1193 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1194 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1195 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
1197 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
1198 return vmull_lane_u32(a, v, 1);
1201 // CHECK-LABEL: @test_vmull_high_lane_s16(
1202 // CHECK-NEXT: entry:
1203 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1204 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1205 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1206 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1207 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1208 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1209 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1210 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
1212 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1213 return vmull_high_lane_s16(a, v, 3);
1216 // CHECK-LABEL: @test_vmull_high_lane_s32(
1217 // CHECK-NEXT: entry:
1218 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1219 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1220 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1221 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1222 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1223 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1224 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1225 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
1227 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1228 return vmull_high_lane_s32(a, v, 1);
1231 // CHECK-LABEL: @test_vmull_high_lane_u16(
1232 // CHECK-NEXT: entry:
1233 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1234 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1235 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1236 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1237 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1238 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1239 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1240 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
1242 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
1243 return vmull_high_lane_u16(a, v, 3);
1246 // CHECK-LABEL: @test_vmull_high_lane_u32(
1247 // CHECK-NEXT: entry:
1248 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1249 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1250 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1251 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1252 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1253 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1254 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1255 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
1257 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
1258 return vmull_high_lane_u32(a, v, 1);
1261 // CHECK-LABEL: @test_vmull_laneq_s16(
1262 // CHECK-NEXT: entry:
1263 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1264 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1265 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1266 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1267 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1268 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1269 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
1271 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
1272 return vmull_laneq_s16(a, v, 7);
1275 // CHECK-LABEL: @test_vmull_laneq_s32(
1276 // CHECK-NEXT: entry:
1277 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1278 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1279 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1280 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1281 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1282 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1283 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
1285 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
1286 return vmull_laneq_s32(a, v, 3);
1289 // CHECK-LABEL: @test_vmull_laneq_u16(
1290 // CHECK-NEXT: entry:
1291 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1292 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1293 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1294 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1295 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1296 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1297 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
1299 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
1300 return vmull_laneq_u16(a, v, 7);
1303 // CHECK-LABEL: @test_vmull_laneq_u32(
1304 // CHECK-NEXT: entry:
1305 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1306 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1307 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1308 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1309 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1310 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1311 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
1313 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
1314 return vmull_laneq_u32(a, v, 3);
1317 // CHECK-LABEL: @test_vmull_high_laneq_s16(
1318 // CHECK-NEXT: entry:
1319 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1320 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1321 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1322 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1323 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1324 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1325 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1326 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
1328 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1329 return vmull_high_laneq_s16(a, v, 7);
1332 // CHECK-LABEL: @test_vmull_high_laneq_s32(
1333 // CHECK-NEXT: entry:
1334 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1335 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1336 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1337 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1338 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1339 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1340 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1341 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
1343 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1344 return vmull_high_laneq_s32(a, v, 3);
1347 // CHECK-LABEL: @test_vmull_high_laneq_u16(
1348 // CHECK-NEXT: entry:
1349 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1350 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1351 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1352 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1353 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1354 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1355 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1356 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
1358 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
1359 return vmull_high_laneq_u16(a, v, 7);
1362 // CHECK-LABEL: @test_vmull_high_laneq_u32(
1363 // CHECK-NEXT: entry:
1364 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1365 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1366 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1367 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1368 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1369 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1370 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1371 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
1373 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
1374 return vmull_high_laneq_u32(a, v, 3);
1377 // CHECK-LABEL: @test_vqdmlal_lane_s16(
1378 // CHECK-NEXT: entry:
1379 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1380 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1381 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1382 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1383 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1384 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1385 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
1386 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
1387 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
1389 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1390 return vqdmlal_lane_s16(a, b, v, 3);
1393 // CHECK-LABEL: @test_vqdmlal_lane_s32(
1394 // CHECK-NEXT: entry:
1395 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1396 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1397 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1398 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1399 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1400 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1401 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
1402 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
1403 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
1405 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1406 return vqdmlal_lane_s32(a, b, v, 1);
1409 // CHECK-LABEL: @test_vqdmlal_high_lane_s16(
1410 // CHECK-NEXT: entry:
1411 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1412 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1413 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1414 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1415 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1416 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1417 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1418 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1419 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
1420 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
1422 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1423 return vqdmlal_high_lane_s16(a, b, v, 3);
1426 // CHECK-LABEL: @test_vqdmlal_high_lane_s32(
1427 // CHECK-NEXT: entry:
1428 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1429 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1430 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1431 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1432 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1433 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1434 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1435 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1436 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
1437 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
1439 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1440 return vqdmlal_high_lane_s32(a, b, v, 1);
1443 // CHECK-LABEL: @test_vqdmlsl_lane_s16(
1444 // CHECK-NEXT: entry:
1445 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1446 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1447 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1448 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1449 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
1450 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1451 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
1452 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
1453 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
1455 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1456 return vqdmlsl_lane_s16(a, b, v, 3);
1459 // CHECK-LABEL: @test_vqdmlsl_lane_s32(
1460 // CHECK-NEXT: entry:
1461 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1462 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1463 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1464 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1465 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
1466 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1467 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
1468 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
1469 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
1471 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1472 return vqdmlsl_lane_s32(a, b, v, 1);
1475 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
1476 // CHECK-NEXT: entry:
1477 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1478 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1479 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1480 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1481 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1482 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1483 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1484 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1485 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
1486 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
1488 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1489 return vqdmlsl_high_lane_s16(a, b, v, 3);
1492 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
1493 // CHECK-NEXT: entry:
1494 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
1495 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1496 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1497 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1498 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
1499 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1500 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1501 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1502 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
1503 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
1505 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1506 return vqdmlsl_high_lane_s32(a, b, v, 1);
1509 // CHECK-LABEL: @test_vqdmull_lane_s16(
1510 // CHECK-NEXT: entry:
1511 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1512 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1513 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1514 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1515 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1516 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1517 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1518 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
1520 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
1521 return vqdmull_lane_s16(a, v, 3);
1524 // CHECK-LABEL: @test_vqdmull_lane_s32(
1525 // CHECK-NEXT: entry:
1526 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1527 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1528 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1529 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1530 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1531 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1532 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1533 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
1535 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
1536 return vqdmull_lane_s32(a, v, 1);
1539 // CHECK-LABEL: @test_vqdmull_laneq_s16(
1540 // CHECK-NEXT: entry:
1541 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1542 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1543 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1544 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1545 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1546 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
1547 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1548 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
1550 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
1551 return vqdmull_laneq_s16(a, v, 3);
1554 // CHECK-LABEL: @test_vqdmull_laneq_s32(
1555 // CHECK-NEXT: entry:
1556 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1557 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1558 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1559 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1560 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1561 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
1562 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1563 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
1565 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
1566 return vqdmull_laneq_s32(a, v, 3);
1569 // CHECK-LABEL: @test_vqdmull_high_lane_s16(
1570 // CHECK-NEXT: entry:
1571 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1572 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1573 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1574 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1575 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1576 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1577 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1578 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1579 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
1581 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1582 return vqdmull_high_lane_s16(a, v, 3);
1585 // CHECK-LABEL: @test_vqdmull_high_lane_s32(
1586 // CHECK-NEXT: entry:
1587 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1588 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1589 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1590 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
1591 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1592 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1593 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1594 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1595 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
1597 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1598 return vqdmull_high_lane_s32(a, v, 1);
1601 // CHECK-LABEL: @test_vqdmull_high_laneq_s16(
1602 // CHECK-NEXT: entry:
1603 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1604 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1605 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1606 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1607 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1608 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
1609 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
1610 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1611 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
1613 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1614 return vqdmull_high_laneq_s16(a, v, 7);
1617 // CHECK-LABEL: @test_vqdmull_high_laneq_s32(
1618 // CHECK-NEXT: entry:
1619 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
1620 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
1621 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1622 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
1623 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1624 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
1625 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
1626 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1627 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
1629 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1630 return vqdmull_high_laneq_s32(a, v, 3);
1633 // CHECK-LABEL: @test_vqdmulh_lane_s16(
1634 // CHECK-NEXT: entry:
1635 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1636 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1637 // CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1638 // CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1639 // CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3)
1640 // CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]]
1642 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1643 return vqdmulh_lane_s16(a, v, 3);
1646 // CHECK-LABEL: @test_vqdmulhq_lane_s16(
1647 // CHECK-NEXT: entry:
1648 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
1649 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1650 // CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1651 // CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1652 // CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3)
1653 // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]]
1655 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1656 return vqdmulhq_lane_s16(a, v, 3);
1659 // CHECK-LABEL: @test_vqdmulh_lane_s32(
1660 // CHECK-NEXT: entry:
1661 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1662 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1663 // CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1664 // CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1665 // CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1)
1666 // CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]]
1668 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1669 return vqdmulh_lane_s32(a, v, 1);
1672 // CHECK-LABEL: @test_vqdmulhq_lane_s32(
1673 // CHECK-NEXT: entry:
1674 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1675 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1676 // CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1677 // CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1678 // CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1)
1679 // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]]
1681 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1682 return vqdmulhq_lane_s32(a, v, 1);
1685 // CHECK-LABEL: @test_vqrdmulh_lane_s16(
1686 // CHECK-NEXT: entry:
1687 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
1688 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1689 // CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1690 // CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1691 // CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3)
1692 // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]]
1694 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1695 return vqrdmulh_lane_s16(a, v, 3);
1698 // CHECK-LABEL: @test_vqrdmulhq_lane_s16(
1699 // CHECK-NEXT: entry:
1700 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
1701 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1702 // CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1703 // CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1704 // CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3)
1705 // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
1707 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1708 return vqrdmulhq_lane_s16(a, v, 3);
1711 // CHECK-LABEL: @test_vqrdmulh_lane_s32(
1712 // CHECK-NEXT: entry:
1713 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
1714 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1715 // CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1716 // CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1717 // CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1)
1718 // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]]
1720 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1721 return vqrdmulh_lane_s32(a, v, 1);
1724 // CHECK-LABEL: @test_vqrdmulhq_lane_s32(
1725 // CHECK-NEXT: entry:
1726 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
1727 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1728 // CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1729 // CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1730 // CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1)
1731 // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
1733 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1734 return vqrdmulhq_lane_s32(a, v, 1);
1737 // CHECK-LABEL: @test_vmul_lane_f32(
1738 // CHECK-NEXT: entry:
1739 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1740 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1741 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
1742 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
1743 // CHECK-NEXT: ret <2 x float> [[MUL]]
1745 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
1746 return vmul_lane_f32(a, v, 1);
1750 // CHECK-LABEL: @test_vmul_lane_f64(
1751 // CHECK-NEXT: entry:
1752 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
1753 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
1754 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1755 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
1756 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
1757 // CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1758 // CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1759 // CHECK-NEXT: ret <1 x double> [[TMP5]]
1761 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
1762 return vmul_lane_f64(a, v, 0);
1765 // CHECK-LABEL: @test_vmulq_lane_f32(
1766 // CHECK-NEXT: entry:
1767 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1768 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1769 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1770 // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
1771 // CHECK-NEXT: ret <4 x float> [[MUL]]
1773 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
1774 return vmulq_lane_f32(a, v, 1);
1777 // CHECK-LABEL: @test_vmulq_lane_f64(
1778 // CHECK-NEXT: entry:
1779 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
1780 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
1781 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
1782 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
1783 // CHECK-NEXT: ret <2 x double> [[MUL]]
1785 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
1786 return vmulq_lane_f64(a, v, 0);
1789 // CHECK-LABEL: @test_vmul_laneq_f32(
1790 // CHECK-NEXT: entry:
1791 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1792 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1793 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
1794 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
1795 // CHECK-NEXT: ret <2 x float> [[MUL]]
1797 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
1798 return vmul_laneq_f32(a, v, 3);
1801 // CHECK-LABEL: @test_vmul_laneq_f64(
1802 // CHECK-NEXT: entry:
1803 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
1804 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
1805 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1806 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1807 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
1808 // CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1809 // CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1810 // CHECK-NEXT: ret <1 x double> [[TMP5]]
1812 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
1813 return vmul_laneq_f64(a, v, 1);
1816 // CHECK-LABEL: @test_vmulq_laneq_f32(
1817 // CHECK-NEXT: entry:
1818 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1819 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1820 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1821 // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
1822 // CHECK-NEXT: ret <4 x float> [[MUL]]
1824 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
1825 return vmulq_laneq_f32(a, v, 3);
1828 // CHECK-LABEL: @test_vmulq_laneq_f64(
1829 // CHECK-NEXT: entry:
1830 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
1831 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1832 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
1833 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
1834 // CHECK-NEXT: ret <2 x double> [[MUL]]
1836 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
1837 return vmulq_laneq_f64(a, v, 1);
1840 // CHECK-LABEL: @test_vmulx_lane_f32(
1841 // CHECK-NEXT: entry:
1842 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1843 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1844 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
1845 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
1846 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
1847 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
1848 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]]
1850 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
1851 return vmulx_lane_f32(a, v, 1);
1854 // CHECK-LABEL: @test_vmulxq_lane_f32(
1855 // CHECK-NEXT: entry:
1856 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
1857 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1858 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1859 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
1860 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
1861 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
1862 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]]
1864 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
1865 return vmulxq_lane_f32(a, v, 1);
1868 // CHECK-LABEL: @test_vmulxq_lane_f64(
1869 // CHECK-NEXT: entry:
1870 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
1871 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
1872 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
1873 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
1874 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
1875 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
1876 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]]
1878 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
1879 return vmulxq_lane_f64(a, v, 0);
1882 // CHECK-LABEL: @test_vmulx_laneq_f32(
1883 // CHECK-NEXT: entry:
1884 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1885 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1886 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
1887 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
1888 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
1889 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
1890 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]]
1892 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
1893 return vmulx_laneq_f32(a, v, 3);
1896 // CHECK-LABEL: @test_vmulxq_laneq_f32(
1897 // CHECK-NEXT: entry:
1898 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
1899 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1900 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1901 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
1902 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
1903 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
1904 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]]
1906 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
1907 return vmulxq_laneq_f32(a, v, 3);
1910 // CHECK-LABEL: @test_vmulxq_laneq_f64(
1911 // CHECK-NEXT: entry:
1912 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
1913 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1914 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
1915 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
1916 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
1917 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
1918 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]]
1920 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
1921 return vmulxq_laneq_f64(a, v, 1);
1924 // CHECK-LABEL: @test_vmla_lane_s16_0(
1925 // CHECK-NEXT: entry:
1926 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1927 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1928 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
1929 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
1930 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
1931 // CHECK-NEXT: ret <4 x i16> [[ADD]]
1933 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1934 return vmla_lane_s16(a, b, v, 0);
1937 // CHECK-LABEL: @test_vmlaq_lane_s16_0(
1938 // CHECK-NEXT: entry:
1939 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
1940 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1941 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
1942 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
1943 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
1944 // CHECK-NEXT: ret <8 x i16> [[ADD]]
1946 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1947 return vmlaq_lane_s16(a, b, v, 0);
1950 // CHECK-LABEL: @test_vmla_lane_s32_0(
1951 // CHECK-NEXT: entry:
1952 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1953 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1954 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
1955 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
1956 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
1957 // CHECK-NEXT: ret <2 x i32> [[ADD]]
1959 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1960 return vmla_lane_s32(a, b, v, 0);
1963 // CHECK-LABEL: @test_vmlaq_lane_s32_0(
1964 // CHECK-NEXT: entry:
1965 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
1966 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1967 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
1968 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
1969 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
1970 // CHECK-NEXT: ret <4 x i32> [[ADD]]
1972 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1973 return vmlaq_lane_s32(a, b, v, 0);
1976 // CHECK-LABEL: @test_vmla_laneq_s16_0(
1977 // CHECK-NEXT: entry:
1978 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1979 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1980 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
1981 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
1982 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
1983 // CHECK-NEXT: ret <4 x i16> [[ADD]]
1985 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1986 return vmla_laneq_s16(a, b, v, 0);
1989 // CHECK-LABEL: @test_vmlaq_laneq_s16_0(
1990 // CHECK-NEXT: entry:
1991 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
1992 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1993 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
1994 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
1995 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
1996 // CHECK-NEXT: ret <8 x i16> [[ADD]]
1998 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1999 return vmlaq_laneq_s16(a, b, v, 0);
2002 // CHECK-LABEL: @test_vmla_laneq_s32_0(
2003 // CHECK-NEXT: entry:
2004 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2005 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2006 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2007 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
2008 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
2009 // CHECK-NEXT: ret <2 x i32> [[ADD]]
2011 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
2012 return vmla_laneq_s32(a, b, v, 0);
2015 // CHECK-LABEL: @test_vmlaq_laneq_s32_0(
2016 // CHECK-NEXT: entry:
2017 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2018 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2019 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2020 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
2021 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
2022 // CHECK-NEXT: ret <4 x i32> [[ADD]]
2024 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
2025 return vmlaq_laneq_s32(a, b, v, 0);
2028 // CHECK-LABEL: @test_vmls_lane_s16_0(
2029 // CHECK-NEXT: entry:
2030 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2031 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2032 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2033 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
2034 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
2035 // CHECK-NEXT: ret <4 x i16> [[SUB]]
2037 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
2038 return vmls_lane_s16(a, b, v, 0);
2041 // CHECK-LABEL: @test_vmlsq_lane_s16_0(
2042 // CHECK-NEXT: entry:
2043 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2044 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2045 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
2046 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
2047 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
2048 // CHECK-NEXT: ret <8 x i16> [[SUB]]
2050 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
2051 return vmlsq_lane_s16(a, b, v, 0);
2054 // CHECK-LABEL: @test_vmls_lane_s32_0(
2055 // CHECK-NEXT: entry:
2056 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2057 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2058 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2059 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
2060 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
2061 // CHECK-NEXT: ret <2 x i32> [[SUB]]
2063 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
2064 return vmls_lane_s32(a, b, v, 0);
2067 // CHECK-LABEL: @test_vmlsq_lane_s32_0(
2068 // CHECK-NEXT: entry:
2069 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2070 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2071 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
2072 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
2073 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
2074 // CHECK-NEXT: ret <4 x i32> [[SUB]]
2076 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
2077 return vmlsq_lane_s32(a, b, v, 0);
2080 // CHECK-LABEL: @test_vmls_laneq_s16_0(
2081 // CHECK-NEXT: entry:
2082 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2083 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2084 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2085 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
2086 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
2087 // CHECK-NEXT: ret <4 x i16> [[SUB]]
2089 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
2090 return vmls_laneq_s16(a, b, v, 0);
2093 // CHECK-LABEL: @test_vmlsq_laneq_s16_0(
2094 // CHECK-NEXT: entry:
2095 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2096 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2097 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
2098 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
2099 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
2100 // CHECK-NEXT: ret <8 x i16> [[SUB]]
2102 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
2103 return vmlsq_laneq_s16(a, b, v, 0);
2106 // CHECK-LABEL: @test_vmls_laneq_s32_0(
2107 // CHECK-NEXT: entry:
2108 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2109 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2110 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2111 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
2112 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
2113 // CHECK-NEXT: ret <2 x i32> [[SUB]]
2115 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
2116 return vmls_laneq_s32(a, b, v, 0);
2119 // CHECK-LABEL: @test_vmlsq_laneq_s32_0(
2120 // CHECK-NEXT: entry:
2121 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2122 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2123 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2124 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
2125 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
2126 // CHECK-NEXT: ret <4 x i32> [[SUB]]
2128 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
2129 return vmlsq_laneq_s32(a, b, v, 0);
2132 // CHECK-LABEL: @test_vmul_lane_s16_0(
2133 // CHECK-NEXT: entry:
2134 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2135 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2136 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2137 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2138 // CHECK-NEXT: ret <4 x i16> [[MUL]]
2140 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
2141 return vmul_lane_s16(a, v, 0);
2144 // CHECK-LABEL: @test_vmulq_lane_s16_0(
2145 // CHECK-NEXT: entry:
2146 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2147 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2148 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
2149 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2150 // CHECK-NEXT: ret <8 x i16> [[MUL]]
2152 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
2153 return vmulq_lane_s16(a, v, 0);
2156 // CHECK-LABEL: @test_vmul_lane_s32_0(
2157 // CHECK-NEXT: entry:
2158 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2159 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2160 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2161 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2162 // CHECK-NEXT: ret <2 x i32> [[MUL]]
2164 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
2165 return vmul_lane_s32(a, v, 0);
2168 // CHECK-LABEL: @test_vmulq_lane_s32_0(
2169 // CHECK-NEXT: entry:
2170 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2171 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2172 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
2173 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2174 // CHECK-NEXT: ret <4 x i32> [[MUL]]
2176 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
2177 return vmulq_lane_s32(a, v, 0);
2180 // CHECK-LABEL: @test_vmul_lane_u16_0(
2181 // CHECK-NEXT: entry:
2182 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2183 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2184 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2185 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2186 // CHECK-NEXT: ret <4 x i16> [[MUL]]
2188 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
2189 return vmul_lane_u16(a, v, 0);
2192 // CHECK-LABEL: @test_vmulq_lane_u16_0(
2193 // CHECK-NEXT: entry:
2194 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2195 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2196 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
2197 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2198 // CHECK-NEXT: ret <8 x i16> [[MUL]]
2200 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
2201 return vmulq_lane_u16(a, v, 0);
2204 // CHECK-LABEL: @test_vmul_lane_u32_0(
2205 // CHECK-NEXT: entry:
2206 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2207 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2208 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2209 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2210 // CHECK-NEXT: ret <2 x i32> [[MUL]]
2212 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
2213 return vmul_lane_u32(a, v, 0);
2216 // CHECK-LABEL: @test_vmulq_lane_u32_0(
2217 // CHECK-NEXT: entry:
2218 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2219 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2220 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
2221 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2222 // CHECK-NEXT: ret <4 x i32> [[MUL]]
2224 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
2225 return vmulq_lane_u32(a, v, 0);
2228 // CHECK-LABEL: @test_vmul_laneq_s16_0(
2229 // CHECK-NEXT: entry:
2230 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2231 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2232 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2233 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2234 // CHECK-NEXT: ret <4 x i16> [[MUL]]
2236 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
2237 return vmul_laneq_s16(a, v, 0);
2240 // CHECK-LABEL: @test_vmulq_laneq_s16_0(
2241 // CHECK-NEXT: entry:
2242 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2243 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2244 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
2245 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2246 // CHECK-NEXT: ret <8 x i16> [[MUL]]
2248 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
2249 return vmulq_laneq_s16(a, v, 0);
2252 // CHECK-LABEL: @test_vmul_laneq_s32_0(
2253 // CHECK-NEXT: entry:
2254 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2255 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2256 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2257 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2258 // CHECK-NEXT: ret <2 x i32> [[MUL]]
2260 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
2261 return vmul_laneq_s32(a, v, 0);
2264 // CHECK-LABEL: @test_vmulq_laneq_s32_0(
2265 // CHECK-NEXT: entry:
2266 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2267 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2268 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2269 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2270 // CHECK-NEXT: ret <4 x i32> [[MUL]]
2272 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
2273 return vmulq_laneq_s32(a, v, 0);
2276 // CHECK-LABEL: @test_vmul_laneq_u16_0(
2277 // CHECK-NEXT: entry:
2278 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2279 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2280 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2281 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]]
2282 // CHECK-NEXT: ret <4 x i16> [[MUL]]
2284 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
2285 return vmul_laneq_u16(a, v, 0);
2288 // CHECK-LABEL: @test_vmulq_laneq_u16_0(
2289 // CHECK-NEXT: entry:
2290 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2291 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2292 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
2293 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]]
2294 // CHECK-NEXT: ret <8 x i16> [[MUL]]
2296 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
2297 return vmulq_laneq_u16(a, v, 0);
2300 // CHECK-LABEL: @test_vmul_laneq_u32_0(
2301 // CHECK-NEXT: entry:
2302 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2303 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2304 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2305 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]]
2306 // CHECK-NEXT: ret <2 x i32> [[MUL]]
2308 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
2309 return vmul_laneq_u32(a, v, 0);
2312 // CHECK-LABEL: @test_vmulq_laneq_u32_0(
2313 // CHECK-NEXT: entry:
2314 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2315 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2316 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
2317 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]]
2318 // CHECK-NEXT: ret <4 x i32> [[MUL]]
2320 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
2321 return vmulq_laneq_u32(a, v, 0);
2324 // CHECK-LABEL: @test_vfma_lane_f32_0(
2325 // CHECK-NEXT: entry:
2326 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2327 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
2328 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2329 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2330 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
2331 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2332 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2333 // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
2334 // CHECK-NEXT: ret <2 x float> [[FMLA2]]
2336 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
2337 return vfma_lane_f32(a, b, v, 0);
2340 // CHECK-LABEL: @test_vfmaq_lane_f32_0(
2341 // CHECK-NEXT: entry:
2342 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2343 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
2344 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2345 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2346 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
2347 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2348 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2349 // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
2350 // CHECK-NEXT: ret <4 x float> [[FMLA2]]
2352 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
2353 return vfmaq_lane_f32(a, b, v, 0);
2356 // CHECK-LABEL: @test_vfma_laneq_f32_0(
2357 // CHECK-NEXT: entry:
2358 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2359 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
2360 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2361 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2362 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2363 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2364 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
2365 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
2366 // CHECK-NEXT: ret <2 x float> [[TMP6]]
2368 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
2369 return vfma_laneq_f32(a, b, v, 0);
2372 // CHECK-LABEL: @test_vfmaq_laneq_f32_0(
2373 // CHECK-NEXT: entry:
2374 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2375 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
2376 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2377 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2378 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2379 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2380 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
2381 // CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
2382 // CHECK-NEXT: ret <4 x float> [[TMP6]]
2384 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
2385 return vfmaq_laneq_f32(a, b, v, 0);
2388 // CHECK-LABEL: @test_vfms_lane_f32_0(
2389 // CHECK-NEXT: entry:
2390 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
2391 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2392 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
2393 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2394 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2395 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
2396 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2397 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2398 // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
2399 // CHECK-NEXT: ret <2 x float> [[FMLA2]]
2401 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
2402 return vfms_lane_f32(a, b, v, 0);
2405 // CHECK-LABEL: @test_vfmsq_lane_f32_0(
2406 // CHECK-NEXT: entry:
2407 // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
2408 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2409 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
2410 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
2411 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
2412 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
2413 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2414 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2415 // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
2416 // CHECK-NEXT: ret <4 x float> [[FMLA2]]
2418 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
2419 return vfmsq_lane_f32(a, b, v, 0);
2422 // CHECK-LABEL: @test_vfms_laneq_f32_0(
2423 // CHECK-NEXT: entry:
2424 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]]
2425 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
2426 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8>
2427 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2428 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2429 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
2430 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2431 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
2432 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
2433 // CHECK-NEXT: ret <2 x float> [[TMP6]]
2435 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
2436 return vfms_laneq_f32(a, b, v, 0);
2439 // CHECK-LABEL: @test_vfmsq_laneq_f32_0(
2440 // CHECK-NEXT: entry:
2441 // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]]
2442 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
2443 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8>
2444 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
2445 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2446 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2447 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2448 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
2449 // CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
2450 // CHECK-NEXT: ret <4 x float> [[TMP6]]
2452 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
2453 return vfmsq_laneq_f32(a, b, v, 0);
2456 // CHECK-LABEL: @test_vfmaq_laneq_f64_0(
2457 // CHECK-NEXT: entry:
2458 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
2459 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
2460 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
2461 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
2462 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2463 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
2464 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
2465 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2466 // CHECK-NEXT: ret <2 x double> [[TMP6]]
2468 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
2469 return vfmaq_laneq_f64(a, b, v, 0);
2472 // CHECK-LABEL: @test_vfmsq_laneq_f64_0(
2473 // CHECK-NEXT: entry:
2474 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]]
2475 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
2476 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8>
2477 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
2478 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
2479 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2480 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
2481 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
2482 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2483 // CHECK-NEXT: ret <2 x double> [[TMP6]]
2485 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
2486 return vfmsq_laneq_f64(a, b, v, 0);
2489 // CHECK-LABEL: @test_vmlal_lane_s16_0(
2490 // CHECK-NEXT: entry:
2491 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2492 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2493 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2494 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2495 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2496 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2497 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2498 // CHECK-NEXT: ret <4 x i32> [[ADD]]
2500 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2501 return vmlal_lane_s16(a, b, v, 0);
2504 // CHECK-LABEL: @test_vmlal_lane_s32_0(
2505 // CHECK-NEXT: entry:
2506 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2507 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2508 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2509 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2510 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2511 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2512 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2513 // CHECK-NEXT: ret <2 x i64> [[ADD]]
2515 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2516 return vmlal_lane_s32(a, b, v, 0);
2519 // CHECK-LABEL: @test_vmlal_laneq_s16_0(
2520 // CHECK-NEXT: entry:
2521 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2522 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2523 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2524 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2525 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2526 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2527 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2528 // CHECK-NEXT: ret <4 x i32> [[ADD]]
2530 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2531 return vmlal_laneq_s16(a, b, v, 0);
2534 // CHECK-LABEL: @test_vmlal_laneq_s32_0(
2535 // CHECK-NEXT: entry:
2536 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2537 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2538 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2539 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2540 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2541 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2542 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2543 // CHECK-NEXT: ret <2 x i64> [[ADD]]
2545 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2546 return vmlal_laneq_s32(a, b, v, 0);
2549 // CHECK-LABEL: @test_vmlal_high_lane_s16_0(
2550 // CHECK-NEXT: entry:
2551 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2552 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2553 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2554 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2555 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2556 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2557 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2558 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2559 // CHECK-NEXT: ret <4 x i32> [[ADD]]
2561 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2562 return vmlal_high_lane_s16(a, b, v, 0);
2565 // CHECK-LABEL: @test_vmlal_high_lane_s32_0(
2566 // CHECK-NEXT: entry:
2567 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2568 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2569 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2570 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2571 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2572 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2573 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2574 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2575 // CHECK-NEXT: ret <2 x i64> [[ADD]]
2577 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2578 return vmlal_high_lane_s32(a, b, v, 0);
2581 // CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
2582 // CHECK-NEXT: entry:
2583 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2584 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2585 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2586 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2587 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2588 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2589 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2590 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2591 // CHECK-NEXT: ret <4 x i32> [[ADD]]
2593 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2594 return vmlal_high_laneq_s16(a, b, v, 0);
2597 // CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
2598 // CHECK-NEXT: entry:
2599 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2600 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2601 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2602 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2603 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2604 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2605 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2606 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2607 // CHECK-NEXT: ret <2 x i64> [[ADD]]
2609 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2610 return vmlal_high_laneq_s32(a, b, v, 0);
2613 // CHECK-LABEL: @test_vmlsl_lane_s16_0(
2614 // CHECK-NEXT: entry:
2615 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2616 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2617 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2618 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2619 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2620 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2621 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2622 // CHECK-NEXT: ret <4 x i32> [[SUB]]
2624 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2625 return vmlsl_lane_s16(a, b, v, 0);
2628 // CHECK-LABEL: @test_vmlsl_lane_s32_0(
2629 // CHECK-NEXT: entry:
2630 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2631 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2632 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2633 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2634 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2635 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2636 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2637 // CHECK-NEXT: ret <2 x i64> [[SUB]]
2639 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2640 return vmlsl_lane_s32(a, b, v, 0);
2643 // CHECK-LABEL: @test_vmlsl_laneq_s16_0(
2644 // CHECK-NEXT: entry:
2645 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2646 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2647 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2648 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2649 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2650 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2651 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2652 // CHECK-NEXT: ret <4 x i32> [[SUB]]
2654 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2655 return vmlsl_laneq_s16(a, b, v, 0);
2658 // CHECK-LABEL: @test_vmlsl_laneq_s32_0(
2659 // CHECK-NEXT: entry:
2660 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2661 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2662 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2663 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2664 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2665 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2666 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2667 // CHECK-NEXT: ret <2 x i64> [[SUB]]
2669 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2670 return vmlsl_laneq_s32(a, b, v, 0);
2673 // CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
2674 // CHECK-NEXT: entry:
2675 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2676 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2677 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2678 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2679 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2680 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2681 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2682 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2683 // CHECK-NEXT: ret <4 x i32> [[SUB]]
2685 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2686 return vmlsl_high_lane_s16(a, b, v, 0);
2689 // CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
2690 // CHECK-NEXT: entry:
2691 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2692 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2693 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2694 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2695 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2696 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2697 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2698 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2699 // CHECK-NEXT: ret <2 x i64> [[SUB]]
2701 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2702 return vmlsl_high_lane_s32(a, b, v, 0);
2705 // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
2706 // CHECK-NEXT: entry:
2707 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2708 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2709 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2710 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2711 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2712 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2713 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2714 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2715 // CHECK-NEXT: ret <4 x i32> [[SUB]]
2717 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2718 return vmlsl_high_laneq_s16(a, b, v, 0);
2721 // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
2722 // CHECK-NEXT: entry:
2723 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2724 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2725 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2726 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2727 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2728 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2729 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2730 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2731 // CHECK-NEXT: ret <2 x i64> [[SUB]]
2733 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2734 return vmlsl_high_laneq_s32(a, b, v, 0);
2737 // CHECK-LABEL: @test_vmlal_lane_u16_0(
2738 // CHECK-NEXT: entry:
2739 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2740 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2741 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2742 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2743 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2744 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2745 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2746 // CHECK-NEXT: ret <4 x i32> [[ADD]]
2748 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2749 return vmlal_lane_u16(a, b, v, 0);
2752 // CHECK-LABEL: @test_vmlal_lane_u32_0(
2753 // CHECK-NEXT: entry:
2754 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2755 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2756 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2757 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2758 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2759 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2760 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2761 // CHECK-NEXT: ret <2 x i64> [[ADD]]
2763 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2764 return vmlal_lane_u32(a, b, v, 0);
2767 // CHECK-LABEL: @test_vmlal_laneq_u16_0(
2768 // CHECK-NEXT: entry:
2769 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2770 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2771 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2772 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2773 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2774 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2775 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2776 // CHECK-NEXT: ret <4 x i32> [[ADD]]
2778 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2779 return vmlal_laneq_u16(a, b, v, 0);
2782 // CHECK-LABEL: @test_vmlal_laneq_u32_0(
2783 // CHECK-NEXT: entry:
2784 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2785 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2786 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2787 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2788 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2789 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2790 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2791 // CHECK-NEXT: ret <2 x i64> [[ADD]]
2793 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2794 return vmlal_laneq_u32(a, b, v, 0);
2797 // CHECK-LABEL: @test_vmlal_high_lane_u16_0(
2798 // CHECK-NEXT: entry:
2799 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2800 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2801 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2802 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2803 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2804 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2805 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2806 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2807 // CHECK-NEXT: ret <4 x i32> [[ADD]]
2809 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2810 return vmlal_high_lane_u16(a, b, v, 0);
2813 // CHECK-LABEL: @test_vmlal_high_lane_u32_0(
2814 // CHECK-NEXT: entry:
2815 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2816 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2817 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2818 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2819 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2820 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2821 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2822 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2823 // CHECK-NEXT: ret <2 x i64> [[ADD]]
2825 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2826 return vmlal_high_lane_u32(a, b, v, 0);
2829 // CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
2830 // CHECK-NEXT: entry:
2831 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2832 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2833 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2834 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2835 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2836 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2837 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2838 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
2839 // CHECK-NEXT: ret <4 x i32> [[ADD]]
2841 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2842 return vmlal_high_laneq_u16(a, b, v, 0);
2845 // CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
2846 // CHECK-NEXT: entry:
2847 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2848 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2849 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2850 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2851 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2852 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2853 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2854 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
2855 // CHECK-NEXT: ret <2 x i64> [[ADD]]
2857 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2858 return vmlal_high_laneq_u32(a, b, v, 0);
2861 // CHECK-LABEL: @test_vmlsl_lane_u16_0(
2862 // CHECK-NEXT: entry:
2863 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2864 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2865 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2866 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2867 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2868 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2869 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2870 // CHECK-NEXT: ret <4 x i32> [[SUB]]
2872 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2873 return vmlsl_lane_u16(a, b, v, 0);
2876 // CHECK-LABEL: @test_vmlsl_lane_u32_0(
2877 // CHECK-NEXT: entry:
2878 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2879 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2880 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2881 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2882 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2883 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2884 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2885 // CHECK-NEXT: ret <2 x i64> [[SUB]]
2887 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2888 return vmlsl_lane_u32(a, b, v, 0);
2891 // CHECK-LABEL: @test_vmlsl_laneq_u16_0(
2892 // CHECK-NEXT: entry:
2893 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2894 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2895 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2896 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
2897 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2898 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
2899 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2900 // CHECK-NEXT: ret <4 x i32> [[SUB]]
2902 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2903 return vmlsl_laneq_u16(a, b, v, 0);
2906 // CHECK-LABEL: @test_vmlsl_laneq_u32_0(
2907 // CHECK-NEXT: entry:
2908 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2909 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2910 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2911 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
2912 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2913 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
2914 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2915 // CHECK-NEXT: ret <2 x i64> [[SUB]]
2917 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2918 return vmlsl_laneq_u32(a, b, v, 0);
2921 // CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
2922 // CHECK-NEXT: entry:
2923 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2924 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2925 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2926 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2927 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2928 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2929 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2930 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2931 // CHECK-NEXT: ret <4 x i32> [[SUB]]
2933 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2934 return vmlsl_high_lane_u16(a, b, v, 0);
2937 // CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
2938 // CHECK-NEXT: entry:
2939 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2940 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
2941 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2942 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
2943 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2944 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2945 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2946 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2947 // CHECK-NEXT: ret <2 x i64> [[SUB]]
2949 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2950 return vmlsl_high_lane_u32(a, b, v, 0);
2953 // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
2954 // CHECK-NEXT: entry:
2955 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2956 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
2957 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2958 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
2959 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2960 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2961 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
2962 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
2963 // CHECK-NEXT: ret <4 x i32> [[SUB]]
2965 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2966 return vmlsl_high_laneq_u16(a, b, v, 0);
2969 // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
2970 // CHECK-NEXT: entry:
2971 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
2972 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
2973 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2974 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
2975 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2976 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
2977 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
2978 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
2979 // CHECK-NEXT: ret <2 x i64> [[SUB]]
2981 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2982 return vmlsl_high_laneq_u32(a, b, v, 0);
2985 // CHECK-LABEL: @test_vmull_lane_s16_0(
2986 // CHECK-NEXT: entry:
2987 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
2988 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2989 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
2990 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
2991 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
2992 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
2993 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
2995 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2996 return vmull_lane_s16(a, v, 0);
2999 // CHECK-LABEL: @test_vmull_lane_s32_0(
3000 // CHECK-NEXT: entry:
3001 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3002 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3003 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3004 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3005 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3006 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3007 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
3009 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
3010 return vmull_lane_s32(a, v, 0);
3013 // CHECK-LABEL: @test_vmull_lane_u16_0(
3014 // CHECK-NEXT: entry:
3015 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3016 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3017 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3018 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3019 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3020 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3021 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
3023 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
3024 return vmull_lane_u16(a, v, 0);
3027 // CHECK-LABEL: @test_vmull_lane_u32_0(
3028 // CHECK-NEXT: entry:
3029 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3030 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3031 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3032 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3033 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3034 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3035 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
3037 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
3038 return vmull_lane_u32(a, v, 0);
3041 // CHECK-LABEL: @test_vmull_high_lane_s16_0(
3042 // CHECK-NEXT: entry:
3043 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3044 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3045 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3046 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3047 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3048 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3049 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3050 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
3052 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
3053 return vmull_high_lane_s16(a, v, 0);
3056 // CHECK-LABEL: @test_vmull_high_lane_s32_0(
3057 // CHECK-NEXT: entry:
3058 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3059 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3060 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3061 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3062 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3063 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3064 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3065 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
3067 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
3068 return vmull_high_lane_s32(a, v, 0);
3071 // CHECK-LABEL: @test_vmull_high_lane_u16_0(
3072 // CHECK-NEXT: entry:
3073 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3074 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3075 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3076 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3077 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3078 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3079 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3080 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
3082 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
3083 return vmull_high_lane_u16(a, v, 0);
3086 // CHECK-LABEL: @test_vmull_high_lane_u32_0(
3087 // CHECK-NEXT: entry:
3088 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3089 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3090 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3091 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3092 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3093 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3094 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3095 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
3097 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
3098 return vmull_high_lane_u32(a, v, 0);
3101 // CHECK-LABEL: @test_vmull_laneq_s16_0(
3102 // CHECK-NEXT: entry:
3103 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3104 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3105 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3106 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3107 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3108 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3109 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
3111 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
3112 return vmull_laneq_s16(a, v, 0);
3115 // CHECK-LABEL: @test_vmull_laneq_s32_0(
3116 // CHECK-NEXT: entry:
3117 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3118 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3119 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3120 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3121 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3122 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3123 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
3125 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
3126 return vmull_laneq_s32(a, v, 0);
3129 // CHECK-LABEL: @test_vmull_laneq_u16_0(
3130 // CHECK-NEXT: entry:
3131 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3132 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3133 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3134 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3135 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3136 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3137 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
3139 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
3140 return vmull_laneq_u16(a, v, 0);
3143 // CHECK-LABEL: @test_vmull_laneq_u32_0(
3144 // CHECK-NEXT: entry:
3145 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3146 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3147 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3148 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3149 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3150 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3151 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
3153 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
3154 return vmull_laneq_u32(a, v, 0);
3157 // CHECK-LABEL: @test_vmull_high_laneq_s16_0(
3158 // CHECK-NEXT: entry:
3159 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3160 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3161 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3162 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3163 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3164 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3165 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3166 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
3168 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
3169 return vmull_high_laneq_s16(a, v, 0);
3172 // CHECK-LABEL: @test_vmull_high_laneq_s32_0(
3173 // CHECK-NEXT: entry:
3174 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3175 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3176 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3177 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3178 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3179 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3180 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3181 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
3183 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
3184 return vmull_high_laneq_s32(a, v, 0);
3187 // CHECK-LABEL: @test_vmull_high_laneq_u16_0(
3188 // CHECK-NEXT: entry:
3189 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3190 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3191 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3192 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3193 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3194 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3195 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3196 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
3198 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
3199 return vmull_high_laneq_u16(a, v, 0);
3202 // CHECK-LABEL: @test_vmull_high_laneq_u32_0(
3203 // CHECK-NEXT: entry:
3204 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3205 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3206 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3207 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3208 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3209 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3210 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3211 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
3213 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
3214 return vmull_high_laneq_u32(a, v, 0);
3217 // CHECK-LABEL: @test_vqdmlal_lane_s16_0(
3218 // CHECK-NEXT: entry:
3219 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3220 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3221 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3222 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3223 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
3224 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3225 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
3226 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
3227 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
3229 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
3230 return vqdmlal_lane_s16(a, b, v, 0);
3233 // CHECK-LABEL: @test_vqdmlal_lane_s32_0(
3234 // CHECK-NEXT: entry:
3235 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3236 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3237 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3238 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3239 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
3240 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3241 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
3242 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
3243 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
3245 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
3246 return vqdmlal_lane_s32(a, b, v, 0);
3249 // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
3250 // CHECK-NEXT: entry:
3251 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3252 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3253 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3254 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3255 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3256 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3257 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3258 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3259 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
3260 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
3262 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
3263 return vqdmlal_high_lane_s16(a, b, v, 0);
3266 // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
3267 // CHECK-NEXT: entry:
3268 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3269 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3270 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3271 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3272 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3273 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3274 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3275 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3276 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
3277 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
3279 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
3280 return vqdmlal_high_lane_s32(a, b, v, 0);
3283 // CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
3284 // CHECK-NEXT: entry:
3285 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3286 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3287 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3288 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3289 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
3290 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3291 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
3292 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
3293 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
3295 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
3296 return vqdmlsl_lane_s16(a, b, v, 0);
3299 // CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
3300 // CHECK-NEXT: entry:
3301 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3302 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3303 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3304 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3305 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
3306 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3307 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
3308 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
3309 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
3311 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
3312 return vqdmlsl_lane_s32(a, b, v, 0);
3315 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
3316 // CHECK-NEXT: entry:
3317 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3318 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3319 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3320 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3321 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3322 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3323 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3324 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3325 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
3326 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
3328 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
3329 return vqdmlsl_high_lane_s16(a, b, v, 0);
3332 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
3333 // CHECK-NEXT: entry:
3334 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3335 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3336 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3337 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3338 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3339 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3340 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3341 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3342 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
3343 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
3345 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
3346 return vqdmlsl_high_lane_s32(a, b, v, 0);
3349 // CHECK-LABEL: @test_vqdmull_lane_s16_0(
3350 // CHECK-NEXT: entry:
3351 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3352 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3353 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3354 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3355 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3356 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3357 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3358 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
3360 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
3361 return vqdmull_lane_s16(a, v, 0);
3364 // CHECK-LABEL: @test_vqdmull_lane_s32_0(
3365 // CHECK-NEXT: entry:
3366 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3367 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3368 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3369 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3370 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3371 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3372 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3373 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
3375 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
3376 return vqdmull_lane_s32(a, v, 0);
3379 // CHECK-LABEL: @test_vqdmull_laneq_s16_0(
3380 // CHECK-NEXT: entry:
3381 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3382 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3383 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3384 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3385 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3386 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
3387 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3388 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
3390 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
3391 return vqdmull_laneq_s16(a, v, 0);
3394 // CHECK-LABEL: @test_vqdmull_laneq_s32_0(
3395 // CHECK-NEXT: entry:
3396 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3397 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3398 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3399 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3400 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3401 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
3402 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3403 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
3405 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
3406 return vqdmull_laneq_s32(a, v, 0);
3409 // CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
3410 // CHECK-NEXT: entry:
3411 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3412 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3413 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3414 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
3415 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3416 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3417 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3418 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3419 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
3421 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
3422 return vqdmull_high_lane_s16(a, v, 0);
3425 // CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
3426 // CHECK-NEXT: entry:
3427 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3428 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3429 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3430 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
3431 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3432 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3433 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3434 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3435 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
3437 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
3438 return vqdmull_high_lane_s32(a, v, 0);
3441 // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
3442 // CHECK-NEXT: entry:
3443 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3444 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
3445 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3446 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
3447 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3448 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
3449 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
3450 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
3451 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
3453 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
3454 return vqdmull_high_laneq_s16(a, v, 0);
3457 // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
3458 // CHECK-NEXT: entry:
3459 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3460 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
3461 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3462 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
3463 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3464 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
3465 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
3466 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
3467 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
3469 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
3470 return vqdmull_high_laneq_s32(a, v, 0);
3473 // CHECK-LABEL: @test_vqdmulh_lane_s16_0(
3474 // CHECK-NEXT: entry:
3475 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3476 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3477 // CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3478 // CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3479 // CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0)
3480 // CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]]
3482 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
3483 return vqdmulh_lane_s16(a, v, 0);
3486 // CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
3487 // CHECK-NEXT: entry:
3488 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
3489 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3490 // CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3491 // CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3492 // CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0)
3493 // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]]
3495 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
3496 return vqdmulhq_lane_s16(a, v, 0);
3499 // CHECK-LABEL: @test_vqdmulh_lane_s32_0(
3500 // CHECK-NEXT: entry:
3501 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3502 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3503 // CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3504 // CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3505 // CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0)
3506 // CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]]
3508 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
3509 return vqdmulh_lane_s32(a, v, 0);
3512 // CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
3513 // CHECK-NEXT: entry:
3514 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3515 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3516 // CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3517 // CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3518 // CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0)
3519 // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]]
3521 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
3522 return vqdmulhq_lane_s32(a, v, 0);
3525 // CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
3526 // CHECK-NEXT: entry:
3527 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
3528 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3529 // CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3530 // CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3531 // CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0)
3532 // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]]
3534 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
3535 return vqrdmulh_lane_s16(a, v, 0);
3538 // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
3539 // CHECK-NEXT: entry:
3540 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
3541 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
3542 // CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3543 // CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3544 // CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0)
3545 // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]]
3547 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
3548 return vqrdmulhq_lane_s16(a, v, 0);
3551 // CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
3552 // CHECK-NEXT: entry:
3553 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
3554 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3555 // CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3556 // CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3557 // CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0)
3558 // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]]
3560 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
3561 return vqrdmulh_lane_s32(a, v, 0);
3564 // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
3565 // CHECK-NEXT: entry:
3566 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3567 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
3568 // CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3569 // CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3570 // CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0)
3571 // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]]
3573 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
3574 return vqrdmulhq_lane_s32(a, v, 0);
3577 // CHECK-LABEL: @test_vmul_lane_f32_0(
3578 // CHECK-NEXT: entry:
3579 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3580 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3581 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
3582 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
3583 // CHECK-NEXT: ret <2 x float> [[MUL]]
3585 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
3586 return vmul_lane_f32(a, v, 0);
3589 // CHECK-LABEL: @test_vmulq_lane_f32_0(
3590 // CHECK-NEXT: entry:
3591 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3592 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3593 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
3594 // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
3595 // CHECK-NEXT: ret <4 x float> [[MUL]]
3597 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
3598 return vmulq_lane_f32(a, v, 0);
3601 // CHECK-LABEL: @test_vmul_laneq_f32_0(
3602 // CHECK-NEXT: entry:
3603 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3604 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3605 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
3606 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]]
3607 // CHECK-NEXT: ret <2 x float> [[MUL]]
3609 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
3610 return vmul_laneq_f32(a, v, 0);
3613 // CHECK-LABEL: @test_vmul_laneq_f64_0(
3614 // CHECK-NEXT: entry:
3615 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
3616 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
3617 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
3618 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3619 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
3620 // CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
3621 // CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
3622 // CHECK-NEXT: ret <1 x double> [[TMP5]]
3624 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
3625 return vmul_laneq_f64(a, v, 0);
3628 // CHECK-LABEL: @test_vmulq_laneq_f32_0(
3629 // CHECK-NEXT: entry:
3630 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3631 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3632 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
3633 // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]]
3634 // CHECK-NEXT: ret <4 x float> [[MUL]]
3636 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
3637 return vmulq_laneq_f32(a, v, 0);
3640 // CHECK-LABEL: @test_vmulq_laneq_f64_0(
3641 // CHECK-NEXT: entry:
3642 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
3643 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
3644 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
3645 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]]
3646 // CHECK-NEXT: ret <2 x double> [[MUL]]
3648 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
3649 return vmulq_laneq_f64(a, v, 0);
3652 // CHECK-LABEL: @test_vmulx_lane_f32_0(
3653 // CHECK-NEXT: entry:
3654 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3655 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3656 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
3657 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
3658 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
3659 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
3660 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]]
3662 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
3663 return vmulx_lane_f32(a, v, 0);
3666 // CHECK-LABEL: @test_vmulxq_lane_f32_0(
3667 // CHECK-NEXT: entry:
3668 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
3669 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3670 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
3671 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
3672 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
3673 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
3674 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]]
3676 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
3677 return vmulxq_lane_f32(a, v, 0);
3680 // CHECK-LABEL: @test_vmulxq_lane_f64_0(
3681 // CHECK-NEXT: entry:
3682 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
3683 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
3684 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
3685 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
3686 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
3687 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
3688 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]]
3690 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
3691 return vmulxq_lane_f64(a, v, 0);
3694 // CHECK-LABEL: @test_vmulx_laneq_f32_0(
3695 // CHECK-NEXT: entry:
3696 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3697 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3698 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
3699 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
3700 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
3701 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
3702 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]]
3704 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
3705 return vmulx_laneq_f32(a, v, 0);
3708 // CHECK-LABEL: @test_vmulxq_laneq_f32_0(
3709 // CHECK-NEXT: entry:
3710 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
3711 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3712 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
3713 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
3714 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
3715 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
3716 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]]
3718 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
3719 return vmulxq_laneq_f32(a, v, 0);
3722 // CHECK-LABEL: @test_vmulxq_laneq_f64_0(
3723 // CHECK-NEXT: entry:
3724 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
3725 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
3726 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
3727 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
3728 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
3729 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
3730 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]]
3732 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
3733 return vmulxq_laneq_f64(a, v, 0);
3736 // CHECK-LABEL: @test_vmull_high_n_s16(
3737 // CHECK-NEXT: entry:
3738 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3739 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
3740 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
3741 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
3742 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
3743 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3744 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3745 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3746 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
3748 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
3749 return vmull_high_n_s16(a, b);
3752 // CHECK-LABEL: @test_vmull_high_n_s32(
3753 // CHECK-NEXT: entry:
3754 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3755 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
3756 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
3757 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3758 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3759 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3760 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
3762 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
3763 return vmull_high_n_s32(a, b);
3766 // CHECK-LABEL: @test_vmull_high_n_u16(
3767 // CHECK-NEXT: entry:
3768 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3769 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
3770 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
3771 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
3772 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
3773 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3774 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3775 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3776 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
3778 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
3779 return vmull_high_n_u16(a, b);
3782 // CHECK-LABEL: @test_vmull_high_n_u32(
3783 // CHECK-NEXT: entry:
3784 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3785 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
3786 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
3787 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3788 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3789 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3790 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
3792 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
3793 return vmull_high_n_u32(a, b);
3796 // CHECK-LABEL: @test_vqdmull_high_n_s16(
3797 // CHECK-NEXT: entry:
3798 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3799 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
3800 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
3801 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
3802 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
3803 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3804 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3805 // CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3806 // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
3807 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]]
3809 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
3810 return vqdmull_high_n_s16(a, b);
3813 // CHECK-LABEL: @test_vqdmull_high_n_s32(
3814 // CHECK-NEXT: entry:
3815 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3>
3816 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
3817 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
3818 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3819 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3820 // CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3821 // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
3822 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]]
3824 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
3825 return vqdmull_high_n_s32(a, b);
3828 // CHECK-LABEL: @test_vmlal_high_n_s16(
3829 // CHECK-NEXT: entry:
3830 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3831 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
3832 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3833 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3834 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3835 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3836 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3837 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3838 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
3839 // CHECK-NEXT: ret <4 x i32> [[ADD_I]]
3841 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3842 return vmlal_high_n_s16(a, b, c);
3845 // CHECK-LABEL: @test_vmlal_high_n_s32(
3846 // CHECK-NEXT: entry:
3847 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3848 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
3849 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3850 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3851 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3852 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3853 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
3854 // CHECK-NEXT: ret <2 x i64> [[ADD_I]]
3856 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3857 return vmlal_high_n_s32(a, b, c);
3860 // CHECK-LABEL: @test_vmlal_high_n_u16(
3861 // CHECK-NEXT: entry:
3862 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3863 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
3864 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3865 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3866 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3867 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3868 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3869 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3870 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
3871 // CHECK-NEXT: ret <4 x i32> [[ADD_I]]
3873 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
3874 return vmlal_high_n_u16(a, b, c);
3877 // CHECK-LABEL: @test_vmlal_high_n_u32(
3878 // CHECK-NEXT: entry:
3879 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3880 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
3881 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3882 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3883 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3884 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3885 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
3886 // CHECK-NEXT: ret <2 x i64> [[ADD_I]]
3888 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3889 return vmlal_high_n_u32(a, b, c);
3892 // CHECK-LABEL: @test_vqdmlal_high_n_s16(
3893 // CHECK-NEXT: entry:
3894 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3895 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
3896 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3897 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3898 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3899 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3900 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3901 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3902 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3903 // CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
3904 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]]
3906 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3907 return vqdmlal_high_n_s16(a, b, c);
3910 // CHECK-LABEL: @test_vqdmlal_high_n_s32(
3911 // CHECK-NEXT: entry:
3912 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3913 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
3914 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3915 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
3916 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3917 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3918 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3919 // CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
3920 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]]
3922 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3923 return vqdmlal_high_n_s32(a, b, c);
3926 // CHECK-LABEL: @test_vmlsl_high_n_s16(
3927 // CHECK-NEXT: entry:
3928 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3929 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
3930 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3931 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3932 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3933 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3934 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3935 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3936 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
3937 // CHECK-NEXT: ret <4 x i32> [[SUB_I]]
3939 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3940 return vmlsl_high_n_s16(a, b, c);
3943 // CHECK-LABEL: @test_vmlsl_high_n_s32(
3944 // CHECK-NEXT: entry:
3945 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3946 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
3947 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3948 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3949 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3950 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3951 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
3952 // CHECK-NEXT: ret <2 x i64> [[SUB_I]]
3954 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3955 return vmlsl_high_n_s32(a, b, c);
3958 // CHECK-LABEL: @test_vmlsl_high_n_u16(
3959 // CHECK-NEXT: entry:
3960 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3961 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
3962 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3963 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3964 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3965 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3966 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3967 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
3968 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
3969 // CHECK-NEXT: ret <4 x i32> [[SUB_I]]
3971 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
3972 return vmlsl_high_n_u16(a, b, c);
3975 // CHECK-LABEL: @test_vmlsl_high_n_u32(
3976 // CHECK-NEXT: entry:
3977 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
3978 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
3979 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
3980 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3981 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3982 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
3983 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
3984 // CHECK-NEXT: ret <2 x i64> [[SUB_I]]
3986 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3987 return vmlsl_high_n_u32(a, b, c);
3990 // CHECK-LABEL: @test_vqdmlsl_high_n_s16(
3991 // CHECK-NEXT: entry:
3992 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3993 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
3994 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
3995 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
3996 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
3997 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
3998 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3999 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4000 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
4001 // CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
4002 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]]
4004 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
4005 return vqdmlsl_high_n_s16(a, b, c);
4008 // CHECK-LABEL: @test_vqdmlsl_high_n_s32(
4009 // CHECK-NEXT: entry:
4010 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
4011 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4012 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4013 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
4014 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
4015 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4016 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
4017 // CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
4018 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]]
4020 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
4021 return vqdmlsl_high_n_s32(a, b, c);
4024 // CHECK-LABEL: @test_vmul_n_f32(
4025 // CHECK-NEXT: entry:
4026 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[B:%.*]], i32 0
4027 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[B]], i32 1
4028 // CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[A:%.*]], [[VECINIT1_I]]
4029 // CHECK-NEXT: ret <2 x float> [[MUL_I]]
4031 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
4032 return vmul_n_f32(a, b);
4035 // CHECK-LABEL: @test_vmulq_n_f32(
4036 // CHECK-NEXT: entry:
4037 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
4038 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[B]], i32 1
4039 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[B]], i32 2
4040 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[B]], i32 3
4041 // CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[A:%.*]], [[VECINIT3_I]]
4042 // CHECK-NEXT: ret <4 x float> [[MUL_I]]
4044 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
4045 return vmulq_n_f32(a, b);
4048 // CHECK-LABEL: @test_vmulq_n_f64(
4049 // CHECK-NEXT: entry:
4050 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
4051 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[B]], i32 1
4052 // CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[A:%.*]], [[VECINIT1_I]]
4053 // CHECK-NEXT: ret <2 x double> [[MUL_I]]
4055 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
4056 return vmulq_n_f64(a, b);
4059 // CHECK-LABEL: @test_vfma_n_f32(
4060 // CHECK-NEXT: entry:
4061 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[N:%.*]], i32 0
4062 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
4063 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
4064 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
4065 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
4066 // CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
4067 // CHECK-NEXT: ret <2 x float> [[TMP3]]
4069 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
4070 return vfma_n_f32(a, b, n);
4073 // CHECK-LABEL: @test_vfma_n_f64(
4074 // CHECK-NEXT: entry:
4075 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double [[N:%.*]], i32 0
4076 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
4077 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
4078 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
4079 // CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]])
4080 // CHECK-NEXT: ret <1 x double> [[TMP3]]
4082 float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
4083 return vfma_n_f64(a, b, n);
4086 // CHECK-LABEL: @test_vfmaq_n_f32(
4087 // CHECK-NEXT: entry:
4088 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[N:%.*]], i32 0
4089 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
4090 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
4091 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
4092 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
4093 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
4094 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
4095 // CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
4096 // CHECK-NEXT: ret <4 x float> [[TMP3]]
4098 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
4099 return vfmaq_n_f32(a, b, n);
4102 // CHECK-LABEL: @test_vfms_n_f32(
4103 // CHECK-NEXT: entry:
4104 // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[B:%.*]]
4105 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[N:%.*]], i32 0
4106 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
4107 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
4108 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8>
4109 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
4110 // CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
4111 // CHECK-NEXT: ret <2 x float> [[TMP3]]
4113 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
4114 return vfms_n_f32(a, b, n);
4117 // CHECK-LABEL: @test_vfms_n_f64(
4118 // CHECK-NEXT: entry:
4119 // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[B:%.*]]
4120 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double [[N:%.*]], i32 0
4121 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
4122 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to <8 x i8>
4123 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
4124 // CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]])
4125 // CHECK-NEXT: ret <1 x double> [[TMP3]]
4127 float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
4128 return vfms_n_f64(a, b, n);
4131 // CHECK-LABEL: @test_vfmsq_n_f32(
4132 // CHECK-NEXT: entry:
4133 // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[B:%.*]]
4134 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[N:%.*]], i32 0
4135 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
4136 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
4137 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
4138 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
4139 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8>
4140 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
4141 // CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
4142 // CHECK-NEXT: ret <4 x float> [[TMP3]]
4144 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
4145 return vfmsq_n_f32(a, b, n);
4148 // CHECK-LABEL: @test_vmul_n_s16(
4149 // CHECK-NEXT: entry:
4150 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4151 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4152 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4153 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4154 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]]
4155 // CHECK-NEXT: ret <4 x i16> [[MUL_I]]
4157 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
4158 return vmul_n_s16(a, b);
4161 // CHECK-LABEL: @test_vmulq_n_s16(
4162 // CHECK-NEXT: entry:
4163 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
4164 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4165 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4166 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4167 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4168 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4169 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4170 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4171 // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]]
4172 // CHECK-NEXT: ret <8 x i16> [[MUL_I]]
4174 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
4175 return vmulq_n_s16(a, b);
4178 // CHECK-LABEL: @test_vmul_n_s32(
4179 // CHECK-NEXT: entry:
4180 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4181 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4182 // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]]
4183 // CHECK-NEXT: ret <2 x i32> [[MUL_I]]
4185 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
4186 return vmul_n_s32(a, b);
4189 // CHECK-LABEL: @test_vmulq_n_s32(
4190 // CHECK-NEXT: entry:
4191 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
4192 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4193 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4194 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4195 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]]
4196 // CHECK-NEXT: ret <4 x i32> [[MUL_I]]
4198 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
4199 return vmulq_n_s32(a, b);
4202 // CHECK-LABEL: @test_vmul_n_u16(
4203 // CHECK-NEXT: entry:
4204 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4205 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4206 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4207 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4208 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]]
4209 // CHECK-NEXT: ret <4 x i16> [[MUL_I]]
4211 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
4212 return vmul_n_u16(a, b);
4215 // CHECK-LABEL: @test_vmulq_n_u16(
4216 // CHECK-NEXT: entry:
4217 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
4218 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4219 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4220 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4221 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4222 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4223 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4224 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4225 // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]]
4226 // CHECK-NEXT: ret <8 x i16> [[MUL_I]]
4228 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
4229 return vmulq_n_u16(a, b);
4232 // CHECK-LABEL: @test_vmul_n_u32(
4233 // CHECK-NEXT: entry:
4234 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4235 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4236 // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]]
4237 // CHECK-NEXT: ret <2 x i32> [[MUL_I]]
4239 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
4240 return vmul_n_u32(a, b);
4243 // CHECK-LABEL: @test_vmulq_n_u32(
4244 // CHECK-NEXT: entry:
4245 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
4246 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4247 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4248 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4249 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]]
4250 // CHECK-NEXT: ret <4 x i32> [[MUL_I]]
4252 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
4253 return vmulq_n_u32(a, b);
4256 // CHECK-LABEL: @test_vmull_n_s16(
4257 // CHECK-NEXT: entry:
4258 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4259 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4260 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4261 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4262 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4263 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4264 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4265 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
4267 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
4268 return vmull_n_s16(a, b);
4271 // CHECK-LABEL: @test_vmull_n_s32(
4272 // CHECK-NEXT: entry:
4273 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4274 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4275 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4276 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4277 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4278 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
4280 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
4281 return vmull_n_s32(a, b);
4284 // CHECK-LABEL: @test_vmull_n_u16(
4285 // CHECK-NEXT: entry:
4286 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4287 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4288 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4289 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4290 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4291 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4292 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4293 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
4295 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
4296 return vmull_n_u16(a, b);
4299 // CHECK-LABEL: @test_vmull_n_u32(
4300 // CHECK-NEXT: entry:
4301 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4302 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4303 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4304 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4305 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4306 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
4308 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
4309 return vmull_n_u32(a, b);
4312 // CHECK-LABEL: @test_vqdmull_n_s16(
4313 // CHECK-NEXT: entry:
4314 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4315 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4316 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4317 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4318 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4319 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4320 // CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4321 // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
4322 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]]
4324 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
4325 return vqdmull_n_s16(a, b);
4328 // CHECK-LABEL: @test_vqdmull_n_s32(
4329 // CHECK-NEXT: entry:
4330 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4331 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4332 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4333 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4334 // CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4335 // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
4336 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]]
4338 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
4339 return vqdmull_n_s32(a, b);
4342 // CHECK-LABEL: @test_vqdmulh_n_s16(
4343 // CHECK-NEXT: entry:
4344 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4345 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4346 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4347 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4348 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4349 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4350 // CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4351 // CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4352 // CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
4354 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
4355 return vqdmulh_n_s16(a, b);
4358 // CHECK-LABEL: @test_vqdmulhq_n_s16(
4359 // CHECK-NEXT: entry:
4360 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
4361 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4362 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4363 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4364 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4365 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4366 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4367 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4368 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
4369 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
4370 // CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
4371 // CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4372 // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
4374 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
4375 return vqdmulhq_n_s16(a, b);
4378 // CHECK-LABEL: @test_vqdmulh_n_s32(
4379 // CHECK-NEXT: entry:
4380 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4381 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4382 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4383 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4384 // CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4385 // CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4386 // CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
4388 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
4389 return vqdmulh_n_s32(a, b);
4392 // CHECK-LABEL: @test_vqdmulhq_n_s32(
4393 // CHECK-NEXT: entry:
4394 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
4395 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4396 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4397 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4398 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4399 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
4400 // CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
4401 // CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4402 // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
4404 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
4405 return vqdmulhq_n_s32(a, b);
4408 // CHECK-LABEL: @test_vqrdmulh_n_s16(
4409 // CHECK-NEXT: entry:
4410 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i32 0
4411 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4412 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4413 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4414 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
4415 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4416 // CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
4417 // CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4418 // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
4420 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
4421 return vqrdmulh_n_s16(a, b);
4424 // CHECK-LABEL: @test_vqrdmulhq_n_s16(
4425 // CHECK-NEXT: entry:
4426 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[B:%.*]], i32 0
4427 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1
4428 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2
4429 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
4430 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4
4431 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5
4432 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6
4433 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
4434 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
4435 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
4436 // CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
4437 // CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4438 // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
4440 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
4441 return vqrdmulhq_n_s16(a, b);
4444 // CHECK-LABEL: @test_vqrdmulh_n_s32(
4445 // CHECK-NEXT: entry:
4446 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0
4447 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4448 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
4449 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4450 // CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
4451 // CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4452 // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
4454 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
4455 return vqrdmulh_n_s32(a, b);
4458 // CHECK-LABEL: @test_vqrdmulhq_n_s32(
4459 // CHECK-NEXT: entry:
4460 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0
4461 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1
4462 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2
4463 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
4464 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4465 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
4466 // CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
4467 // CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4468 // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
4470 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
4471 return vqrdmulhq_n_s32(a, b);
4474 // CHECK-LABEL: @test_vmla_n_s16(
4475 // CHECK-NEXT: entry:
4476 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4477 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4478 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4479 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4480 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4481 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]]
4482 // CHECK-NEXT: ret <4 x i16> [[ADD_I]]
4484 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
4485 return vmla_n_s16(a, b, c);
4488 // CHECK-LABEL: @test_vmlaq_n_s16(
4489 // CHECK-NEXT: entry:
4490 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0
4491 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4492 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4493 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4494 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4495 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4496 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4497 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4498 // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4499 // CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]]
4500 // CHECK-NEXT: ret <8 x i16> [[ADD_I]]
4502 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
4503 return vmlaq_n_s16(a, b, c);
4506 // CHECK-LABEL: @test_vmla_n_s32(
4507 // CHECK-NEXT: entry:
4508 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4509 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4510 // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4511 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]]
4512 // CHECK-NEXT: ret <2 x i32> [[ADD_I]]
4514 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
4515 return vmla_n_s32(a, b, c);
4518 // CHECK-LABEL: @test_vmlaq_n_s32(
4519 // CHECK-NEXT: entry:
4520 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
4521 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4522 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4523 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4524 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4525 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]]
4526 // CHECK-NEXT: ret <4 x i32> [[ADD_I]]
4528 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
4529 return vmlaq_n_s32(a, b, c);
4532 // CHECK-LABEL: @test_vmla_n_u16(
4533 // CHECK-NEXT: entry:
4534 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4535 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4536 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4537 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4538 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4539 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]]
4540 // CHECK-NEXT: ret <4 x i16> [[ADD_I]]
4542 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
4543 return vmla_n_u16(a, b, c);
4546 // CHECK-LABEL: @test_vmlaq_n_u16(
4547 // CHECK-NEXT: entry:
4548 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0
4549 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4550 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4551 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4552 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4553 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4554 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4555 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4556 // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4557 // CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]]
4558 // CHECK-NEXT: ret <8 x i16> [[ADD_I]]
4560 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
4561 return vmlaq_n_u16(a, b, c);
4564 // CHECK-LABEL: @test_vmla_n_u32(
4565 // CHECK-NEXT: entry:
4566 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4567 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4568 // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4569 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]]
4570 // CHECK-NEXT: ret <2 x i32> [[ADD_I]]
4572 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
4573 return vmla_n_u32(a, b, c);
4576 // CHECK-LABEL: @test_vmlaq_n_u32(
4577 // CHECK-NEXT: entry:
4578 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
4579 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4580 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4581 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4582 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4583 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]]
4584 // CHECK-NEXT: ret <4 x i32> [[ADD_I]]
4586 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
4587 return vmlaq_n_u32(a, b, c);
4590 // CHECK-LABEL: @test_vmlal_n_s16(
4591 // CHECK-NEXT: entry:
4592 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4593 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4594 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4595 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4596 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4597 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4598 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4599 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4600 // CHECK-NEXT: ret <4 x i32> [[ADD_I]]
4602 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4603 return vmlal_n_s16(a, b, c);
4606 // CHECK-LABEL: @test_vmlal_n_s32(
4607 // CHECK-NEXT: entry:
4608 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4609 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4610 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4611 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4612 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4613 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4614 // CHECK-NEXT: ret <2 x i64> [[ADD_I]]
4616 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4617 return vmlal_n_s32(a, b, c);
4620 // CHECK-LABEL: @test_vmlal_n_u16(
4621 // CHECK-NEXT: entry:
4622 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4623 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4624 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4625 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4626 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4627 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4628 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4629 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4630 // CHECK-NEXT: ret <4 x i32> [[ADD_I]]
4632 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
4633 return vmlal_n_u16(a, b, c);
4636 // CHECK-LABEL: @test_vmlal_n_u32(
4637 // CHECK-NEXT: entry:
4638 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4639 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4640 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4641 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4642 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4643 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4644 // CHECK-NEXT: ret <2 x i64> [[ADD_I]]
4646 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
4647 return vmlal_n_u32(a, b, c);
4650 // CHECK-LABEL: @test_vqdmlal_n_s16(
4651 // CHECK-NEXT: entry:
4652 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4653 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4654 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4655 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4656 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4657 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4658 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4659 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4660 // CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
4661 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]]
4663 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4664 return vqdmlal_n_s16(a, b, c);
4667 // CHECK-LABEL: @test_vqdmlal_n_s32(
4668 // CHECK-NEXT: entry:
4669 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4670 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4671 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
4672 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4673 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4674 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4675 // CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
4676 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]]
4678 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4679 return vqdmlal_n_s32(a, b, c);
4682 // CHECK-LABEL: @test_vmls_n_s16(
4683 // CHECK-NEXT: entry:
4684 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4685 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4686 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4687 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4688 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4689 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]]
4690 // CHECK-NEXT: ret <4 x i16> [[SUB_I]]
4692 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
4693 return vmls_n_s16(a, b, c);
4696 // CHECK-LABEL: @test_vmlsq_n_s16(
4697 // CHECK-NEXT: entry:
4698 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0
4699 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4700 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4701 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4702 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4703 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4704 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4705 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4706 // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4707 // CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]]
4708 // CHECK-NEXT: ret <8 x i16> [[SUB_I]]
4710 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
4711 return vmlsq_n_s16(a, b, c);
4714 // CHECK-LABEL: @test_vmls_n_s32(
4715 // CHECK-NEXT: entry:
4716 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4717 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4718 // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4719 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]]
4720 // CHECK-NEXT: ret <2 x i32> [[SUB_I]]
4722 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
4723 return vmls_n_s32(a, b, c);
4726 // CHECK-LABEL: @test_vmlsq_n_s32(
4727 // CHECK-NEXT: entry:
4728 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
4729 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4730 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4731 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4732 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4733 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]]
4734 // CHECK-NEXT: ret <4 x i32> [[SUB_I]]
4736 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
4737 return vmlsq_n_s32(a, b, c);
4740 // CHECK-LABEL: @test_vmls_n_u16(
4741 // CHECK-NEXT: entry:
4742 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4743 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4744 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4745 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4746 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]]
4747 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]]
4748 // CHECK-NEXT: ret <4 x i16> [[SUB_I]]
4750 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
4751 return vmls_n_u16(a, b, c);
4754 // CHECK-LABEL: @test_vmlsq_n_u16(
4755 // CHECK-NEXT: entry:
4756 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C:%.*]], i32 0
4757 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4758 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4759 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4760 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4
4761 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5
4762 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6
4763 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7
4764 // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]]
4765 // CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]]
4766 // CHECK-NEXT: ret <8 x i16> [[SUB_I]]
4768 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
4769 return vmlsq_n_u16(a, b, c);
4772 // CHECK-LABEL: @test_vmls_n_u32(
4773 // CHECK-NEXT: entry:
4774 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4775 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4776 // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]]
4777 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]]
4778 // CHECK-NEXT: ret <2 x i32> [[SUB_I]]
4780 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
4781 return vmls_n_u32(a, b, c);
4784 // CHECK-LABEL: @test_vmlsq_n_u32(
4785 // CHECK-NEXT: entry:
4786 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
4787 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4788 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2
4789 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3
4790 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]]
4791 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]]
4792 // CHECK-NEXT: ret <4 x i32> [[SUB_I]]
4794 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
4795 return vmlsq_n_u32(a, b, c);
4798 // CHECK-LABEL: @test_vmlsl_n_s16(
4799 // CHECK-NEXT: entry:
4800 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4801 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4802 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4803 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4804 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4805 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4806 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4807 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4808 // CHECK-NEXT: ret <4 x i32> [[SUB_I]]
4810 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4811 return vmlsl_n_s16(a, b, c);
4814 // CHECK-LABEL: @test_vmlsl_n_s32(
4815 // CHECK-NEXT: entry:
4816 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4817 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4818 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4819 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4820 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4821 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4822 // CHECK-NEXT: ret <2 x i64> [[SUB_I]]
4824 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4825 return vmlsl_n_s32(a, b, c);
4828 // CHECK-LABEL: @test_vmlsl_n_u16(
4829 // CHECK-NEXT: entry:
4830 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4831 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4832 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4833 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4834 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4835 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4836 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4837 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
4838 // CHECK-NEXT: ret <4 x i32> [[SUB_I]]
4840 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
4841 return vmlsl_n_u16(a, b, c);
4844 // CHECK-LABEL: @test_vmlsl_n_u32(
4845 // CHECK-NEXT: entry:
4846 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4847 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4848 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4849 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4850 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4851 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
4852 // CHECK-NEXT: ret <2 x i64> [[SUB_I]]
4854 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
4855 return vmlsl_n_u32(a, b, c);
4858 // CHECK-LABEL: @test_vqdmlsl_n_s16(
4859 // CHECK-NEXT: entry:
4860 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i32 0
4861 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1
4862 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2
4863 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
4864 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
4865 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
4866 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4867 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
4868 // CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
4869 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]]
4871 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4872 return vqdmlsl_n_s16(a, b, c);
4875 // CHECK-LABEL: @test_vqdmlsl_n_s32(
4876 // CHECK-NEXT: entry:
4877 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[C:%.*]], i32 0
4878 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
4879 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
4880 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
4881 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4882 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
4883 // CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
4884 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]]
4886 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4887 return vqdmlsl_n_s32(a, b, c);
4890 // CHECK-LABEL: @test_vmla_lane_u16_0(
4891 // CHECK-NEXT: entry:
4892 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
4893 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4894 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
4895 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
4896 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
4897 // CHECK-NEXT: ret <4 x i16> [[ADD]]
4899 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4900 return vmla_lane_u16(a, b, v, 0);
4903 // CHECK-LABEL: @test_vmlaq_lane_u16_0(
4904 // CHECK-NEXT: entry:
4905 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
4906 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4907 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
4908 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
4909 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
4910 // CHECK-NEXT: ret <8 x i16> [[ADD]]
4912 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4913 return vmlaq_lane_u16(a, b, v, 0);
4916 // CHECK-LABEL: @test_vmla_lane_u32_0(
4917 // CHECK-NEXT: entry:
4918 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
4919 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4920 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
4921 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
4922 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
4923 // CHECK-NEXT: ret <2 x i32> [[ADD]]
4925 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4926 return vmla_lane_u32(a, b, v, 0);
4929 // CHECK-LABEL: @test_vmlaq_lane_u32_0(
4930 // CHECK-NEXT: entry:
4931 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
4932 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4933 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
4934 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
4935 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
4936 // CHECK-NEXT: ret <4 x i32> [[ADD]]
4938 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4939 return vmlaq_lane_u32(a, b, v, 0);
4942 // CHECK-LABEL: @test_vmla_laneq_u16_0(
4943 // CHECK-NEXT: entry:
4944 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
4945 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4946 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
4947 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
4948 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
4949 // CHECK-NEXT: ret <4 x i16> [[ADD]]
4951 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4952 return vmla_laneq_u16(a, b, v, 0);
4955 // CHECK-LABEL: @test_vmlaq_laneq_u16_0(
4956 // CHECK-NEXT: entry:
4957 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
4958 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4959 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
4960 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
4961 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
4962 // CHECK-NEXT: ret <8 x i16> [[ADD]]
4964 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4965 return vmlaq_laneq_u16(a, b, v, 0);
4968 // CHECK-LABEL: @test_vmla_laneq_u32_0(
4969 // CHECK-NEXT: entry:
4970 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
4971 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4972 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
4973 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
4974 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
4975 // CHECK-NEXT: ret <2 x i32> [[ADD]]
4977 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4978 return vmla_laneq_u32(a, b, v, 0);
4981 // CHECK-LABEL: @test_vmlaq_laneq_u32_0(
4982 // CHECK-NEXT: entry:
4983 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
4984 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4985 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
4986 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
4987 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
4988 // CHECK-NEXT: ret <4 x i32> [[ADD]]
4990 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4991 return vmlaq_laneq_u32(a, b, v, 0);
4994 // CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
4995 // CHECK-NEXT: entry:
4996 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
4997 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4998 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
4999 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5000 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5001 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5002 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
5003 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5004 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
5006 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
5007 return vqdmlal_laneq_s16(a, b, v, 0);
5010 // CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
5011 // CHECK-NEXT: entry:
5012 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5013 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5014 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5015 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5016 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5017 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5018 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
5019 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5020 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
5022 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
5023 return vqdmlal_laneq_s32(a, b, v, 0);
5026 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
5027 // CHECK-NEXT: entry:
5028 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5029 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5030 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5031 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5032 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5033 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5034 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5035 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
5036 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5037 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
5039 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
5040 return vqdmlal_high_laneq_s16(a, b, v, 0);
5043 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
5044 // CHECK-NEXT: entry:
5045 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5046 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5047 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5048 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5049 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5050 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5051 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5052 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
5053 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5054 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
5056 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
5057 return vqdmlal_high_laneq_s32(a, b, v, 0);
5060 // CHECK-LABEL: @test_vmls_lane_u16_0(
5061 // CHECK-NEXT: entry:
5062 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5063 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5064 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
5065 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5066 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5067 // CHECK-NEXT: ret <4 x i16> [[SUB]]
5069 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
5070 return vmls_lane_u16(a, b, v, 0);
5073 // CHECK-LABEL: @test_vmlsq_lane_u16_0(
5074 // CHECK-NEXT: entry:
5075 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5076 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5077 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer
5078 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5079 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5080 // CHECK-NEXT: ret <8 x i16> [[SUB]]
5082 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
5083 return vmlsq_lane_u16(a, b, v, 0);
5086 // CHECK-LABEL: @test_vmls_lane_u32_0(
5087 // CHECK-NEXT: entry:
5088 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5089 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5090 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
5091 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5092 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5093 // CHECK-NEXT: ret <2 x i32> [[SUB]]
5095 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
5096 return vmls_lane_u32(a, b, v, 0);
5099 // CHECK-LABEL: @test_vmlsq_lane_u32_0(
5100 // CHECK-NEXT: entry:
5101 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5102 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5103 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer
5104 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5105 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5106 // CHECK-NEXT: ret <4 x i32> [[SUB]]
5108 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
5109 return vmlsq_lane_u32(a, b, v, 0);
5112 // CHECK-LABEL: @test_vmls_laneq_u16_0(
5113 // CHECK-NEXT: entry:
5114 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5115 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5116 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5117 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5118 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5119 // CHECK-NEXT: ret <4 x i16> [[SUB]]
5121 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
5122 return vmls_laneq_u16(a, b, v, 0);
5125 // CHECK-LABEL: @test_vmlsq_laneq_u16_0(
5126 // CHECK-NEXT: entry:
5127 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5128 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5129 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer
5130 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5131 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5132 // CHECK-NEXT: ret <8 x i16> [[SUB]]
5134 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
5135 return vmlsq_laneq_u16(a, b, v, 0);
5138 // CHECK-LABEL: @test_vmls_laneq_u32_0(
5139 // CHECK-NEXT: entry:
5140 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5141 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5142 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5143 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5144 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5145 // CHECK-NEXT: ret <2 x i32> [[SUB]]
5147 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
5148 return vmls_laneq_u32(a, b, v, 0);
5151 // CHECK-LABEL: @test_vmlsq_laneq_u32_0(
5152 // CHECK-NEXT: entry:
5153 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5154 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5155 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
5156 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5157 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5158 // CHECK-NEXT: ret <4 x i32> [[SUB]]
5160 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
5161 return vmlsq_laneq_u32(a, b, v, 0);
5164 // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
5165 // CHECK-NEXT: entry:
5166 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5167 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5168 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5169 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5170 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5171 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5172 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
5173 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5174 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
5176 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
5177 return vqdmlsl_laneq_s16(a, b, v, 0);
5180 // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
5181 // CHECK-NEXT: entry:
5182 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5183 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5184 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5185 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5186 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5187 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5188 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
5189 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5190 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
5192 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
5193 return vqdmlsl_laneq_s32(a, b, v, 0);
5196 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
5197 // CHECK-NEXT: entry:
5198 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5199 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5200 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5201 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
5202 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5203 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5204 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5205 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
5206 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5207 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
5209 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
5210 return vqdmlsl_high_laneq_s16(a, b, v, 0);
5213 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
5214 // CHECK-NEXT: entry:
5215 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5216 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5217 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5218 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
5219 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5220 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5221 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5222 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
5223 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5224 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
5226 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
5227 return vqdmlsl_high_laneq_s32(a, b, v, 0);
5230 // CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
5231 // CHECK-NEXT: entry:
5232 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5233 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5234 // CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5235 // CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5236 // CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0)
5237 // CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]]
5239 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
5240 return vqdmulh_laneq_s16(a, v, 0);
5243 // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
5244 // CHECK-NEXT: entry:
5245 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5246 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5247 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5248 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5249 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0)
5250 // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
5252 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
5253 return vqdmulhq_laneq_s16(a, v, 0);
5256 // CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
5257 // CHECK-NEXT: entry:
5258 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5259 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5260 // CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5261 // CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5262 // CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0)
5263 // CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]]
5265 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
5266 return vqdmulh_laneq_s32(a, v, 0);
5269 // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
5270 // CHECK-NEXT: entry:
5271 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5272 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5273 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5274 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5275 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0)
5276 // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
5278 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
5279 return vqdmulhq_laneq_s32(a, v, 0);
5282 // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
5283 // CHECK-NEXT: entry:
5284 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5285 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5286 // CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5287 // CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5288 // CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0)
5289 // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
5291 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
5292 return vqrdmulh_laneq_s16(a, v, 0);
5295 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
5296 // CHECK-NEXT: entry:
5297 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5298 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5299 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5300 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5301 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0)
5302 // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
5304 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
5305 return vqrdmulhq_laneq_s16(a, v, 0);
5308 // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
5309 // CHECK-NEXT: entry:
5310 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5311 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5312 // CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5313 // CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5314 // CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0)
5315 // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
5317 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
5318 return vqrdmulh_laneq_s32(a, v, 0);
5321 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
5322 // CHECK-NEXT: entry:
5323 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5324 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5325 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5326 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5327 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0)
5328 // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
5330 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
5331 return vqrdmulhq_laneq_s32(a, v, 0);
5334 // CHECK-LABEL: @test_vmla_lane_u16(
5335 // CHECK-NEXT: entry:
5336 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5337 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5338 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5339 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5340 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
5341 // CHECK-NEXT: ret <4 x i16> [[ADD]]
5343 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
5344 return vmla_lane_u16(a, b, v, 3);
5347 // CHECK-LABEL: @test_vmlaq_lane_u16(
5348 // CHECK-NEXT: entry:
5349 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5350 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5351 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
5352 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5353 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
5354 // CHECK-NEXT: ret <8 x i16> [[ADD]]
5356 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
5357 return vmlaq_lane_u16(a, b, v, 3);
5360 // CHECK-LABEL: @test_vmla_lane_u32(
5361 // CHECK-NEXT: entry:
5362 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5363 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5364 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
5365 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5366 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
5367 // CHECK-NEXT: ret <2 x i32> [[ADD]]
5369 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
5370 return vmla_lane_u32(a, b, v, 1);
5373 // CHECK-LABEL: @test_vmlaq_lane_u32(
5374 // CHECK-NEXT: entry:
5375 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5376 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5377 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5378 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5379 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
5380 // CHECK-NEXT: ret <4 x i32> [[ADD]]
5382 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
5383 return vmlaq_lane_u32(a, b, v, 1);
5386 // CHECK-LABEL: @test_vmla_laneq_u16(
5387 // CHECK-NEXT: entry:
5388 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5389 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5390 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5391 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5392 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]]
5393 // CHECK-NEXT: ret <4 x i16> [[ADD]]
5395 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
5396 return vmla_laneq_u16(a, b, v, 7);
5399 // CHECK-LABEL: @test_vmlaq_laneq_u16(
5400 // CHECK-NEXT: entry:
5401 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5402 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5403 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
5404 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5405 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]]
5406 // CHECK-NEXT: ret <8 x i16> [[ADD]]
5408 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
5409 return vmlaq_laneq_u16(a, b, v, 7);
5412 // CHECK-LABEL: @test_vmla_laneq_u32(
5413 // CHECK-NEXT: entry:
5414 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5415 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5416 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5417 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5418 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]]
5419 // CHECK-NEXT: ret <2 x i32> [[ADD]]
5421 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
5422 return vmla_laneq_u32(a, b, v, 3);
5425 // CHECK-LABEL: @test_vmlaq_laneq_u32(
5426 // CHECK-NEXT: entry:
5427 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5428 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5429 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5430 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5431 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]]
5432 // CHECK-NEXT: ret <4 x i32> [[ADD]]
5434 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
5435 return vmlaq_laneq_u32(a, b, v, 3);
5438 // CHECK-LABEL: @test_vqdmlal_laneq_s16(
5439 // CHECK-NEXT: entry:
5440 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5441 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5442 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5443 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5444 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5445 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5446 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
5447 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5448 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
5450 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
5451 return vqdmlal_laneq_s16(a, b, v, 7);
5454 // CHECK-LABEL: @test_vqdmlal_laneq_s32(
5455 // CHECK-NEXT: entry:
5456 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5457 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5458 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5459 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5460 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5461 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5462 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
5463 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5464 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
5466 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
5467 return vqdmlal_laneq_s32(a, b, v, 3);
5470 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
5471 // CHECK-NEXT: entry:
5472 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5473 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5474 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5475 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5476 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5477 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5478 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5479 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
5480 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5481 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
5483 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
5484 return vqdmlal_high_laneq_s16(a, b, v, 7);
5487 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
5488 // CHECK-NEXT: entry:
5489 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5490 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5491 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5492 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5493 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5494 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5495 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5496 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
5497 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5498 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
5500 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
5501 return vqdmlal_high_laneq_s32(a, b, v, 3);
5504 // CHECK-LABEL: @test_vmls_lane_u16(
5505 // CHECK-NEXT: entry:
5506 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5507 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5508 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5509 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5510 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5511 // CHECK-NEXT: ret <4 x i16> [[SUB]]
5513 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
5514 return vmls_lane_u16(a, b, v, 3);
5517 // CHECK-LABEL: @test_vmlsq_lane_u16(
5518 // CHECK-NEXT: entry:
5519 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8>
5520 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5521 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
5522 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5523 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5524 // CHECK-NEXT: ret <8 x i16> [[SUB]]
5526 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
5527 return vmlsq_lane_u16(a, b, v, 3);
5530 // CHECK-LABEL: @test_vmls_lane_u32(
5531 // CHECK-NEXT: entry:
5532 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5533 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5534 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
5535 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5536 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5537 // CHECK-NEXT: ret <2 x i32> [[SUB]]
5539 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
5540 return vmls_lane_u32(a, b, v, 1);
5543 // CHECK-LABEL: @test_vmlsq_lane_u32(
5544 // CHECK-NEXT: entry:
5545 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
5546 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5547 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5548 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5549 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5550 // CHECK-NEXT: ret <4 x i32> [[SUB]]
5552 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
5553 return vmlsq_lane_u32(a, b, v, 1);
5556 // CHECK-LABEL: @test_vmls_laneq_u16(
5557 // CHECK-NEXT: entry:
5558 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5559 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5560 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5561 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]]
5562 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]]
5563 // CHECK-NEXT: ret <4 x i16> [[SUB]]
5565 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
5566 return vmls_laneq_u16(a, b, v, 7);
5569 // CHECK-LABEL: @test_vmlsq_laneq_u16(
5570 // CHECK-NEXT: entry:
5571 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5572 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5573 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
5574 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]]
5575 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]]
5576 // CHECK-NEXT: ret <8 x i16> [[SUB]]
5578 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
5579 return vmlsq_laneq_u16(a, b, v, 7);
5582 // CHECK-LABEL: @test_vmls_laneq_u32(
5583 // CHECK-NEXT: entry:
5584 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5585 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5586 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5587 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]]
5588 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]]
5589 // CHECK-NEXT: ret <2 x i32> [[SUB]]
5591 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
5592 return vmls_laneq_u32(a, b, v, 3);
5595 // CHECK-LABEL: @test_vmlsq_laneq_u32(
5596 // CHECK-NEXT: entry:
5597 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5598 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5599 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5600 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]]
5601 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]]
5602 // CHECK-NEXT: ret <4 x i32> [[SUB]]
5604 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
5605 return vmlsq_laneq_u32(a, b, v, 3);
5608 // CHECK-LABEL: @test_vqdmlsl_laneq_s16(
5609 // CHECK-NEXT: entry:
5610 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5611 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5612 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5613 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5614 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
5615 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5616 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
5617 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5618 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
5620 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
5621 return vqdmlsl_laneq_s16(a, b, v, 7);
5624 // CHECK-LABEL: @test_vqdmlsl_laneq_s32(
5625 // CHECK-NEXT: entry:
5626 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5627 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5628 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5629 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5630 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
5631 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5632 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
5633 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5634 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
5636 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
5637 return vqdmlsl_laneq_s32(a, b, v, 3);
5640 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
5641 // CHECK-NEXT: entry:
5642 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5643 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5644 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5645 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
5646 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5647 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
5648 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
5649 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
5650 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
5651 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
5653 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
5654 return vqdmlsl_high_laneq_s16(a, b, v, 7);
5657 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
5658 // CHECK-NEXT: entry:
5659 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3>
5660 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5661 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5662 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
5663 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
5664 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
5665 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
5666 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
5667 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
5668 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
5670 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
5671 return vqdmlsl_high_laneq_s32(a, b, v, 3);
5674 // CHECK-LABEL: @test_vqdmulh_laneq_s16(
5675 // CHECK-NEXT: entry:
5676 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5677 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5678 // CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5679 // CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5680 // CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7)
5681 // CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]]
5683 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
5684 return vqdmulh_laneq_s16(a, v, 7);
5687 // CHECK-LABEL: @test_vqdmulhq_laneq_s16(
5688 // CHECK-NEXT: entry:
5689 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5690 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5691 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5692 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5693 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7)
5694 // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]]
5696 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
5697 return vqdmulhq_laneq_s16(a, v, 7);
5700 // CHECK-LABEL: @test_vqdmulh_laneq_s32(
5701 // CHECK-NEXT: entry:
5702 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5703 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5704 // CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5705 // CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5706 // CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3)
5707 // CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]]
5709 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
5710 return vqdmulh_laneq_s32(a, v, 3);
5713 // CHECK-LABEL: @test_vqdmulhq_laneq_s32(
5714 // CHECK-NEXT: entry:
5715 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5716 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5717 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5718 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5719 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3)
5720 // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]]
5722 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
5723 return vqdmulhq_laneq_s32(a, v, 3);
5726 // CHECK-LABEL: @test_vqrdmulh_laneq_s16(
5727 // CHECK-NEXT: entry:
5728 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
5729 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5730 // CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
5731 // CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5732 // CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7)
5733 // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]]
5735 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
5736 return vqrdmulh_laneq_s16(a, v, 7);
5739 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
5740 // CHECK-NEXT: entry:
5741 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
5742 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
5743 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
5744 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
5745 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7)
5746 // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]]
5748 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
5749 return vqrdmulhq_laneq_s16(a, v, 7);
5752 // CHECK-LABEL: @test_vqrdmulh_laneq_s32(
5753 // CHECK-NEXT: entry:
5754 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
5755 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5756 // CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
5757 // CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5758 // CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3)
5759 // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]]
5761 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
5762 return vqrdmulh_laneq_s32(a, v, 3);
5765 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
5766 // CHECK-NEXT: entry:
5767 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
5768 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
5769 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5770 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5771 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3)
5772 // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]]
5774 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
5775 return vqrdmulhq_laneq_s32(a, v, 3);