1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.2a -target-feature +neon -target-feature +fp16fml \
3 // RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
5 // REQUIRES: aarch64-registered-target
7 // Test AArch64 Armv8.2-A FP16 Fused Multiply-Add Long intrinsics
13 // CHECK-LABEL: @test_vfmlal_low_f16(
15 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
16 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
17 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
18 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
19 // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]]
21 float32x2_t
test_vfmlal_low_f16(float32x2_t a
, float16x4_t b
, float16x4_t c
) {
22 return vfmlal_low_f16(a
, b
, c
);
25 // CHECK-LABEL: @test_vfmlsl_low_f16(
27 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
28 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
29 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
30 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
31 // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]]
33 float32x2_t
test_vfmlsl_low_f16(float32x2_t a
, float16x4_t b
, float16x4_t c
) {
34 return vfmlsl_low_f16(a
, b
, c
);
37 // CHECK-LABEL: @test_vfmlal_high_f16(
39 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
40 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
41 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
42 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
43 // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]]
45 float32x2_t
test_vfmlal_high_f16(float32x2_t a
, float16x4_t b
, float16x4_t c
) {
46 return vfmlal_high_f16(a
, b
, c
);
49 // CHECK-LABEL: @test_vfmlsl_high_f16(
51 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
52 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
53 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
54 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
55 // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]]
57 float32x2_t
test_vfmlsl_high_f16(float32x2_t a
, float16x4_t b
, float16x4_t c
) {
58 return vfmlsl_high_f16(a
, b
, c
);
61 // CHECK-LABEL: @test_vfmlalq_low_f16(
63 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
64 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
65 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
66 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
67 // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]]
69 float32x4_t
test_vfmlalq_low_f16(float32x4_t a
, float16x8_t b
, float16x8_t c
) {
70 return vfmlalq_low_f16(a
, b
, c
);
73 // CHECK-LABEL: @test_vfmlslq_low_f16(
75 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
76 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
77 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
78 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
79 // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]]
81 float32x4_t
test_vfmlslq_low_f16(float32x4_t a
, float16x8_t b
, float16x8_t c
) {
82 return vfmlslq_low_f16(a
, b
, c
);
85 // CHECK-LABEL: @test_vfmlalq_high_f16(
87 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
88 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
89 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
90 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
91 // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]]
93 float32x4_t
test_vfmlalq_high_f16(float32x4_t a
, float16x8_t b
, float16x8_t c
) {
94 return vfmlalq_high_f16(a
, b
, c
);
97 // CHECK-LABEL: @test_vfmlslq_high_f16(
99 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
100 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
101 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
102 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
103 // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]]
105 float32x4_t
test_vfmlslq_high_f16(float32x4_t a
, float16x8_t b
, float16x8_t c
) {
106 return vfmlslq_high_f16(a
, b
, c
);
111 // CHECK-LABEL: @test_vfmlal_lane_low_f16(
112 // CHECK-NEXT: entry:
113 // CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8
114 // CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2
115 // CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
116 // CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2
117 // CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
118 // CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2
119 // CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
120 // CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2
121 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_851]], align 8
122 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_851]], align 8
123 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
124 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_851]], align 2
125 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_851]], align 2
126 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
127 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8514]], align 8
128 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_8514]], align 8
129 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
130 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8515]], align 2
131 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8515]], align 2
132 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
133 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85114]], align 8
134 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, ptr [[__REINT_85114]], align 8
135 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 0
136 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_85115]], align 2
137 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85115]], align 2
138 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
139 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85124]], align 8
140 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, ptr [[__REINT_85124]], align 8
141 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 0
142 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_85125]], align 2
143 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85125]], align 2
144 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
145 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
146 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
147 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
148 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
149 // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]]
151 float32x2_t
test_vfmlal_lane_low_f16(float32x2_t a
, float16x4_t b
, float16x4_t c
) {
152 return vfmlal_lane_low_f16(a
, b
, c
, 0);
155 // CHECK-LABEL: @test_vfmlal_lane_high_f16(
156 // CHECK-NEXT: entry:
157 // CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8
158 // CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2
159 // CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
160 // CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2
161 // CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
162 // CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2
163 // CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
164 // CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2
165 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_851]], align 8
166 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_851]], align 8
167 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
168 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_851]], align 2
169 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_851]], align 2
170 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
171 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8514]], align 8
172 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_8514]], align 8
173 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
174 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8515]], align 2
175 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8515]], align 2
176 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
177 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85114]], align 8
178 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, ptr [[__REINT_85114]], align 8
179 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 1
180 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_85115]], align 2
181 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85115]], align 2
182 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
183 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85124]], align 8
184 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, ptr [[__REINT_85124]], align 8
185 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 1
186 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_85125]], align 2
187 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85125]], align 2
188 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
189 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
190 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
191 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
192 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
193 // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]]
195 float32x2_t
test_vfmlal_lane_high_f16(float32x2_t a
, float16x4_t b
, float16x4_t c
) {
196 return vfmlal_lane_high_f16(a
, b
, c
, 1);
199 // CHECK-LABEL: @test_vfmlalq_lane_low_f16(
200 // CHECK-NEXT: entry:
201 // CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8
202 // CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2
203 // CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
204 // CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2
205 // CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
206 // CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2
207 // CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
208 // CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2
209 // CHECK-NEXT: [[__REINT_85134:%.*]] = alloca <4 x half>, align 8
210 // CHECK-NEXT: [[__REINT1_85135:%.*]] = alloca i16, align 2
211 // CHECK-NEXT: [[__REINT_85144:%.*]] = alloca <4 x half>, align 8
212 // CHECK-NEXT: [[__REINT1_85145:%.*]] = alloca i16, align 2
213 // CHECK-NEXT: [[__REINT_85154:%.*]] = alloca <4 x half>, align 8
214 // CHECK-NEXT: [[__REINT1_85155:%.*]] = alloca i16, align 2
215 // CHECK-NEXT: [[__REINT_85164:%.*]] = alloca <4 x half>, align 8
216 // CHECK-NEXT: [[__REINT1_85165:%.*]] = alloca i16, align 2
217 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_851]], align 8
218 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_851]], align 8
219 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
220 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_851]], align 2
221 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_851]], align 2
222 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
223 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8514]], align 8
224 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_8514]], align 8
225 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
226 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8515]], align 2
227 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8515]], align 2
228 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
229 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85114]], align 8
230 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, ptr [[__REINT_85114]], align 8
231 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 2
232 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_85115]], align 2
233 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85115]], align 2
234 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
235 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85124]], align 8
236 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, ptr [[__REINT_85124]], align 8
237 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 2
238 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_85125]], align 2
239 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85125]], align 2
240 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
241 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85134]], align 8
242 // CHECK-NEXT: [[TMP17:%.*]] = load <4 x i16>, ptr [[__REINT_85134]], align 8
243 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 2
244 // CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_85135]], align 2
245 // CHECK-NEXT: [[TMP19:%.*]] = load half, ptr [[__REINT1_85135]], align 2
246 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
247 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85144]], align 8
248 // CHECK-NEXT: [[TMP21:%.*]] = load <4 x i16>, ptr [[__REINT_85144]], align 8
249 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 2
250 // CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_85145]], align 2
251 // CHECK-NEXT: [[TMP23:%.*]] = load half, ptr [[__REINT1_85145]], align 2
252 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
253 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85154]], align 8
254 // CHECK-NEXT: [[TMP25:%.*]] = load <4 x i16>, ptr [[__REINT_85154]], align 8
255 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 2
256 // CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_85155]], align 2
257 // CHECK-NEXT: [[TMP27:%.*]] = load half, ptr [[__REINT1_85155]], align 2
258 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
259 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85164]], align 8
260 // CHECK-NEXT: [[TMP29:%.*]] = load <4 x i16>, ptr [[__REINT_85164]], align 8
261 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 2
262 // CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_85165]], align 2
263 // CHECK-NEXT: [[TMP31:%.*]] = load half, ptr [[__REINT1_85165]], align 2
264 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
265 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
266 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
267 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
268 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
269 // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]]
271 float32x4_t
test_vfmlalq_lane_low_f16(float32x4_t a
, float16x8_t b
, float16x4_t c
) {
272 return vfmlalq_lane_low_f16(a
, b
, c
, 2);
275 // CHECK-LABEL: @test_vfmlalq_lane_high_f16(
276 // CHECK-NEXT: entry:
277 // CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8
278 // CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2
279 // CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
280 // CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2
281 // CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
282 // CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2
283 // CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
284 // CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2
285 // CHECK-NEXT: [[__REINT_85134:%.*]] = alloca <4 x half>, align 8
286 // CHECK-NEXT: [[__REINT1_85135:%.*]] = alloca i16, align 2
287 // CHECK-NEXT: [[__REINT_85144:%.*]] = alloca <4 x half>, align 8
288 // CHECK-NEXT: [[__REINT1_85145:%.*]] = alloca i16, align 2
289 // CHECK-NEXT: [[__REINT_85154:%.*]] = alloca <4 x half>, align 8
290 // CHECK-NEXT: [[__REINT1_85155:%.*]] = alloca i16, align 2
291 // CHECK-NEXT: [[__REINT_85164:%.*]] = alloca <4 x half>, align 8
292 // CHECK-NEXT: [[__REINT1_85165:%.*]] = alloca i16, align 2
293 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_851]], align 8
294 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_851]], align 8
295 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
296 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_851]], align 2
297 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_851]], align 2
298 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
299 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8514]], align 8
300 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_8514]], align 8
301 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
302 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8515]], align 2
303 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8515]], align 2
304 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
305 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85114]], align 8
306 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, ptr [[__REINT_85114]], align 8
307 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 3
308 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_85115]], align 2
309 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85115]], align 2
310 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
311 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85124]], align 8
312 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, ptr [[__REINT_85124]], align 8
313 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 3
314 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_85125]], align 2
315 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85125]], align 2
316 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
317 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85134]], align 8
318 // CHECK-NEXT: [[TMP17:%.*]] = load <4 x i16>, ptr [[__REINT_85134]], align 8
319 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 3
320 // CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_85135]], align 2
321 // CHECK-NEXT: [[TMP19:%.*]] = load half, ptr [[__REINT1_85135]], align 2
322 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
323 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85144]], align 8
324 // CHECK-NEXT: [[TMP21:%.*]] = load <4 x i16>, ptr [[__REINT_85144]], align 8
325 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 3
326 // CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_85145]], align 2
327 // CHECK-NEXT: [[TMP23:%.*]] = load half, ptr [[__REINT1_85145]], align 2
328 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
329 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85154]], align 8
330 // CHECK-NEXT: [[TMP25:%.*]] = load <4 x i16>, ptr [[__REINT_85154]], align 8
331 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 3
332 // CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_85155]], align 2
333 // CHECK-NEXT: [[TMP27:%.*]] = load half, ptr [[__REINT1_85155]], align 2
334 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
335 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85164]], align 8
336 // CHECK-NEXT: [[TMP29:%.*]] = load <4 x i16>, ptr [[__REINT_85164]], align 8
337 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 3
338 // CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_85165]], align 2
339 // CHECK-NEXT: [[TMP31:%.*]] = load half, ptr [[__REINT1_85165]], align 2
340 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
341 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
342 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
343 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
344 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
345 // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]]
347 float32x4_t
test_vfmlalq_lane_high_f16(float32x4_t a
, float16x8_t b
, float16x4_t c
) {
348 return vfmlalq_lane_high_f16(a
, b
, c
, 3);
351 // CHECK-LABEL: @test_vfmlal_laneq_low_f16(
352 // CHECK-NEXT: entry:
353 // CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16
354 // CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2
355 // CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
356 // CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2
357 // CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
358 // CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2
359 // CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
360 // CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2
361 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_854]], align 16
362 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[__REINT_854]], align 16
363 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
364 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_854]], align 2
365 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_854]], align 2
366 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
367 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8544]], align 16
368 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[__REINT_8544]], align 16
369 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
370 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8545]], align 2
371 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8545]], align 2
372 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
373 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85414]], align 16
374 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr [[__REINT_85414]], align 16
375 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 4
376 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85415]], align 2
377 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85415]], align 2
378 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
379 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85424]], align 16
380 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr [[__REINT_85424]], align 16
381 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 4
382 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85425]], align 2
383 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85425]], align 2
384 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
385 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
386 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
387 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
388 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
389 // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]]
391 float32x2_t
test_vfmlal_laneq_low_f16(float32x2_t a
, float16x4_t b
, float16x8_t c
) {
392 return vfmlal_laneq_low_f16(a
, b
, c
, 4);
395 // CHECK-LABEL: @test_vfmlal_laneq_high_f16(
396 // CHECK-NEXT: entry:
397 // CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16
398 // CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2
399 // CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
400 // CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2
401 // CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
402 // CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2
403 // CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
404 // CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2
405 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_854]], align 16
406 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[__REINT_854]], align 16
407 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
408 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_854]], align 2
409 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_854]], align 2
410 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
411 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8544]], align 16
412 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[__REINT_8544]], align 16
413 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
414 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8545]], align 2
415 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8545]], align 2
416 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
417 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85414]], align 16
418 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr [[__REINT_85414]], align 16
419 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 5
420 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85415]], align 2
421 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85415]], align 2
422 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
423 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85424]], align 16
424 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr [[__REINT_85424]], align 16
425 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 5
426 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85425]], align 2
427 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85425]], align 2
428 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
429 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
430 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
431 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
432 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
433 // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]]
435 float32x2_t
test_vfmlal_laneq_high_f16(float32x2_t a
, float16x4_t b
, float16x8_t c
) {
436 return vfmlal_laneq_high_f16(a
, b
, c
, 5);
439 // CHECK-LABEL: @test_vfmlalq_laneq_low_f16(
440 // CHECK-NEXT: entry:
441 // CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16
442 // CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2
443 // CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
444 // CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2
445 // CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
446 // CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2
447 // CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
448 // CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2
449 // CHECK-NEXT: [[__REINT_85434:%.*]] = alloca <8 x half>, align 16
450 // CHECK-NEXT: [[__REINT1_85435:%.*]] = alloca i16, align 2
451 // CHECK-NEXT: [[__REINT_85444:%.*]] = alloca <8 x half>, align 16
452 // CHECK-NEXT: [[__REINT1_85445:%.*]] = alloca i16, align 2
453 // CHECK-NEXT: [[__REINT_85454:%.*]] = alloca <8 x half>, align 16
454 // CHECK-NEXT: [[__REINT1_85455:%.*]] = alloca i16, align 2
455 // CHECK-NEXT: [[__REINT_85464:%.*]] = alloca <8 x half>, align 16
456 // CHECK-NEXT: [[__REINT1_85465:%.*]] = alloca i16, align 2
457 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_854]], align 16
458 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[__REINT_854]], align 16
459 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
460 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_854]], align 2
461 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_854]], align 2
462 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
463 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8544]], align 16
464 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[__REINT_8544]], align 16
465 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
466 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8545]], align 2
467 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8545]], align 2
468 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
469 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85414]], align 16
470 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr [[__REINT_85414]], align 16
471 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 6
472 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85415]], align 2
473 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85415]], align 2
474 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
475 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85424]], align 16
476 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr [[__REINT_85424]], align 16
477 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 6
478 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85425]], align 2
479 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85425]], align 2
480 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
481 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85434]], align 16
482 // CHECK-NEXT: [[TMP17:%.*]] = load <8 x i16>, ptr [[__REINT_85434]], align 16
483 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 6
484 // CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85435]], align 2
485 // CHECK-NEXT: [[TMP19:%.*]] = load half, ptr [[__REINT1_85435]], align 2
486 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
487 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85444]], align 16
488 // CHECK-NEXT: [[TMP21:%.*]] = load <8 x i16>, ptr [[__REINT_85444]], align 16
489 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 6
490 // CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85445]], align 2
491 // CHECK-NEXT: [[TMP23:%.*]] = load half, ptr [[__REINT1_85445]], align 2
492 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
493 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85454]], align 16
494 // CHECK-NEXT: [[TMP25:%.*]] = load <8 x i16>, ptr [[__REINT_85454]], align 16
495 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 6
496 // CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85455]], align 2
497 // CHECK-NEXT: [[TMP27:%.*]] = load half, ptr [[__REINT1_85455]], align 2
498 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
499 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85464]], align 16
500 // CHECK-NEXT: [[TMP29:%.*]] = load <8 x i16>, ptr [[__REINT_85464]], align 16
501 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 6
502 // CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85465]], align 2
503 // CHECK-NEXT: [[TMP31:%.*]] = load half, ptr [[__REINT1_85465]], align 2
504 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
505 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
506 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
507 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
508 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
509 // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]]
511 float32x4_t
test_vfmlalq_laneq_low_f16(float32x4_t a
, float16x8_t b
, float16x8_t c
) {
512 return vfmlalq_laneq_low_f16(a
, b
, c
, 6);
515 // CHECK-LABEL: @test_vfmlalq_laneq_high_f16(
516 // CHECK-NEXT: entry:
517 // CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16
518 // CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2
519 // CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
520 // CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2
521 // CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
522 // CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2
523 // CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
524 // CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2
525 // CHECK-NEXT: [[__REINT_85434:%.*]] = alloca <8 x half>, align 16
526 // CHECK-NEXT: [[__REINT1_85435:%.*]] = alloca i16, align 2
527 // CHECK-NEXT: [[__REINT_85444:%.*]] = alloca <8 x half>, align 16
528 // CHECK-NEXT: [[__REINT1_85445:%.*]] = alloca i16, align 2
529 // CHECK-NEXT: [[__REINT_85454:%.*]] = alloca <8 x half>, align 16
530 // CHECK-NEXT: [[__REINT1_85455:%.*]] = alloca i16, align 2
531 // CHECK-NEXT: [[__REINT_85464:%.*]] = alloca <8 x half>, align 16
532 // CHECK-NEXT: [[__REINT1_85465:%.*]] = alloca i16, align 2
533 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_854]], align 16
534 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[__REINT_854]], align 16
535 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
536 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_854]], align 2
537 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_854]], align 2
538 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
539 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8544]], align 16
540 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[__REINT_8544]], align 16
541 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
542 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8545]], align 2
543 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8545]], align 2
544 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
545 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85414]], align 16
546 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr [[__REINT_85414]], align 16
547 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 7
548 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85415]], align 2
549 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85415]], align 2
550 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
551 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85424]], align 16
552 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr [[__REINT_85424]], align 16
553 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 7
554 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85425]], align 2
555 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85425]], align 2
556 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
557 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85434]], align 16
558 // CHECK-NEXT: [[TMP17:%.*]] = load <8 x i16>, ptr [[__REINT_85434]], align 16
559 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 7
560 // CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85435]], align 2
561 // CHECK-NEXT: [[TMP19:%.*]] = load half, ptr [[__REINT1_85435]], align 2
562 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
563 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85444]], align 16
564 // CHECK-NEXT: [[TMP21:%.*]] = load <8 x i16>, ptr [[__REINT_85444]], align 16
565 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 7
566 // CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85445]], align 2
567 // CHECK-NEXT: [[TMP23:%.*]] = load half, ptr [[__REINT1_85445]], align 2
568 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
569 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85454]], align 16
570 // CHECK-NEXT: [[TMP25:%.*]] = load <8 x i16>, ptr [[__REINT_85454]], align 16
571 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 7
572 // CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85455]], align 2
573 // CHECK-NEXT: [[TMP27:%.*]] = load half, ptr [[__REINT1_85455]], align 2
574 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
575 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85464]], align 16
576 // CHECK-NEXT: [[TMP29:%.*]] = load <8 x i16>, ptr [[__REINT_85464]], align 16
577 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 7
578 // CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85465]], align 2
579 // CHECK-NEXT: [[TMP31:%.*]] = load half, ptr [[__REINT1_85465]], align 2
580 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
581 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
582 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
583 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
584 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
585 // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]]
587 float32x4_t
test_vfmlalq_laneq_high_f16(float32x4_t a
, float16x8_t b
, float16x8_t c
) {
588 return vfmlalq_laneq_high_f16(a
, b
, c
, 7);
591 // CHECK-LABEL: @test_vfmlsl_lane_low_f16(
592 // CHECK-NEXT: entry:
593 // CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8
594 // CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2
595 // CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
596 // CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2
597 // CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
598 // CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2
599 // CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
600 // CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2
601 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_851]], align 8
602 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_851]], align 8
603 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
604 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_851]], align 2
605 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_851]], align 2
606 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
607 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8514]], align 8
608 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_8514]], align 8
609 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
610 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8515]], align 2
611 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8515]], align 2
612 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
613 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85114]], align 8
614 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, ptr [[__REINT_85114]], align 8
615 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 0
616 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_85115]], align 2
617 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85115]], align 2
618 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
619 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85124]], align 8
620 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, ptr [[__REINT_85124]], align 8
621 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 0
622 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_85125]], align 2
623 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85125]], align 2
624 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
625 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
626 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
627 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
628 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
629 // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]]
631 float32x2_t
test_vfmlsl_lane_low_f16(float32x2_t a
, float16x4_t b
, float16x4_t c
) {
632 return vfmlsl_lane_low_f16(a
, b
, c
, 0);
635 // CHECK-LABEL: @test_vfmlsl_lane_high_f16(
636 // CHECK-NEXT: entry:
637 // CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8
638 // CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2
639 // CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
640 // CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2
641 // CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
642 // CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2
643 // CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
644 // CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2
645 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_851]], align 8
646 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_851]], align 8
647 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
648 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_851]], align 2
649 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_851]], align 2
650 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
651 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8514]], align 8
652 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_8514]], align 8
653 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
654 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8515]], align 2
655 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8515]], align 2
656 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
657 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85114]], align 8
658 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, ptr [[__REINT_85114]], align 8
659 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 1
660 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_85115]], align 2
661 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85115]], align 2
662 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
663 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85124]], align 8
664 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, ptr [[__REINT_85124]], align 8
665 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 1
666 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_85125]], align 2
667 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85125]], align 2
668 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
669 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
670 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
671 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
672 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
673 // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]]
675 float32x2_t
test_vfmlsl_lane_high_f16(float32x2_t a
, float16x4_t b
, float16x4_t c
) {
676 return vfmlsl_lane_high_f16(a
, b
, c
, 1);
679 // CHECK-LABEL: @test_vfmlslq_lane_low_f16(
680 // CHECK-NEXT: entry:
681 // CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8
682 // CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2
683 // CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
684 // CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2
685 // CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
686 // CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2
687 // CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
688 // CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2
689 // CHECK-NEXT: [[__REINT_85134:%.*]] = alloca <4 x half>, align 8
690 // CHECK-NEXT: [[__REINT1_85135:%.*]] = alloca i16, align 2
691 // CHECK-NEXT: [[__REINT_85144:%.*]] = alloca <4 x half>, align 8
692 // CHECK-NEXT: [[__REINT1_85145:%.*]] = alloca i16, align 2
693 // CHECK-NEXT: [[__REINT_85154:%.*]] = alloca <4 x half>, align 8
694 // CHECK-NEXT: [[__REINT1_85155:%.*]] = alloca i16, align 2
695 // CHECK-NEXT: [[__REINT_85164:%.*]] = alloca <4 x half>, align 8
696 // CHECK-NEXT: [[__REINT1_85165:%.*]] = alloca i16, align 2
697 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_851]], align 8
698 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_851]], align 8
699 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
700 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_851]], align 2
701 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_851]], align 2
702 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
703 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8514]], align 8
704 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_8514]], align 8
705 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
706 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8515]], align 2
707 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8515]], align 2
708 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
709 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85114]], align 8
710 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, ptr [[__REINT_85114]], align 8
711 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 2
712 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_85115]], align 2
713 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85115]], align 2
714 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
715 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85124]], align 8
716 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, ptr [[__REINT_85124]], align 8
717 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 2
718 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_85125]], align 2
719 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85125]], align 2
720 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
721 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85134]], align 8
722 // CHECK-NEXT: [[TMP17:%.*]] = load <4 x i16>, ptr [[__REINT_85134]], align 8
723 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 2
724 // CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_85135]], align 2
725 // CHECK-NEXT: [[TMP19:%.*]] = load half, ptr [[__REINT1_85135]], align 2
726 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
727 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85144]], align 8
728 // CHECK-NEXT: [[TMP21:%.*]] = load <4 x i16>, ptr [[__REINT_85144]], align 8
729 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 2
730 // CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_85145]], align 2
731 // CHECK-NEXT: [[TMP23:%.*]] = load half, ptr [[__REINT1_85145]], align 2
732 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
733 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85154]], align 8
734 // CHECK-NEXT: [[TMP25:%.*]] = load <4 x i16>, ptr [[__REINT_85154]], align 8
735 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 2
736 // CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_85155]], align 2
737 // CHECK-NEXT: [[TMP27:%.*]] = load half, ptr [[__REINT1_85155]], align 2
738 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
739 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85164]], align 8
740 // CHECK-NEXT: [[TMP29:%.*]] = load <4 x i16>, ptr [[__REINT_85164]], align 8
741 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 2
742 // CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_85165]], align 2
743 // CHECK-NEXT: [[TMP31:%.*]] = load half, ptr [[__REINT1_85165]], align 2
744 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
745 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
746 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
747 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
748 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
749 // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]]
751 float32x4_t
test_vfmlslq_lane_low_f16(float32x4_t a
, float16x8_t b
, float16x4_t c
) {
752 return vfmlslq_lane_low_f16(a
, b
, c
, 2);
755 // CHECK-LABEL: @test_vfmlslq_lane_high_f16(
756 // CHECK-NEXT: entry:
757 // CHECK-NEXT: [[__REINT_851:%.*]] = alloca <4 x half>, align 8
758 // CHECK-NEXT: [[__REINT1_851:%.*]] = alloca i16, align 2
759 // CHECK-NEXT: [[__REINT_8514:%.*]] = alloca <4 x half>, align 8
760 // CHECK-NEXT: [[__REINT1_8515:%.*]] = alloca i16, align 2
761 // CHECK-NEXT: [[__REINT_85114:%.*]] = alloca <4 x half>, align 8
762 // CHECK-NEXT: [[__REINT1_85115:%.*]] = alloca i16, align 2
763 // CHECK-NEXT: [[__REINT_85124:%.*]] = alloca <4 x half>, align 8
764 // CHECK-NEXT: [[__REINT1_85125:%.*]] = alloca i16, align 2
765 // CHECK-NEXT: [[__REINT_85134:%.*]] = alloca <4 x half>, align 8
766 // CHECK-NEXT: [[__REINT1_85135:%.*]] = alloca i16, align 2
767 // CHECK-NEXT: [[__REINT_85144:%.*]] = alloca <4 x half>, align 8
768 // CHECK-NEXT: [[__REINT1_85145:%.*]] = alloca i16, align 2
769 // CHECK-NEXT: [[__REINT_85154:%.*]] = alloca <4 x half>, align 8
770 // CHECK-NEXT: [[__REINT1_85155:%.*]] = alloca i16, align 2
771 // CHECK-NEXT: [[__REINT_85164:%.*]] = alloca <4 x half>, align 8
772 // CHECK-NEXT: [[__REINT1_85165:%.*]] = alloca i16, align 2
773 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_851]], align 8
774 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_851]], align 8
775 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
776 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_851]], align 2
777 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_851]], align 2
778 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
779 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8514]], align 8
780 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_8514]], align 8
781 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
782 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8515]], align 2
783 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8515]], align 2
784 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
785 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85114]], align 8
786 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, ptr [[__REINT_85114]], align 8
787 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 3
788 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_85115]], align 2
789 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85115]], align 2
790 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
791 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85124]], align 8
792 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, ptr [[__REINT_85124]], align 8
793 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 3
794 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_85125]], align 2
795 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85125]], align 2
796 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
797 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85134]], align 8
798 // CHECK-NEXT: [[TMP17:%.*]] = load <4 x i16>, ptr [[__REINT_85134]], align 8
799 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 3
800 // CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_85135]], align 2
801 // CHECK-NEXT: [[TMP19:%.*]] = load half, ptr [[__REINT1_85135]], align 2
802 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
803 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85144]], align 8
804 // CHECK-NEXT: [[TMP21:%.*]] = load <4 x i16>, ptr [[__REINT_85144]], align 8
805 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 3
806 // CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_85145]], align 2
807 // CHECK-NEXT: [[TMP23:%.*]] = load half, ptr [[__REINT1_85145]], align 2
808 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
809 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85154]], align 8
810 // CHECK-NEXT: [[TMP25:%.*]] = load <4 x i16>, ptr [[__REINT_85154]], align 8
811 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 3
812 // CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_85155]], align 2
813 // CHECK-NEXT: [[TMP27:%.*]] = load half, ptr [[__REINT1_85155]], align 2
814 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
815 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_85164]], align 8
816 // CHECK-NEXT: [[TMP29:%.*]] = load <4 x i16>, ptr [[__REINT_85164]], align 8
817 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 3
818 // CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_85165]], align 2
819 // CHECK-NEXT: [[TMP31:%.*]] = load half, ptr [[__REINT1_85165]], align 2
820 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
821 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
822 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
823 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
824 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
825 // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]]
827 float32x4_t
test_vfmlslq_lane_high_f16(float32x4_t a
, float16x8_t b
, float16x4_t c
) {
828 return vfmlslq_lane_high_f16(a
, b
, c
, 3);
831 // CHECK-LABEL: @test_vfmlsl_laneq_low_f16(
832 // CHECK-NEXT: entry:
833 // CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16
834 // CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2
835 // CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
836 // CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2
837 // CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
838 // CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2
839 // CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
840 // CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2
841 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_854]], align 16
842 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[__REINT_854]], align 16
843 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
844 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_854]], align 2
845 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_854]], align 2
846 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
847 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8544]], align 16
848 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[__REINT_8544]], align 16
849 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
850 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8545]], align 2
851 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8545]], align 2
852 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
853 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85414]], align 16
854 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr [[__REINT_85414]], align 16
855 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 4
856 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85415]], align 2
857 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85415]], align 2
858 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
859 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85424]], align 16
860 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr [[__REINT_85424]], align 16
861 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 4
862 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85425]], align 2
863 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85425]], align 2
864 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
865 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
866 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
867 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
868 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
869 // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]]
871 float32x2_t
test_vfmlsl_laneq_low_f16(float32x2_t a
, float16x4_t b
, float16x8_t c
) {
872 return vfmlsl_laneq_low_f16(a
, b
, c
, 4);
875 // CHECK-LABEL: @test_vfmlsl_laneq_high_f16(
876 // CHECK-NEXT: entry:
877 // CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16
878 // CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2
879 // CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
880 // CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2
881 // CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
882 // CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2
883 // CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
884 // CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2
885 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_854]], align 16
886 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[__REINT_854]], align 16
887 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
888 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_854]], align 2
889 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_854]], align 2
890 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
891 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8544]], align 16
892 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[__REINT_8544]], align 16
893 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
894 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8545]], align 2
895 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8545]], align 2
896 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
897 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85414]], align 16
898 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr [[__REINT_85414]], align 16
899 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 5
900 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85415]], align 2
901 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85415]], align 2
902 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
903 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85424]], align 16
904 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr [[__REINT_85424]], align 16
905 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 5
906 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85425]], align 2
907 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85425]], align 2
908 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
909 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
910 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
911 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
912 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
913 // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]]
915 float32x2_t
test_vfmlsl_laneq_high_f16(float32x2_t a
, float16x4_t b
, float16x8_t c
) {
916 return vfmlsl_laneq_high_f16(a
, b
, c
, 5);
919 // CHECK-LABEL: @test_vfmlslq_laneq_low_f16(
920 // CHECK-NEXT: entry:
921 // CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16
922 // CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2
923 // CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
924 // CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2
925 // CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
926 // CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2
927 // CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
928 // CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2
929 // CHECK-NEXT: [[__REINT_85434:%.*]] = alloca <8 x half>, align 16
930 // CHECK-NEXT: [[__REINT1_85435:%.*]] = alloca i16, align 2
931 // CHECK-NEXT: [[__REINT_85444:%.*]] = alloca <8 x half>, align 16
932 // CHECK-NEXT: [[__REINT1_85445:%.*]] = alloca i16, align 2
933 // CHECK-NEXT: [[__REINT_85454:%.*]] = alloca <8 x half>, align 16
934 // CHECK-NEXT: [[__REINT1_85455:%.*]] = alloca i16, align 2
935 // CHECK-NEXT: [[__REINT_85464:%.*]] = alloca <8 x half>, align 16
936 // CHECK-NEXT: [[__REINT1_85465:%.*]] = alloca i16, align 2
937 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_854]], align 16
938 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[__REINT_854]], align 16
939 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
940 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_854]], align 2
941 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_854]], align 2
942 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
943 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8544]], align 16
944 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[__REINT_8544]], align 16
945 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
946 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8545]], align 2
947 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8545]], align 2
948 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
949 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85414]], align 16
950 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr [[__REINT_85414]], align 16
951 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 6
952 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85415]], align 2
953 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85415]], align 2
954 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
955 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85424]], align 16
956 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr [[__REINT_85424]], align 16
957 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 6
958 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85425]], align 2
959 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85425]], align 2
960 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
961 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85434]], align 16
962 // CHECK-NEXT: [[TMP17:%.*]] = load <8 x i16>, ptr [[__REINT_85434]], align 16
963 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 6
964 // CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85435]], align 2
965 // CHECK-NEXT: [[TMP19:%.*]] = load half, ptr [[__REINT1_85435]], align 2
966 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
967 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85444]], align 16
968 // CHECK-NEXT: [[TMP21:%.*]] = load <8 x i16>, ptr [[__REINT_85444]], align 16
969 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 6
970 // CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85445]], align 2
971 // CHECK-NEXT: [[TMP23:%.*]] = load half, ptr [[__REINT1_85445]], align 2
972 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
973 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85454]], align 16
974 // CHECK-NEXT: [[TMP25:%.*]] = load <8 x i16>, ptr [[__REINT_85454]], align 16
975 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 6
976 // CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85455]], align 2
977 // CHECK-NEXT: [[TMP27:%.*]] = load half, ptr [[__REINT1_85455]], align 2
978 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
979 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85464]], align 16
980 // CHECK-NEXT: [[TMP29:%.*]] = load <8 x i16>, ptr [[__REINT_85464]], align 16
981 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 6
982 // CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85465]], align 2
983 // CHECK-NEXT: [[TMP31:%.*]] = load half, ptr [[__REINT1_85465]], align 2
984 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
985 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
986 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
987 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
988 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
989 // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]]
991 float32x4_t
test_vfmlslq_laneq_low_f16(float32x4_t a
, float16x8_t b
, float16x8_t c
) {
992 return vfmlslq_laneq_low_f16(a
, b
, c
, 6);
995 // CHECK-LABEL: @test_vfmlslq_laneq_high_f16(
996 // CHECK-NEXT: entry:
997 // CHECK-NEXT: [[__REINT_854:%.*]] = alloca <8 x half>, align 16
998 // CHECK-NEXT: [[__REINT1_854:%.*]] = alloca i16, align 2
999 // CHECK-NEXT: [[__REINT_8544:%.*]] = alloca <8 x half>, align 16
1000 // CHECK-NEXT: [[__REINT1_8545:%.*]] = alloca i16, align 2
1001 // CHECK-NEXT: [[__REINT_85414:%.*]] = alloca <8 x half>, align 16
1002 // CHECK-NEXT: [[__REINT1_85415:%.*]] = alloca i16, align 2
1003 // CHECK-NEXT: [[__REINT_85424:%.*]] = alloca <8 x half>, align 16
1004 // CHECK-NEXT: [[__REINT1_85425:%.*]] = alloca i16, align 2
1005 // CHECK-NEXT: [[__REINT_85434:%.*]] = alloca <8 x half>, align 16
1006 // CHECK-NEXT: [[__REINT1_85435:%.*]] = alloca i16, align 2
1007 // CHECK-NEXT: [[__REINT_85444:%.*]] = alloca <8 x half>, align 16
1008 // CHECK-NEXT: [[__REINT1_85445:%.*]] = alloca i16, align 2
1009 // CHECK-NEXT: [[__REINT_85454:%.*]] = alloca <8 x half>, align 16
1010 // CHECK-NEXT: [[__REINT1_85455:%.*]] = alloca i16, align 2
1011 // CHECK-NEXT: [[__REINT_85464:%.*]] = alloca <8 x half>, align 16
1012 // CHECK-NEXT: [[__REINT1_85465:%.*]] = alloca i16, align 2
1013 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_854]], align 16
1014 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[__REINT_854]], align 16
1015 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
1016 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_854]], align 2
1017 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_854]], align 2
1018 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
1019 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8544]], align 16
1020 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[__REINT_8544]], align 16
1021 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
1022 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8545]], align 2
1023 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_8545]], align 2
1024 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
1025 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85414]], align 16
1026 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr [[__REINT_85414]], align 16
1027 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 7
1028 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85415]], align 2
1029 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85415]], align 2
1030 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
1031 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85424]], align 16
1032 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr [[__REINT_85424]], align 16
1033 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 7
1034 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85425]], align 2
1035 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85425]], align 2
1036 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
1037 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85434]], align 16
1038 // CHECK-NEXT: [[TMP17:%.*]] = load <8 x i16>, ptr [[__REINT_85434]], align 16
1039 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 7
1040 // CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85435]], align 2
1041 // CHECK-NEXT: [[TMP19:%.*]] = load half, ptr [[__REINT1_85435]], align 2
1042 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
1043 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85444]], align 16
1044 // CHECK-NEXT: [[TMP21:%.*]] = load <8 x i16>, ptr [[__REINT_85444]], align 16
1045 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 7
1046 // CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85445]], align 2
1047 // CHECK-NEXT: [[TMP23:%.*]] = load half, ptr [[__REINT1_85445]], align 2
1048 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
1049 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85454]], align 16
1050 // CHECK-NEXT: [[TMP25:%.*]] = load <8 x i16>, ptr [[__REINT_85454]], align 16
1051 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 7
1052 // CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85455]], align 2
1053 // CHECK-NEXT: [[TMP27:%.*]] = load half, ptr [[__REINT1_85455]], align 2
1054 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
1055 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85464]], align 16
1056 // CHECK-NEXT: [[TMP29:%.*]] = load <8 x i16>, ptr [[__REINT_85464]], align 16
1057 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 7
1058 // CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85465]], align 2
1059 // CHECK-NEXT: [[TMP31:%.*]] = load half, ptr [[__REINT1_85465]], align 2
1060 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
1061 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
1062 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
1063 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
1064 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
1065 // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]]
1067 float32x4_t
test_vfmlslq_laneq_high_f16(float32x4_t a
, float16x8_t b
, float16x8_t c
) {
1068 return vfmlslq_laneq_high_f16(a
, b
, c
, 7);