1 ; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16 | FileCheck %s
3 declare half @llvm.aarch64.neon.fmulx.f16(half, half)
4 declare <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half>, <4 x half>)
5 declare <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half>, <8 x half>)
6 declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
7 declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
8 declare half @llvm.fma.f16(half, half, half) #1
10 define dso_local <4 x half> @t_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
11 ; CHECK-LABEL: t_vfma_lane_f16:
12 ; CHECK: dup v2.4h, v2.h[0]
13 ; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h
16 %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
17 %fmla3 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %lane1, <4 x half> %a)
21 define dso_local <8 x half> @t_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
22 ; CHECK-LABEL: t_vfmaq_lane_f16:
23 ; CHECK: dup v2.8h, v2.h[0]
24 ; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h
27 %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> zeroinitializer
28 %fmla3 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %lane1, <8 x half> %a)
32 define dso_local <4 x half> @t_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
33 ; CHECK-LABEL: t_vfma_laneq_f16:
34 ; CHECK: dup v2.4h, v2.h[0]
35 ; CHECK-NEXT: fmla v0.4h, v1.4h, v2.4h
38 %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> zeroinitializer
39 %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %lane1, <4 x half> %b, <4 x half> %a)
43 define dso_local <8 x half> @t_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
44 ; CHECK-LABEL: t_vfmaq_laneq_f16:
45 ; CHECK: dup v2.8h, v2.h[0]
46 ; CHECK-NEXT: fmla v0.8h, v1.8h, v2.8h
49 %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> zeroinitializer
50 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %lane1, <8 x half> %b, <8 x half> %a)
54 define dso_local <4 x half> @t_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
55 ; CHECK-LABEL: t_vfma_n_f16:
56 ; CHECK: dup v2.4h, v2.h[0]
57 ; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h
60 %vecinit = insertelement <4 x half> undef, half %c, i32 0
61 %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
62 %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %vecinit3, <4 x half> %a) #4
66 define dso_local <8 x half> @t_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
67 ; CHECK-LABEL: t_vfmaq_n_f16:
68 ; CHECK: dup v2.8h, v2.h[0]
69 ; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h
72 %vecinit = insertelement <8 x half> undef, half %c, i32 0
73 %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
74 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %vecinit7, <8 x half> %a) #4
78 define dso_local half @t_vfmah_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
79 ; CHECK-LABEL: t_vfmah_lane_f16:
80 ; CHECK: fmadd h0, h1, h2, h0
83 %extract = extractelement <4 x half> %c, i32 0
84 %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
88 define dso_local half @t_vfmah_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
89 ; CHECK-LABEL: t_vfmah_laneq_f16:
90 ; CHECK: fmadd h0, h1, h2, h0
93 %extract = extractelement <8 x half> %c, i32 0
94 %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
98 define dso_local <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
99 ; CHECK-LABEL: t_vfms_lane_f16:
100 ; CHECK: fneg v1.4h, v1.4h
101 ; CHECK-NEXT: dup v2.4h, v2.h[0]
102 ; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h
105 %sub = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
106 %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
107 %fmla3 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %sub, <4 x half> %lane1, <4 x half> %a)
108 ret <4 x half> %fmla3
111 define dso_local <8 x half> @t_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
112 ; CHECK-LABEL: t_vfmsq_lane_f16:
113 ; CHECK: fneg v1.8h, v1.8h
114 ; CHECK-NEXT: dup v2.8h, v2.h[0]
115 ; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h
118 %sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
119 %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> zeroinitializer
120 %fmla3 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub, <8 x half> %lane1, <8 x half> %a)
121 ret <8 x half> %fmla3
124 define dso_local <4 x half> @t_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
125 ; CHECK-LABEL: t_vfms_laneq_f16:
126 ; CHECK: dup v2.4h, v2.h[0]
127 ; CHECK-NEXT: fmls v0.4h, v2.4h, v1.4h
130 %sub = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
131 %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> zeroinitializer
132 %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %lane1, <4 x half> %sub, <4 x half> %a)
136 define dso_local <8 x half> @t_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
137 ; CHECK-LABEL: t_vfmsq_laneq_f16:
138 ; CHECK: dup v2.8h, v2.h[0]
139 ; CHECK-NEXT: fmls v0.8h, v2.8h, v1.8h
142 %sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
143 %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> zeroinitializer
144 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %lane1, <8 x half> %sub, <8 x half> %a)
148 define dso_local <4 x half> @t_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
149 ; CHECK-LABEL: t_vfms_n_f16:
150 ; CHECK: fneg v1.4h, v1.4h
151 ; CHECK-NEXT: dup v2.4h, v2.h[0]
152 ; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h
155 %sub = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
156 %vecinit = insertelement <4 x half> undef, half %c, i32 0
157 %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
158 %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %sub, <4 x half> %vecinit3, <4 x half> %a) #4
162 define dso_local <8 x half> @t_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
163 ; CHECK-LABEL: t_vfmsq_n_f16:
164 ; CHECK: fneg v1.8h, v1.8h
165 ; CHECK-NEXT: dup v2.8h, v2.h[0]
166 ; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h
169 %sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
170 %vecinit = insertelement <8 x half> undef, half %c, i32 0
171 %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
172 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub, <8 x half> %vecinit7, <8 x half> %a) #4
176 define dso_local half @t_vfmsh_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
177 ; CHECK-LABEL: t_vfmsh_lane_f16:
178 ; CHECK: fmsub h0, h1, h2, h0
181 %0 = fsub half 0xH8000, %b
182 %extract = extractelement <4 x half> %c, i32 0
183 %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
187 define dso_local half @t_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
188 ; CHECK-LABEL: t_vfmsh_laneq_f16:
189 ; CHECK: fmsub h0, h1, h2, h0
192 %0 = fsub half 0xH8000, %b
193 %extract = extractelement <8 x half> %c, i32 0
194 %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
198 define dso_local <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
199 ; CHECK-LABEL: t_vmul_laneq_f16:
200 ; CHECK: fmul v0.4h, v0.4h, v1.h[0]
203 %shuffle = shufflevector <8 x half> %b, <8 x half> undef, <4 x i32> zeroinitializer
204 %mul = fmul <4 x half> %shuffle, %a
208 define dso_local <8 x half> @t_vmulq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
209 ; CHECK-LABEL: t_vmulq_laneq_f16:
210 ; CHECK: fmul v0.8h, v0.8h, v1.h[0]
213 %shuffle = shufflevector <8 x half> %b, <8 x half> undef, <8 x i32> zeroinitializer
214 %mul = fmul <8 x half> %shuffle, %a
218 define dso_local half @t_vmulh_lane_f16(half %a, <4 x half> %c, i32 %lane) {
219 ; CHECK-LABEL: t_vmulh_lane_f16:
220 ; CHECK: fmul h0, h0, v1.h[0]
223 %0 = extractelement <4 x half> %c, i32 0
224 %1 = fmul half %0, %a
228 define dso_local half @t_vmulh_laneq_f16(half %a, <8 x half> %c, i32 %lane) {
229 ; CHECK-LABEL: t_vmulh_laneq_f16:
230 ; CHECK: fmul h0, h0, v1.h[0]
233 %0 = extractelement <8 x half> %c, i32 0
234 %1 = fmul half %0, %a
238 define dso_local half @t_vmulx_f16(half %a, half %b) {
239 ; CHECK-LABEL: t_vmulx_f16:
240 ; CHECK: fmulx h0, h0, h1
243 %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %b)
247 define dso_local half @t_vmulxh_lane_f16(half %a, <4 x half> %b, i32 %lane) {
248 ; CHECK-LABEL: t_vmulxh_lane_f16:
249 ; CHECK: fmulx h0, h0, v1.h[3]
252 %extract = extractelement <4 x half> %b, i32 3
253 %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract)
257 define dso_local <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) {
258 ; CHECK-LABEL: t_vmulx_lane_f16:
259 ; CHECK: fmulx v0.4h, v0.4h, v1.h[0]
262 %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> zeroinitializer
263 %vmulx2.i = tail call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> %shuffle) #4
264 ret <4 x half> %vmulx2.i
267 define dso_local <8 x half> @t_vmulxq_lane_f16(<8 x half> %a, <4 x half> %b, i32 %lane) {
268 ; CHECK-LABEL: t_vmulxq_lane_f16:
269 ; CHECK: fmulx v0.8h, v0.8h, v1.h[0]
272 %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> zeroinitializer
273 %vmulx2.i = tail call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> %shuffle) #4
274 ret <8 x half> %vmulx2.i
277 define dso_local <4 x half> @t_vmulx_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
278 ; CHECK-LABEL: t_vmulx_laneq_f16:
279 ; CHECK: fmulx v0.4h, v0.4h, v1.h[0]
282 %shuffle = shufflevector <8 x half> %b, <8 x half> undef, <4 x i32> zeroinitializer
283 %vmulx2.i = tail call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> %shuffle) #4
284 ret <4 x half> %vmulx2.i
287 define dso_local <8 x half> @t_vmulxq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
288 ; CHECK-LABEL: t_vmulxq_laneq_f16:
289 ; CHECK: fmulx v0.8h, v0.8h, v1.h[0]
292 %shuffle = shufflevector <8 x half> %b, <8 x half> undef, <8 x i32> zeroinitializer
293 %vmulx2.i = tail call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> %shuffle) #4
294 ret <8 x half> %vmulx2.i
297 define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) {
298 ; CHECK-LABEL: t_vmulxh_laneq_f16:
299 ; CHECK: fmulx h0, h0, v1.h[7]
302 %extract = extractelement <8 x half> %b, i32 7
303 %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract)
307 define dso_local <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) {
308 ; CHECK-LABEL: t_vmulx_n_f16:
309 ; CHECK: dup v1.4h, v1.h[0]
310 ; CHECK-NEXT: fmulx v0.4h, v0.4h, v1.4h
313 %vecinit = insertelement <4 x half> undef, half %c, i32 0
314 %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
315 %vmulx2.i = tail call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> %vecinit3) #4
316 ret <4 x half> %vmulx2.i
319 define dso_local <8 x half> @t_vmulxq_n_f16(<8 x half> %a, half %c) {
320 ; CHECK-LABEL: t_vmulxq_n_f16:
321 ; CHECK: dup v1.8h, v1.h[0]
322 ; CHECK-NEXT: fmulx v0.8h, v0.8h, v1.8h
325 %vecinit = insertelement <8 x half> undef, half %c, i32 0
326 %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
327 %vmulx2.i = tail call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> %vecinit7) #4
328 ret <8 x half> %vmulx2.i