1 ; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16 | FileCheck %s
3 declare half @llvm.aarch64.neon.fmulx.f16(half, half)
4 declare <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half>, <4 x half>)
5 declare <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half>, <8 x half>)
6 declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
7 declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
8 declare half @llvm.fma.f16(half, half, half) #1
10 define dso_local <4 x half> @t_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
11 ; CHECK-LABEL: t_vfma_lane_f16:
12 ; CHECK: dup v2.4h, v2.h[0]
13 ; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h
16 %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
17 %fmla3 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %lane1, <4 x half> %a)
21 define dso_local <8 x half> @t_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
22 ; CHECK-LABEL: t_vfmaq_lane_f16:
23 ; CHECK: dup v2.8h, v2.h[0]
24 ; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h
27 %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> zeroinitializer
28 %fmla3 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %lane1, <8 x half> %a)
32 define dso_local <4 x half> @t_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
33 ; CHECK-LABEL: t_vfma_laneq_f16:
34 ; CHECK: dup v2.4h, v2.h[0]
35 ; CHECK-NEXT: fmla v0.4h, v1.4h, v2.4h
38 %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> zeroinitializer
39 %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %lane1, <4 x half> %b, <4 x half> %a)
43 define dso_local <8 x half> @t_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
44 ; CHECK-LABEL: t_vfmaq_laneq_f16:
45 ; CHECK: dup v2.8h, v2.h[0]
46 ; CHECK-NEXT: fmla v0.8h, v1.8h, v2.8h
49 %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> zeroinitializer
50 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %lane1, <8 x half> %b, <8 x half> %a)
54 define dso_local <4 x half> @t_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
55 ; CHECK-LABEL: t_vfma_n_f16:
56 ; CHECK: dup v2.4h, v2.h[0]
57 ; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h
60 %vecinit = insertelement <4 x half> undef, half %c, i32 0
61 %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
62 %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %vecinit3, <4 x half> %a) #4
66 define dso_local <8 x half> @t_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
67 ; CHECK-LABEL: t_vfmaq_n_f16:
68 ; CHECK: dup v2.8h, v2.h[0]
69 ; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h
72 %vecinit = insertelement <8 x half> undef, half %c, i32 0
73 %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
74 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %vecinit7, <8 x half> %a) #4
78 define dso_local half @t_vfmah_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
79 ; CHECK-LABEL: t_vfmah_lane_f16:
80 ; CHECK: fmadd h0, h1, h2, h0
83 %extract = extractelement <4 x half> %c, i32 0
84 %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
88 define dso_local half @t_vfmah_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
89 ; CHECK-LABEL: t_vfmah_laneq_f16:
90 ; CHECK: fmadd h0, h1, h2, h0
93 %extract = extractelement <8 x half> %c, i32 0
94 %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
98 define dso_local <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
99 ; CHECK-LABEL: t_vfms_lane_f16:
100 ; CHECK: fneg v1.4h, v1.4h
101 ; CHECK-NEXT: dup v2.4h, v2.h[0]
102 ; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h
105 %sub = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
106 %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
107 %fmla3 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %sub, <4 x half> %lane1, <4 x half> %a)
108 ret <4 x half> %fmla3
111 define dso_local <8 x half> @t_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
112 ; CHECK-LABEL: t_vfmsq_lane_f16:
113 ; CHECK: fneg v1.8h, v1.8h
114 ; CHECK-NEXT: dup v2.8h, v2.h[0]
115 ; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h
118 %sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
119 %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> zeroinitializer
120 %fmla3 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub, <8 x half> %lane1, <8 x half> %a)
121 ret <8 x half> %fmla3
124 define dso_local <4 x half> @t_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
125 ; CHECK-LABEL: t_vfms_laneq_f16:
126 ; CHECK: dup v2.4h, v2.h[0]
127 ; CHECK-NEXT: fmls v0.4h, v2.4h, v1.4h
130 %sub = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
131 %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> zeroinitializer
132 %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %lane1, <4 x half> %sub, <4 x half> %a)
136 define dso_local <8 x half> @t_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
137 ; CHECK-LABEL: t_vfmsq_laneq_f16:
138 ; CHECK: dup v2.8h, v2.h[0]
139 ; CHECK-NEXT: fmls v0.8h, v2.8h, v1.8h
142 %sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
143 %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> zeroinitializer
144 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %lane1, <8 x half> %sub, <8 x half> %a)
148 define dso_local <4 x half> @t_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
149 ; CHECK-LABEL: t_vfms_n_f16:
150 ; CHECK: fneg v1.4h, v1.4h
151 ; CHECK-NEXT: dup v2.4h, v2.h[0]
152 ; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h
155 %sub = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
156 %vecinit = insertelement <4 x half> undef, half %c, i32 0
157 %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
158 %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %sub, <4 x half> %vecinit3, <4 x half> %a) #4
162 define dso_local <8 x half> @t_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
163 ; CHECK-LABEL: t_vfmsq_n_f16:
164 ; CHECK: fneg v1.8h, v1.8h
165 ; CHECK-NEXT: dup v2.8h, v2.h[0]
166 ; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h
169 %sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
170 %vecinit = insertelement <8 x half> undef, half %c, i32 0
171 %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
172 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub, <8 x half> %vecinit7, <8 x half> %a) #4
176 define dso_local half @t_vfmsh_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
177 ; CHECK-LABEL: t_vfmsh_lane_f16:
179 ; CHECK: fmadd h0, h1, h2, h0
182 %0 = fsub half 0xH8000, %b
183 %extract = extractelement <4 x half> %c, i32 0
184 %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
188 define dso_local half @t_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
189 ; CHECK-LABEL: t_vfmsh_laneq_f16:
191 ; CHECK-NEXT: fmadd h0, h1, h2, h0
194 %0 = fsub half 0xH8000, %b
195 %extract = extractelement <8 x half> %c, i32 0
196 %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
200 define dso_local <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
201 ; CHECK-LABEL: t_vmul_laneq_f16:
202 ; CHECK: fmul v0.4h, v0.4h, v1.h[0]
205 %shuffle = shufflevector <8 x half> %b, <8 x half> undef, <4 x i32> zeroinitializer
206 %mul = fmul <4 x half> %shuffle, %a
210 define dso_local <8 x half> @t_vmulq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
211 ; CHECK-LABEL: t_vmulq_laneq_f16:
212 ; CHECK: fmul v0.8h, v0.8h, v1.h[0]
215 %shuffle = shufflevector <8 x half> %b, <8 x half> undef, <8 x i32> zeroinitializer
216 %mul = fmul <8 x half> %shuffle, %a
220 define dso_local half @t_vmulh_lane_f16(half %a, <4 x half> %c, i32 %lane) {
221 ; CHECK-LABEL: t_vmulh_lane_f16:
222 ; CHECK: fmul h0, h0, v1.h[0]
225 %0 = extractelement <4 x half> %c, i32 0
226 %1 = fmul half %0, %a
230 define dso_local half @t_vmulh_laneq_f16(half %a, <8 x half> %c, i32 %lane) {
231 ; CHECK-LABEL: t_vmulh_laneq_f16:
232 ; CHECK: fmul h0, h0, v1.h[0]
235 %0 = extractelement <8 x half> %c, i32 0
236 %1 = fmul half %0, %a
240 define dso_local half @t_vmulx_f16(half %a, half %b) {
241 ; CHECK-LABEL: t_vmulx_f16:
242 ; CHECK: fmulx h0, h0, h1
245 %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %b)
249 define dso_local half @t_vmulxh_lane_f16(half %a, <4 x half> %b, i32 %lane) {
250 ; CHECK-LABEL: t_vmulxh_lane_f16:
251 ; CHECK: fmulx h0, h0, v1.h[3]
254 %extract = extractelement <4 x half> %b, i32 3
255 %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract)
259 define dso_local <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) {
260 ; CHECK-LABEL: t_vmulx_lane_f16:
261 ; CHECK: fmulx v0.4h, v0.4h, v1.h[0]
264 %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> zeroinitializer
265 %vmulx2.i = tail call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> %shuffle) #4
266 ret <4 x half> %vmulx2.i
269 define dso_local <8 x half> @t_vmulxq_lane_f16(<8 x half> %a, <4 x half> %b, i32 %lane) {
270 ; CHECK-LABEL: t_vmulxq_lane_f16:
271 ; CHECK: fmulx v0.8h, v0.8h, v1.h[0]
274 %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> zeroinitializer
275 %vmulx2.i = tail call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> %shuffle) #4
276 ret <8 x half> %vmulx2.i
279 define dso_local <4 x half> @t_vmulx_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
280 ; CHECK-LABEL: t_vmulx_laneq_f16:
281 ; CHECK: fmulx v0.4h, v0.4h, v1.h[0]
284 %shuffle = shufflevector <8 x half> %b, <8 x half> undef, <4 x i32> zeroinitializer
285 %vmulx2.i = tail call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> %shuffle) #4
286 ret <4 x half> %vmulx2.i
289 define dso_local <8 x half> @t_vmulxq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
290 ; CHECK-LABEL: t_vmulxq_laneq_f16:
291 ; CHECK: fmulx v0.8h, v0.8h, v1.h[0]
294 %shuffle = shufflevector <8 x half> %b, <8 x half> undef, <8 x i32> zeroinitializer
295 %vmulx2.i = tail call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> %shuffle) #4
296 ret <8 x half> %vmulx2.i
299 define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) {
300 ; CHECK-LABEL: t_vmulxh_laneq_f16:
301 ; CHECK: fmulx h0, h0, v1.h[7]
304 %extract = extractelement <8 x half> %b, i32 7
305 %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract)
309 define dso_local <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) {
310 ; CHECK-LABEL: t_vmulx_n_f16:
311 ; CHECK: dup v1.4h, v1.h[0]
312 ; CHECK-NEXT: fmulx v0.4h, v0.4h, v1.4h
315 %vecinit = insertelement <4 x half> undef, half %c, i32 0
316 %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
317 %vmulx2.i = tail call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> %vecinit3) #4
318 ret <4 x half> %vmulx2.i
321 define dso_local <8 x half> @t_vmulxq_n_f16(<8 x half> %a, half %c) {
322 ; CHECK-LABEL: t_vmulxq_n_f16:
323 ; CHECK: dup v1.8h, v1.h[0]
324 ; CHECK-NEXT: fmulx v0.8h, v0.8h, v1.8h
327 %vecinit = insertelement <8 x half> undef, half %c, i32 0
328 %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
329 %vmulx2.i = tail call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> %vecinit7) #4
330 ret <8 x half> %vmulx2.i