1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16 | FileCheck %s
4 declare half @llvm.aarch64.neon.fmulx.f16(half, half)
5 declare <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half>, <4 x half>)
6 declare <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half>, <8 x half>)
7 declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
8 declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
9 declare half @llvm.fma.f16(half, half, half) #1
11 define dso_local <4 x half> @t_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
12 ; CHECK-LABEL: t_vfma_lane_f16:
13 ; CHECK-NEXT: .cfi_startproc
14 ; CHECK-NEXT: // %bb.0: // %entry
15 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
16 ; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[0]
19 %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
20 %fmla3 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %lane1, <4 x half> %a)
24 define dso_local <8 x half> @t_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
25 ; CHECK-LABEL: t_vfmaq_lane_f16:
26 ; CHECK-NEXT: .cfi_startproc
27 ; CHECK-NEXT: // %bb.0: // %entry
28 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
29 ; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0]
32 %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> zeroinitializer
33 %fmla3 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %lane1, <8 x half> %a)
37 define dso_local <4 x half> @t_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
38 ; CHECK-LABEL: t_vfma_laneq_f16:
39 ; CHECK-NEXT: .cfi_startproc
40 ; CHECK-NEXT: // %bb.0: // %entry
41 ; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[0]
44 %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> zeroinitializer
45 %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %lane1, <4 x half> %b, <4 x half> %a)
49 define dso_local <8 x half> @t_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
50 ; CHECK-LABEL: t_vfmaq_laneq_f16:
51 ; CHECK-NEXT: .cfi_startproc
52 ; CHECK-NEXT: // %bb.0: // %entry
53 ; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0]
56 %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> zeroinitializer
57 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %lane1, <8 x half> %b, <8 x half> %a)
61 define dso_local <4 x half> @t_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
62 ; CHECK-LABEL: t_vfma_n_f16:
63 ; CHECK-NEXT: .cfi_startproc
64 ; CHECK-NEXT: // %bb.0: // %entry
65 ; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2
66 ; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[0]
69 %vecinit = insertelement <4 x half> undef, half %c, i32 0
70 %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
71 %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %vecinit3, <4 x half> %a) #4
75 define dso_local <8 x half> @t_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
76 ; CHECK-LABEL: t_vfmaq_n_f16:
77 ; CHECK-NEXT: .cfi_startproc
78 ; CHECK-NEXT: // %bb.0: // %entry
79 ; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2
80 ; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0]
83 %vecinit = insertelement <8 x half> undef, half %c, i32 0
84 %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
85 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %vecinit7, <8 x half> %a) #4
89 define dso_local half @t_vfmah_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
90 ; CHECK-LABEL: t_vfmah_lane_f16:
91 ; CHECK-NEXT: .cfi_startproc
92 ; CHECK-NEXT: // %bb.0: // %entry
93 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
94 ; CHECK-NEXT: fmla h0, h1, v2.h[0]
97 %extract = extractelement <4 x half> %c, i32 0
98 %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
102 define dso_local half @t_vfmah_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
103 ; CHECK-LABEL: t_vfmah_laneq_f16:
104 ; CHECK-NEXT: .cfi_startproc
105 ; CHECK-NEXT: // %bb.0: // %entry
106 ; CHECK-NEXT: fmla h0, h1, v2.h[0]
109 %extract = extractelement <8 x half> %c, i32 0
110 %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
114 define dso_local <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
115 ; CHECK-LABEL: t_vfms_lane_f16:
116 ; CHECK-NEXT: .cfi_startproc
117 ; CHECK-NEXT: // %bb.0: // %entry
118 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
119 ; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[0]
122 %sub = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
123 %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
124 %fmla3 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %sub, <4 x half> %lane1, <4 x half> %a)
125 ret <4 x half> %fmla3
128 define dso_local <8 x half> @t_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
129 ; CHECK-LABEL: t_vfmsq_lane_f16:
130 ; CHECK-NEXT: .cfi_startproc
131 ; CHECK-NEXT: // %bb.0: // %entry
132 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
133 ; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0]
136 %sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
137 %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> zeroinitializer
138 %fmla3 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub, <8 x half> %lane1, <8 x half> %a)
139 ret <8 x half> %fmla3
142 define dso_local <4 x half> @t_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
143 ; CHECK-LABEL: t_vfms_laneq_f16:
144 ; CHECK-NEXT: .cfi_startproc
145 ; CHECK-NEXT: // %bb.0: // %entry
146 ; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[0]
149 %sub = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
150 %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> zeroinitializer
151 %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %lane1, <4 x half> %sub, <4 x half> %a)
155 define dso_local <8 x half> @t_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
156 ; CHECK-LABEL: t_vfmsq_laneq_f16:
157 ; CHECK-NEXT: .cfi_startproc
158 ; CHECK-NEXT: // %bb.0: // %entry
159 ; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0]
162 %sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
163 %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> zeroinitializer
164 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %lane1, <8 x half> %sub, <8 x half> %a)
168 define dso_local <4 x half> @t_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
169 ; CHECK-LABEL: t_vfms_n_f16:
170 ; CHECK-NEXT: .cfi_startproc
171 ; CHECK-NEXT: // %bb.0: // %entry
172 ; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2
173 ; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[0]
176 %sub = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
177 %vecinit = insertelement <4 x half> undef, half %c, i32 0
178 %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
179 %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %sub, <4 x half> %vecinit3, <4 x half> %a) #4
183 define dso_local <8 x half> @t_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
184 ; CHECK-LABEL: t_vfmsq_n_f16:
185 ; CHECK-NEXT: .cfi_startproc
186 ; CHECK-NEXT: // %bb.0: // %entry
187 ; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2
188 ; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0]
191 %sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
192 %vecinit = insertelement <8 x half> undef, half %c, i32 0
193 %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
194 %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub, <8 x half> %vecinit7, <8 x half> %a) #4
198 define dso_local half @t_vfmsh_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
199 ; CHECK-LABEL: t_vfmsh_lane_f16:
200 ; CHECK-NEXT: .cfi_startproc
201 ; CHECK-NEXT: // %bb.0: // %entry
202 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
203 ; CHECK-NEXT: fmls h0, h1, v2.h[0]
206 %0 = fsub half 0xH8000, %b
207 %extract = extractelement <4 x half> %c, i32 0
208 %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
212 define dso_local half @t_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
213 ; CHECK-LABEL: t_vfmsh_laneq_f16:
214 ; CHECK-NEXT: .cfi_startproc
215 ; CHECK-NEXT: // %bb.0: // %entry
216 ; CHECK-NEXT: fmls h0, h1, v2.h[0]
219 %0 = fsub half 0xH8000, %b
220 %extract = extractelement <8 x half> %c, i32 0
221 %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
225 define dso_local <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
226 ; CHECK-LABEL: t_vmul_laneq_f16:
227 ; CHECK-NEXT: .cfi_startproc
228 ; CHECK-NEXT: // %bb.0: // %entry
229 ; CHECK-NEXT: fmul v0.4h, v0.4h, v1.h[0]
232 %shuffle = shufflevector <8 x half> %b, <8 x half> undef, <4 x i32> zeroinitializer
233 %mul = fmul <4 x half> %shuffle, %a
237 define dso_local <8 x half> @t_vmulq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
238 ; CHECK-LABEL: t_vmulq_laneq_f16:
239 ; CHECK-NEXT: .cfi_startproc
240 ; CHECK-NEXT: // %bb.0: // %entry
241 ; CHECK-NEXT: fmul v0.8h, v0.8h, v1.h[0]
244 %shuffle = shufflevector <8 x half> %b, <8 x half> undef, <8 x i32> zeroinitializer
245 %mul = fmul <8 x half> %shuffle, %a
249 define dso_local half @t_vmulh_lane_f16(half %a, <4 x half> %c, i32 %lane) {
250 ; CHECK-LABEL: t_vmulh_lane_f16:
251 ; CHECK-NEXT: .cfi_startproc
252 ; CHECK-NEXT: // %bb.0: // %entry
253 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
254 ; CHECK-NEXT: fmul h0, h0, v1.h[0]
257 %0 = extractelement <4 x half> %c, i32 0
258 %1 = fmul half %0, %a
262 define dso_local half @t_vmulh_laneq_f16(half %a, <8 x half> %c, i32 %lane) {
263 ; CHECK-LABEL: t_vmulh_laneq_f16:
264 ; CHECK-NEXT: .cfi_startproc
265 ; CHECK-NEXT: // %bb.0: // %entry
266 ; CHECK-NEXT: fmul h0, h0, v1.h[0]
269 %0 = extractelement <8 x half> %c, i32 0
270 %1 = fmul half %0, %a
274 define dso_local half @t_vmulx_f16(half %a, half %b) {
275 ; CHECK-LABEL: t_vmulx_f16:
276 ; CHECK-NEXT: .cfi_startproc
277 ; CHECK-NEXT: // %bb.0: // %entry
278 ; CHECK-NEXT: fmulx h0, h0, h1
281 %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %b)
285 define dso_local half @t_vmulxh_lane_f16(half %a, <4 x half> %b, i32 %lane) {
286 ; CHECK-LABEL: t_vmulxh_lane_f16:
287 ; CHECK-NEXT: .cfi_startproc
288 ; CHECK-NEXT: // %bb.0: // %entry
289 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
290 ; CHECK-NEXT: fmulx h0, h0, v1.h[3]
293 %extract = extractelement <4 x half> %b, i32 3
294 %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract)
298 define dso_local <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) {
299 ; CHECK-LABEL: t_vmulx_lane_f16:
300 ; CHECK-NEXT: .cfi_startproc
301 ; CHECK-NEXT: // %bb.0: // %entry
302 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
303 ; CHECK-NEXT: fmulx v0.4h, v0.4h, v1.h[0]
306 %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> zeroinitializer
307 %vmulx2.i = tail call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> %shuffle) #4
308 ret <4 x half> %vmulx2.i
311 define dso_local <8 x half> @t_vmulxq_lane_f16(<8 x half> %a, <4 x half> %b, i32 %lane) {
312 ; CHECK-LABEL: t_vmulxq_lane_f16:
313 ; CHECK-NEXT: .cfi_startproc
314 ; CHECK-NEXT: // %bb.0: // %entry
315 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
316 ; CHECK-NEXT: fmulx v0.8h, v0.8h, v1.h[0]
319 %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> zeroinitializer
320 %vmulx2.i = tail call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> %shuffle) #4
321 ret <8 x half> %vmulx2.i
324 define dso_local <4 x half> @t_vmulx_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
325 ; CHECK-LABEL: t_vmulx_laneq_f16:
326 ; CHECK-NEXT: .cfi_startproc
327 ; CHECK-NEXT: // %bb.0: // %entry
328 ; CHECK-NEXT: fmulx v0.4h, v0.4h, v1.h[0]
331 %shuffle = shufflevector <8 x half> %b, <8 x half> undef, <4 x i32> zeroinitializer
332 %vmulx2.i = tail call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> %shuffle) #4
333 ret <4 x half> %vmulx2.i
336 define dso_local <8 x half> @t_vmulxq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
337 ; CHECK-LABEL: t_vmulxq_laneq_f16:
338 ; CHECK-NEXT: .cfi_startproc
339 ; CHECK-NEXT: // %bb.0: // %entry
340 ; CHECK-NEXT: fmulx v0.8h, v0.8h, v1.h[0]
343 %shuffle = shufflevector <8 x half> %b, <8 x half> undef, <8 x i32> zeroinitializer
344 %vmulx2.i = tail call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> %shuffle) #4
345 ret <8 x half> %vmulx2.i
348 define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) {
349 ; CHECK-LABEL: t_vmulxh_laneq_f16:
350 ; CHECK-NEXT: .cfi_startproc
351 ; CHECK-NEXT: // %bb.0: // %entry
352 ; CHECK-NEXT: fmulx h0, h0, v1.h[7]
355 %extract = extractelement <8 x half> %b, i32 7
356 %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract)
360 define dso_local <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) {
361 ; CHECK-LABEL: t_vmulx_n_f16:
362 ; CHECK-NEXT: .cfi_startproc
363 ; CHECK-NEXT: // %bb.0: // %entry
364 ; CHECK-NEXT: // kill: def $h1 killed $h1 def $q1
365 ; CHECK-NEXT: dup v1.4h, v1.h[0]
366 ; CHECK-NEXT: fmulx v0.4h, v0.4h, v1.4h
369 %vecinit = insertelement <4 x half> undef, half %c, i32 0
370 %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
371 %vmulx2.i = tail call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> %vecinit3) #4
372 ret <4 x half> %vmulx2.i
375 define dso_local <8 x half> @t_vmulxq_n_f16(<8 x half> %a, half %c) {
376 ; CHECK-LABEL: t_vmulxq_n_f16:
377 ; CHECK-NEXT: .cfi_startproc
378 ; CHECK-NEXT: // %bb.0: // %entry
379 ; CHECK-NEXT: // kill: def $h1 killed $h1 def $q1
380 ; CHECK-NEXT: dup v1.8h, v1.h[0]
381 ; CHECK-NEXT: fmulx v0.8h, v0.8h, v1.8h
384 %vecinit = insertelement <8 x half> undef, half %c, i32 0
385 %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
386 %vmulx2.i = tail call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> %vecinit7) #4
387 ret <8 x half> %vmulx2.i
390 define dso_local half @t_vfmah_lane3_f16(half %a, half %b, <4 x half> %c) {
391 ; CHECK-LABEL: t_vfmah_lane3_f16:
392 ; CHECK-NEXT: .cfi_startproc
393 ; CHECK-NEXT: // %bb.0: // %entry
394 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
395 ; CHECK-NEXT: fmla h0, h1, v2.h[3]
398 %extract = extractelement <4 x half> %c, i32 3
399 %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
403 define dso_local half @t_vfmah_laneq7_f16(half %a, half %b, <8 x half> %c) {
404 ; CHECK-LABEL: t_vfmah_laneq7_f16:
405 ; CHECK-NEXT: .cfi_startproc
406 ; CHECK-NEXT: // %bb.0: // %entry
407 ; CHECK-NEXT: fmla h0, h1, v2.h[7]
410 %extract = extractelement <8 x half> %c, i32 7
411 %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
415 define dso_local half @t_vfmsh_lane3_f16(half %a, half %b, <4 x half> %c) {
416 ; CHECK-LABEL: t_vfmsh_lane3_f16:
417 ; CHECK-NEXT: .cfi_startproc
418 ; CHECK-NEXT: // %bb.0: // %entry
419 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
420 ; CHECK-NEXT: fmls h0, h1, v2.h[3]
423 %0 = fsub half 0xH8000, %b
424 %extract = extractelement <4 x half> %c, i32 3
425 %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
429 define dso_local half @t_vfmsh_laneq7_f16(half %a, half %b, <8 x half> %c) {
430 ; CHECK-LABEL: t_vfmsh_laneq7_f16:
431 ; CHECK-NEXT: .cfi_startproc
432 ; CHECK-NEXT: // %bb.0: // %entry
433 ; CHECK-NEXT: fmls h0, h1, v2.h[7]
436 %0 = fsub half 0xH8000, %b
437 %extract = extractelement <8 x half> %c, i32 7
438 %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
442 define dso_local half @t_fadd_vfmah_f16(half %a, half %b, <4 x half> %c, <4 x half> %d) {
443 ; CHECK-LABEL: t_fadd_vfmah_f16:
444 ; CHECK-NEXT: .cfi_startproc
445 ; CHECK-NEXT: // %bb.0: // %entry
446 ; CHECK-NEXT: fadd v2.4h, v2.4h, v3.4h
447 ; CHECK-NEXT: fmla h0, h1, v2.h[3]
450 %0 = fadd <4 x half> %c, %d
451 %extract = extractelement <4 x half> %0, i32 3
452 %1 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)