1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,NO-SZ
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,HAS-SZ
5 ; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set.
6 define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
8 ; NO-SZ: # %bb.0: # %entry
9 ; NO-SZ-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0
12 ; HAS-SZ-LABEL: test1:
13 ; HAS-SZ: # %bb.0: # %entry
14 ; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
15 ; HAS-SZ-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm3
16 ; HAS-SZ-NEXT: vaddph %zmm0, %zmm3, %zmm0
19 %0 = bitcast <32 x half> %a to <16 x float>
20 %1 = bitcast <32 x half> %b to <16 x float>
21 %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
22 %3 = bitcast <16 x float> %2 to <32 x half>
23 %add.i = fadd <32 x half> %3, %acc
24 ret <32 x half> %add.i
27 define dso_local <32 x half> @test2(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
29 ; NO-SZ: # %bb.0: # %entry
30 ; NO-SZ-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0
33 ; HAS-SZ-LABEL: test2:
34 ; HAS-SZ: # %bb.0: # %entry
35 ; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
36 ; HAS-SZ-NEXT: vfmaddcph %zmm2, %zmm1, %zmm3
37 ; HAS-SZ-NEXT: vaddph %zmm0, %zmm3, %zmm0
40 %0 = bitcast <32 x half> %a to <16 x float>
41 %1 = bitcast <32 x half> %b to <16 x float>
42 %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
43 %3 = bitcast <16 x float> %2 to <32 x half>
44 %add.i = fadd <32 x half> %3, %acc
45 ret <32 x half> %add.i
48 define dso_local <16 x half> @test3(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
50 ; NO-SZ: # %bb.0: # %entry
51 ; NO-SZ-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0
54 ; HAS-SZ-LABEL: test3:
55 ; HAS-SZ: # %bb.0: # %entry
56 ; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
57 ; HAS-SZ-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm3
58 ; HAS-SZ-NEXT: vaddph %ymm0, %ymm3, %ymm0
61 %0 = bitcast <16 x half> %a to <8 x float>
62 %1 = bitcast <16 x half> %b to <8 x float>
63 %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
64 %3 = bitcast <8 x float> %2 to <16 x half>
65 %add.i = fadd <16 x half> %3, %acc
66 ret <16 x half> %add.i
69 define dso_local <16 x half> @test4(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
71 ; NO-SZ: # %bb.0: # %entry
72 ; NO-SZ-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0
75 ; HAS-SZ-LABEL: test4:
76 ; HAS-SZ: # %bb.0: # %entry
77 ; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
78 ; HAS-SZ-NEXT: vfmaddcph %ymm2, %ymm1, %ymm3
79 ; HAS-SZ-NEXT: vaddph %ymm0, %ymm3, %ymm0
82 %0 = bitcast <16 x half> %a to <8 x float>
83 %1 = bitcast <16 x half> %b to <8 x float>
84 %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
85 %3 = bitcast <8 x float> %2 to <16 x half>
86 %add.i = fadd <16 x half> %3, %acc
87 ret <16 x half> %add.i
90 define dso_local <8 x half> @test5(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
92 ; NO-SZ: # %bb.0: # %entry
93 ; NO-SZ-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0
96 ; HAS-SZ-LABEL: test5:
97 ; HAS-SZ: # %bb.0: # %entry
98 ; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
99 ; HAS-SZ-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm3
100 ; HAS-SZ-NEXT: vaddph %xmm0, %xmm3, %xmm0
103 %0 = bitcast <8 x half> %a to <4 x float>
104 %1 = bitcast <8 x half> %b to <4 x float>
105 %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
106 %3 = bitcast <4 x float> %2 to <8 x half>
107 %add.i = fadd <8 x half> %3, %acc
108 ret <8 x half> %add.i
111 define dso_local <8 x half> @test6(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
112 ; NO-SZ-LABEL: test6:
113 ; NO-SZ: # %bb.0: # %entry
114 ; NO-SZ-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0
117 ; HAS-SZ-LABEL: test6:
118 ; HAS-SZ: # %bb.0: # %entry
119 ; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
120 ; HAS-SZ-NEXT: vfmaddcph %xmm2, %xmm1, %xmm3
121 ; HAS-SZ-NEXT: vaddph %xmm0, %xmm3, %xmm0
124 %0 = bitcast <8 x half> %a to <4 x float>
125 %1 = bitcast <8 x half> %b to <4 x float>
126 %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
127 %3 = bitcast <4 x float> %2 to <8 x half>
128 %add.i = fadd <8 x half> %3, %acc
129 ret <8 x half> %add.i
132 ; FADD(acc, FMA(a, b, -0.0)) can be combined to FMA(a, b, acc) no matter if the nsz flag set.
133 define dso_local <32 x half> @test13(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
134 ; CHECK-LABEL: test13:
135 ; CHECK: # %bb.0: # %entry
136 ; CHECK-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0
139 %0 = bitcast <32 x half> %a to <16 x float>
140 %1 = bitcast <32 x half> %b to <16 x float>
141 %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
142 %3 = bitcast <16 x float> %2 to <32 x half>
143 %add.i = fadd <32 x half> %3, %acc
144 ret <32 x half> %add.i
147 define dso_local <32 x half> @test14(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
148 ; CHECK-LABEL: test14:
149 ; CHECK: # %bb.0: # %entry
150 ; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0
153 %0 = bitcast <32 x half> %a to <16 x float>
154 %1 = bitcast <32 x half> %b to <16 x float>
155 %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
156 %3 = bitcast <16 x float> %2 to <32 x half>
157 %add.i = fadd <32 x half> %3, %acc
158 ret <32 x half> %add.i
161 define dso_local <16 x half> @test15(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
162 ; CHECK-LABEL: test15:
163 ; CHECK: # %bb.0: # %entry
164 ; CHECK-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0
167 %0 = bitcast <16 x half> %a to <8 x float>
168 %1 = bitcast <16 x half> %b to <8 x float>
169 %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
170 %3 = bitcast <8 x float> %2 to <16 x half>
171 %add.i = fadd <16 x half> %3, %acc
172 ret <16 x half> %add.i
175 define dso_local <16 x half> @test16(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
176 ; CHECK-LABEL: test16:
177 ; CHECK: # %bb.0: # %entry
178 ; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0
181 %0 = bitcast <16 x half> %a to <8 x float>
182 %1 = bitcast <16 x half> %b to <8 x float>
183 %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
184 %3 = bitcast <8 x float> %2 to <16 x half>
185 %add.i = fadd <16 x half> %3, %acc
186 ret <16 x half> %add.i
189 define dso_local <8 x half> @test17(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
190 ; CHECK-LABEL: test17:
191 ; CHECK: # %bb.0: # %entry
192 ; CHECK-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0
195 %0 = bitcast <8 x half> %a to <4 x float>
196 %1 = bitcast <8 x half> %b to <4 x float>
197 %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
198 %3 = bitcast <4 x float> %2 to <8 x half>
199 %add.i = fadd <8 x half> %3, %acc
200 ret <8 x half> %add.i
203 define dso_local <8 x half> @test18(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
204 ; CHECK-LABEL: test18:
205 ; CHECK: # %bb.0: # %entry
206 ; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0
209 %0 = bitcast <8 x half> %a to <4 x float>
210 %1 = bitcast <8 x half> %b to <4 x float>
211 %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
212 %3 = bitcast <4 x float> %2 to <8 x half>
213 %add.i = fadd <8 x half> %3, %acc
214 ret <8 x half> %add.i
217 declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
218 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
219 declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
220 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
221 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
222 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)