1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
2 ; RUN: llc -march=amdgcn -mcpu=gfx906 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-UNSAFE
3 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906
4 ; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-fp64-fp16-denormals,-fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT
5 ; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+fp64-fp16-denormals,+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT
6 ; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
8 ; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
9 ; are not converted from f16 to f32.
10 ; GCN-LABEL: {{^}}dotproduct_f16
11 ; GFX900: v_fma_legacy_f16
12 ; GCN900: v_fma_legacy_f16
14 ; GFX906: v_mul_f16_e32
15 ; GFX906: v_mul_f16_e32
17 ; GFX906-UNSAFE: v_fma_legacy_f16
19 ; GFX906-CONTRACT: v_mac_f16_e32
20 ; GFX906-DENORM-CONTRACT: v_fma_legacy_f16
21 define amdgpu_kernel void @dotproduct_f16(<2 x half> addrspace(1)* %src1,
22 <2 x half> addrspace(1)* %src2,
23 half addrspace(1)* nocapture %dst) {
25 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
26 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
28 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
29 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
31 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
32 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
34 %mul2 = fmul half %src1.el2, %src2.el2
35 %mul1 = fmul half %src1.el1, %src2.el1
36 %acc = load half, half addrspace(1)* %dst, align 2
37 %acc1 = fadd half %mul2, %acc
38 %acc2 = fadd half %mul1, %acc1
39 store half %acc2, half addrspace(1)* %dst, align 2
44 ; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32
45 ; and the vectors are of type <2 x half>
46 ; GCN-LABEL: {{^}}dotproduct_f16_f32
47 ; GFX900: v_mad_mix_f32
48 ; GCN900: v_mad_mix_f32
51 ; GFX906: v_mac_f32_e32
53 ; GFX906-UNSAFE: v_dot2_f32_f16
55 ; GFX906-CONTRACT: v_dot2_f32_f16
57 ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
58 define amdgpu_kernel void @dotproduct_f16_f32(<2 x half> addrspace(1)* %src1,
59 <2 x half> addrspace(1)* %src2,
60 float addrspace(1)* nocapture %dst) {
62 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
63 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
65 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
66 %csrc1.el1 = fpext half %src1.el1 to float
67 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
68 %csrc2.el1 = fpext half %src2.el1 to float
70 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
71 %csrc1.el2 = fpext half %src1.el2 to float
72 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
73 %csrc2.el2 = fpext half %src2.el2 to float
75 %mul2 = fmul float %csrc1.el2, %csrc2.el2
76 %mul1 = fmul float %csrc1.el1, %csrc2.el1
77 %acc = load float, float addrspace(1)* %dst, align 4
78 %acc1 = fadd float %mul2, %acc
79 %acc2 = fadd float %mul1, %acc1
80 store float %acc2, float addrspace(1)* %dst, align 4
84 ; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32
85 ; and the vectors are of type <2 x half>
86 ; GCN-LABEL: {{^}}dotproduct_diffvecorder
87 ; GFX900: v_mad_mix_f32
88 ; GCN900: v_mad_mix_f32
91 ; GFX906: v_mac_f32_e32
93 ; GFX906-UNSAFE: v_dot2_f32_f16
95 ; GFX906-CONTRACT: v_dot2_f32_f16
96 ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
97 define amdgpu_kernel void @dotproduct_diffvecorder(<2 x half> addrspace(1)* %src1,
98 <2 x half> addrspace(1)* %src2,
99 float addrspace(1)* nocapture %dst) {
101 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
102 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
104 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
105 %csrc1.el1 = fpext half %src1.el1 to float
106 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
107 %csrc2.el1 = fpext half %src2.el1 to float
109 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
110 %csrc1.el2 = fpext half %src1.el2 to float
111 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
112 %csrc2.el2 = fpext half %src2.el2 to float
114 %mul2 = fmul float %csrc2.el2, %csrc1.el2
115 %mul1 = fmul float %csrc1.el1, %csrc2.el1
116 %acc = load float, float addrspace(1)* %dst, align 4
117 %acc1 = fadd float %mul2, %acc
118 %acc2 = fadd float %mul1, %acc1
119 store float %acc2, float addrspace(1)* %dst, align 4
123 ; Tests to make sure dot product is not generated when the vectors are not of <2 x half>.
124 ; GCN-LABEL: {{^}}dotproduct_v4f16
125 ; GFX900: v_mad_mix_f32
128 ; GFX906: v_mac_f32_e32
130 ; GFX906-UNSAFE: v_fma_mix_f32
132 ; GFX906-CONTRACT: v_fma_mix_f32
133 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
134 define amdgpu_kernel void @dotproduct_v4f16(<4 x half> addrspace(1)* %src1,
135 <4 x half> addrspace(1)* %src2,
136 float addrspace(1)* nocapture %dst) {
138 %src1.vec = load <4 x half>, <4 x half> addrspace(1)* %src1
139 %src2.vec = load <4 x half>, <4 x half> addrspace(1)* %src2
141 %src1.el1 = extractelement <4 x half> %src1.vec, i64 0
142 %csrc1.el1 = fpext half %src1.el1 to float
143 %src2.el1 = extractelement <4 x half> %src2.vec, i64 0
144 %csrc2.el1 = fpext half %src2.el1 to float
146 %src1.el2 = extractelement <4 x half> %src1.vec, i64 1
147 %csrc1.el2 = fpext half %src1.el2 to float
148 %src2.el2 = extractelement <4 x half> %src2.vec, i64 1
149 %csrc2.el2 = fpext half %src2.el2 to float
151 %mul2 = fmul float %csrc1.el2, %csrc2.el2
152 %mul1 = fmul float %csrc1.el1, %csrc2.el1
153 %acc = load float, float addrspace(1)* %dst, align 4
154 %acc1 = fadd float %mul2, %acc
155 %acc2 = fadd float %mul1, %acc1
156 store float %acc2, float addrspace(1)* %dst, align 4
160 ; GCN-LABEL: {{^}}NotAdotproduct
161 ; GFX900: v_mad_mix_f32
162 ; GCN900: v_mad_mix_f32
165 ; GFX906: v_mac_f32_e32
167 ; GFX906-UNSAFE: v_fma_mix_f32
169 ; GFX906-CONTRACT: v_fma_mix_f32
170 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
171 define amdgpu_kernel void @NotAdotproduct(<2 x half> addrspace(1)* %src1,
172 <2 x half> addrspace(1)* %src2,
173 float addrspace(1)* nocapture %dst) {
175 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
176 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
178 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
179 %csrc1.el1 = fpext half %src1.el1 to float
180 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
181 %csrc2.el1 = fpext half %src2.el1 to float
183 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
184 %csrc1.el2 = fpext half %src1.el2 to float
185 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
186 %csrc2.el2 = fpext half %src2.el2 to float
188 %mul2 = fmul float %csrc1.el2, %csrc1.el1
189 %mul1 = fmul float %csrc2.el1, %csrc2.el2
190 %acc = load float, float addrspace(1)* %dst, align 4
191 %acc1 = fadd float %mul2, %acc
192 %acc2 = fadd float %mul1, %acc1
193 store float %acc2, float addrspace(1)* %dst, align 4
197 ; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct
198 ; GFX900: v_mad_mix_f32
199 ; GCN900: v_mad_mix_f32
202 ; GFX906: v_mac_f32_e32
204 ; GFX906-UNSAFE: v_fma_mix_f32
206 ; GFX906-CONTRACT: v_fma_mix_f32
207 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
208 define amdgpu_kernel void @Diff_Idx_NotAdotproduct(<2 x half> addrspace(1)* %src1,
209 <2 x half> addrspace(1)* %src2,
210 float addrspace(1)* nocapture %dst) {
212 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
213 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
215 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
216 %csrc1.el1 = fpext half %src1.el1 to float
217 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
218 %csrc2.el1 = fpext half %src2.el1 to float
220 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
221 %csrc1.el2 = fpext half %src1.el2 to float
222 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
223 %csrc2.el2 = fpext half %src2.el2 to float
225 %mul2 = fmul float %csrc1.el2, %csrc2.el1
226 %mul1 = fmul float %csrc1.el1, %csrc2.el2
227 %acc = load float, float addrspace(1)* %dst, align 4
228 %acc1 = fadd float %mul2, %acc
229 %acc2 = fadd float %mul1, %acc1
230 store float %acc2, float addrspace(1)* %dst, align 4