1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
2 ; RUN: llc -march=amdgcn -mcpu=gfx906 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
3 ; RUN: llc -march=amdgcn -mcpu=gfx1011 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
4 ; RUN: llc -march=amdgcn -mcpu=gfx1012 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
5 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906
6 ; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-fp64-fp16-denormals,-fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT
7 ; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+fp64-fp16-denormals,+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT
8 ; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
10 ; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
11 ; are not converted from f16 to f32.
12 ; GCN-LABEL: {{^}}dotproduct_f16
16 ; GFX906: v_mul_f16_e32
17 ; GFX906: v_mul_f16_e32
19 ; GFX906-DL-UNSAFE: v_fma_f16
20 ; GFX10-CONTRACT: v_fmac_f16
22 ; GFX906-CONTRACT: v_mac_f16_e32
23 ; GFX906-DENORM-CONTRACT: v_fma_f16
24 define amdgpu_kernel void @dotproduct_f16(<2 x half> addrspace(1)* %src1,
25 <2 x half> addrspace(1)* %src2,
26 half addrspace(1)* nocapture %dst) {
28 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
29 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
31 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
32 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
34 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
35 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
37 %mul2 = fmul half %src1.el2, %src2.el2
38 %mul1 = fmul half %src1.el1, %src2.el1
39 %acc = load half, half addrspace(1)* %dst, align 2
40 %acc1 = fadd half %mul2, %acc
41 %acc2 = fadd half %mul1, %acc1
42 store half %acc2, half addrspace(1)* %dst, align 2
47 ; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32
48 ; and the vectors are of type <2 x half>
49 ; GCN-LABEL: {{^}}dotproduct_f16_f32
50 ; GFX900: v_mad_mix_f32
51 ; GFX900: v_mad_mix_f32
54 ; GFX906: v_mac_f32_e32
56 ; GFX906-DL-UNSAFE: v_dot2_f32_f16
57 ; GFX10-DL-UNSAFE: v_dot2c_f32_f16_e32
59 ; GFX906-CONTRACT: v_dot2_f32_f16
61 ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
62 define amdgpu_kernel void @dotproduct_f16_f32(<2 x half> addrspace(1)* %src1,
63 <2 x half> addrspace(1)* %src2,
64 float addrspace(1)* nocapture %dst) {
66 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
67 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
69 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
70 %csrc1.el1 = fpext half %src1.el1 to float
71 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
72 %csrc2.el1 = fpext half %src2.el1 to float
74 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
75 %csrc1.el2 = fpext half %src1.el2 to float
76 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
77 %csrc2.el2 = fpext half %src2.el2 to float
79 %mul2 = fmul float %csrc1.el2, %csrc2.el2
80 %mul1 = fmul float %csrc1.el1, %csrc2.el1
81 %acc = load float, float addrspace(1)* %dst, align 4
82 %acc1 = fadd float %mul2, %acc
83 %acc2 = fadd float %mul1, %acc1
84 store float %acc2, float addrspace(1)* %dst, align 4
88 ; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32
89 ; and the vectors are of type <2 x half>
90 ; GCN-LABEL: {{^}}dotproduct_diffvecorder
91 ; GFX900: v_mad_mix_f32
92 ; GFX900: v_mad_mix_f32
95 ; GFX906: v_mac_f32_e32
97 ; GFX906-DL-UNSAFE: v_dot2_f32_f16
98 ; GFX10-DL-UNSAFE: v_dot2c_f32_f16_e32
100 ; GFX906-CONTRACT: v_dot2_f32_f16
101 ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
102 define amdgpu_kernel void @dotproduct_diffvecorder(<2 x half> addrspace(1)* %src1,
103 <2 x half> addrspace(1)* %src2,
104 float addrspace(1)* nocapture %dst) {
106 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
107 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
109 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
110 %csrc1.el1 = fpext half %src1.el1 to float
111 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
112 %csrc2.el1 = fpext half %src2.el1 to float
114 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
115 %csrc1.el2 = fpext half %src1.el2 to float
116 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
117 %csrc2.el2 = fpext half %src2.el2 to float
119 %mul2 = fmul float %csrc2.el2, %csrc1.el2
120 %mul1 = fmul float %csrc1.el1, %csrc2.el1
121 %acc = load float, float addrspace(1)* %dst, align 4
122 %acc1 = fadd float %mul2, %acc
123 %acc2 = fadd float %mul1, %acc1
124 store float %acc2, float addrspace(1)* %dst, align 4
128 ; Tests to make sure dot product is not generated when the vectors are not of <2 x half>.
129 ; GCN-LABEL: {{^}}dotproduct_v4f16
130 ; GFX900: v_mad_mix_f32
133 ; GFX906: v_mac_f32_e32
135 ; GCN-DL-UNSAFE: v_fma_mix_f32
137 ; GFX906-CONTRACT: v_fma_mix_f32
138 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
139 define amdgpu_kernel void @dotproduct_v4f16(<4 x half> addrspace(1)* %src1,
140 <4 x half> addrspace(1)* %src2,
141 float addrspace(1)* nocapture %dst) {
143 %src1.vec = load <4 x half>, <4 x half> addrspace(1)* %src1
144 %src2.vec = load <4 x half>, <4 x half> addrspace(1)* %src2
146 %src1.el1 = extractelement <4 x half> %src1.vec, i64 0
147 %csrc1.el1 = fpext half %src1.el1 to float
148 %src2.el1 = extractelement <4 x half> %src2.vec, i64 0
149 %csrc2.el1 = fpext half %src2.el1 to float
151 %src1.el2 = extractelement <4 x half> %src1.vec, i64 1
152 %csrc1.el2 = fpext half %src1.el2 to float
153 %src2.el2 = extractelement <4 x half> %src2.vec, i64 1
154 %csrc2.el2 = fpext half %src2.el2 to float
156 %mul2 = fmul float %csrc1.el2, %csrc2.el2
157 %mul1 = fmul float %csrc1.el1, %csrc2.el1
158 %acc = load float, float addrspace(1)* %dst, align 4
159 %acc1 = fadd float %mul2, %acc
160 %acc2 = fadd float %mul1, %acc1
161 store float %acc2, float addrspace(1)* %dst, align 4
165 ; GCN-LABEL: {{^}}NotAdotproduct
166 ; GFX900: v_mad_mix_f32
167 ; GFX900: v_mad_mix_f32
170 ; GFX906: v_mac_f32_e32
172 ; GCN-DL-UNSAFE: v_fma_mix_f32
174 ; GFX906-CONTRACT: v_fma_mix_f32
175 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
176 define amdgpu_kernel void @NotAdotproduct(<2 x half> addrspace(1)* %src1,
177 <2 x half> addrspace(1)* %src2,
178 float addrspace(1)* nocapture %dst) {
180 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
181 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
183 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
184 %csrc1.el1 = fpext half %src1.el1 to float
185 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
186 %csrc2.el1 = fpext half %src2.el1 to float
188 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
189 %csrc1.el2 = fpext half %src1.el2 to float
190 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
191 %csrc2.el2 = fpext half %src2.el2 to float
193 %mul2 = fmul float %csrc1.el2, %csrc1.el1
194 %mul1 = fmul float %csrc2.el1, %csrc2.el2
195 %acc = load float, float addrspace(1)* %dst, align 4
196 %acc1 = fadd float %mul2, %acc
197 %acc2 = fadd float %mul1, %acc1
198 store float %acc2, float addrspace(1)* %dst, align 4
202 ; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct
203 ; GFX900: v_mad_mix_f32
204 ; GFX900: v_mad_mix_f32
207 ; GFX906: v_mac_f32_e32
209 ; GCN-DL-UNSAFE: v_fma_mix_f32
211 ; GFX906-CONTRACT: v_fma_mix_f32
212 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
213 define amdgpu_kernel void @Diff_Idx_NotAdotproduct(<2 x half> addrspace(1)* %src1,
214 <2 x half> addrspace(1)* %src2,
215 float addrspace(1)* nocapture %dst) {
217 %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
218 %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
220 %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
221 %csrc1.el1 = fpext half %src1.el1 to float
222 %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
223 %csrc2.el1 = fpext half %src2.el1 to float
225 %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
226 %csrc1.el2 = fpext half %src1.el2 to float
227 %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
228 %csrc2.el2 = fpext half %src2.el2 to float
230 %mul2 = fmul float %csrc1.el2, %csrc2.el1
231 %mul1 = fmul float %csrc1.el1, %csrc2.el2
232 %acc = load float, float addrspace(1)* %dst, align 4
233 %acc1 = fadd float %mul2, %acc
234 %acc2 = fadd float %mul1, %acc1
235 store float %acc2, float addrspace(1)* %dst, align 4