1 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s
2 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI,GFX89 %s
4 ; FIXME: Should still like to vectorize the memory operations for VI
6 ; Simple 3-pair chain with loads and stores
7 ; GCN-LABEL: @test1_as_3_3_3_v2f16(
8 ; GFX89: load <2 x half>, <2 x half> addrspace(3)*
9 ; GFX89: load <2 x half>, <2 x half> addrspace(3)*
10 ; GFX89: fmul <2 x half>
11 ; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
13 define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
14 %i0 = load half, half addrspace(3)* %a, align 2
15 %i1 = load half, half addrspace(3)* %b, align 2
16 %mul = fmul half %i0, %i1
17 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
18 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
19 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
20 %i4 = load half, half addrspace(3)* %arrayidx4, align 2
21 %mul5 = fmul half %i3, %i4
22 store half %mul, half addrspace(3)* %c, align 2
23 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
24 store half %mul5, half addrspace(3)* %arrayidx5, align 2
28 ; GCN-LABEL: @test1_as_3_0_0(
29 ; GFX89: load <2 x half>, <2 x half> addrspace(3)*
30 ; GFX89: load <2 x half>, <2 x half>*
31 ; GFX89: fmul <2 x half>
32 ; GFX89: store <2 x half> %{{.*}}, <2 x half>* %
34 define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
35 %i0 = load half, half addrspace(3)* %a, align 2
36 %i1 = load half, half* %b, align 2
37 %mul = fmul half %i0, %i1
38 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
39 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
40 %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
41 %i4 = load half, half* %arrayidx4, align 2
42 %mul5 = fmul half %i3, %i4
43 store half %mul, half* %c, align 2
44 %arrayidx5 = getelementptr inbounds half, half* %c, i64 1
45 store half %mul5, half* %arrayidx5, align 2
49 ; GCN-LABEL: @test1_as_0_0_3_v2f16(
50 ; GFX89: load <2 x half>, <2 x half>*
51 ; GFX89: load <2 x half>, <2 x half>*
52 ; GFX89: fmul <2 x half>
53 ; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
55 define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
56 %i0 = load half, half* %a, align 2
57 %i1 = load half, half* %b, align 2
58 %mul = fmul half %i0, %i1
59 %arrayidx3 = getelementptr inbounds half, half* %a, i64 1
60 %i3 = load half, half* %arrayidx3, align 2
61 %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
62 %i4 = load half, half* %arrayidx4, align 2
63 %mul5 = fmul half %i3, %i4
64 store half %mul, half addrspace(3)* %c, align 2
65 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
66 store half %mul5, half addrspace(3)* %arrayidx5, align 2
70 ; GCN-LABEL: @test1_fma_v2f16(
71 ; GFX9: load <2 x half>
72 ; GFX9: load <2 x half>
73 ; GFX9: load <2 x half>
74 ; GFX9: call <2 x half> @llvm.fma.v2f16(
75 ; GFX9: store <2 x half>
76 define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
77 %i0 = load half, half addrspace(3)* %a, align 2
78 %i1 = load half, half addrspace(3)* %b, align 2
79 %i2 = load half, half addrspace(3)* %c, align 2
80 %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
81 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
82 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
83 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
84 %i4 = load half, half addrspace(3)* %arrayidx4, align 2
85 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
86 %i5 = load half, half addrspace(3)* %arrayidx5, align 2
87 %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
88 store half %fma0, half addrspace(3)* %d, align 2
89 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
90 store half %fma1, half addrspace(3)* %arrayidx6, align 2
94 ; GCN-LABEL: @mul_scalar_v2f16(
95 ; GFX9: load <2 x half>
96 ; GFX9: fmul <2 x half>
97 ; GFX9: store <2 x half>
98 define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
99 %i0 = load half, half addrspace(3)* %a, align 2
100 %mul = fmul half %i0, %scalar
101 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
102 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
103 %mul5 = fmul half %i3, %scalar
104 store half %mul, half addrspace(3)* %c, align 2
105 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
106 store half %mul5, half addrspace(3)* %arrayidx5, align 2
110 ; GCN-LABEL: @fabs_v2f16
111 ; GFX9: load <2 x half>
112 ; GFX9: call <2 x half> @llvm.fabs.v2f16(
113 ; GFX9: store <2 x half>
114 define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
115 %i0 = load half, half addrspace(3)* %a, align 2
116 %fabs0 = call half @llvm.fabs.f16(half %i0)
117 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
118 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
119 %fabs1 = call half @llvm.fabs.f16(half %i3)
120 store half %fabs0, half addrspace(3)* %c, align 2
121 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
122 store half %fabs1, half addrspace(3)* %arrayidx5, align 2
126 ; GCN-LABEL: @test1_fabs_fma_v2f16(
127 ; GFX9: load <2 x half>
128 ; GFX9: call <2 x half> @llvm.fabs.v2f16(
129 ; GFX9: call <2 x half> @llvm.fma.v2f16(
130 ; GFX9: store <2 x half>
131 define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
132 %i0 = load half, half addrspace(3)* %a, align 2
133 %i1 = load half, half addrspace(3)* %b, align 2
134 %i2 = load half, half addrspace(3)* %c, align 2
135 %i0.fabs = call half @llvm.fabs.f16(half %i0)
137 %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
138 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
139 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
140 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
141 %i4 = load half, half addrspace(3)* %arrayidx4, align 2
142 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
143 %i5 = load half, half addrspace(3)* %arrayidx5, align 2
144 %i3.fabs = call half @llvm.fabs.f16(half %i3)
146 %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
147 store half %fma0, half addrspace(3)* %d, align 2
148 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
149 store half %fma1, half addrspace(3)* %arrayidx6, align 2
153 ; FIXME: Should do vector load and extract component for fabs
154 ; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
156 ; GFX9: call half @llvm.fabs.f16(
157 ; GFX9: load <2 x half>
159 ; GFX9: load <2 x half>
160 ; GFX9: call <2 x half> @llvm.fma.v2f16(
161 ; GFX9: store <2 x half>
162 define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
163 %i0 = load half, half addrspace(3)* %a, align 2
164 %i1 = load half, half addrspace(3)* %b, align 2
165 %i2 = load half, half addrspace(3)* %c, align 2
166 %i1.fabs = call half @llvm.fabs.f16(half %i1)
168 %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
169 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
170 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
171 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
172 %i4 = load half, half addrspace(3)* %arrayidx4, align 2
173 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
174 %i5 = load half, half addrspace(3)* %arrayidx5, align 2
175 %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
176 store half %fma0, half addrspace(3)* %d, align 2
177 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
178 store half %fma1, half addrspace(3)* %arrayidx6, align 2
182 ; GCN-LABEL: @canonicalize_v2f16
183 ; GFX9: load <2 x half>
184 ; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
185 ; GFX9: store <2 x half>
186 define amdgpu_kernel void @canonicalize_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
187 %i0 = load half, half addrspace(3)* %a, align 2
188 %canonicalize0 = call half @llvm.canonicalize.f16(half %i0)
189 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
190 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
191 %canonicalize1 = call half @llvm.canonicalize.f16(half %i3)
192 store half %canonicalize0, half addrspace(3)* %c, align 2
193 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
194 store half %canonicalize1, half addrspace(3)* %arrayidx5, align 2
198 declare half @llvm.fabs.f16(half) #1
199 declare half @llvm.fma.f16(half, half, half) #1
200 declare half @llvm.canonicalize.f16(half) #1
202 attributes #0 = { nounwind }
203 attributes #1 = { nounwind readnone }