1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s
5 ; FIXME: Should still like to vectorize the memory operations for VI
7 ; Simple 3-pair chain with loads and stores
8 define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
9 ; GCN-LABEL: @test1_as_3_3_3_v2f16(
10 ; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
11 ; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
12 ; GCN-NEXT: [[TMP3:%.*]] = bitcast half addrspace(3)* [[B:%.*]] to <2 x half> addrspace(3)*
13 ; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2
14 ; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
15 ; GCN-NEXT: [[TMP6:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
16 ; GCN-NEXT: store <2 x half> [[TMP5]], <2 x half> addrspace(3)* [[TMP6]], align 2
19 %i0 = load half, half addrspace(3)* %a, align 2
20 %i1 = load half, half addrspace(3)* %b, align 2
21 %mul = fmul half %i0, %i1
22 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
23 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
24 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
25 %i4 = load half, half addrspace(3)* %arrayidx4, align 2
26 %mul5 = fmul half %i3, %i4
27 store half %mul, half addrspace(3)* %c, align 2
28 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
29 store half %mul5, half addrspace(3)* %arrayidx5, align 2
33 define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
34 ; GCN-LABEL: @test1_as_3_0_0(
35 ; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
36 ; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
37 ; GCN-NEXT: [[TMP3:%.*]] = bitcast half* [[B:%.*]] to <2 x half>*
38 ; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half>* [[TMP3]], align 2
39 ; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
40 ; GCN-NEXT: [[TMP6:%.*]] = bitcast half* [[C:%.*]] to <2 x half>*
41 ; GCN-NEXT: store <2 x half> [[TMP5]], <2 x half>* [[TMP6]], align 2
44 %i0 = load half, half addrspace(3)* %a, align 2
45 %i1 = load half, half* %b, align 2
46 %mul = fmul half %i0, %i1
47 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
48 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
49 %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
50 %i4 = load half, half* %arrayidx4, align 2
51 %mul5 = fmul half %i3, %i4
52 store half %mul, half* %c, align 2
53 %arrayidx5 = getelementptr inbounds half, half* %c, i64 1
54 store half %mul5, half* %arrayidx5, align 2
58 define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
59 ; GCN-LABEL: @test1_as_0_0_3_v2f16(
60 ; GCN-NEXT: [[TMP1:%.*]] = bitcast half* [[A:%.*]] to <2 x half>*
61 ; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 2
62 ; GCN-NEXT: [[TMP3:%.*]] = bitcast half* [[B:%.*]] to <2 x half>*
63 ; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half>* [[TMP3]], align 2
64 ; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
65 ; GCN-NEXT: [[TMP6:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
66 ; GCN-NEXT: store <2 x half> [[TMP5]], <2 x half> addrspace(3)* [[TMP6]], align 2
69 %i0 = load half, half* %a, align 2
70 %i1 = load half, half* %b, align 2
71 %mul = fmul half %i0, %i1
72 %arrayidx3 = getelementptr inbounds half, half* %a, i64 1
73 %i3 = load half, half* %arrayidx3, align 2
74 %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
75 %i4 = load half, half* %arrayidx4, align 2
76 %mul5 = fmul half %i3, %i4
77 store half %mul, half addrspace(3)* %c, align 2
78 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
79 store half %mul5, half addrspace(3)* %arrayidx5, align 2
83 define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
84 ; GCN-LABEL: @test1_fma_v2f16(
85 ; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
86 ; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
87 ; GCN-NEXT: [[TMP3:%.*]] = bitcast half addrspace(3)* [[B:%.*]] to <2 x half> addrspace(3)*
88 ; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2
89 ; GCN-NEXT: [[TMP5:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
90 ; GCN-NEXT: [[TMP6:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP5]], align 2
91 ; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP4]], <2 x half> [[TMP6]])
92 ; GCN-NEXT: [[TMP8:%.*]] = bitcast half addrspace(3)* [[D:%.*]] to <2 x half> addrspace(3)*
93 ; GCN-NEXT: store <2 x half> [[TMP7]], <2 x half> addrspace(3)* [[TMP8]], align 2
96 %i0 = load half, half addrspace(3)* %a, align 2
97 %i1 = load half, half addrspace(3)* %b, align 2
98 %i2 = load half, half addrspace(3)* %c, align 2
99 %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
100 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
101 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
102 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
103 %i4 = load half, half addrspace(3)* %arrayidx4, align 2
104 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
105 %i5 = load half, half addrspace(3)* %arrayidx5, align 2
106 %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
107 store half %fma0, half addrspace(3)* %d, align 2
108 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
109 store half %fma1, half addrspace(3)* %arrayidx6, align 2
113 define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
114 ; GCN-LABEL: @mul_scalar_v2f16(
115 ; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
116 ; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
117 ; GCN-NEXT: [[TMP3:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0
118 ; GCN-NEXT: [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[SCALAR]], i32 1
119 ; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]]
120 ; GCN-NEXT: [[TMP6:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
121 ; GCN-NEXT: store <2 x half> [[TMP5]], <2 x half> addrspace(3)* [[TMP6]], align 2
124 %i0 = load half, half addrspace(3)* %a, align 2
125 %mul = fmul half %i0, %scalar
126 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
127 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
128 %mul5 = fmul half %i3, %scalar
129 store half %mul, half addrspace(3)* %c, align 2
130 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
131 store half %mul5, half addrspace(3)* %arrayidx5, align 2
135 define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
136 ; GCN-LABEL: @fabs_v2f16(
137 ; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
138 ; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
139 ; GCN-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]])
140 ; GCN-NEXT: [[TMP4:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
141 ; GCN-NEXT: store <2 x half> [[TMP3]], <2 x half> addrspace(3)* [[TMP4]], align 2
144 %i0 = load half, half addrspace(3)* %a, align 2
145 %fabs0 = call half @llvm.fabs.f16(half %i0)
146 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
147 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
148 %fabs1 = call half @llvm.fabs.f16(half %i3)
149 store half %fabs0, half addrspace(3)* %c, align 2
150 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
151 store half %fabs1, half addrspace(3)* %arrayidx5, align 2
155 define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
156 ; GCN-LABEL: @test1_fabs_fma_v2f16(
157 ; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
158 ; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
159 ; GCN-NEXT: [[TMP3:%.*]] = bitcast half addrspace(3)* [[B:%.*]] to <2 x half> addrspace(3)*
160 ; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2
161 ; GCN-NEXT: [[TMP5:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
162 ; GCN-NEXT: [[TMP6:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP5]], align 2
163 ; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]])
164 ; GCN-NEXT: [[TMP8:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP7]], <2 x half> [[TMP4]], <2 x half> [[TMP6]])
165 ; GCN-NEXT: [[TMP9:%.*]] = bitcast half addrspace(3)* [[D:%.*]] to <2 x half> addrspace(3)*
166 ; GCN-NEXT: store <2 x half> [[TMP8]], <2 x half> addrspace(3)* [[TMP9]], align 2
169 %i0 = load half, half addrspace(3)* %a, align 2
170 %i1 = load half, half addrspace(3)* %b, align 2
171 %i2 = load half, half addrspace(3)* %c, align 2
172 %i0.fabs = call half @llvm.fabs.f16(half %i0)
174 %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
175 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
176 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
177 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
178 %i4 = load half, half addrspace(3)* %arrayidx4, align 2
179 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
180 %i5 = load half, half addrspace(3)* %arrayidx5, align 2
181 %i3.fabs = call half @llvm.fabs.f16(half %i3)
183 %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
184 store half %fma0, half addrspace(3)* %d, align 2
185 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
186 store half %fma1, half addrspace(3)* %arrayidx6, align 2
190 define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
191 ; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
192 ; GCN-NEXT: [[I1:%.*]] = load half, half addrspace(3)* [[B:%.*]], align 2
193 ; GCN-NEXT: [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]])
194 ; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
195 ; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
196 ; GCN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, half addrspace(3)* [[B]], i64 1
197 ; GCN-NEXT: [[I4:%.*]] = load half, half addrspace(3)* [[ARRAYIDX4]], align 2
198 ; GCN-NEXT: [[TMP3:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
199 ; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2
200 ; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0
201 ; GCN-NEXT: [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[I4]], i32 1
202 ; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP6]], <2 x half> [[TMP4]])
203 ; GCN-NEXT: [[TMP8:%.*]] = bitcast half addrspace(3)* [[D:%.*]] to <2 x half> addrspace(3)*
204 ; GCN-NEXT: store <2 x half> [[TMP7]], <2 x half> addrspace(3)* [[TMP8]], align 2
207 %i0 = load half, half addrspace(3)* %a, align 2
208 %i1 = load half, half addrspace(3)* %b, align 2
209 %i2 = load half, half addrspace(3)* %c, align 2
210 %i1.fabs = call half @llvm.fabs.f16(half %i1)
212 %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
213 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
214 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
215 %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
216 %i4 = load half, half addrspace(3)* %arrayidx4, align 2
217 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
218 %i5 = load half, half addrspace(3)* %arrayidx5, align 2
219 %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
220 store half %fma0, half addrspace(3)* %d, align 2
221 %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
222 store half %fma1, half addrspace(3)* %arrayidx6, align 2
226 define amdgpu_kernel void @canonicalize_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
227 ; GFX9-LABEL: @canonicalize_v2f16(
228 ; GFX9-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
229 ; GFX9-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
230 ; GFX9-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP2]])
231 ; GFX9-NEXT: [[TMP4:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
232 ; GFX9-NEXT: store <2 x half> [[TMP3]], <2 x half> addrspace(3)* [[TMP4]], align 2
233 ; GFX9-NEXT: ret void
235 ; VI-LABEL: @canonicalize_v2f16(
236 ; VI-NEXT: [[I0:%.*]] = load half, half addrspace(3)* [[A:%.*]], align 2
237 ; VI-NEXT: [[CANONICALIZE0:%.*]] = call half @llvm.canonicalize.f16(half [[I0]])
238 ; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, half addrspace(3)* [[A]], i64 1
239 ; VI-NEXT: [[I3:%.*]] = load half, half addrspace(3)* [[ARRAYIDX3]], align 2
240 ; VI-NEXT: [[CANONICALIZE1:%.*]] = call half @llvm.canonicalize.f16(half [[I3]])
241 ; VI-NEXT: store half [[CANONICALIZE0]], half addrspace(3)* [[C:%.*]], align 2
242 ; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, half addrspace(3)* [[C]], i64 1
243 ; VI-NEXT: store half [[CANONICALIZE1]], half addrspace(3)* [[ARRAYIDX5]], align 2
246 %i0 = load half, half addrspace(3)* %a, align 2
247 %canonicalize0 = call half @llvm.canonicalize.f16(half %i0)
248 %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
249 %i3 = load half, half addrspace(3)* %arrayidx3, align 2
250 %canonicalize1 = call half @llvm.canonicalize.f16(half %i3)
251 store half %canonicalize0, half addrspace(3)* %c, align 2
252 %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
253 store half %canonicalize1, half addrspace(3)* %arrayidx5, align 2
257 declare half @llvm.fabs.f16(half) #1
258 declare half @llvm.fma.f16(half, half, half) #1
259 declare half @llvm.canonicalize.f16(half) #1
261 attributes #0 = { nounwind }
262 attributes #1 = { nounwind readnone }