1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,dce -S -slp-threshold=-100 -slp-vectorize-hor-store | FileCheck %s --check-prefix=GFX9
4 @arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
5 @arr64 = local_unnamed_addr global [32 x i64] zeroinitializer, align 16
6 @var = global i32 zeroinitializer, align 8
7 @var64 = global i64 zeroinitializer, align 8
9 @farr = local_unnamed_addr global [32 x float] zeroinitializer, align 16
10 @fvar = global float zeroinitializer, align 8
12 @darr = local_unnamed_addr global [32 x double] zeroinitializer, align 16
13 @dvar = global double zeroinitializer, align 8
15 ; Tests whether the min/max reduction pattern is vectorized if SLP starts at the store.
16 define i32 @smaxv6() {
17 ; GFX9-LABEL: @smaxv6(
18 ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @arr, align 16
19 ; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
20 ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
21 ; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
22 ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]]
23 ; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
24 ; GFX9-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
25 ; GFX9-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
26 ; GFX9-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP5]], i32 [[SELECT1]]
27 ; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4
28 ; GFX9-NEXT: store i32 [[STORE_SELECT]], ptr @var, align 8
29 ; GFX9-NEXT: ret i32 [[OP_RDX1]]
31 %load1 = load i32, ptr @arr, align 16
32 %load2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
33 %cmp1 = icmp sgt i32 %load1, %load2
34 %select1 = select i1 %cmp1, i32 %load1, i32 %load2
36 %load3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
37 %cmp2 = icmp sgt i32 %select1, %load3
38 %select2 = select i1 %cmp2, i32 %select1, i32 %load3
40 %load4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
41 %cmp3 = icmp sgt i32 %select2, %load4
42 %select3 = select i1 %cmp3, i32 %select2, i32 %load4
44 %load5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
45 %cmp4 = icmp sgt i32 %select3, %load5
46 %select4 = select i1 %cmp4, i32 %select3, i32 %load5
48 %load6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4
49 %cmp5 = icmp sgt i32 %select4, %load6
50 %select5 = select i1 %cmp5, i32 %select4, i32 %load6
52 %store-select = select i1 %cmp1, i32 3, i32 4
53 store i32 %store-select, ptr @var, align 8
57 define i64 @sminv6() {
58 ; GFX9-LABEL: @sminv6(
59 ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @arr64, align 16
60 ; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
61 ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
62 ; GFX9-NEXT: [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]]
63 ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]]
64 ; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([32 x i64], ptr @arr64, i64 0, i64 2), align 16
65 ; GFX9-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP4]])
66 ; GFX9-NEXT: [[OP_RDX:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
67 ; GFX9-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i64 [[TMP5]], i64 [[SELECT1]]
68 ; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4
69 ; GFX9-NEXT: store i64 [[STORE_SELECT]], ptr @var64, align 8
70 ; GFX9-NEXT: ret i64 [[OP_RDX1]]
72 %load1 = load i64, ptr @arr64, align 16
73 %load2 = load i64, ptr getelementptr inbounds ([32 x i64], ptr @arr64, i64 0, i64 1), align 8
74 %cmp1 = icmp slt i64 %load1, %load2
75 %select1 = select i1 %cmp1, i64 %load1, i64 %load2
77 %load3 = load i64, ptr getelementptr inbounds ([32 x i64], ptr @arr64, i64 0, i64 2), align 16
78 %cmp2 = icmp slt i64 %select1, %load3
79 %select2 = select i1 %cmp2, i64 %select1, i64 %load3
81 %load4 = load i64, ptr getelementptr inbounds ([32 x i64], ptr @arr64, i64 0, i64 3), align 8
82 %cmp3 = icmp slt i64 %select2, %load4
83 %select3 = select i1 %cmp3, i64 %select2, i64 %load4
85 %load5 = load i64, ptr getelementptr inbounds ([32 x i64], ptr @arr64, i64 0, i64 4), align 16
86 %cmp4 = icmp slt i64 %select3, %load5
87 %select4 = select i1 %cmp4, i64 %select3, i64 %load5
89 %load6 = load i64, ptr getelementptr inbounds ([32 x i64], ptr @arr64, i64 0, i64 5), align 8
90 %cmp5 = icmp slt i64 %select4, %load6
91 %select5 = select i1 %cmp5, i64 %select4, i64 %load6
93 %store-select = select i1 %cmp1, i64 3, i64 4
94 store i64 %store-select, ptr @var64, align 8
98 ; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
99 ; with fastmath on the select.
100 define float @fmaxv6() {
101 ; GFX9-LABEL: @fmaxv6(
102 ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr @farr, align 16
103 ; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
104 ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
105 ; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
106 ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]]
107 ; GFX9-NEXT: [[LOAD3:%.*]] = load float, ptr getelementptr inbounds ([32 x float], ptr @farr, i64 0, i64 2), align 8
108 ; GFX9-NEXT: [[CMP2:%.*]] = fcmp fast ogt float [[SELECT1]], [[LOAD3]]
109 ; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], float [[SELECT1]], float [[LOAD3]]
110 ; GFX9-NEXT: [[LOAD4:%.*]] = load float, ptr getelementptr inbounds ([32 x float], ptr @farr, i64 0, i64 3), align 4
111 ; GFX9-NEXT: [[CMP3:%.*]] = fcmp fast ogt float [[SELECT2]], [[LOAD4]]
112 ; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], float [[SELECT2]], float [[LOAD4]]
113 ; GFX9-NEXT: [[LOAD5:%.*]] = load float, ptr getelementptr inbounds ([32 x float], ptr @farr, i64 0, i64 4), align 16
114 ; GFX9-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[SELECT3]], [[LOAD5]]
115 ; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], float [[SELECT3]], float [[LOAD5]]
116 ; GFX9-NEXT: [[LOAD6:%.*]] = load float, ptr getelementptr inbounds ([32 x float], ptr @farr, i64 0, i64 5), align 4
117 ; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast ogt float [[SELECT4]], [[LOAD6]]
118 ; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], float [[SELECT4]], float [[LOAD6]]
119 ; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00
120 ; GFX9-NEXT: store float [[STORE_SELECT]], ptr @fvar, align 8
121 ; GFX9-NEXT: ret float [[SELECT5]]
123 %load1 = load float, ptr @farr, align 16
124 %load2 = load float, ptr getelementptr inbounds ([32 x float], ptr @farr, i64 0, i64 1), align 4
125 %cmp1 = fcmp fast ogt float %load1, %load2
126 %select1 = select i1 %cmp1, float %load1, float %load2
128 %load3 = load float, ptr getelementptr inbounds ([32 x float], ptr @farr, i64 0, i64 2), align 8
129 %cmp2 = fcmp fast ogt float %select1, %load3
130 %select2 = select i1 %cmp2, float %select1, float %load3
132 %load4 = load float, ptr getelementptr inbounds ([32 x float], ptr @farr, i64 0, i64 3), align 4
133 %cmp3 = fcmp fast ogt float %select2, %load4
134 %select3 = select i1 %cmp3, float %select2, float %load4
136 %load5 = load float, ptr getelementptr inbounds ([32 x float], ptr @farr, i64 0, i64 4), align 16
137 %cmp4 = fcmp fast ogt float %select3, %load5
138 %select4 = select i1 %cmp4, float %select3, float %load5
140 %load6 = load float, ptr getelementptr inbounds ([32 x float], ptr @farr, i64 0, i64 5), align 4
141 %cmp5 = fcmp fast ogt float %select4, %load6
142 %select5 = select i1 %cmp5, float %select4, float %load6
144 %store-select = select i1 %cmp1, float 3.0, float 4.0
145 store float %store-select, ptr @fvar, align 8
149 ; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
150 ; with fastmath on the select.
151 define double @dminv6() {
152 ; GFX9-LABEL: @dminv6(
153 ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @darr, align 16
154 ; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
155 ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
156 ; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]]
157 ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]]
158 ; GFX9-NEXT: [[LOAD3:%.*]] = load double, ptr getelementptr inbounds ([32 x double], ptr @darr, i64 0, i64 2), align 8
159 ; GFX9-NEXT: [[CMP2:%.*]] = fcmp fast olt double [[SELECT1]], [[LOAD3]]
160 ; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], double [[SELECT1]], double [[LOAD3]]
161 ; GFX9-NEXT: [[LOAD4:%.*]] = load double, ptr getelementptr inbounds ([32 x double], ptr @darr, i64 0, i64 3), align 4
162 ; GFX9-NEXT: [[CMP3:%.*]] = fcmp fast olt double [[SELECT2]], [[LOAD4]]
163 ; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], double [[SELECT2]], double [[LOAD4]]
164 ; GFX9-NEXT: [[LOAD5:%.*]] = load double, ptr getelementptr inbounds ([32 x double], ptr @darr, i64 0, i64 4), align 16
165 ; GFX9-NEXT: [[CMP4:%.*]] = fcmp fast olt double [[SELECT3]], [[LOAD5]]
166 ; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], double [[SELECT3]], double [[LOAD5]]
167 ; GFX9-NEXT: [[LOAD6:%.*]] = load double, ptr getelementptr inbounds ([32 x double], ptr @darr, i64 0, i64 5), align 4
168 ; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast olt double [[SELECT4]], [[LOAD6]]
169 ; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], double [[SELECT4]], double [[LOAD6]]
170 ; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00
171 ; GFX9-NEXT: store double [[STORE_SELECT]], ptr @dvar, align 8
172 ; GFX9-NEXT: ret double [[SELECT5]]
174 %load1 = load double, ptr @darr, align 16
175 %load2 = load double, ptr getelementptr inbounds ([32 x double], ptr @darr, i64 0, i64 1), align 4
176 %cmp1 = fcmp fast olt double %load1, %load2
177 %select1 = select i1 %cmp1, double %load1, double %load2
179 %load3 = load double, ptr getelementptr inbounds ([32 x double], ptr @darr, i64 0, i64 2), align 8
180 %cmp2 = fcmp fast olt double %select1, %load3
181 %select2 = select i1 %cmp2, double %select1, double %load3
183 %load4 = load double, ptr getelementptr inbounds ([32 x double], ptr @darr, i64 0, i64 3), align 4
184 %cmp3 = fcmp fast olt double %select2, %load4
185 %select3 = select i1 %cmp3, double %select2, double %load4
187 %load5 = load double, ptr getelementptr inbounds ([32 x double], ptr @darr, i64 0, i64 4), align 16
188 %cmp4 = fcmp fast olt double %select3, %load5
189 %select4 = select i1 %cmp4, double %select3, double %load5
191 %load6 = load double, ptr getelementptr inbounds ([32 x double], ptr @darr, i64 0, i64 5), align 4
192 %cmp5 = fcmp fast olt double %select4, %load6
193 %select5 = select i1 %cmp5, double %select4, double %load6
195 %store-select = select i1 %cmp1, double 3.0, double 4.0
196 store double %store-select, ptr @dvar, align 8
200 define i32 @smax_wdiff_valuenum(i32, i32 %v1) {
201 ; GFX9-LABEL: @smax_wdiff_valuenum(
202 ; GFX9-NEXT: [[VLOAD:%.*]] = load <2 x i32>, ptr @arr, align 16
203 ; GFX9-NEXT: [[ELT1:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
204 ; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[ELT1]], [[V1:%.*]]
205 ; GFX9-NEXT: [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
206 ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]]
207 ; GFX9-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
208 ; GFX9-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
209 ; GFX9-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
210 ; GFX9-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP3]], i32 [[SELECT1]]
211 ; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4
212 ; GFX9-NEXT: store i32 [[STOREVAL]], ptr @var, align 8
213 ; GFX9-NEXT: ret i32 [[OP_RDX1]]
215 %vload = load <2 x i32>, ptr @arr, align 16
216 %elt1 = extractelement <2 x i32> %vload, i32 0
217 %cmp1 = icmp sgt i32 %elt1, %v1
218 %ex0 = extractelement <2 x i32> %vload, i32 0
219 %select1 = select i1 %cmp1, i32 %ex0, i32 %v1
221 %load3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
222 %cmp2 = icmp sgt i32 %select1, %load3
223 %select2 = select i1 %cmp2, i32 %select1, i32 %load3
225 %load4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
226 %cmp3 = icmp sgt i32 %select2, %load4
227 %select3 = select i1 %cmp3, i32 %select2, i32 %load4
229 %load5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
230 %cmp4 = icmp sgt i32 %select3, %load5
231 %select4 = select i1 %cmp4, i32 %select3, i32 %load5
233 %load6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4
234 %cmp5 = icmp sgt i32 %select4, %load6
235 %select5 = select i1 %cmp5, i32 %select4, i32 %load6
237 %storeval = select i1 %cmp1, i32 3, i32 4
238 store i32 %storeval, ptr @var, align 8