1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4 ; GCN-LABEL: {{^}}reduction_fadd_v4f16:
5 ; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
6 ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9 ; VI-NEXT: v_add_f16_e32
10 ; VI-NEXT: v_add_f16_e32
11 define half @reduction_fadd_v4f16(<4 x half> %vec4) {
13 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
14 %bin.rdx = fadd <4 x half> %vec4, %rdx.shuf
15 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
16 %bin.rdx2 = fadd <4 x half> %bin.rdx, %rdx.shuf1
17 %res = extractelement <4 x half> %bin.rdx2, i32 0
21 ; GCN-LABEL: {{^}}reduction_fsub_v4f16:
23 ; GFX9-NEXT: v_pk_add_f16 [[ADD:v[0-9]+]], v0, v1 neg_lo:[0,1] neg_hi:[0,1]{{$}}
24 ; GFX9-NEXT: v_sub_f16_sdwa v0, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
25 ; GFX9-NEXT: s_setpc_b64
28 ; VI-NEXT: v_sub_f16_e32
29 ; VI-NEXT: v_sub_f16_e32
30 ; VI-NEXT: s_setpc_b64
31 define half @reduction_fsub_v4f16(<4 x half> %vec4) {
33 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
34 %bin.rdx = fsub <4 x half> %vec4, %rdx.shuf
35 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
36 %bin.rdx2 = fsub <4 x half> %bin.rdx, %rdx.shuf1
37 %res = extractelement <4 x half> %bin.rdx2, i32 0
41 ; Make sure nsz is preserved when the operations are split.
42 ; GCN-LABEL: {{^}}reduction_fsub_v4f16_preserve_fmf:
44 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]{{$}}
45 ; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
46 ; GFX9-NEXT: s_setpc_b64
49 ; VI-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
50 ; VI-NEXT: v_sub_f16_e32 v0, v1, v0
51 ; VI-NEXT: v_add_f16_e32 v0, v2, v0
52 ; VI-NEXT: s_setpc_b64
53 define half @reduction_fsub_v4f16_preserve_fmf(<4 x half> %vec4) {
55 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
56 %bin.rdx = fsub nsz <4 x half> %vec4, %rdx.shuf
57 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
58 %bin.rdx2 = fsub nsz <4 x half> %bin.rdx, %rdx.shuf1
59 %res = extractelement <4 x half> %bin.rdx2, i32 0
60 %neg.res = fsub half -0.0, %res
64 ; GCN-LABEL: {{^}}reduction_fmul_half4:
65 ; GFX9: v_pk_mul_f16 [[MUL:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
66 ; GFX9-NEXT: v_mul_f16_sdwa v{{[0-9]+}}, [[MUL]], [[MUL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
69 ; VI-NEXT: v_mul_f16_e32
70 ; VI-NEXT: v_mul_f16_e32
71 define half @reduction_fmul_half4(<4 x half> %vec4) {
73 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
74 %bin.rdx = fmul <4 x half> %vec4, %rdx.shuf
75 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
76 %bin.rdx2 = fmul <4 x half> %bin.rdx, %rdx.shuf1
77 %res = extractelement <4 x half> %bin.rdx2, i32 0
81 ; GCN-LABEL: {{^}}reduction_v4i16:
82 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
83 ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
86 ; VI-NEXT: v_add_u16_e32
87 ; VI-NEXT: v_add_u16_e32
88 define i16 @reduction_v4i16(<4 x i16> %vec4) {
90 %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
91 %bin.rdx = add <4 x i16> %vec4, %rdx.shuf
92 %rdx.shuf1 = shufflevector <4 x i16> %bin.rdx, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
93 %bin.rdx2 = add <4 x i16> %bin.rdx, %rdx.shuf1
94 %res = extractelement <4 x i16> %bin.rdx2, i32 0
98 ; GCN-LABEL: {{^}}reduction_half8:
99 ; GFX9: v_pk_add_f16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
100 ; GFX9-NEXT: v_pk_add_f16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
101 ; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
102 ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
105 ; VI-NEXT: v_add_f16_sdwa
106 ; VI-NEXT: v_add_f16_e32
107 ; VI-NEXT: v_add_f16_e32
108 ; VI-NEXT: v_add_f16_e32
109 ; VI-NEXT: v_add_f16_e32
110 ; VI-NEXT: v_add_f16_e32
112 define half @reduction_half8(<8 x half> %vec8) {
114 %rdx.shuf = shufflevector <8 x half> %vec8, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
115 %bin.rdx = fadd <8 x half> %vec8, %rdx.shuf
116 %rdx.shuf1 = shufflevector <8 x half> %bin.rdx, <8 x half> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
117 %bin.rdx2 = fadd <8 x half> %bin.rdx, %rdx.shuf1
118 %rdx.shuf3 = shufflevector <8 x half> %bin.rdx2, <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
119 %bin.rdx4 = fadd <8 x half> %bin.rdx2, %rdx.shuf3
120 %res = extractelement <8 x half> %bin.rdx4, i32 0
124 ; GCN-LABEL: {{^}}reduction_v8i16:
125 ; GFX9: v_pk_add_u16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
126 ; GFX9-NEXT: v_pk_add_u16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
127 ; GFX9-NEXT: v_pk_add_u16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
128 ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
131 ; VI-NEXT: v_add_u16_sdwa
132 ; VI-NEXT: v_add_u16_e32
133 ; VI-NEXT: v_add_u16_e32
134 ; VI-NEXT: v_add_u16_e32
135 ; VI-NEXT: v_add_u16_e32
136 ; VI-NEXT: v_add_u16_e32
138 define i16 @reduction_v8i16(<8 x i16> %vec8) {
140 %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
141 %bin.rdx = add <8 x i16> %vec8, %rdx.shuf
142 %rdx.shuf1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
143 %bin.rdx2 = add <8 x i16> %bin.rdx, %rdx.shuf1
144 %rdx.shuf3 = shufflevector <8 x i16> %bin.rdx2, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
145 %bin.rdx4 = add <8 x i16> %bin.rdx2, %rdx.shuf3
146 %res = extractelement <8 x i16> %bin.rdx4, i32 0
150 ; GCN-LABEL: {{^}}reduction_half16:
151 ; GFX9: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
152 ; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
153 ; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
154 ; GFX9: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
155 ; GFX9-NEXT: v_pk_add_f16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
156 ; GFX9-NEXT: v_pk_add_f16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
157 ; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
158 ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
161 ; VI-NEXT: v_add_f16_sdwa
162 ; VI-NEXT: v_add_f16_sdwa
163 ; VI-NEXT: v_add_f16_sdwa
164 ; VI-NEXT: v_add_f16_e32
165 ; VI-NEXT: v_add_f16_e32
166 ; VI-NEXT: v_add_f16_e32
167 ; VI-NEXT: v_add_f16_e32
168 ; VI-NEXT: v_add_f16_e32
169 ; VI-NEXT: v_add_f16_e32
170 ; VI-NEXT: v_add_f16_e32
171 ; VI-NEXT: v_add_f16_e32
172 ; VI-NEXT: v_add_f16_e32
173 ; VI-NEXT: v_add_f16_e32
174 ; VI-NEXT: v_add_f16_e32
176 define half @reduction_half16(<16 x half> %vec16) {
178 %rdx.shuf = shufflevector <16 x half> %vec16, <16 x half> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
179 %bin.rdx = fadd <16 x half> %vec16, %rdx.shuf
180 %rdx.shuf1 = shufflevector <16 x half> %bin.rdx, <16 x half> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
181 %bin.rdx2 = fadd <16 x half> %bin.rdx, %rdx.shuf1
182 %rdx.shuf3 = shufflevector <16 x half> %bin.rdx2, <16 x half> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
183 %bin.rdx4 = fadd <16 x half> %bin.rdx2, %rdx.shuf3
184 %rdx.shuf5 = shufflevector <16 x half> %bin.rdx4, <16 x half> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
185 %bin.rdx6 = fadd <16 x half> %bin.rdx4, %rdx.shuf5
186 %res = extractelement <16 x half> %bin.rdx6, i32 0
190 ; GCN-LABEL: {{^}}reduction_min_v4i16:
191 ; GFX9: v_pk_min_u16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
192 ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
195 ; VI-NEXT: v_min_u16_e32
196 ; VI-NEXT: v_min_u16_e32
197 define i16 @reduction_min_v4i16(<4 x i16> %vec4) {
199 %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
200 %rdx.minmax.cmp = icmp ult <4 x i16> %vec4, %rdx.shuf
201 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf
202 %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
203 %rdx.minmax.cmp2 = icmp ult <4 x i16> %rdx.minmax.select, %rdx.shuf1
204 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1
205 %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0
209 ; GCN-LABEL: {{^}}reduction_umin_v8i16:
210 ; GFX9: v_pk_min_u16 [[MIN1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
211 ; GFX9-NEXT: v_pk_min_u16 [[MIN2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
212 ; GFX9-NEXT: v_pk_min_u16 [[MIN3:v[0-9]+]], [[MIN2]], [[MIN1]]{{$}}
213 ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN3]], [[MIN3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
216 ; VI-NEXT: v_min_u16_sdwa
217 ; VI-NEXT: v_min_u16_e32
218 ; VI-NEXT: v_min_u16_e32
219 ; VI-NEXT: v_min_u16_e32
220 ; VI-NEXT: v_min_u16_e32
221 ; VI-NEXT: v_min_u16_e32
222 define i16 @reduction_umin_v8i16(<8 x i16> %vec8) {
224 %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
225 %rdx.minmax.cmp = icmp ult <8 x i16> %vec8, %rdx.shuf
226 %rdx.minmax.select = select <8 x i1> %rdx.minmax.cmp, <8 x i16> %vec8, <8 x i16> %rdx.shuf
227 %rdx.shuf1 = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
228 %rdx.minmax.cmp2 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf1
229 %rdx.minmax.select3 = select <8 x i1> %rdx.minmax.cmp2, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf1
230 %rdx.shuf4 = shufflevector <8 x i16> %rdx.minmax.select3, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
231 %rdx.minmax.cmp5 = icmp ult <8 x i16> %rdx.minmax.select3, %rdx.shuf4
232 %rdx.minmax.select6 = select <8 x i1> %rdx.minmax.cmp5, <8 x i16> %rdx.minmax.select3, <8 x i16> %rdx.shuf4
233 %res = extractelement <8 x i16> %rdx.minmax.select6, i32 0
237 ; Tests to make sure without slp the number of instructions are more.
238 ; GCN-LABEL: {{^}}reduction_umin_v8i16_woslp:
239 ; GFX9: v_lshrrev_b32_e32
240 ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
241 ; GFX9-NEXT: v_lshrrev_b32_e32
242 ; GFX9-NEXT: v_min3_u16
243 ; GFX9-NEXT: v_lshrrev_b32_e32
244 ; GFX9-NEXT: v_min3_u16
245 ; GFX9-NEXT: v_min3_u16
246 define i16 @reduction_umin_v8i16_woslp(<8 x i16> %vec8) {
248 %elt0 = extractelement <8 x i16> %vec8, i64 0
249 %elt1 = extractelement <8 x i16> %vec8, i64 1
250 %elt2 = extractelement <8 x i16> %vec8, i64 2
251 %elt3 = extractelement <8 x i16> %vec8, i64 3
252 %elt4 = extractelement <8 x i16> %vec8, i64 4
253 %elt5 = extractelement <8 x i16> %vec8, i64 5
254 %elt6 = extractelement <8 x i16> %vec8, i64 6
255 %elt7 = extractelement <8 x i16> %vec8, i64 7
257 %cmp0 = icmp ult i16 %elt1, %elt0
258 %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
259 %cmp1 = icmp ult i16 %elt2, %min1
260 %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
261 %cmp2 = icmp ult i16 %elt3, %min2
262 %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
264 %cmp3 = icmp ult i16 %elt4, %min3
265 %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
266 %cmp4 = icmp ult i16 %elt5, %min4
267 %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
269 %cmp5 = icmp ult i16 %elt6, %min5
270 %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
271 %cmp6 = icmp ult i16 %elt7, %min6
272 %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
277 ; GCN-LABEL: {{^}}reduction_smin_v16i16:
278 ; GFX9: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
279 ; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
280 ; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
281 ; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
282 ; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
283 ; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
284 ; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
285 ; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
288 ; VI-NEXT: v_min_i16_sdwa
289 ; VI-NEXT: v_min_i16_sdwa
290 ; VI-NEXT: v_min_i16_sdwa
291 ; VI-NEXT: v_min_i16_e32
292 ; VI-NEXT: v_min_i16_e32
293 ; VI-NEXT: v_min_i16_e32
294 ; VI-NEXT: v_min_i16_e32
295 ; VI-NEXT: v_min_i16_e32
296 ; VI-NEXT: v_min_i16_e32
297 ; VI-NEXT: v_min_i16_e32
298 ; VI-NEXT: v_min_i16_e32
299 ; VI-NEXT: v_min_i16_e32
300 ; VI-NEXT: v_min_i16_e32
301 ; VI-NEXT: v_min_i16_e32
302 define i16 @reduction_smin_v16i16(<16 x i16> %vec16) {
304 %rdx.shuf = shufflevector <16 x i16> %vec16, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
305 %rdx.minmax.cmp = icmp slt <16 x i16> %vec16, %rdx.shuf
306 %rdx.minmax.select = select <16 x i1> %rdx.minmax.cmp, <16 x i16> %vec16, <16 x i16> %rdx.shuf
307 %rdx.shuf1 = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
308 %rdx.minmax.cmp2 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf1
309 %rdx.minmax.select3 = select <16 x i1> %rdx.minmax.cmp2, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf1
310 %rdx.shuf4 = shufflevector <16 x i16> %rdx.minmax.select3, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
311 %rdx.minmax.cmp5 = icmp slt <16 x i16> %rdx.minmax.select3, %rdx.shuf4
312 %rdx.minmax.select6 = select <16 x i1> %rdx.minmax.cmp5, <16 x i16> %rdx.minmax.select3, <16 x i16> %rdx.shuf4
313 %rdx.shuf7 = shufflevector <16 x i16> %rdx.minmax.select6, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
314 %rdx.minmax.cmp8 = icmp slt <16 x i16> %rdx.minmax.select6, %rdx.shuf7
315 %rdx.minmax.select9 = select <16 x i1> %rdx.minmax.cmp8, <16 x i16> %rdx.minmax.select6, <16 x i16> %rdx.shuf7
316 %res = extractelement <16 x i16> %rdx.minmax.select9, i32 0
320 ; Tests to make sure without slp the number of instructions are more.
321 ; GCN-LABEL: {{^}}reduction_smin_v16i16_woslp:
322 ; GFX9: v_lshrrev_b32_e32
323 ; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
324 ; GFX9-NEXT: v_lshrrev_b32_e32
325 ; GFX9-NEXT: v_min3_i16
326 ; GFX9-NEXT: v_lshrrev_b32_e32
327 ; GFX9-NEXT: v_min3_i16
328 ; GFX9-NEXT: v_lshrrev_b32_e32
329 ; GFX9-NEXT: v_min3_i16
330 ; GFX9-NEXT: v_lshrrev_b32_e32
331 ; GFX9-NEXT: v_min3_i16
332 ; GFX9-NEXT: v_lshrrev_b32_e32
333 ; GFX9-NEXT: v_min3_i16
334 ; GFX9-NEXT: v_lshrrev_b32_e32
335 ; GFX9-NEXT: v_min3_i16
336 ; GFX9-NEXT: v_min3_i16
337 define i16 @reduction_smin_v16i16_woslp(<16 x i16> %vec16) {
339 %elt0 = extractelement <16 x i16> %vec16, i64 0
340 %elt1 = extractelement <16 x i16> %vec16, i64 1
341 %elt2 = extractelement <16 x i16> %vec16, i64 2
342 %elt3 = extractelement <16 x i16> %vec16, i64 3
343 %elt4 = extractelement <16 x i16> %vec16, i64 4
344 %elt5 = extractelement <16 x i16> %vec16, i64 5
345 %elt6 = extractelement <16 x i16> %vec16, i64 6
346 %elt7 = extractelement <16 x i16> %vec16, i64 7
348 %elt8 = extractelement <16 x i16> %vec16, i64 8
349 %elt9 = extractelement <16 x i16> %vec16, i64 9
350 %elt10 = extractelement <16 x i16> %vec16, i64 10
351 %elt11 = extractelement <16 x i16> %vec16, i64 11
352 %elt12 = extractelement <16 x i16> %vec16, i64 12
353 %elt13 = extractelement <16 x i16> %vec16, i64 13
354 %elt14 = extractelement <16 x i16> %vec16, i64 14
355 %elt15 = extractelement <16 x i16> %vec16, i64 15
357 %cmp0 = icmp slt i16 %elt1, %elt0
358 %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
359 %cmp1 = icmp slt i16 %elt2, %min1
360 %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
361 %cmp2 = icmp slt i16 %elt3, %min2
362 %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
364 %cmp3 = icmp slt i16 %elt4, %min3
365 %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
366 %cmp4 = icmp slt i16 %elt5, %min4
367 %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
369 %cmp5 = icmp slt i16 %elt6, %min5
370 %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
371 %cmp6 = icmp slt i16 %elt7, %min6
372 %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
374 %cmp7 = icmp slt i16 %elt8, %min7
375 %min8 = select i1 %cmp7, i16 %elt8, i16 %min7
376 %cmp8 = icmp slt i16 %elt9, %min8
377 %min9 = select i1 %cmp8, i16 %elt9, i16 %min8
379 %cmp9 = icmp slt i16 %elt10, %min9
380 %min10 = select i1 %cmp9, i16 %elt10, i16 %min9
381 %cmp10 = icmp slt i16 %elt11, %min10
382 %min11 = select i1 %cmp10, i16 %elt11, i16 %min10
384 %cmp11 = icmp slt i16 %elt12, %min11
385 %min12 = select i1 %cmp11, i16 %elt12, i16 %min11
386 %cmp12 = icmp slt i16 %elt13, %min12
387 %min13 = select i1 %cmp12, i16 %elt13, i16 %min12
389 %cmp13 = icmp slt i16 %elt14, %min13
390 %min14 = select i1 %cmp13, i16 %elt14, i16 %min13
391 %cmp14 = icmp slt i16 %elt15, %min14
392 %min15 = select i1 %cmp14, i16 %elt15, i16 %min14
398 ; GCN-LABEL: {{^}}reduction_umax_v4i16:
399 ; GFX9: v_pk_max_u16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
400 ; GFX9-NEXT: v_max_u16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
403 ; VI-NEXT: v_max_u16_e32
404 ; VI-NEXT: v_max_u16_e32
405 define i16 @reduction_umax_v4i16(<4 x i16> %vec4) {
407 %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
408 %rdx.minmax.cmp = icmp ugt <4 x i16> %vec4, %rdx.shuf
409 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf
410 %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
411 %rdx.minmax.cmp2 = icmp ugt <4 x i16> %rdx.minmax.select, %rdx.shuf1
412 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1
413 %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0
417 ; GCN-LABEL: {{^}}reduction_smax_v4i16:
418 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
419 ; GFX9-NEXT: v_max_i16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
422 ; VI-NEXT: v_max_i16_e32
423 ; VI-NEXT: v_max_i16_e32
424 define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 {
426 %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
427 %rdx.minmax.cmp = icmp sgt <4 x i16> %vec4, %rdx.shuf
428 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf
429 %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
430 %rdx.minmax.cmp2 = icmp sgt <4 x i16> %rdx.minmax.select, %rdx.shuf1
431 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1
432 %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0
436 ; GCN-LABEL: {{^}}reduction_maxnum_v4f16:
438 ; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
439 ; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
440 ; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
442 ; FIXME: Extra canonicalize leftover
443 ; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
444 ; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]
446 ; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
447 ; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
448 ; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
449 ; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
451 ; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
452 ; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
453 ; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
454 define half @reduction_maxnum_v4f16(<4 x half> %vec4) {
456 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
457 %rdx.minmax = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf)
458 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
459 %rdx.minmax3 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1)
460 %res = extractelement <4 x half> %rdx.minmax3, i32 0
464 ; GCN-LABEL: {{^}}reduction_minnum_v4f16:
466 ; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
467 ; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
468 ; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
470 ; FIXME: Extra canonicalize leftover
471 ; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
472 ; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]
475 ; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
476 ; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
477 ; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
478 ; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
480 ; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
481 ; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
482 ; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
483 define half @reduction_minnum_v4f16(<4 x half> %vec4) {
485 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
486 %rdx.minmax = call <4 x half> @llvm.minnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf)
487 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
488 %rdx.minmax3 = call <4 x half> @llvm.minnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1)
489 %res = extractelement <4 x half> %rdx.minmax3, i32 0
493 ; FIXME: Need to preserve fast math flags when fmaxnum matched
494 ; directly from the IR to avoid unnecessary quieting.
496 ; GCN-LABEL: {{^}}reduction_fast_max_pattern_v4f16:
497 ; XGFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
498 ; XGFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
501 ; XVI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
502 ; XVI-NEXT: v_max_f16_e32 v0, v0, v1
503 ; XVI-NEXT: v_max_f16_e32 v0, v0, v2
504 ; XVI-NEXT: s_setpc_b64
507 ; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
508 ; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
509 ; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
511 ; FIXME: Extra canonicalize leftover
512 ; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
513 ; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]
515 ; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
516 ; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
517 ; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
518 ; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
520 ; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
521 ; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
522 ; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
523 define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) {
525 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
526 %rdx.minmax.cmp = fcmp nnan nsz ogt <4 x half> %vec4, %rdx.shuf
527 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf
528 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
529 %rdx.minmax.cmp2 = fcmp nnan nsz ogt <4 x half> %rdx.minmax.select, %rdx.shuf1
530 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
531 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0
535 ; FIXME: Need to preserve fast math flags when fmaxnum matched
536 ; directly from the IR to avoid unnecessary quieting.
538 ; GCN-LABEL: {{^}}reduction_fast_min_pattern_v4f16:
539 ; XGFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
540 ; XGFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
543 ; XVI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
544 ; XVI-NEXT: v_min_f16_e32 v0, v0, v1
545 ; XVI-NEXT: v_min_f16_e32 v0, v0, v2
546 ; XVI-NEXT: s_setpc_b64
549 ; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
550 ; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
551 ; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
553 ; FIXME: Extra canonicalize leftover
554 ; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
555 ; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]
558 ; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
559 ; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
560 ; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
561 ; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
563 ; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
564 ; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
565 ; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
566 define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) {
568 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
569 %rdx.minmax.cmp = fcmp nnan nsz olt <4 x half> %vec4, %rdx.shuf
570 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf
571 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
572 %rdx.minmax.cmp2 = fcmp nnan nsz olt <4 x half> %rdx.minmax.select, %rdx.shuf1
573 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
574 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0
578 declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>)
579 declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)