1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s --check-prefix=VI
3 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX9
5 ; FIXME: Need to handle non-uniform case for function below (load without gep).
6 define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
7 ; VI-LABEL: v_test_imax_sge_i16:
9 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
10 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
11 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
12 ; VI-NEXT: s_waitcnt lgkmcnt(0)
13 ; VI-NEXT: v_mov_b32_e32 v1, s7
14 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
15 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
16 ; VI-NEXT: v_mov_b32_e32 v3, s1
17 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
18 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
19 ; VI-NEXT: flat_load_ushort v5, v[0:1]
20 ; VI-NEXT: flat_load_ushort v2, v[2:3]
21 ; VI-NEXT: v_mov_b32_e32 v1, s5
22 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
23 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
24 ; VI-NEXT: s_waitcnt vmcnt(0)
25 ; VI-NEXT: v_max_i16_e32 v2, v5, v2
26 ; VI-NEXT: flat_store_short v[0:1], v2
29 ; GFX9-LABEL: v_test_imax_sge_i16:
31 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
32 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
33 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
35 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
36 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
37 ; GFX9-NEXT: s_waitcnt vmcnt(0)
38 ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2
39 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
41 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
42 %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
43 %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
44 %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
45 %a = load i16, ptr addrspace(1) %gep0, align 4
46 %b = load i16, ptr addrspace(1) %gep1, align 4
47 %cmp = icmp sge i16 %a, %b
48 %val = select i1 %cmp, i16 %a, i16 %b
49 store i16 %val, ptr addrspace(1) %outgep, align 4
53 ; FIXME: Need to handle non-uniform case for function below (load without gep).
54 define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
55 ; VI-LABEL: v_test_imax_sge_v2i16:
57 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
58 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
59 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
60 ; VI-NEXT: s_waitcnt lgkmcnt(0)
61 ; VI-NEXT: v_mov_b32_e32 v1, s7
62 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
63 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
64 ; VI-NEXT: v_mov_b32_e32 v3, s1
65 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
66 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
67 ; VI-NEXT: flat_load_dword v5, v[0:1]
68 ; VI-NEXT: flat_load_dword v2, v[2:3]
69 ; VI-NEXT: v_mov_b32_e32 v1, s5
70 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
71 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
72 ; VI-NEXT: s_waitcnt vmcnt(0)
73 ; VI-NEXT: v_max_i16_e32 v3, v5, v2
74 ; VI-NEXT: v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
75 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
76 ; VI-NEXT: flat_store_dword v[0:1], v2
79 ; GFX9-LABEL: v_test_imax_sge_v2i16:
81 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
82 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
83 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
84 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
85 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
86 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
87 ; GFX9-NEXT: s_waitcnt vmcnt(0)
88 ; GFX9-NEXT: v_pk_max_i16 v1, v1, v2
89 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
91 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
92 %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid
93 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %bptr, i32 %tid
94 %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid
95 %a = load <2 x i16>, ptr addrspace(1) %gep0, align 4
96 %b = load <2 x i16>, ptr addrspace(1) %gep1, align 4
97 %cmp = icmp sge <2 x i16> %a, %b
98 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
99 store <2 x i16> %val, ptr addrspace(1) %outgep, align 4
103 ; FIXME: Need to handle non-uniform case for function below (load without gep).
104 define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
105 ; VI-LABEL: v_test_imax_sge_v3i16:
107 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
108 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
109 ; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0
110 ; VI-NEXT: s_waitcnt lgkmcnt(0)
111 ; VI-NEXT: v_mov_b32_e32 v1, s7
112 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6
113 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
114 ; VI-NEXT: v_mov_b32_e32 v3, s1
115 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6
116 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
117 ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
118 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
119 ; VI-NEXT: flat_load_ushort v4, v[4:5]
120 ; VI-NEXT: flat_load_dword v5, v[0:1]
121 ; VI-NEXT: flat_load_dword v7, v[2:3]
122 ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2
123 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
124 ; VI-NEXT: flat_load_ushort v8, v[0:1]
125 ; VI-NEXT: v_mov_b32_e32 v1, s5
126 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v6
127 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
128 ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
129 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
130 ; VI-NEXT: s_waitcnt vmcnt(1)
131 ; VI-NEXT: v_max_i16_e32 v6, v5, v7
132 ; VI-NEXT: v_max_i16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
133 ; VI-NEXT: v_or_b32_e32 v5, v6, v5
134 ; VI-NEXT: s_waitcnt vmcnt(0)
135 ; VI-NEXT: v_max_i16_e32 v4, v4, v8
136 ; VI-NEXT: flat_store_short v[2:3], v4
137 ; VI-NEXT: flat_store_dword v[0:1], v5
140 ; GFX9-LABEL: v_test_imax_sge_v3i16:
142 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
143 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
144 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
145 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
146 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
147 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
148 ; GFX9-NEXT: global_load_dword v3, v0, s[2:3]
150 ; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4
152 ; GFX9-NEXT: global_load_dword v4, v0, s[6:7]
154 ; GFX9-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:4
155 ; GFX9-NEXT: s_waitcnt vmcnt(1)
156 ; GFX9-NEXT: v_pk_max_i16 v3, v4, v3
157 ; GFX9-NEXT: s_waitcnt vmcnt(0)
158 ; GFX9-NEXT: v_pk_max_i16 v1, v2, v1
159 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] offset:4
160 ; GFX9-NEXT: global_store_dword v0, v3, s[4:5]
161 ; GFX9-NEXT: s_endpgm
162 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
163 %gep0 = getelementptr <3 x i16>, ptr addrspace(1) %aptr, i32 %tid
164 %gep1 = getelementptr <3 x i16>, ptr addrspace(1) %bptr, i32 %tid
165 %outgep = getelementptr <3 x i16>, ptr addrspace(1) %out, i32 %tid
166 %a = load <3 x i16>, ptr addrspace(1) %gep0, align 4
167 %b = load <3 x i16>, ptr addrspace(1) %gep1, align 4
168 %cmp = icmp sge <3 x i16> %a, %b
169 %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
170 store <3 x i16> %val, ptr addrspace(1) %outgep, align 4
174 ; FIXME: Need to handle non-uniform case for function below (load without gep).
175 define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
176 ; VI-LABEL: v_test_imax_sge_v4i16:
178 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
179 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
180 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
181 ; VI-NEXT: s_waitcnt lgkmcnt(0)
182 ; VI-NEXT: v_mov_b32_e32 v1, s7
183 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
184 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
185 ; VI-NEXT: v_mov_b32_e32 v3, s1
186 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
187 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
188 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
189 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
190 ; VI-NEXT: v_mov_b32_e32 v5, s5
191 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
192 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
193 ; VI-NEXT: s_waitcnt vmcnt(0)
194 ; VI-NEXT: v_max_i16_e32 v6, v1, v3
195 ; VI-NEXT: v_max_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
196 ; VI-NEXT: v_max_i16_e32 v3, v0, v2
197 ; VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
198 ; VI-NEXT: v_or_b32_e32 v1, v6, v1
199 ; VI-NEXT: v_or_b32_e32 v0, v3, v0
200 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
203 ; GFX9-LABEL: v_test_imax_sge_v4i16:
205 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
206 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
207 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
208 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
209 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
210 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
211 ; GFX9-NEXT: s_waitcnt vmcnt(0)
212 ; GFX9-NEXT: v_pk_max_i16 v1, v1, v3
213 ; GFX9-NEXT: v_pk_max_i16 v0, v0, v2
214 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
215 ; GFX9-NEXT: s_endpgm
216 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
217 %gep0 = getelementptr <4 x i16>, ptr addrspace(1) %aptr, i32 %tid
218 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %bptr, i32 %tid
219 %outgep = getelementptr <4 x i16>, ptr addrspace(1) %out, i32 %tid
220 %a = load <4 x i16>, ptr addrspace(1) %gep0, align 4
221 %b = load <4 x i16>, ptr addrspace(1) %gep1, align 4
222 %cmp = icmp sge <4 x i16> %a, %b
223 %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
224 store <4 x i16> %val, ptr addrspace(1) %outgep, align 4
228 ; FIXME: Need to handle non-uniform case for function below (load without gep).
229 define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
230 ; VI-LABEL: v_test_imax_sgt_i16:
232 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
233 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
234 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
235 ; VI-NEXT: s_waitcnt lgkmcnt(0)
236 ; VI-NEXT: v_mov_b32_e32 v1, s7
237 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
238 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
239 ; VI-NEXT: v_mov_b32_e32 v3, s1
240 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
241 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
242 ; VI-NEXT: flat_load_ushort v5, v[0:1]
243 ; VI-NEXT: flat_load_ushort v2, v[2:3]
244 ; VI-NEXT: v_mov_b32_e32 v1, s5
245 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
246 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
247 ; VI-NEXT: s_waitcnt vmcnt(0)
248 ; VI-NEXT: v_max_i16_e32 v2, v5, v2
249 ; VI-NEXT: flat_store_short v[0:1], v2
252 ; GFX9-LABEL: v_test_imax_sgt_i16:
254 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
255 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
256 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
257 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
258 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
259 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
260 ; GFX9-NEXT: s_waitcnt vmcnt(0)
261 ; GFX9-NEXT: v_max_i16_e32 v1, v1, v2
262 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
263 ; GFX9-NEXT: s_endpgm
264 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
265 %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
266 %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
267 %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
268 %a = load i16, ptr addrspace(1) %gep0, align 4
269 %b = load i16, ptr addrspace(1) %gep1, align 4
270 %cmp = icmp sgt i16 %a, %b
271 %val = select i1 %cmp, i16 %a, i16 %b
272 store i16 %val, ptr addrspace(1) %outgep, align 4
276 ; FIXME: Need to handle non-uniform case for function below (load without gep).
277 define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
278 ; VI-LABEL: v_test_umax_uge_i16:
280 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
281 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
282 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
283 ; VI-NEXT: s_waitcnt lgkmcnt(0)
284 ; VI-NEXT: v_mov_b32_e32 v1, s7
285 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
286 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
287 ; VI-NEXT: v_mov_b32_e32 v3, s1
288 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
289 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
290 ; VI-NEXT: flat_load_ushort v5, v[0:1]
291 ; VI-NEXT: flat_load_ushort v2, v[2:3]
292 ; VI-NEXT: v_mov_b32_e32 v1, s5
293 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
294 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
295 ; VI-NEXT: s_waitcnt vmcnt(0)
296 ; VI-NEXT: v_max_u16_e32 v2, v5, v2
297 ; VI-NEXT: flat_store_short v[0:1], v2
300 ; GFX9-LABEL: v_test_umax_uge_i16:
302 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
303 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
304 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
305 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
306 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
307 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
308 ; GFX9-NEXT: s_waitcnt vmcnt(0)
309 ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2
310 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
311 ; GFX9-NEXT: s_endpgm
312 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
313 %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
314 %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
315 %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
316 %a = load i16, ptr addrspace(1) %gep0, align 4
317 %b = load i16, ptr addrspace(1) %gep1, align 4
318 %cmp = icmp uge i16 %a, %b
319 %val = select i1 %cmp, i16 %a, i16 %b
320 store i16 %val, ptr addrspace(1) %outgep, align 4
324 ; FIXME: Need to handle non-uniform case for function below (load without gep).
325 define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
326 ; VI-LABEL: v_test_umax_ugt_i16:
328 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
329 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
330 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
331 ; VI-NEXT: s_waitcnt lgkmcnt(0)
332 ; VI-NEXT: v_mov_b32_e32 v1, s7
333 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
334 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
335 ; VI-NEXT: v_mov_b32_e32 v3, s1
336 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
337 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
338 ; VI-NEXT: flat_load_ushort v5, v[0:1]
339 ; VI-NEXT: flat_load_ushort v2, v[2:3]
340 ; VI-NEXT: v_mov_b32_e32 v1, s5
341 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
342 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
343 ; VI-NEXT: s_waitcnt vmcnt(0)
344 ; VI-NEXT: v_max_u16_e32 v2, v5, v2
345 ; VI-NEXT: flat_store_short v[0:1], v2
348 ; GFX9-LABEL: v_test_umax_ugt_i16:
350 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
351 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
352 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
353 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
354 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
355 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
356 ; GFX9-NEXT: s_waitcnt vmcnt(0)
357 ; GFX9-NEXT: v_max_u16_e32 v1, v1, v2
358 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
359 ; GFX9-NEXT: s_endpgm
360 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
361 %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
362 %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
363 %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid
364 %a = load i16, ptr addrspace(1) %gep0, align 4
365 %b = load i16, ptr addrspace(1) %gep1, align 4
366 %cmp = icmp ugt i16 %a, %b
367 %val = select i1 %cmp, i16 %a, i16 %b
368 store i16 %val, ptr addrspace(1) %outgep, align 4
372 define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
373 ; VI-LABEL: v_test_umax_ugt_v2i16:
375 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
376 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
377 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
378 ; VI-NEXT: s_waitcnt lgkmcnt(0)
379 ; VI-NEXT: v_mov_b32_e32 v1, s7
380 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
381 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
382 ; VI-NEXT: v_mov_b32_e32 v3, s1
383 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
384 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
385 ; VI-NEXT: flat_load_dword v5, v[0:1]
386 ; VI-NEXT: flat_load_dword v2, v[2:3]
387 ; VI-NEXT: v_mov_b32_e32 v1, s5
388 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
389 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
390 ; VI-NEXT: s_waitcnt vmcnt(0)
391 ; VI-NEXT: v_max_u16_e32 v3, v5, v2
392 ; VI-NEXT: v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
393 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
394 ; VI-NEXT: flat_store_dword v[0:1], v2
397 ; GFX9-LABEL: v_test_umax_ugt_v2i16:
399 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
400 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
401 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
402 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
403 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
404 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
405 ; GFX9-NEXT: s_waitcnt vmcnt(0)
406 ; GFX9-NEXT: v_pk_max_u16 v1, v1, v2
407 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
408 ; GFX9-NEXT: s_endpgm
409 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
410 %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid
411 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %bptr, i32 %tid
412 %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid
413 %a = load <2 x i16>, ptr addrspace(1) %gep0, align 4
414 %b = load <2 x i16>, ptr addrspace(1) %gep1, align 4
415 %cmp = icmp ugt <2 x i16> %a, %b
416 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
417 store <2 x i16> %val, ptr addrspace(1) %outgep, align 4
421 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone