1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=VI
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GCN
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GFX1030
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
8 define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
11 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
12 ; SI-NEXT: s_mov_b32 s7, 0xf000
13 ; SI-NEXT: s_mov_b32 s6, -1
14 ; SI-NEXT: s_mov_b32 s10, s6
15 ; SI-NEXT: s_mov_b32 s11, s7
16 ; SI-NEXT: s_waitcnt lgkmcnt(0)
17 ; SI-NEXT: s_mov_b32 s8, s2
18 ; SI-NEXT: s_mov_b32 s9, s3
19 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
20 ; SI-NEXT: s_mov_b32 s4, s0
21 ; SI-NEXT: s_mov_b32 s5, s1
22 ; SI-NEXT: s_waitcnt vmcnt(0)
23 ; SI-NEXT: v_cvt_f32_u32_e32 v2, v1
24 ; SI-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
25 ; SI-NEXT: v_rcp_iflag_f32_e32 v2, v2
26 ; SI-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
27 ; SI-NEXT: v_cvt_u32_f32_e32 v2, v2
28 ; SI-NEXT: v_mul_lo_u32 v3, v3, v2
29 ; SI-NEXT: v_mul_hi_u32 v3, v2, v3
30 ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
31 ; SI-NEXT: v_mul_hi_u32 v2, v0, v2
32 ; SI-NEXT: v_mul_lo_u32 v3, v2, v1
33 ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v2
34 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
35 ; SI-NEXT: v_sub_i32_e32 v3, vcc, v0, v1
36 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
37 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
38 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v2
40 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
41 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
42 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
47 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
48 ; VI-NEXT: s_mov_b32 s7, 0xf000
49 ; VI-NEXT: s_mov_b32 s6, -1
50 ; VI-NEXT: s_mov_b32 s10, s6
51 ; VI-NEXT: s_mov_b32 s11, s7
52 ; VI-NEXT: s_waitcnt lgkmcnt(0)
53 ; VI-NEXT: s_mov_b32 s8, s2
54 ; VI-NEXT: s_mov_b32 s9, s3
55 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
56 ; VI-NEXT: s_mov_b32 s4, s0
57 ; VI-NEXT: s_mov_b32 s5, s1
58 ; VI-NEXT: s_waitcnt vmcnt(0)
59 ; VI-NEXT: v_cvt_f32_u32_e32 v2, v1
60 ; VI-NEXT: v_sub_u32_e32 v3, vcc, 0, v1
61 ; VI-NEXT: v_rcp_iflag_f32_e32 v2, v2
62 ; VI-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
63 ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
64 ; VI-NEXT: v_mul_lo_u32 v3, v3, v2
65 ; VI-NEXT: v_mul_hi_u32 v3, v2, v3
66 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v3
67 ; VI-NEXT: v_mul_hi_u32 v2, v0, v2
68 ; VI-NEXT: v_mul_lo_u32 v3, v2, v1
69 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v2
70 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v3
71 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v0, v1
72 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
73 ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
74 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
75 ; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v2
76 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
77 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
78 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
81 ; GCN-LABEL: udiv_i32:
83 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
84 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
85 ; GCN-NEXT: v_mov_b32_e32 v0, s2
86 ; GCN-NEXT: v_mov_b32_e32 v1, s3
87 ; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
88 ; GCN-NEXT: s_waitcnt vmcnt(0)
89 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1
90 ; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1
91 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
92 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
93 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
94 ; GCN-NEXT: v_mul_lo_u32 v3, v3, v2
95 ; GCN-NEXT: v_mul_hi_u32 v3, v2, v3
96 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3
97 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v2
98 ; GCN-NEXT: v_mov_b32_e32 v2, s0
99 ; GCN-NEXT: v_mov_b32_e32 v3, s1
100 ; GCN-NEXT: v_mul_lo_u32 v5, v4, v1
101 ; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4
102 ; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v5
103 ; GCN-NEXT: v_sub_u32_e32 v5, vcc, v0, v1
104 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
105 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
106 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
107 ; GCN-NEXT: v_add_u32_e32 v5, vcc, 1, v4
108 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
109 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
110 ; GCN-NEXT: flat_store_dword v[2:3], v0
113 ; GFX1030-LABEL: udiv_i32:
115 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
116 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0
117 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
118 ; GFX1030-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
119 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
120 ; GFX1030-NEXT: v_readfirstlane_b32 s2, v1
121 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v0
122 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s2
123 ; GFX1030-NEXT: s_sub_i32 s4, 0, s2
124 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1
125 ; GFX1030-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
126 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v1, v1
127 ; GFX1030-NEXT: v_readfirstlane_b32 s3, v1
128 ; GFX1030-NEXT: s_mul_i32 s4, s4, s3
129 ; GFX1030-NEXT: s_mul_hi_u32 s4, s3, s4
130 ; GFX1030-NEXT: s_add_i32 s3, s3, s4
131 ; GFX1030-NEXT: s_mul_hi_u32 s3, s5, s3
132 ; GFX1030-NEXT: s_mul_i32 s4, s3, s2
133 ; GFX1030-NEXT: s_sub_i32 s4, s5, s4
134 ; GFX1030-NEXT: s_add_i32 s5, s3, 1
135 ; GFX1030-NEXT: s_sub_i32 s6, s4, s2
136 ; GFX1030-NEXT: s_cmp_ge_u32 s4, s2
137 ; GFX1030-NEXT: s_cselect_b32 s3, s5, s3
138 ; GFX1030-NEXT: s_cselect_b32 s4, s6, s4
139 ; GFX1030-NEXT: s_add_i32 s5, s3, 1
140 ; GFX1030-NEXT: s_cmp_ge_u32 s4, s2
141 ; GFX1030-NEXT: s_cselect_b32 s2, s5, s3
142 ; GFX1030-NEXT: v_mov_b32_e32 v0, s2
143 ; GFX1030-NEXT: global_store_dword v2, v0, s[0:1]
144 ; GFX1030-NEXT: s_endpgm
146 ; EG-LABEL: udiv_i32:
148 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
150 ; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
151 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
154 ; EG-NEXT: Fetch clause starting at 6:
155 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
156 ; EG-NEXT: ALU clause starting at 8:
157 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
158 ; EG-NEXT: ALU clause starting at 9:
159 ; EG-NEXT: SUB_INT T0.W, 0.0, T0.Y,
160 ; EG-NEXT: RECIP_UINT * T0.Z, T0.Y,
161 ; EG-NEXT: MULLO_INT * T0.W, PV.W, PS,
162 ; EG-NEXT: MULHI * T0.W, T0.Z, PS,
163 ; EG-NEXT: ADD_INT * T0.W, T0.Z, PS,
164 ; EG-NEXT: MULHI * T0.Z, T0.X, PV.W,
165 ; EG-NEXT: MULLO_INT * T0.W, PS, T0.Y,
166 ; EG-NEXT: SUB_INT * T0.W, T0.X, PS,
167 ; EG-NEXT: ADD_INT T1.Z, T0.Z, 1,
168 ; EG-NEXT: SETGE_UINT T1.W, PV.W, T0.Y,
169 ; EG-NEXT: SUB_INT * T2.W, PV.W, T0.Y,
170 ; EG-NEXT: CNDE_INT T0.W, PV.W, T0.W, PS,
171 ; EG-NEXT: CNDE_INT * T1.W, PV.W, T0.Z, PV.Z,
172 ; EG-NEXT: ADD_INT T2.W, PS, 1,
173 ; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.Y,
174 ; EG-NEXT: CNDE_INT T0.X, PS, T1.W, PV.W,
175 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
176 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
177 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
178 %a = load i32, ptr addrspace(1) %in
179 %b = load i32, ptr addrspace(1) %b_ptr
180 %result = udiv i32 %a, %b
181 store i32 %result, ptr addrspace(1) %out
185 define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
186 ; SI-LABEL: s_udiv_i32:
188 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
189 ; SI-NEXT: s_mov_b32 s7, 0xf000
190 ; SI-NEXT: s_mov_b32 s6, -1
191 ; SI-NEXT: s_waitcnt lgkmcnt(0)
192 ; SI-NEXT: v_cvt_f32_u32_e32 v0, s3
193 ; SI-NEXT: s_sub_i32 s4, 0, s3
194 ; SI-NEXT: s_mov_b32 s5, s1
195 ; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0
196 ; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
197 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
198 ; SI-NEXT: v_mul_lo_u32 v1, s4, v0
199 ; SI-NEXT: s_mov_b32 s4, s0
200 ; SI-NEXT: v_mul_hi_u32 v1, v0, v1
201 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
202 ; SI-NEXT: v_mul_hi_u32 v0, s2, v0
203 ; SI-NEXT: v_readfirstlane_b32 s0, v0
204 ; SI-NEXT: s_mul_i32 s0, s0, s3
205 ; SI-NEXT: s_sub_i32 s0, s2, s0
206 ; SI-NEXT: s_sub_i32 s1, s0, s3
207 ; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0
208 ; SI-NEXT: s_cmp_ge_u32 s0, s3
209 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
210 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
211 ; SI-NEXT: s_cselect_b32 s0, s1, s0
212 ; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0
213 ; SI-NEXT: s_cmp_ge_u32 s0, s3
214 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
215 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
216 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
219 ; VI-LABEL: s_udiv_i32:
221 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
222 ; VI-NEXT: s_mov_b32 s7, 0xf000
223 ; VI-NEXT: s_mov_b32 s6, -1
224 ; VI-NEXT: s_waitcnt lgkmcnt(0)
225 ; VI-NEXT: v_cvt_f32_u32_e32 v0, s3
226 ; VI-NEXT: s_sub_i32 s4, 0, s3
227 ; VI-NEXT: s_mov_b32 s5, s1
228 ; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0
229 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
230 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
231 ; VI-NEXT: v_mul_lo_u32 v1, s4, v0
232 ; VI-NEXT: s_mov_b32 s4, s0
233 ; VI-NEXT: v_mul_hi_u32 v1, v0, v1
234 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
235 ; VI-NEXT: v_mul_hi_u32 v0, s2, v0
236 ; VI-NEXT: v_readfirstlane_b32 s0, v0
237 ; VI-NEXT: s_mul_i32 s0, s0, s3
238 ; VI-NEXT: s_sub_i32 s0, s2, s0
239 ; VI-NEXT: s_sub_i32 s1, s0, s3
240 ; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0
241 ; VI-NEXT: s_cmp_ge_u32 s0, s3
242 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
243 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
244 ; VI-NEXT: s_cselect_b32 s0, s1, s0
245 ; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0
246 ; VI-NEXT: s_cmp_ge_u32 s0, s3
247 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
248 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
249 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
252 ; GCN-LABEL: s_udiv_i32:
254 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
255 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
256 ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3
257 ; GCN-NEXT: s_sub_i32 s4, 0, s3
258 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
259 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
260 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
261 ; GCN-NEXT: v_mul_lo_u32 v1, s4, v0
262 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1
263 ; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
264 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
265 ; GCN-NEXT: v_readfirstlane_b32 s4, v0
266 ; GCN-NEXT: s_mul_i32 s4, s4, s3
267 ; GCN-NEXT: s_sub_i32 s2, s2, s4
268 ; GCN-NEXT: s_sub_i32 s4, s2, s3
269 ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
270 ; GCN-NEXT: s_cmp_ge_u32 s2, s3
271 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0
272 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
273 ; GCN-NEXT: s_cselect_b32 s2, s4, s2
274 ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
275 ; GCN-NEXT: s_cmp_ge_u32 s2, s3
276 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0
277 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
278 ; GCN-NEXT: v_mov_b32_e32 v0, s0
279 ; GCN-NEXT: v_mov_b32_e32 v1, s1
280 ; GCN-NEXT: flat_store_dword v[0:1], v2
283 ; GFX1030-LABEL: s_udiv_i32:
285 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
286 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
287 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s3
288 ; GFX1030-NEXT: s_sub_i32 s5, 0, s3
289 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0
290 ; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
291 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0
292 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v0
293 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0
294 ; GFX1030-NEXT: s_mul_i32 s5, s5, s4
295 ; GFX1030-NEXT: s_mul_hi_u32 s5, s4, s5
296 ; GFX1030-NEXT: s_add_i32 s4, s4, s5
297 ; GFX1030-NEXT: s_mul_hi_u32 s4, s2, s4
298 ; GFX1030-NEXT: s_mul_i32 s5, s4, s3
299 ; GFX1030-NEXT: s_sub_i32 s2, s2, s5
300 ; GFX1030-NEXT: s_add_i32 s5, s4, 1
301 ; GFX1030-NEXT: s_sub_i32 s6, s2, s3
302 ; GFX1030-NEXT: s_cmp_ge_u32 s2, s3
303 ; GFX1030-NEXT: s_cselect_b32 s4, s5, s4
304 ; GFX1030-NEXT: s_cselect_b32 s2, s6, s2
305 ; GFX1030-NEXT: s_add_i32 s5, s4, 1
306 ; GFX1030-NEXT: s_cmp_ge_u32 s2, s3
307 ; GFX1030-NEXT: s_cselect_b32 s2, s5, s4
308 ; GFX1030-NEXT: v_mov_b32_e32 v1, s2
309 ; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
310 ; GFX1030-NEXT: s_endpgm
312 ; EG-LABEL: s_udiv_i32:
314 ; EG-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[]
315 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
318 ; EG-NEXT: ALU clause starting at 4:
319 ; EG-NEXT: SUB_INT T0.W, 0.0, KC0[2].W,
320 ; EG-NEXT: RECIP_UINT * T0.X, KC0[2].W,
321 ; EG-NEXT: MULLO_INT * T0.Y, PV.W, PS,
322 ; EG-NEXT: MULHI * T0.Y, T0.X, PS,
323 ; EG-NEXT: ADD_INT * T0.W, T0.X, PS,
324 ; EG-NEXT: MULHI * T0.X, KC0[2].Z, PV.W,
325 ; EG-NEXT: MULLO_INT * T0.Y, PS, KC0[2].W,
326 ; EG-NEXT: SUB_INT * T0.W, KC0[2].Z, PS,
327 ; EG-NEXT: SUB_INT T0.Z, PV.W, KC0[2].W,
328 ; EG-NEXT: SETGE_UINT T1.W, PV.W, KC0[2].W,
329 ; EG-NEXT: ADD_INT * T2.W, T0.X, 1,
330 ; EG-NEXT: CNDE_INT T2.W, PV.W, T0.X, PS,
331 ; EG-NEXT: CNDE_INT * T0.W, PV.W, T0.W, PV.Z,
332 ; EG-NEXT: SETGE_UINT T0.W, PS, KC0[2].W,
333 ; EG-NEXT: ADD_INT * T1.W, PV.W, 1,
334 ; EG-NEXT: CNDE_INT T0.X, PV.W, T2.W, PS,
335 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
336 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
337 %result = udiv i32 %a, %b
338 store i32 %result, ptr addrspace(1) %out
343 ; The code generated by udiv is long and complex and may frequently
344 ; change. The goal of this test is to make sure the ISel doesn't fail
345 ; when it gets a v4i32 udiv
346 define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
347 ; SI-LABEL: udiv_v2i32:
349 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
350 ; SI-NEXT: s_mov_b32 s7, 0xf000
351 ; SI-NEXT: s_mov_b32 s6, -1
352 ; SI-NEXT: s_mov_b32 s10, s6
353 ; SI-NEXT: s_mov_b32 s11, s7
354 ; SI-NEXT: s_waitcnt lgkmcnt(0)
355 ; SI-NEXT: s_mov_b32 s8, s2
356 ; SI-NEXT: s_mov_b32 s9, s3
357 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
358 ; SI-NEXT: s_mov_b32 s4, s0
359 ; SI-NEXT: s_mov_b32 s5, s1
360 ; SI-NEXT: s_waitcnt vmcnt(0)
361 ; SI-NEXT: v_cvt_f32_u32_e32 v4, v2
362 ; SI-NEXT: v_cvt_f32_u32_e32 v5, v3
363 ; SI-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
364 ; SI-NEXT: v_rcp_iflag_f32_e32 v4, v4
365 ; SI-NEXT: v_rcp_iflag_f32_e32 v5, v5
366 ; SI-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
367 ; SI-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
368 ; SI-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
369 ; SI-NEXT: v_cvt_u32_f32_e32 v4, v4
370 ; SI-NEXT: v_cvt_u32_f32_e32 v5, v5
371 ; SI-NEXT: v_mul_lo_u32 v6, v6, v4
372 ; SI-NEXT: v_mul_lo_u32 v7, v7, v5
373 ; SI-NEXT: v_mul_hi_u32 v6, v4, v6
374 ; SI-NEXT: v_mul_hi_u32 v7, v5, v7
375 ; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6
376 ; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7
377 ; SI-NEXT: v_mul_hi_u32 v4, v0, v4
378 ; SI-NEXT: v_mul_hi_u32 v5, v1, v5
379 ; SI-NEXT: v_mul_lo_u32 v6, v4, v2
380 ; SI-NEXT: v_mul_lo_u32 v8, v5, v3
381 ; SI-NEXT: v_add_i32_e32 v7, vcc, 1, v4
382 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
383 ; SI-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
384 ; SI-NEXT: v_add_i32_e32 v9, vcc, 1, v5
385 ; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
386 ; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
387 ; SI-NEXT: v_sub_i32_e32 v6, vcc, v0, v2
388 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
389 ; SI-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
390 ; SI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3]
391 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1]
392 ; SI-NEXT: v_add_i32_e32 v6, vcc, 1, v4
393 ; SI-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
394 ; SI-NEXT: v_add_i32_e32 v7, vcc, 1, v5
395 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
396 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
397 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
398 ; SI-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
399 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
402 ; VI-LABEL: udiv_v2i32:
404 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
405 ; VI-NEXT: s_mov_b32 s7, 0xf000
406 ; VI-NEXT: s_mov_b32 s6, -1
407 ; VI-NEXT: s_mov_b32 s10, s6
408 ; VI-NEXT: s_mov_b32 s11, s7
409 ; VI-NEXT: s_waitcnt lgkmcnt(0)
410 ; VI-NEXT: s_mov_b32 s8, s2
411 ; VI-NEXT: s_mov_b32 s9, s3
412 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
413 ; VI-NEXT: s_mov_b32 s4, s0
414 ; VI-NEXT: s_mov_b32 s5, s1
415 ; VI-NEXT: s_waitcnt vmcnt(0)
416 ; VI-NEXT: v_cvt_f32_u32_e32 v4, v2
417 ; VI-NEXT: v_cvt_f32_u32_e32 v5, v3
418 ; VI-NEXT: v_sub_u32_e32 v6, vcc, 0, v2
419 ; VI-NEXT: v_rcp_iflag_f32_e32 v4, v4
420 ; VI-NEXT: v_rcp_iflag_f32_e32 v5, v5
421 ; VI-NEXT: v_sub_u32_e32 v7, vcc, 0, v3
422 ; VI-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
423 ; VI-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
424 ; VI-NEXT: v_cvt_u32_f32_e32 v4, v4
425 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v5
426 ; VI-NEXT: v_mul_lo_u32 v6, v6, v4
427 ; VI-NEXT: v_mul_lo_u32 v7, v7, v5
428 ; VI-NEXT: v_mul_hi_u32 v6, v4, v6
429 ; VI-NEXT: v_mul_hi_u32 v7, v5, v7
430 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
431 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v7
432 ; VI-NEXT: v_mul_hi_u32 v4, v0, v4
433 ; VI-NEXT: v_mul_hi_u32 v5, v1, v5
434 ; VI-NEXT: v_mul_lo_u32 v6, v4, v2
435 ; VI-NEXT: v_mul_lo_u32 v8, v5, v3
436 ; VI-NEXT: v_add_u32_e32 v7, vcc, 1, v4
437 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v6
438 ; VI-NEXT: v_sub_u32_e32 v1, vcc, v1, v8
439 ; VI-NEXT: v_add_u32_e32 v9, vcc, 1, v5
440 ; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
441 ; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
442 ; VI-NEXT: v_sub_u32_e32 v6, vcc, v0, v2
443 ; VI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
444 ; VI-NEXT: v_sub_u32_e32 v7, vcc, v1, v3
445 ; VI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3]
446 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1]
447 ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v4
448 ; VI-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
449 ; VI-NEXT: v_add_u32_e32 v7, vcc, 1, v5
450 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
451 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
452 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
453 ; VI-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
454 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
457 ; GCN-LABEL: udiv_v2i32:
459 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
460 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
461 ; GCN-NEXT: v_mov_b32_e32 v0, s2
462 ; GCN-NEXT: v_mov_b32_e32 v1, s3
463 ; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
464 ; GCN-NEXT: s_waitcnt vmcnt(0)
465 ; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2
466 ; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3
467 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4
468 ; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5
469 ; GCN-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
470 ; GCN-NEXT: v_cvt_u32_f32_e32 v6, v4
471 ; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
472 ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v5
473 ; GCN-NEXT: v_sub_u32_e32 v4, vcc, 0, v2
474 ; GCN-NEXT: v_mul_lo_u32 v5, v4, v6
475 ; GCN-NEXT: v_sub_u32_e32 v4, vcc, 0, v3
476 ; GCN-NEXT: v_mul_lo_u32 v8, v4, v7
477 ; GCN-NEXT: v_mul_hi_u32 v9, v6, v5
478 ; GCN-NEXT: v_mov_b32_e32 v4, s0
479 ; GCN-NEXT: v_mov_b32_e32 v5, s1
480 ; GCN-NEXT: v_mul_hi_u32 v8, v7, v8
481 ; GCN-NEXT: v_add_u32_e32 v6, vcc, v6, v9
482 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v6
483 ; GCN-NEXT: v_add_u32_e32 v7, vcc, v7, v8
484 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v7
485 ; GCN-NEXT: v_mul_lo_u32 v8, v6, v2
486 ; GCN-NEXT: v_add_u32_e32 v9, vcc, 1, v6
487 ; GCN-NEXT: v_mul_lo_u32 v10, v7, v3
488 ; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
489 ; GCN-NEXT: v_add_u32_e32 v11, vcc, 1, v7
490 ; GCN-NEXT: v_sub_u32_e32 v1, vcc, v1, v10
491 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
492 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
493 ; GCN-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
494 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
495 ; GCN-NEXT: v_sub_u32_e32 v9, vcc, v1, v3
496 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[2:3]
497 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1]
498 ; GCN-NEXT: v_add_u32_e32 v8, vcc, 1, v6
499 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3]
500 ; GCN-NEXT: v_add_u32_e32 v9, vcc, 1, v7
501 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
502 ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
503 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
504 ; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc
505 ; GCN-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
508 ; GFX1030-LABEL: udiv_v2i32:
510 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
511 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0
512 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
513 ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
514 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
515 ; GFX1030-NEXT: v_readfirstlane_b32 s2, v2
516 ; GFX1030-NEXT: v_readfirstlane_b32 s3, v3
517 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v0
518 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, s2
519 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v3, s3
520 ; GFX1030-NEXT: s_sub_i32 s5, 0, s2
521 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v2, v2
522 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v3
523 ; GFX1030-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
524 ; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v3
525 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v2, v2
526 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0
527 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v2
528 ; GFX1030-NEXT: v_readfirstlane_b32 s8, v0
529 ; GFX1030-NEXT: s_mul_i32 s5, s5, s4
530 ; GFX1030-NEXT: s_mul_hi_u32 s5, s4, s5
531 ; GFX1030-NEXT: s_add_i32 s4, s4, s5
532 ; GFX1030-NEXT: s_mul_hi_u32 s4, s6, s4
533 ; GFX1030-NEXT: s_mul_i32 s5, s4, s2
534 ; GFX1030-NEXT: s_sub_i32 s5, s6, s5
535 ; GFX1030-NEXT: s_add_i32 s6, s4, 1
536 ; GFX1030-NEXT: s_sub_i32 s7, s5, s2
537 ; GFX1030-NEXT: s_cmp_ge_u32 s5, s2
538 ; GFX1030-NEXT: s_cselect_b32 s4, s6, s4
539 ; GFX1030-NEXT: s_cselect_b32 s5, s7, s5
540 ; GFX1030-NEXT: s_add_i32 s6, s4, 1
541 ; GFX1030-NEXT: s_cmp_ge_u32 s5, s2
542 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v1
543 ; GFX1030-NEXT: s_cselect_b32 s2, s6, s4
544 ; GFX1030-NEXT: s_sub_i32 s4, 0, s3
545 ; GFX1030-NEXT: v_mov_b32_e32 v0, s2
546 ; GFX1030-NEXT: s_mul_i32 s4, s4, s8
547 ; GFX1030-NEXT: s_mul_hi_u32 s4, s8, s4
548 ; GFX1030-NEXT: s_add_i32 s8, s8, s4
549 ; GFX1030-NEXT: s_mul_hi_u32 s4, s5, s8
550 ; GFX1030-NEXT: s_mul_i32 s6, s4, s3
551 ; GFX1030-NEXT: s_sub_i32 s5, s5, s6
552 ; GFX1030-NEXT: s_add_i32 s6, s4, 1
553 ; GFX1030-NEXT: s_sub_i32 s7, s5, s3
554 ; GFX1030-NEXT: s_cmp_ge_u32 s5, s3
555 ; GFX1030-NEXT: s_cselect_b32 s4, s6, s4
556 ; GFX1030-NEXT: s_cselect_b32 s5, s7, s5
557 ; GFX1030-NEXT: s_add_i32 s6, s4, 1
558 ; GFX1030-NEXT: s_cmp_ge_u32 s5, s3
559 ; GFX1030-NEXT: s_cselect_b32 s3, s6, s4
560 ; GFX1030-NEXT: v_mov_b32_e32 v1, s3
561 ; GFX1030-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
562 ; GFX1030-NEXT: s_endpgm
564 ; EG-LABEL: udiv_v2i32:
566 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
568 ; EG-NEXT: ALU 33, @9, KC0[CB0:0-32], KC1[]
569 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
572 ; EG-NEXT: Fetch clause starting at 6:
573 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
574 ; EG-NEXT: ALU clause starting at 8:
575 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
576 ; EG-NEXT: ALU clause starting at 9:
577 ; EG-NEXT: SUB_INT T1.W, 0.0, T0.W,
578 ; EG-NEXT: RECIP_UINT * T1.X, T0.W,
579 ; EG-NEXT: MULLO_INT * T1.Y, PV.W, PS,
580 ; EG-NEXT: SUB_INT T1.W, 0.0, T0.Z,
581 ; EG-NEXT: RECIP_UINT * T1.Z, T0.Z,
582 ; EG-NEXT: MULLO_INT * T1.W, PV.W, PS,
583 ; EG-NEXT: MULHI * T1.W, T1.Z, PS,
584 ; EG-NEXT: ADD_INT T1.W, T1.Z, PS,
585 ; EG-NEXT: MULHI * T1.Y, T1.X, T1.Y,
586 ; EG-NEXT: ADD_INT T2.W, T1.X, PS,
587 ; EG-NEXT: MULHI * T1.X, T0.X, PV.W,
588 ; EG-NEXT: MULHI * T1.Y, T0.Y, PV.W,
589 ; EG-NEXT: MULLO_INT * T1.Z, PS, T0.W,
590 ; EG-NEXT: SUB_INT T1.W, T0.Y, PS,
591 ; EG-NEXT: MULLO_INT * T0.Y, T1.X, T0.Z,
592 ; EG-NEXT: SUB_INT T0.Y, T0.X, PS,
593 ; EG-NEXT: ADD_INT T1.Z, T1.Y, 1,
594 ; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W,
595 ; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W,
596 ; EG-NEXT: CNDE_INT T0.X, PV.W, T1.W, PS,
597 ; EG-NEXT: CNDE_INT T1.Y, PV.W, T1.Y, PV.Z,
598 ; EG-NEXT: ADD_INT T1.Z, T1.X, 1,
599 ; EG-NEXT: SETGE_UINT T1.W, PV.Y, T0.Z,
600 ; EG-NEXT: SUB_INT * T2.W, PV.Y, T0.Z,
601 ; EG-NEXT: CNDE_INT T0.Y, PV.W, T0.Y, PS,
602 ; EG-NEXT: CNDE_INT T1.Z, PV.W, T1.X, PV.Z,
603 ; EG-NEXT: ADD_INT T1.W, PV.Y, 1,
604 ; EG-NEXT: SETGE_UINT * T0.W, PV.X, T0.W,
605 ; EG-NEXT: CNDE_INT T1.Y, PS, T1.Y, PV.W,
606 ; EG-NEXT: ADD_INT T0.W, PV.Z, 1,
607 ; EG-NEXT: SETGE_UINT * T1.W, PV.Y, T0.Z,
608 ; EG-NEXT: CNDE_INT T1.X, PS, T1.Z, PV.W,
609 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
610 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
611 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
612 %a = load <2 x i32>, ptr addrspace(1) %in
613 %b = load <2 x i32>, ptr addrspace(1) %b_ptr
614 %result = udiv <2 x i32> %a, %b
615 store <2 x i32> %result, ptr addrspace(1) %out
619 define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
620 ; SI-LABEL: udiv_v4i32:
622 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
623 ; SI-NEXT: s_mov_b32 s11, 0xf000
624 ; SI-NEXT: s_mov_b32 s10, -1
625 ; SI-NEXT: s_mov_b32 s6, s10
626 ; SI-NEXT: s_mov_b32 s7, s11
627 ; SI-NEXT: s_waitcnt lgkmcnt(0)
628 ; SI-NEXT: s_mov_b32 s4, s2
629 ; SI-NEXT: s_mov_b32 s5, s3
630 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
631 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
632 ; SI-NEXT: s_mov_b32 s8, s0
633 ; SI-NEXT: s_mov_b32 s9, s1
634 ; SI-NEXT: s_waitcnt vmcnt(1)
635 ; SI-NEXT: v_cvt_f32_u32_e32 v8, v0
636 ; SI-NEXT: v_cvt_f32_u32_e32 v10, v1
637 ; SI-NEXT: v_cvt_f32_u32_e32 v12, v2
638 ; SI-NEXT: v_cvt_f32_u32_e32 v14, v3
639 ; SI-NEXT: v_rcp_iflag_f32_e32 v8, v8
640 ; SI-NEXT: v_rcp_iflag_f32_e32 v10, v10
641 ; SI-NEXT: v_rcp_iflag_f32_e32 v12, v12
642 ; SI-NEXT: v_rcp_iflag_f32_e32 v14, v14
643 ; SI-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8
644 ; SI-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10
645 ; SI-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12
646 ; SI-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14
647 ; SI-NEXT: v_cvt_u32_f32_e32 v8, v8
648 ; SI-NEXT: v_cvt_u32_f32_e32 v10, v10
649 ; SI-NEXT: v_cvt_u32_f32_e32 v12, v12
650 ; SI-NEXT: v_cvt_u32_f32_e32 v14, v14
651 ; SI-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
652 ; SI-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
653 ; SI-NEXT: v_sub_i32_e32 v13, vcc, 0, v2
654 ; SI-NEXT: v_sub_i32_e32 v15, vcc, 0, v3
655 ; SI-NEXT: v_mul_lo_u32 v9, v9, v8
656 ; SI-NEXT: v_mul_lo_u32 v11, v11, v10
657 ; SI-NEXT: v_mul_lo_u32 v13, v13, v12
658 ; SI-NEXT: v_mul_lo_u32 v15, v15, v14
659 ; SI-NEXT: v_mul_hi_u32 v9, v8, v9
660 ; SI-NEXT: v_mul_hi_u32 v11, v10, v11
661 ; SI-NEXT: v_mul_hi_u32 v13, v12, v13
662 ; SI-NEXT: v_mul_hi_u32 v15, v14, v15
663 ; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v9
664 ; SI-NEXT: v_add_i32_e32 v9, vcc, v10, v11
665 ; SI-NEXT: v_add_i32_e32 v10, vcc, v12, v13
666 ; SI-NEXT: v_add_i32_e32 v11, vcc, v14, v15
667 ; SI-NEXT: s_waitcnt vmcnt(0)
668 ; SI-NEXT: v_mul_hi_u32 v8, v4, v8
669 ; SI-NEXT: v_mul_hi_u32 v9, v5, v9
670 ; SI-NEXT: v_mul_hi_u32 v10, v6, v10
671 ; SI-NEXT: v_mul_hi_u32 v11, v7, v11
672 ; SI-NEXT: v_mul_lo_u32 v12, v8, v0
673 ; SI-NEXT: v_mul_lo_u32 v14, v9, v1
674 ; SI-NEXT: v_mul_lo_u32 v16, v10, v2
675 ; SI-NEXT: v_mul_lo_u32 v18, v11, v3
676 ; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v12
677 ; SI-NEXT: v_sub_i32_e32 v5, vcc, v5, v14
678 ; SI-NEXT: v_sub_i32_e32 v6, vcc, v6, v16
679 ; SI-NEXT: v_sub_i32_e32 v7, vcc, v7, v18
680 ; SI-NEXT: v_add_i32_e32 v13, vcc, 1, v8
681 ; SI-NEXT: v_add_i32_e32 v15, vcc, 1, v9
682 ; SI-NEXT: v_add_i32_e32 v17, vcc, 1, v10
683 ; SI-NEXT: v_add_i32_e32 v19, vcc, 1, v11
684 ; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0
685 ; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1
686 ; SI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2
687 ; SI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3
688 ; SI-NEXT: v_sub_i32_e32 v12, vcc, v4, v0
689 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1]
690 ; SI-NEXT: v_sub_i32_e32 v13, vcc, v5, v1
691 ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3]
692 ; SI-NEXT: v_sub_i32_e32 v14, vcc, v6, v2
693 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5]
694 ; SI-NEXT: v_sub_i32_e32 v15, vcc, v7, v3
695 ; SI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7]
696 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1]
697 ; SI-NEXT: v_add_i32_e32 v12, vcc, 1, v8
698 ; SI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3]
699 ; SI-NEXT: v_add_i32_e32 v13, vcc, 1, v9
700 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5]
701 ; SI-NEXT: v_add_i32_e32 v14, vcc, 1, v10
702 ; SI-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[6:7]
703 ; SI-NEXT: v_add_i32_e32 v15, vcc, 1, v11
704 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0
705 ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc
706 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
707 ; SI-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc
708 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2
709 ; SI-NEXT: v_cndmask_b32_e32 v2, v10, v14, vcc
710 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
711 ; SI-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc
712 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
715 ; VI-LABEL: udiv_v4i32:
717 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
718 ; VI-NEXT: s_mov_b32 s11, 0xf000
719 ; VI-NEXT: s_mov_b32 s10, -1
720 ; VI-NEXT: s_mov_b32 s6, s10
721 ; VI-NEXT: s_mov_b32 s7, s11
722 ; VI-NEXT: s_waitcnt lgkmcnt(0)
723 ; VI-NEXT: s_mov_b32 s4, s2
724 ; VI-NEXT: s_mov_b32 s5, s3
725 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
726 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
727 ; VI-NEXT: s_mov_b32 s8, s0
728 ; VI-NEXT: s_mov_b32 s9, s1
729 ; VI-NEXT: s_waitcnt vmcnt(1)
730 ; VI-NEXT: v_cvt_f32_u32_e32 v8, v0
731 ; VI-NEXT: v_cvt_f32_u32_e32 v10, v1
732 ; VI-NEXT: v_cvt_f32_u32_e32 v12, v2
733 ; VI-NEXT: v_cvt_f32_u32_e32 v14, v3
734 ; VI-NEXT: v_rcp_iflag_f32_e32 v8, v8
735 ; VI-NEXT: v_rcp_iflag_f32_e32 v10, v10
736 ; VI-NEXT: v_rcp_iflag_f32_e32 v12, v12
737 ; VI-NEXT: v_rcp_iflag_f32_e32 v14, v14
738 ; VI-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8
739 ; VI-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10
740 ; VI-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12
741 ; VI-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14
742 ; VI-NEXT: v_cvt_u32_f32_e32 v8, v8
743 ; VI-NEXT: v_cvt_u32_f32_e32 v10, v10
744 ; VI-NEXT: v_cvt_u32_f32_e32 v12, v12
745 ; VI-NEXT: v_cvt_u32_f32_e32 v14, v14
746 ; VI-NEXT: v_sub_u32_e32 v9, vcc, 0, v0
747 ; VI-NEXT: v_sub_u32_e32 v11, vcc, 0, v1
748 ; VI-NEXT: v_sub_u32_e32 v13, vcc, 0, v2
749 ; VI-NEXT: v_sub_u32_e32 v15, vcc, 0, v3
750 ; VI-NEXT: v_mul_lo_u32 v9, v9, v8
751 ; VI-NEXT: v_mul_lo_u32 v11, v11, v10
752 ; VI-NEXT: v_mul_lo_u32 v13, v13, v12
753 ; VI-NEXT: v_mul_lo_u32 v15, v15, v14
754 ; VI-NEXT: v_mul_hi_u32 v9, v8, v9
755 ; VI-NEXT: v_mul_hi_u32 v11, v10, v11
756 ; VI-NEXT: v_mul_hi_u32 v13, v12, v13
757 ; VI-NEXT: v_mul_hi_u32 v15, v14, v15
758 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v9
759 ; VI-NEXT: v_add_u32_e32 v9, vcc, v10, v11
760 ; VI-NEXT: v_add_u32_e32 v10, vcc, v12, v13
761 ; VI-NEXT: v_add_u32_e32 v11, vcc, v14, v15
762 ; VI-NEXT: s_waitcnt vmcnt(0)
763 ; VI-NEXT: v_mul_hi_u32 v8, v4, v8
764 ; VI-NEXT: v_mul_hi_u32 v9, v5, v9
765 ; VI-NEXT: v_mul_hi_u32 v10, v6, v10
766 ; VI-NEXT: v_mul_hi_u32 v11, v7, v11
767 ; VI-NEXT: v_mul_lo_u32 v12, v8, v0
768 ; VI-NEXT: v_mul_lo_u32 v14, v9, v1
769 ; VI-NEXT: v_mul_lo_u32 v16, v10, v2
770 ; VI-NEXT: v_mul_lo_u32 v18, v11, v3
771 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v12
772 ; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v14
773 ; VI-NEXT: v_sub_u32_e32 v6, vcc, v6, v16
774 ; VI-NEXT: v_sub_u32_e32 v7, vcc, v7, v18
775 ; VI-NEXT: v_add_u32_e32 v13, vcc, 1, v8
776 ; VI-NEXT: v_add_u32_e32 v15, vcc, 1, v9
777 ; VI-NEXT: v_add_u32_e32 v17, vcc, 1, v10
778 ; VI-NEXT: v_add_u32_e32 v19, vcc, 1, v11
779 ; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0
780 ; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1
781 ; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2
782 ; VI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3
783 ; VI-NEXT: v_sub_u32_e32 v12, vcc, v4, v0
784 ; VI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1]
785 ; VI-NEXT: v_sub_u32_e32 v13, vcc, v5, v1
786 ; VI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3]
787 ; VI-NEXT: v_sub_u32_e32 v14, vcc, v6, v2
788 ; VI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5]
789 ; VI-NEXT: v_sub_u32_e32 v15, vcc, v7, v3
790 ; VI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7]
791 ; VI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1]
792 ; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v8
793 ; VI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3]
794 ; VI-NEXT: v_add_u32_e32 v13, vcc, 1, v9
795 ; VI-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5]
796 ; VI-NEXT: v_add_u32_e32 v14, vcc, 1, v10
797 ; VI-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[6:7]
798 ; VI-NEXT: v_add_u32_e32 v15, vcc, 1, v11
799 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0
800 ; VI-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc
801 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
802 ; VI-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc
803 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2
804 ; VI-NEXT: v_cndmask_b32_e32 v2, v10, v14, vcc
805 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
806 ; VI-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc
807 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
810 ; GCN-LABEL: udiv_v4i32:
812 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
813 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
814 ; GCN-NEXT: s_add_u32 s4, s2, 16
815 ; GCN-NEXT: s_addc_u32 s5, s3, 0
816 ; GCN-NEXT: v_mov_b32_e32 v0, s4
817 ; GCN-NEXT: v_mov_b32_e32 v1, s5
818 ; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
819 ; GCN-NEXT: v_mov_b32_e32 v5, s3
820 ; GCN-NEXT: v_mov_b32_e32 v4, s2
821 ; GCN-NEXT: flat_load_dwordx4 v[6:9], v[4:5]
822 ; GCN-NEXT: v_mov_b32_e32 v4, s0
823 ; GCN-NEXT: v_mov_b32_e32 v5, s1
824 ; GCN-NEXT: s_waitcnt vmcnt(1)
825 ; GCN-NEXT: v_cvt_f32_u32_e32 v10, v0
826 ; GCN-NEXT: v_cvt_f32_u32_e32 v12, v1
827 ; GCN-NEXT: v_cvt_f32_u32_e32 v14, v2
828 ; GCN-NEXT: v_cvt_f32_u32_e32 v16, v3
829 ; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10
830 ; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12
831 ; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14
832 ; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16
833 ; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10
834 ; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12
835 ; GCN-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14
836 ; GCN-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16
837 ; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10
838 ; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12
839 ; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14
840 ; GCN-NEXT: v_cvt_u32_f32_e32 v16, v16
841 ; GCN-NEXT: v_sub_u32_e32 v11, vcc, 0, v0
842 ; GCN-NEXT: v_sub_u32_e32 v13, vcc, 0, v1
843 ; GCN-NEXT: v_sub_u32_e32 v15, vcc, 0, v2
844 ; GCN-NEXT: v_sub_u32_e32 v17, vcc, 0, v3
845 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v10
846 ; GCN-NEXT: v_mul_lo_u32 v13, v13, v12
847 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v14
848 ; GCN-NEXT: v_mul_lo_u32 v17, v17, v16
849 ; GCN-NEXT: v_mul_hi_u32 v11, v10, v11
850 ; GCN-NEXT: v_mul_hi_u32 v13, v12, v13
851 ; GCN-NEXT: v_mul_hi_u32 v15, v14, v15
852 ; GCN-NEXT: v_mul_hi_u32 v17, v16, v17
853 ; GCN-NEXT: v_add_u32_e32 v10, vcc, v10, v11
854 ; GCN-NEXT: v_add_u32_e32 v11, vcc, v12, v13
855 ; GCN-NEXT: v_add_u32_e32 v12, vcc, v14, v15
856 ; GCN-NEXT: v_add_u32_e32 v13, vcc, v16, v17
857 ; GCN-NEXT: s_waitcnt vmcnt(0)
858 ; GCN-NEXT: v_mul_hi_u32 v10, v6, v10
859 ; GCN-NEXT: v_mul_hi_u32 v11, v7, v11
860 ; GCN-NEXT: v_mul_hi_u32 v12, v8, v12
861 ; GCN-NEXT: v_mul_hi_u32 v13, v9, v13
862 ; GCN-NEXT: v_mul_lo_u32 v14, v10, v0
863 ; GCN-NEXT: v_mul_lo_u32 v16, v11, v1
864 ; GCN-NEXT: v_mul_lo_u32 v18, v12, v2
865 ; GCN-NEXT: v_mul_lo_u32 v20, v13, v3
866 ; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v14
867 ; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v16
868 ; GCN-NEXT: v_sub_u32_e32 v8, vcc, v8, v18
869 ; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v20
870 ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10
871 ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11
872 ; GCN-NEXT: v_add_u32_e32 v19, vcc, 1, v12
873 ; GCN-NEXT: v_add_u32_e32 v21, vcc, 1, v13
874 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v6, v0
875 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v7, v1
876 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
877 ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3
878 ; GCN-NEXT: v_sub_u32_e32 v14, vcc, v6, v0
879 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1]
880 ; GCN-NEXT: v_sub_u32_e32 v15, vcc, v7, v1
881 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3]
882 ; GCN-NEXT: v_sub_u32_e32 v16, vcc, v8, v2
883 ; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5]
884 ; GCN-NEXT: v_sub_u32_e32 v17, vcc, v9, v3
885 ; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v21, s[6:7]
886 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[0:1]
887 ; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10
888 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[2:3]
889 ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11
890 ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5]
891 ; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v12
892 ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[6:7]
893 ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v13
894 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v0
895 ; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v14, vcc
896 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v1
897 ; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc
898 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2
899 ; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v16, vcc
900 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3
901 ; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc
902 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
905 ; GFX1030-LABEL: udiv_v4i32:
907 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
908 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0
909 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
910 ; GFX1030-NEXT: s_clause 0x1
911 ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
912 ; GFX1030-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3]
913 ; GFX1030-NEXT: s_waitcnt vmcnt(1)
914 ; GFX1030-NEXT: v_readfirstlane_b32 s2, v0
915 ; GFX1030-NEXT: v_readfirstlane_b32 s3, v1
916 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
917 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v4
918 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v2
919 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s2
920 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s3
921 ; GFX1030-NEXT: s_sub_i32 s6, 0, s2
922 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0
923 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1
924 ; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
925 ; GFX1030-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
926 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0
927 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v1, v1
928 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v0
929 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s5
930 ; GFX1030-NEXT: v_readfirstlane_b32 s9, v1
931 ; GFX1030-NEXT: s_mul_i32 s6, s6, s4
932 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0
933 ; GFX1030-NEXT: s_mul_hi_u32 s6, s4, s6
934 ; GFX1030-NEXT: s_add_i32 s4, s4, s6
935 ; GFX1030-NEXT: s_mul_hi_u32 s4, s7, s4
936 ; GFX1030-NEXT: s_mul_i32 s6, s4, s2
937 ; GFX1030-NEXT: s_sub_i32 s6, s7, s6
938 ; GFX1030-NEXT: s_add_i32 s7, s4, 1
939 ; GFX1030-NEXT: s_sub_i32 s8, s6, s2
940 ; GFX1030-NEXT: s_cmp_ge_u32 s6, s2
941 ; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
942 ; GFX1030-NEXT: s_cselect_b32 s4, s7, s4
943 ; GFX1030-NEXT: s_cselect_b32 s6, s8, s6
944 ; GFX1030-NEXT: s_add_i32 s7, s4, 1
945 ; GFX1030-NEXT: s_cmp_ge_u32 s6, s2
946 ; GFX1030-NEXT: v_readfirstlane_b32 s2, v3
947 ; GFX1030-NEXT: s_cselect_b32 s4, s7, s4
948 ; GFX1030-NEXT: s_sub_i32 s6, 0, s3
949 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v5
950 ; GFX1030-NEXT: s_mul_i32 s6, s6, s9
951 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0
952 ; GFX1030-NEXT: s_mul_hi_u32 s6, s9, s6
953 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s2
954 ; GFX1030-NEXT: s_add_i32 s9, s9, s6
955 ; GFX1030-NEXT: s_mul_hi_u32 s6, s7, s9
956 ; GFX1030-NEXT: v_readfirstlane_b32 s10, v0
957 ; GFX1030-NEXT: s_mul_i32 s8, s6, s3
958 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1
959 ; GFX1030-NEXT: s_sub_i32 s7, s7, s8
960 ; GFX1030-NEXT: s_add_i32 s8, s6, 1
961 ; GFX1030-NEXT: s_sub_i32 s9, s7, s3
962 ; GFX1030-NEXT: s_cmp_ge_u32 s7, s3
963 ; GFX1030-NEXT: s_cselect_b32 s6, s8, s6
964 ; GFX1030-NEXT: s_cselect_b32 s7, s9, s7
965 ; GFX1030-NEXT: s_add_i32 s8, s6, 1
966 ; GFX1030-NEXT: s_cmp_ge_u32 s7, s3
967 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v6
968 ; GFX1030-NEXT: s_cselect_b32 s3, s8, s6
969 ; GFX1030-NEXT: s_sub_i32 s6, 0, s5
970 ; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1
971 ; GFX1030-NEXT: s_mul_i32 s6, s6, s10
972 ; GFX1030-NEXT: v_mov_b32_e32 v1, s3
973 ; GFX1030-NEXT: s_mul_hi_u32 s6, s10, s6
974 ; GFX1030-NEXT: s_add_i32 s10, s10, s6
975 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0
976 ; GFX1030-NEXT: s_mul_hi_u32 s6, s7, s10
977 ; GFX1030-NEXT: s_mul_i32 s8, s6, s5
978 ; GFX1030-NEXT: s_sub_i32 s7, s7, s8
979 ; GFX1030-NEXT: s_add_i32 s8, s6, 1
980 ; GFX1030-NEXT: s_sub_i32 s9, s7, s5
981 ; GFX1030-NEXT: s_cmp_ge_u32 s7, s5
982 ; GFX1030-NEXT: v_readfirstlane_b32 s10, v0
983 ; GFX1030-NEXT: s_cselect_b32 s6, s8, s6
984 ; GFX1030-NEXT: s_cselect_b32 s7, s9, s7
985 ; GFX1030-NEXT: s_add_i32 s8, s6, 1
986 ; GFX1030-NEXT: s_cmp_ge_u32 s7, s5
987 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v7
988 ; GFX1030-NEXT: s_cselect_b32 s5, s8, s6
989 ; GFX1030-NEXT: s_sub_i32 s6, 0, s2
990 ; GFX1030-NEXT: v_mov_b32_e32 v0, s4
991 ; GFX1030-NEXT: s_mul_i32 s6, s6, s10
992 ; GFX1030-NEXT: v_mov_b32_e32 v2, s5
993 ; GFX1030-NEXT: s_mul_hi_u32 s6, s10, s6
994 ; GFX1030-NEXT: s_add_i32 s10, s10, s6
995 ; GFX1030-NEXT: s_mul_hi_u32 s6, s7, s10
996 ; GFX1030-NEXT: s_mul_i32 s8, s6, s2
997 ; GFX1030-NEXT: s_sub_i32 s7, s7, s8
998 ; GFX1030-NEXT: s_add_i32 s8, s6, 1
999 ; GFX1030-NEXT: s_sub_i32 s9, s7, s2
1000 ; GFX1030-NEXT: s_cmp_ge_u32 s7, s2
1001 ; GFX1030-NEXT: s_cselect_b32 s6, s8, s6
1002 ; GFX1030-NEXT: s_cselect_b32 s7, s9, s7
1003 ; GFX1030-NEXT: s_add_i32 s8, s6, 1
1004 ; GFX1030-NEXT: s_cmp_ge_u32 s7, s2
1005 ; GFX1030-NEXT: s_cselect_b32 s2, s8, s6
1006 ; GFX1030-NEXT: v_mov_b32_e32 v3, s2
1007 ; GFX1030-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
1008 ; GFX1030-NEXT: s_endpgm
1010 ; EG-LABEL: udiv_v4i32:
1012 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1014 ; EG-NEXT: ALU 65, @11, KC0[CB0:0-32], KC1[]
1015 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
1018 ; EG-NEXT: Fetch clause starting at 6:
1019 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
1020 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
1021 ; EG-NEXT: ALU clause starting at 10:
1022 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1023 ; EG-NEXT: ALU clause starting at 11:
1024 ; EG-NEXT: SUB_INT T2.W, 0.0, T1.W,
1025 ; EG-NEXT: RECIP_UINT * T2.X, T1.W,
1026 ; EG-NEXT: MULLO_INT * T2.Y, PV.W, PS,
1027 ; EG-NEXT: MULHI * T2.Y, T2.X, PS,
1028 ; EG-NEXT: ADD_INT * T2.W, T2.X, PS,
1029 ; EG-NEXT: MULHI * T2.X, T0.W, PV.W,
1030 ; EG-NEXT: MULLO_INT * T2.Y, PS, T1.W,
1031 ; EG-NEXT: SUB_INT T2.W, 0.0, T1.X,
1032 ; EG-NEXT: RECIP_UINT * T2.Z, T1.X,
1033 ; EG-NEXT: MULLO_INT * T2.W, PV.W, PS,
1034 ; EG-NEXT: SUB_INT T3.W, 0.0, T1.Y,
1035 ; EG-NEXT: RECIP_UINT * T3.X, T1.Y,
1036 ; EG-NEXT: MULLO_INT * T3.Y, PV.W, PS,
1037 ; EG-NEXT: SUB_INT T3.W, 0.0, T1.Z,
1038 ; EG-NEXT: RECIP_UINT * T3.Z, T1.Z,
1039 ; EG-NEXT: MULLO_INT * T3.W, PV.W, PS,
1040 ; EG-NEXT: MULHI * T3.W, T3.Z, PS,
1041 ; EG-NEXT: ADD_INT T3.W, T3.Z, PS,
1042 ; EG-NEXT: MULHI * T3.Y, T3.X, T3.Y,
1043 ; EG-NEXT: ADD_INT T4.W, T3.X, PS,
1044 ; EG-NEXT: MULHI * T3.X, T0.Z, PV.W,
1045 ; EG-NEXT: MULHI * T3.Y, T0.Y, PV.W,
1046 ; EG-NEXT: MULLO_INT * T3.Z, PS, T1.Y,
1047 ; EG-NEXT: SUB_INT T3.W, T0.Y, PS,
1048 ; EG-NEXT: MULLO_INT * T0.Y, T3.X, T1.Z,
1049 ; EG-NEXT: SUB_INT T4.X, T0.Z, PS,
1050 ; EG-NEXT: ADD_INT T0.Y, T3.Y, 1,
1051 ; EG-NEXT: SETGE_UINT T0.Z, PV.W, T1.Y,
1052 ; EG-NEXT: SUB_INT T4.W, PV.W, T1.Y,
1053 ; EG-NEXT: MULHI * T2.W, T2.Z, T2.W,
1054 ; EG-NEXT: CNDE_INT T5.X, PV.Z, T3.W, PV.W,
1055 ; EG-NEXT: CNDE_INT T0.Y, PV.Z, T3.Y, PV.Y, BS:VEC_021/SCL_122
1056 ; EG-NEXT: SETGE_UINT T0.Z, PV.X, T1.Z,
1057 ; EG-NEXT: ADD_INT T2.W, T2.Z, PS,
1058 ; EG-NEXT: SUB_INT * T0.W, T0.W, T2.Y,
1059 ; EG-NEXT: ADD_INT T6.X, T3.X, 1,
1060 ; EG-NEXT: ADD_INT T2.Y, T2.X, 1, BS:VEC_120/SCL_212
1061 ; EG-NEXT: SETGE_UINT T2.Z, PS, T1.W,
1062 ; EG-NEXT: SUB_INT T3.W, PS, T1.W,
1063 ; EG-NEXT: MULHI * T2.W, T0.X, PV.W,
1064 ; EG-NEXT: SUB_INT T7.X, T4.X, T1.Z,
1065 ; EG-NEXT: CNDE_INT T3.Y, PV.Z, T0.W, PV.W,
1066 ; EG-NEXT: CNDE_INT T2.Z, PV.Z, T2.X, PV.Y,
1067 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T3.X, PV.X, BS:VEC_021/SCL_122
1068 ; EG-NEXT: MULLO_INT * T2.X, T2.W, T1.X,
1069 ; EG-NEXT: ADD_INT T3.X, T0.W, 1,
1070 ; EG-NEXT: ADD_INT T2.Y, T2.Z, 1,
1071 ; EG-NEXT: SETGE_UINT T3.Z, T3.Y, T1.W,
1072 ; EG-NEXT: SUB_INT T1.W, T0.X, PS, BS:VEC_201
1073 ; EG-NEXT: CNDE_INT * T3.W, T0.Z, T4.X, T7.X,
1074 ; EG-NEXT: SETGE_UINT T0.X, PS, T1.Z, BS:VEC_021/SCL_122
1075 ; EG-NEXT: ADD_INT T3.Y, T2.W, 1,
1076 ; EG-NEXT: SETGE_UINT T0.Z, PV.W, T1.X,
1077 ; EG-NEXT: SUB_INT T3.W, PV.W, T1.X,
1078 ; EG-NEXT: CNDE_INT * T4.W, PV.Z, T2.Z, PV.Y,
1079 ; EG-NEXT: CNDE_INT T2.X, PV.Z, T1.W, PV.W,
1080 ; EG-NEXT: CNDE_INT T2.Y, PV.Z, T2.W, PV.Y, BS:VEC_021/SCL_122
1081 ; EG-NEXT: CNDE_INT T4.Z, PV.X, T0.W, T3.X, BS:VEC_201
1082 ; EG-NEXT: ADD_INT T0.W, T0.Y, 1,
1083 ; EG-NEXT: SETGE_UINT * T1.W, T5.X, T1.Y,
1084 ; EG-NEXT: CNDE_INT T4.Y, PS, T0.Y, PV.W,
1085 ; EG-NEXT: ADD_INT T0.W, PV.Y, 1,
1086 ; EG-NEXT: SETGE_UINT * T1.W, PV.X, T1.X,
1087 ; EG-NEXT: CNDE_INT T4.X, PS, T2.Y, PV.W,
1088 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
1089 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1090 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
1091 %a = load <4 x i32>, ptr addrspace(1) %in
1092 %b = load <4 x i32>, ptr addrspace(1) %b_ptr
1093 %result = udiv <4 x i32> %a, %b
1094 store <4 x i32> %result, ptr addrspace(1) %out
1098 define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1099 ; SI-LABEL: udiv_i32_div_pow2:
1101 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1102 ; SI-NEXT: s_mov_b32 s7, 0xf000
1103 ; SI-NEXT: s_mov_b32 s6, -1
1104 ; SI-NEXT: s_mov_b32 s10, s6
1105 ; SI-NEXT: s_mov_b32 s11, s7
1106 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1107 ; SI-NEXT: s_mov_b32 s8, s2
1108 ; SI-NEXT: s_mov_b32 s9, s3
1109 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1110 ; SI-NEXT: s_mov_b32 s4, s0
1111 ; SI-NEXT: s_mov_b32 s5, s1
1112 ; SI-NEXT: s_waitcnt vmcnt(0)
1113 ; SI-NEXT: v_lshrrev_b32_e32 v0, 4, v0
1114 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1117 ; VI-LABEL: udiv_i32_div_pow2:
1119 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1120 ; VI-NEXT: s_mov_b32 s7, 0xf000
1121 ; VI-NEXT: s_mov_b32 s6, -1
1122 ; VI-NEXT: s_mov_b32 s10, s6
1123 ; VI-NEXT: s_mov_b32 s11, s7
1124 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1125 ; VI-NEXT: s_mov_b32 s8, s2
1126 ; VI-NEXT: s_mov_b32 s9, s3
1127 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1128 ; VI-NEXT: s_mov_b32 s4, s0
1129 ; VI-NEXT: s_mov_b32 s5, s1
1130 ; VI-NEXT: s_waitcnt vmcnt(0)
1131 ; VI-NEXT: v_lshrrev_b32_e32 v0, 4, v0
1132 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1135 ; GCN-LABEL: udiv_i32_div_pow2:
1137 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1138 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1139 ; GCN-NEXT: v_mov_b32_e32 v0, s2
1140 ; GCN-NEXT: v_mov_b32_e32 v1, s3
1141 ; GCN-NEXT: flat_load_dword v2, v[0:1]
1142 ; GCN-NEXT: v_mov_b32_e32 v0, s0
1143 ; GCN-NEXT: v_mov_b32_e32 v1, s1
1144 ; GCN-NEXT: s_waitcnt vmcnt(0)
1145 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 4, v2
1146 ; GCN-NEXT: flat_store_dword v[0:1], v2
1147 ; GCN-NEXT: s_endpgm
1149 ; GFX1030-LABEL: udiv_i32_div_pow2:
1151 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1152 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0
1153 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
1154 ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3]
1155 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
1156 ; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 4, v1
1157 ; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
1158 ; GFX1030-NEXT: s_endpgm
1160 ; EG-LABEL: udiv_i32_div_pow2:
1162 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1164 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
1165 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1168 ; EG-NEXT: Fetch clause starting at 6:
1169 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1170 ; EG-NEXT: ALU clause starting at 8:
1171 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1172 ; EG-NEXT: ALU clause starting at 9:
1173 ; EG-NEXT: LSHR T0.X, T0.X, literal.x,
1174 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1175 ; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45)
1176 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1177 %a = load i32, ptr addrspace(1) %in
1178 %result = udiv i32 %a, 16
1179 store i32 %result, ptr addrspace(1) %out
1183 define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1184 ; SI-LABEL: udiv_i32_div_k_even:
1186 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1187 ; SI-NEXT: s_mov_b32 s7, 0xf000
1188 ; SI-NEXT: s_mov_b32 s6, -1
1189 ; SI-NEXT: s_mov_b32 s10, s6
1190 ; SI-NEXT: s_mov_b32 s11, s7
1191 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1192 ; SI-NEXT: s_mov_b32 s8, s2
1193 ; SI-NEXT: s_mov_b32 s9, s3
1194 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1195 ; SI-NEXT: s_mov_b32 s2, 0xfabbd9c1
1196 ; SI-NEXT: s_mov_b32 s4, s0
1197 ; SI-NEXT: s_mov_b32 s5, s1
1198 ; SI-NEXT: s_waitcnt vmcnt(0)
1199 ; SI-NEXT: v_mul_hi_u32 v0, v0, s2
1200 ; SI-NEXT: v_lshrrev_b32_e32 v0, 25, v0
1201 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1204 ; VI-LABEL: udiv_i32_div_k_even:
1206 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1207 ; VI-NEXT: s_mov_b32 s7, 0xf000
1208 ; VI-NEXT: s_mov_b32 s6, -1
1209 ; VI-NEXT: s_mov_b32 s10, s6
1210 ; VI-NEXT: s_mov_b32 s11, s7
1211 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1212 ; VI-NEXT: s_mov_b32 s8, s2
1213 ; VI-NEXT: s_mov_b32 s9, s3
1214 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1215 ; VI-NEXT: s_mov_b32 s2, 0xfabbd9c1
1216 ; VI-NEXT: s_mov_b32 s4, s0
1217 ; VI-NEXT: s_mov_b32 s5, s1
1218 ; VI-NEXT: s_waitcnt vmcnt(0)
1219 ; VI-NEXT: v_mul_hi_u32 v0, v0, s2
1220 ; VI-NEXT: v_lshrrev_b32_e32 v0, 25, v0
1221 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1224 ; GCN-LABEL: udiv_i32_div_k_even:
1226 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1227 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1228 ; GCN-NEXT: v_mov_b32_e32 v0, s2
1229 ; GCN-NEXT: v_mov_b32_e32 v1, s3
1230 ; GCN-NEXT: flat_load_dword v0, v[0:1]
1231 ; GCN-NEXT: s_mov_b32 s2, 0xfabbd9c1
1232 ; GCN-NEXT: v_mov_b32_e32 v1, s1
1233 ; GCN-NEXT: s_waitcnt vmcnt(0)
1234 ; GCN-NEXT: v_mul_hi_u32 v2, v0, s2
1235 ; GCN-NEXT: v_mov_b32_e32 v0, s0
1236 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 25, v2
1237 ; GCN-NEXT: flat_store_dword v[0:1], v2
1238 ; GCN-NEXT: s_endpgm
1240 ; GFX1030-LABEL: udiv_i32_div_k_even:
1242 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1243 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0
1244 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
1245 ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3]
1246 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
1247 ; GFX1030-NEXT: v_mul_hi_u32 v1, 0xfabbd9c1, v1
1248 ; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 25, v1
1249 ; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
1250 ; GFX1030-NEXT: s_endpgm
1252 ; EG-LABEL: udiv_i32_div_k_even:
1254 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1256 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
1257 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1260 ; EG-NEXT: Fetch clause starting at 6:
1261 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1262 ; EG-NEXT: ALU clause starting at 8:
1263 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1264 ; EG-NEXT: ALU clause starting at 9:
1265 ; EG-NEXT: MULHI * T0.X, T0.X, literal.x,
1266 ; EG-NEXT: -88352319(-4.876880e+35), 0(0.000000e+00)
1267 ; EG-NEXT: LSHR T0.X, PS, literal.x,
1268 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1269 ; EG-NEXT: 25(3.503246e-44), 2(2.802597e-45)
1270 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1271 %a = load i32, ptr addrspace(1) %in
1272 %result = udiv i32 %a, 34259182
1273 store i32 %result, ptr addrspace(1) %out
1277 define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1278 ; SI-LABEL: udiv_i32_div_k_odd:
1280 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1281 ; SI-NEXT: s_mov_b32 s7, 0xf000
1282 ; SI-NEXT: s_mov_b32 s6, -1
1283 ; SI-NEXT: s_mov_b32 s10, s6
1284 ; SI-NEXT: s_mov_b32 s11, s7
1285 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1286 ; SI-NEXT: s_mov_b32 s8, s2
1287 ; SI-NEXT: s_mov_b32 s9, s3
1288 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1289 ; SI-NEXT: s_mov_b32 s2, 0x7d5deca3
1290 ; SI-NEXT: s_mov_b32 s4, s0
1291 ; SI-NEXT: s_mov_b32 s5, s1
1292 ; SI-NEXT: s_waitcnt vmcnt(0)
1293 ; SI-NEXT: v_mul_hi_u32 v0, v0, s2
1294 ; SI-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1295 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1298 ; VI-LABEL: udiv_i32_div_k_odd:
1300 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1301 ; VI-NEXT: s_mov_b32 s7, 0xf000
1302 ; VI-NEXT: s_mov_b32 s6, -1
1303 ; VI-NEXT: s_mov_b32 s10, s6
1304 ; VI-NEXT: s_mov_b32 s11, s7
1305 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1306 ; VI-NEXT: s_mov_b32 s8, s2
1307 ; VI-NEXT: s_mov_b32 s9, s3
1308 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
1309 ; VI-NEXT: s_mov_b32 s2, 0x7d5deca3
1310 ; VI-NEXT: s_mov_b32 s4, s0
1311 ; VI-NEXT: s_mov_b32 s5, s1
1312 ; VI-NEXT: s_waitcnt vmcnt(0)
1313 ; VI-NEXT: v_mul_hi_u32 v0, v0, s2
1314 ; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0
1315 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1318 ; GCN-LABEL: udiv_i32_div_k_odd:
1320 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1321 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1322 ; GCN-NEXT: v_mov_b32_e32 v0, s2
1323 ; GCN-NEXT: v_mov_b32_e32 v1, s3
1324 ; GCN-NEXT: flat_load_dword v0, v[0:1]
1325 ; GCN-NEXT: s_mov_b32 s2, 0x7d5deca3
1326 ; GCN-NEXT: v_mov_b32_e32 v1, s1
1327 ; GCN-NEXT: s_waitcnt vmcnt(0)
1328 ; GCN-NEXT: v_mul_hi_u32 v2, v0, s2
1329 ; GCN-NEXT: v_mov_b32_e32 v0, s0
1330 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 24, v2
1331 ; GCN-NEXT: flat_store_dword v[0:1], v2
1332 ; GCN-NEXT: s_endpgm
1334 ; GFX1030-LABEL: udiv_i32_div_k_odd:
1336 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1337 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0
1338 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
1339 ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3]
1340 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
1341 ; GFX1030-NEXT: v_mul_hi_u32 v1, 0x7d5deca3, v1
1342 ; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 24, v1
1343 ; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
1344 ; GFX1030-NEXT: s_endpgm
1346 ; EG-LABEL: udiv_i32_div_k_odd:
1348 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1350 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
1351 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1354 ; EG-NEXT: Fetch clause starting at 6:
1355 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1356 ; EG-NEXT: ALU clause starting at 8:
1357 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1358 ; EG-NEXT: ALU clause starting at 9:
1359 ; EG-NEXT: MULHI * T0.X, T0.X, literal.x,
1360 ; EG-NEXT: 2103307427(1.843675e+37), 0(0.000000e+00)
1361 ; EG-NEXT: LSHR T0.X, PS, literal.x,
1362 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1363 ; EG-NEXT: 24(3.363116e-44), 2(2.802597e-45)
1364 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1365 %a = load i32, ptr addrspace(1) %in
1366 %result = udiv i32 %a, 34259183
1367 store i32 %result, ptr addrspace(1) %out
1371 define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1372 ; SI-LABEL: v_udiv_i8:
1374 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1375 ; SI-NEXT: s_mov_b32 s7, 0xf000
1376 ; SI-NEXT: s_mov_b32 s6, -1
1377 ; SI-NEXT: s_mov_b32 s10, s6
1378 ; SI-NEXT: s_mov_b32 s11, s7
1379 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1380 ; SI-NEXT: s_mov_b32 s8, s2
1381 ; SI-NEXT: s_mov_b32 s9, s3
1382 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1383 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0
1384 ; SI-NEXT: s_mov_b32 s4, s0
1385 ; SI-NEXT: s_mov_b32 s5, s1
1386 ; SI-NEXT: s_waitcnt vmcnt(1)
1387 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1388 ; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0
1389 ; SI-NEXT: s_waitcnt vmcnt(0)
1390 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
1391 ; SI-NEXT: v_mul_f32_e32 v2, v1, v2
1392 ; SI-NEXT: v_trunc_f32_e32 v2, v2
1393 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v2
1394 ; SI-NEXT: v_mad_f32 v1, -v2, v0, v1
1395 ; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
1396 ; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
1397 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
1398 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1401 ; VI-LABEL: v_udiv_i8:
1403 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1404 ; VI-NEXT: s_mov_b32 s7, 0xf000
1405 ; VI-NEXT: s_mov_b32 s6, -1
1406 ; VI-NEXT: s_mov_b32 s10, s6
1407 ; VI-NEXT: s_mov_b32 s11, s7
1408 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1409 ; VI-NEXT: s_mov_b32 s8, s2
1410 ; VI-NEXT: s_mov_b32 s9, s3
1411 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1412 ; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0
1413 ; VI-NEXT: s_mov_b32 s4, s0
1414 ; VI-NEXT: s_mov_b32 s5, s1
1415 ; VI-NEXT: s_waitcnt vmcnt(1)
1416 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1417 ; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0
1418 ; VI-NEXT: s_waitcnt vmcnt(0)
1419 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
1420 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2
1421 ; VI-NEXT: v_trunc_f32_e32 v2, v2
1422 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v2
1423 ; VI-NEXT: v_mad_f32 v1, -v2, v0, v1
1424 ; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
1425 ; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
1426 ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
1427 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1430 ; GCN-LABEL: v_udiv_i8:
1432 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1433 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1434 ; GCN-NEXT: v_mov_b32_e32 v0, s2
1435 ; GCN-NEXT: v_mov_b32_e32 v1, s3
1436 ; GCN-NEXT: flat_load_ushort v2, v[0:1]
1437 ; GCN-NEXT: v_mov_b32_e32 v0, s0
1438 ; GCN-NEXT: v_mov_b32_e32 v1, s1
1439 ; GCN-NEXT: s_waitcnt vmcnt(0)
1440 ; GCN-NEXT: v_cvt_f32_ubyte1_e32 v3, v2
1441 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v3
1442 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
1443 ; GCN-NEXT: v_mul_f32_e32 v4, v2, v4
1444 ; GCN-NEXT: v_trunc_f32_e32 v4, v4
1445 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4
1446 ; GCN-NEXT: v_mad_f32 v2, -v4, v3, v2
1447 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3
1448 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
1449 ; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2
1450 ; GCN-NEXT: flat_store_dword v[0:1], v2
1451 ; GCN-NEXT: s_endpgm
1453 ; GFX1030-LABEL: v_udiv_i8:
1455 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1456 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0
1457 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
1458 ; GFX1030-NEXT: global_load_ushort v1, v0, s[2:3]
1459 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
1460 ; GFX1030-NEXT: v_cvt_f32_ubyte1_e32 v2, v1
1461 ; GFX1030-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
1462 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v2
1463 ; GFX1030-NEXT: v_mul_f32_e32 v3, v1, v3
1464 ; GFX1030-NEXT: v_trunc_f32_e32 v3, v3
1465 ; GFX1030-NEXT: v_fma_f32 v1, -v3, v2, v1
1466 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3
1467 ; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v1|, v2
1468 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1469 ; GFX1030-NEXT: v_and_b32_e32 v1, 0xff, v1
1470 ; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
1471 ; GFX1030-NEXT: s_endpgm
1473 ; EG-LABEL: v_udiv_i8:
1475 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1477 ; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
1478 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1481 ; EG-NEXT: Fetch clause starting at 6:
1482 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1
1483 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1484 ; EG-NEXT: ALU clause starting at 10:
1485 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1486 ; EG-NEXT: ALU clause starting at 11:
1487 ; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X,
1488 ; EG-NEXT: RECIP_IEEE * T0.Z, PS,
1489 ; EG-NEXT: UINT_TO_FLT * T0.X, T0.X,
1490 ; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Z,
1491 ; EG-NEXT: TRUNC * T0.W, PV.W,
1492 ; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
1493 ; EG-NEXT: TRUNC * T0.W, PV.W,
1494 ; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.Y|,
1495 ; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x,
1496 ; EG-NEXT: FLT_TO_UINT * T0.X, T0.W,
1497 ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
1498 ; EG-NEXT: ADD_INT * T0.W, PS, PV.W,
1499 ; EG-NEXT: AND_INT T0.X, PV.W, literal.x,
1500 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1501 ; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
1502 %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
1503 %num = load i8, ptr addrspace(1) %in
1504 %den = load i8, ptr addrspace(1) %den_ptr
1505 %result = udiv i8 %num, %den
1506 %result.ext = zext i8 %result to i32
1507 store i32 %result.ext, ptr addrspace(1) %out
1511 define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1512 ; SI-LABEL: v_udiv_i16:
1514 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1515 ; SI-NEXT: s_mov_b32 s7, 0xf000
1516 ; SI-NEXT: s_mov_b32 s6, -1
1517 ; SI-NEXT: s_mov_b32 s10, s6
1518 ; SI-NEXT: s_mov_b32 s11, s7
1519 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1520 ; SI-NEXT: s_mov_b32 s8, s2
1521 ; SI-NEXT: s_mov_b32 s9, s3
1522 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2
1523 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
1524 ; SI-NEXT: s_mov_b32 s4, s0
1525 ; SI-NEXT: s_mov_b32 s5, s1
1526 ; SI-NEXT: s_waitcnt vmcnt(1)
1527 ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0
1528 ; SI-NEXT: s_waitcnt vmcnt(0)
1529 ; SI-NEXT: v_cvt_f32_u32_e32 v1, v1
1530 ; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0
1531 ; SI-NEXT: v_mul_f32_e32 v2, v1, v2
1532 ; SI-NEXT: v_trunc_f32_e32 v2, v2
1533 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v2
1534 ; SI-NEXT: v_mad_f32 v1, -v2, v0, v1
1535 ; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
1536 ; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
1537 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1538 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1541 ; VI-LABEL: v_udiv_i16:
1543 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1544 ; VI-NEXT: s_mov_b32 s7, 0xf000
1545 ; VI-NEXT: s_mov_b32 s6, -1
1546 ; VI-NEXT: s_mov_b32 s10, s6
1547 ; VI-NEXT: s_mov_b32 s11, s7
1548 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1549 ; VI-NEXT: s_mov_b32 s8, s2
1550 ; VI-NEXT: s_mov_b32 s9, s3
1551 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2
1552 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
1553 ; VI-NEXT: s_mov_b32 s4, s0
1554 ; VI-NEXT: s_mov_b32 s5, s1
1555 ; VI-NEXT: s_waitcnt vmcnt(1)
1556 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
1557 ; VI-NEXT: s_waitcnt vmcnt(0)
1558 ; VI-NEXT: v_cvt_f32_u32_e32 v1, v1
1559 ; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0
1560 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2
1561 ; VI-NEXT: v_trunc_f32_e32 v2, v2
1562 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v2
1563 ; VI-NEXT: v_mad_f32 v1, -v2, v0, v1
1564 ; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
1565 ; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
1566 ; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1567 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1570 ; GCN-LABEL: v_udiv_i16:
1572 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1573 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1574 ; GCN-NEXT: v_mov_b32_e32 v0, s2
1575 ; GCN-NEXT: v_mov_b32_e32 v1, s3
1576 ; GCN-NEXT: flat_load_dword v0, v[0:1]
1577 ; GCN-NEXT: v_mov_b32_e32 v1, s1
1578 ; GCN-NEXT: s_waitcnt vmcnt(0)
1579 ; GCN-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1580 ; GCN-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1581 ; GCN-NEXT: v_mov_b32_e32 v0, s0
1582 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2
1583 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4
1584 ; GCN-NEXT: v_trunc_f32_e32 v4, v4
1585 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4
1586 ; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3
1587 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2
1588 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
1589 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
1590 ; GCN-NEXT: flat_store_dword v[0:1], v2
1591 ; GCN-NEXT: s_endpgm
1593 ; GFX1030-LABEL: v_udiv_i16:
1595 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1596 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0
1597 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
1598 ; GFX1030-NEXT: global_load_dword v1, v0, s[2:3]
1599 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
1600 ; GFX1030-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1601 ; GFX1030-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1602 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v2
1603 ; GFX1030-NEXT: v_mul_f32_e32 v3, v1, v3
1604 ; GFX1030-NEXT: v_trunc_f32_e32 v3, v3
1605 ; GFX1030-NEXT: v_fma_f32 v1, -v3, v2, v1
1606 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3
1607 ; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v1|, v2
1608 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1609 ; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v1
1610 ; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
1611 ; GFX1030-NEXT: s_endpgm
1613 ; EG-LABEL: v_udiv_i16:
1615 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1617 ; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
1618 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1621 ; EG-NEXT: Fetch clause starting at 6:
1622 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
1623 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1624 ; EG-NEXT: ALU clause starting at 10:
1625 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1626 ; EG-NEXT: ALU clause starting at 11:
1627 ; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X,
1628 ; EG-NEXT: RECIP_IEEE * T0.Z, PS,
1629 ; EG-NEXT: UINT_TO_FLT * T0.X, T0.X,
1630 ; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Z,
1631 ; EG-NEXT: TRUNC * T0.W, PV.W,
1632 ; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
1633 ; EG-NEXT: TRUNC * T0.W, PV.W,
1634 ; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.Y|,
1635 ; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x,
1636 ; EG-NEXT: FLT_TO_UINT * T0.X, T0.W,
1637 ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
1638 ; EG-NEXT: ADD_INT * T0.W, PS, PV.W,
1639 ; EG-NEXT: AND_INT T0.X, PV.W, literal.x,
1640 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1641 ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
1642 %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1
1643 %num = load i16, ptr addrspace(1) %in
1644 %den = load i16, ptr addrspace(1) %den_ptr
1645 %result = udiv i16 %num, %den
1646 %result.ext = zext i16 %result to i32
1647 store i32 %result.ext, ptr addrspace(1) %out
1651 define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1652 ; SI-LABEL: v_udiv_i23:
1654 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1655 ; SI-NEXT: s_mov_b32 s7, 0xf000
1656 ; SI-NEXT: s_mov_b32 s6, -1
1657 ; SI-NEXT: s_mov_b32 s10, s6
1658 ; SI-NEXT: s_mov_b32 s11, s7
1659 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1660 ; SI-NEXT: s_mov_b32 s8, s2
1661 ; SI-NEXT: s_mov_b32 s9, s3
1662 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1663 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4
1664 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1665 ; SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0
1666 ; SI-NEXT: s_mov_b32 s4, s0
1667 ; SI-NEXT: s_mov_b32 s5, s1
1668 ; SI-NEXT: s_waitcnt vmcnt(3)
1669 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1670 ; SI-NEXT: s_waitcnt vmcnt(2)
1671 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1672 ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0
1673 ; SI-NEXT: s_waitcnt vmcnt(1)
1674 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
1675 ; SI-NEXT: s_waitcnt vmcnt(0)
1676 ; SI-NEXT: v_or_b32_e32 v1, v3, v1
1677 ; SI-NEXT: v_cvt_f32_u32_e32 v1, v1
1678 ; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0
1679 ; SI-NEXT: v_mul_f32_e32 v2, v1, v2
1680 ; SI-NEXT: v_trunc_f32_e32 v2, v2
1681 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v2
1682 ; SI-NEXT: v_mad_f32 v1, -v2, v0, v1
1683 ; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
1684 ; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
1685 ; SI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0
1686 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1689 ; VI-LABEL: v_udiv_i23:
1691 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1692 ; VI-NEXT: s_mov_b32 s7, 0xf000
1693 ; VI-NEXT: s_mov_b32 s6, -1
1694 ; VI-NEXT: s_mov_b32 s10, s6
1695 ; VI-NEXT: s_mov_b32 s11, s7
1696 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1697 ; VI-NEXT: s_mov_b32 s8, s2
1698 ; VI-NEXT: s_mov_b32 s9, s3
1699 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1700 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4
1701 ; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1702 ; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0
1703 ; VI-NEXT: s_mov_b32 s4, s0
1704 ; VI-NEXT: s_mov_b32 s5, s1
1705 ; VI-NEXT: s_waitcnt vmcnt(3)
1706 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1707 ; VI-NEXT: s_waitcnt vmcnt(2)
1708 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1709 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
1710 ; VI-NEXT: s_waitcnt vmcnt(1)
1711 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
1712 ; VI-NEXT: s_waitcnt vmcnt(0)
1713 ; VI-NEXT: v_or_b32_e32 v1, v3, v1
1714 ; VI-NEXT: v_cvt_f32_u32_e32 v1, v1
1715 ; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0
1716 ; VI-NEXT: v_mul_f32_e32 v2, v1, v2
1717 ; VI-NEXT: v_trunc_f32_e32 v2, v2
1718 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v2
1719 ; VI-NEXT: v_mad_f32 v1, -v2, v0, v1
1720 ; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
1721 ; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
1722 ; VI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0
1723 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1726 ; GCN-LABEL: v_udiv_i23:
1728 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1729 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1730 ; GCN-NEXT: s_add_u32 s4, s2, 4
1731 ; GCN-NEXT: s_addc_u32 s5, s3, 0
1732 ; GCN-NEXT: s_add_u32 s6, s2, 2
1733 ; GCN-NEXT: s_addc_u32 s7, s3, 0
1734 ; GCN-NEXT: v_mov_b32_e32 v0, s6
1735 ; GCN-NEXT: v_mov_b32_e32 v1, s7
1736 ; GCN-NEXT: s_add_u32 s6, s2, 6
1737 ; GCN-NEXT: s_addc_u32 s7, s3, 0
1738 ; GCN-NEXT: v_mov_b32_e32 v2, s6
1739 ; GCN-NEXT: v_mov_b32_e32 v3, s7
1740 ; GCN-NEXT: v_mov_b32_e32 v4, s4
1741 ; GCN-NEXT: v_mov_b32_e32 v5, s5
1742 ; GCN-NEXT: flat_load_ubyte v6, v[2:3]
1743 ; GCN-NEXT: flat_load_ushort v4, v[4:5]
1744 ; GCN-NEXT: v_mov_b32_e32 v2, s2
1745 ; GCN-NEXT: v_mov_b32_e32 v3, s3
1746 ; GCN-NEXT: flat_load_ubyte v0, v[0:1]
1747 ; GCN-NEXT: flat_load_ushort v1, v[2:3]
1748 ; GCN-NEXT: s_waitcnt vmcnt(3)
1749 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6
1750 ; GCN-NEXT: s_waitcnt vmcnt(2)
1751 ; GCN-NEXT: v_or_b32_e32 v2, v4, v2
1752 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
1753 ; GCN-NEXT: s_waitcnt vmcnt(1)
1754 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1755 ; GCN-NEXT: s_waitcnt vmcnt(0)
1756 ; GCN-NEXT: v_or_b32_e32 v0, v1, v0
1757 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0
1758 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2
1759 ; GCN-NEXT: v_mov_b32_e32 v0, s0
1760 ; GCN-NEXT: v_mov_b32_e32 v1, s1
1761 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4
1762 ; GCN-NEXT: v_trunc_f32_e32 v4, v4
1763 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4
1764 ; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3
1765 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2
1766 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
1767 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v2
1768 ; GCN-NEXT: flat_store_dword v[0:1], v2
1769 ; GCN-NEXT: s_endpgm
1771 ; GFX1030-LABEL: v_udiv_i23:
1773 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1774 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0
1775 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
1776 ; GFX1030-NEXT: s_clause 0x3
1777 ; GFX1030-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6
1778 ; GFX1030-NEXT: global_load_ushort v2, v0, s[2:3] offset:4
1779 ; GFX1030-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2
1780 ; GFX1030-NEXT: global_load_ushort v4, v0, s[2:3]
1781 ; GFX1030-NEXT: s_waitcnt vmcnt(3)
1782 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1783 ; GFX1030-NEXT: s_waitcnt vmcnt(2)
1784 ; GFX1030-NEXT: v_or_b32_e32 v1, v2, v1
1785 ; GFX1030-NEXT: s_waitcnt vmcnt(1)
1786 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 16, v3
1787 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, v1
1788 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
1789 ; GFX1030-NEXT: v_or_b32_e32 v2, v4, v2
1790 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v1
1791 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, v2
1792 ; GFX1030-NEXT: v_mul_f32_e32 v3, v2, v3
1793 ; GFX1030-NEXT: v_trunc_f32_e32 v3, v3
1794 ; GFX1030-NEXT: v_fma_f32 v2, -v3, v1, v2
1795 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3
1796 ; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
1797 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1798 ; GFX1030-NEXT: v_and_b32_e32 v1, 0x7fffff, v1
1799 ; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
1800 ; GFX1030-NEXT: s_endpgm
1802 ; EG-LABEL: v_udiv_i23:
1804 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
1806 ; EG-NEXT: ALU 20, @15, KC0[CB0:0-32], KC1[]
1807 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1810 ; EG-NEXT: Fetch clause starting at 6:
1811 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1
1812 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1
1813 ; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1
1814 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
1815 ; EG-NEXT: ALU clause starting at 14:
1816 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1817 ; EG-NEXT: ALU clause starting at 15:
1818 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1819 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1820 ; EG-NEXT: OR_INT T0.W, T0.X, PV.W,
1821 ; EG-NEXT: LSHL * T1.W, T3.X, literal.x,
1822 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1823 ; EG-NEXT: UINT_TO_FLT * T0.X, PV.W,
1824 ; EG-NEXT: OR_INT T0.W, T2.X, T1.W,
1825 ; EG-NEXT: RECIP_IEEE * T0.Y, PS,
1826 ; EG-NEXT: UINT_TO_FLT * T0.Z, PV.W,
1827 ; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Y,
1828 ; EG-NEXT: TRUNC * T0.W, PV.W,
1829 ; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.X, T0.Z,
1830 ; EG-NEXT: TRUNC * T0.W, PV.W,
1831 ; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.X|,
1832 ; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x,
1833 ; EG-NEXT: FLT_TO_UINT * T0.X, T0.W,
1834 ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
1835 ; EG-NEXT: ADD_INT * T0.W, PS, PV.W,
1836 ; EG-NEXT: AND_INT T0.X, PV.W, literal.x,
1837 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1838 ; EG-NEXT: 8388607(1.175494e-38), 2(2.802597e-45)
1839 %den_ptr = getelementptr i23, ptr addrspace(1) %in, i23 1
1840 %num = load i23, ptr addrspace(1) %in
1841 %den = load i23, ptr addrspace(1) %den_ptr
1842 %result = udiv i23 %num, %den
1843 %result.ext = zext i23 %result to i32
1844 store i32 %result.ext, ptr addrspace(1) %out
1848 define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1849 ; SI-LABEL: v_udiv_i24:
1851 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1852 ; SI-NEXT: s_mov_b32 s3, 0xf000
1853 ; SI-NEXT: s_mov_b32 s2, -1
1854 ; SI-NEXT: s_mov_b32 s10, s2
1855 ; SI-NEXT: s_mov_b32 s11, s3
1856 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1857 ; SI-NEXT: s_mov_b32 s8, s6
1858 ; SI-NEXT: s_mov_b32 s9, s7
1859 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1860 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4
1861 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1862 ; SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0
1863 ; SI-NEXT: s_mov_b32 s0, s4
1864 ; SI-NEXT: s_mov_b32 s1, s5
1865 ; SI-NEXT: s_waitcnt vmcnt(3)
1866 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1867 ; SI-NEXT: s_waitcnt vmcnt(2)
1868 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1869 ; SI-NEXT: v_cvt_f32_u32_e32 v1, v0
1870 ; SI-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
1871 ; SI-NEXT: s_waitcnt vmcnt(1)
1872 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1873 ; SI-NEXT: v_rcp_iflag_f32_e32 v1, v1
1874 ; SI-NEXT: s_waitcnt vmcnt(0)
1875 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
1876 ; SI-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
1877 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v1
1878 ; SI-NEXT: v_mul_lo_u32 v4, v4, v1
1879 ; SI-NEXT: v_mul_hi_u32 v4, v1, v4
1880 ; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4
1881 ; SI-NEXT: v_mul_hi_u32 v1, v2, v1
1882 ; SI-NEXT: v_mul_lo_u32 v3, v1, v0
1883 ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v1
1884 ; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
1885 ; SI-NEXT: v_sub_i32_e32 v3, vcc, v2, v0
1886 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0
1887 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1888 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1889 ; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v1
1890 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0
1891 ; SI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
1892 ; SI-NEXT: v_and_b32_e32 v0, 0xffffff, v0
1893 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1896 ; VI-LABEL: v_udiv_i24:
1898 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1899 ; VI-NEXT: s_mov_b32 s3, 0xf000
1900 ; VI-NEXT: s_mov_b32 s2, -1
1901 ; VI-NEXT: s_mov_b32 s10, s2
1902 ; VI-NEXT: s_mov_b32 s11, s3
1903 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1904 ; VI-NEXT: s_mov_b32 s8, s6
1905 ; VI-NEXT: s_mov_b32 s9, s7
1906 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6
1907 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4
1908 ; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
1909 ; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0
1910 ; VI-NEXT: s_mov_b32 s0, s4
1911 ; VI-NEXT: s_mov_b32 s1, s5
1912 ; VI-NEXT: s_waitcnt vmcnt(3)
1913 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1914 ; VI-NEXT: s_waitcnt vmcnt(2)
1915 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
1916 ; VI-NEXT: v_cvt_f32_u32_e32 v1, v0
1917 ; VI-NEXT: v_sub_u32_e32 v4, vcc, 0, v0
1918 ; VI-NEXT: s_waitcnt vmcnt(1)
1919 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1920 ; VI-NEXT: v_rcp_iflag_f32_e32 v1, v1
1921 ; VI-NEXT: s_waitcnt vmcnt(0)
1922 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1923 ; VI-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
1924 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v1
1925 ; VI-NEXT: v_mul_lo_u32 v4, v4, v1
1926 ; VI-NEXT: v_mul_hi_u32 v4, v1, v4
1927 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v4
1928 ; VI-NEXT: v_mul_hi_u32 v1, v2, v1
1929 ; VI-NEXT: v_mul_lo_u32 v3, v1, v0
1930 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v1
1931 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3
1932 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v2, v0
1933 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0
1934 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1935 ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1936 ; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v1
1937 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0
1938 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
1939 ; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0
1940 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1943 ; GCN-LABEL: v_udiv_i24:
1945 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1946 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1947 ; GCN-NEXT: s_add_u32 s4, s2, 4
1948 ; GCN-NEXT: s_addc_u32 s5, s3, 0
1949 ; GCN-NEXT: s_add_u32 s6, s2, 2
1950 ; GCN-NEXT: v_mov_b32_e32 v0, s4
1951 ; GCN-NEXT: s_addc_u32 s7, s3, 0
1952 ; GCN-NEXT: v_mov_b32_e32 v1, s5
1953 ; GCN-NEXT: s_add_u32 s4, s2, 6
1954 ; GCN-NEXT: s_addc_u32 s5, s3, 0
1955 ; GCN-NEXT: v_mov_b32_e32 v2, s4
1956 ; GCN-NEXT: v_mov_b32_e32 v3, s5
1957 ; GCN-NEXT: flat_load_ubyte v4, v[2:3]
1958 ; GCN-NEXT: flat_load_ushort v5, v[0:1]
1959 ; GCN-NEXT: v_mov_b32_e32 v2, s6
1960 ; GCN-NEXT: v_mov_b32_e32 v0, s2
1961 ; GCN-NEXT: v_mov_b32_e32 v3, s7
1962 ; GCN-NEXT: v_mov_b32_e32 v1, s3
1963 ; GCN-NEXT: flat_load_ubyte v2, v[2:3]
1964 ; GCN-NEXT: flat_load_ushort v0, v[0:1]
1965 ; GCN-NEXT: s_waitcnt vmcnt(3)
1966 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4
1967 ; GCN-NEXT: s_waitcnt vmcnt(2)
1968 ; GCN-NEXT: v_or_b32_e32 v3, v5, v1
1969 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v3
1970 ; GCN-NEXT: v_sub_u32_e32 v4, vcc, 0, v3
1971 ; GCN-NEXT: s_waitcnt vmcnt(1)
1972 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1973 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1
1974 ; GCN-NEXT: s_waitcnt vmcnt(0)
1975 ; GCN-NEXT: v_or_b32_e32 v2, v0, v2
1976 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
1977 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
1978 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v1
1979 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4
1980 ; GCN-NEXT: v_add_u32_e32 v0, vcc, v1, v4
1981 ; GCN-NEXT: v_mul_hi_u32 v4, v2, v0
1982 ; GCN-NEXT: v_mov_b32_e32 v0, s0
1983 ; GCN-NEXT: v_mov_b32_e32 v1, s1
1984 ; GCN-NEXT: v_mul_lo_u32 v5, v4, v3
1985 ; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4
1986 ; GCN-NEXT: v_sub_u32_e32 v2, vcc, v2, v5
1987 ; GCN-NEXT: v_sub_u32_e32 v5, vcc, v2, v3
1988 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3
1989 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
1990 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
1991 ; GCN-NEXT: v_add_u32_e32 v5, vcc, 1, v4
1992 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3
1993 ; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
1994 ; GCN-NEXT: v_and_b32_e32 v2, 0xffffff, v2
1995 ; GCN-NEXT: flat_store_dword v[0:1], v2
1996 ; GCN-NEXT: s_endpgm
1998 ; GFX1030-LABEL: v_udiv_i24:
2000 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2001 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0
2002 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
2003 ; GFX1030-NEXT: s_clause 0x3
2004 ; GFX1030-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6
2005 ; GFX1030-NEXT: global_load_ushort v2, v0, s[2:3] offset:4
2006 ; GFX1030-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2
2007 ; GFX1030-NEXT: global_load_ushort v4, v0, s[2:3]
2008 ; GFX1030-NEXT: s_waitcnt vmcnt(3)
2009 ; GFX1030-NEXT: v_readfirstlane_b32 s2, v1
2010 ; GFX1030-NEXT: s_waitcnt vmcnt(2)
2011 ; GFX1030-NEXT: v_readfirstlane_b32 s3, v2
2012 ; GFX1030-NEXT: s_waitcnt vmcnt(1)
2013 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v3
2014 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
2015 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v4
2016 ; GFX1030-NEXT: s_lshl_b32 s2, s2, 16
2017 ; GFX1030-NEXT: s_or_b32 s2, s3, s2
2018 ; GFX1030-NEXT: s_lshl_b32 s4, s4, 16
2019 ; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s2
2020 ; GFX1030-NEXT: s_sub_i32 s6, 0, s2
2021 ; GFX1030-NEXT: s_or_b32 s4, s5, s4
2022 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1
2023 ; GFX1030-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
2024 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v1, v1
2025 ; GFX1030-NEXT: v_readfirstlane_b32 s3, v1
2026 ; GFX1030-NEXT: s_mul_i32 s6, s6, s3
2027 ; GFX1030-NEXT: s_mul_hi_u32 s6, s3, s6
2028 ; GFX1030-NEXT: s_add_i32 s3, s3, s6
2029 ; GFX1030-NEXT: s_mul_hi_u32 s3, s4, s3
2030 ; GFX1030-NEXT: s_mul_i32 s5, s3, s2
2031 ; GFX1030-NEXT: s_sub_i32 s4, s4, s5
2032 ; GFX1030-NEXT: s_add_i32 s5, s3, 1
2033 ; GFX1030-NEXT: s_sub_i32 s6, s4, s2
2034 ; GFX1030-NEXT: s_cmp_ge_u32 s4, s2
2035 ; GFX1030-NEXT: s_cselect_b32 s3, s5, s3
2036 ; GFX1030-NEXT: s_cselect_b32 s4, s6, s4
2037 ; GFX1030-NEXT: s_add_i32 s5, s3, 1
2038 ; GFX1030-NEXT: s_cmp_ge_u32 s4, s2
2039 ; GFX1030-NEXT: s_cselect_b32 s2, s5, s3
2040 ; GFX1030-NEXT: s_and_b32 s2, s2, 0xffffff
2041 ; GFX1030-NEXT: v_mov_b32_e32 v1, s2
2042 ; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
2043 ; GFX1030-NEXT: s_endpgm
2045 ; EG-LABEL: v_udiv_i24:
2047 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
2049 ; EG-NEXT: ALU 23, @15, KC0[CB0:0-32], KC1[]
2050 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2053 ; EG-NEXT: Fetch clause starting at 6:
2054 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1
2055 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1
2056 ; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1
2057 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
2058 ; EG-NEXT: ALU clause starting at 14:
2059 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2060 ; EG-NEXT: ALU clause starting at 15:
2061 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
2062 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2063 ; EG-NEXT: OR_INT * T0.W, T0.X, PV.W,
2064 ; EG-NEXT: SUB_INT T1.W, 0.0, PV.W,
2065 ; EG-NEXT: RECIP_UINT * T0.X, PV.W,
2066 ; EG-NEXT: MULLO_INT * T0.Y, PV.W, PS,
2067 ; EG-NEXT: LSHL T1.W, T3.X, literal.x,
2068 ; EG-NEXT: MULHI * T0.Y, T0.X, PS,
2069 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2070 ; EG-NEXT: ADD_INT T2.W, T0.X, PS,
2071 ; EG-NEXT: OR_INT * T1.W, T2.X, PV.W,
2072 ; EG-NEXT: MULHI * T0.X, PS, PV.W,
2073 ; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W,
2074 ; EG-NEXT: SUB_INT * T1.W, T1.W, PS,
2075 ; EG-NEXT: ADD_INT T0.Z, T0.X, 1,
2076 ; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W,
2077 ; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W,
2078 ; EG-NEXT: CNDE_INT T1.W, PV.W, T1.W, PS,
2079 ; EG-NEXT: CNDE_INT * T2.W, PV.W, T0.X, PV.Z,
2080 ; EG-NEXT: ADD_INT T3.W, PS, 1,
2081 ; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W,
2082 ; EG-NEXT: CNDE_INT T0.X, PS, T2.W, PV.W,
2083 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2084 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2085 %den_ptr = getelementptr i24, ptr addrspace(1) %in, i24 1
2086 %num = load i24, ptr addrspace(1) %in
2087 %den = load i24, ptr addrspace(1) %den_ptr
2088 %result = udiv i24 %num, %den
2089 %result.ext = zext i24 %result to i32
2090 store i32 %result.ext, ptr addrspace(1) %out
2094 define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) {
2095 ; SI-LABEL: scalarize_mulhu_4xi32:
2097 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2098 ; SI-NEXT: s_mov_b32 s7, 0xf000
2099 ; SI-NEXT: s_mov_b32 s6, -1
2100 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2101 ; SI-NEXT: s_mov_b32 s4, s0
2102 ; SI-NEXT: s_mov_b32 s5, s1
2103 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2104 ; SI-NEXT: s_mov_b32 s0, 0x1389c755
2105 ; SI-NEXT: s_mov_b32 s4, s2
2106 ; SI-NEXT: s_mov_b32 s5, s3
2107 ; SI-NEXT: s_waitcnt vmcnt(0)
2108 ; SI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
2109 ; SI-NEXT: v_lshrrev_b32_e32 v1, 2, v1
2110 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
2111 ; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
2112 ; SI-NEXT: v_mul_hi_u32 v0, v0, s0
2113 ; SI-NEXT: v_mul_hi_u32 v1, v1, s0
2114 ; SI-NEXT: v_mul_hi_u32 v2, v2, s0
2115 ; SI-NEXT: v_mul_hi_u32 v3, v3, s0
2116 ; SI-NEXT: v_lshrrev_b32_e32 v0, 10, v0
2117 ; SI-NEXT: v_lshrrev_b32_e32 v1, 10, v1
2118 ; SI-NEXT: v_lshrrev_b32_e32 v2, 10, v2
2119 ; SI-NEXT: v_lshrrev_b32_e32 v3, 10, v3
2120 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2123 ; VI-LABEL: scalarize_mulhu_4xi32:
2125 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2126 ; VI-NEXT: s_mov_b32 s7, 0xf000
2127 ; VI-NEXT: s_mov_b32 s6, -1
2128 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2129 ; VI-NEXT: s_mov_b32 s4, s0
2130 ; VI-NEXT: s_mov_b32 s5, s1
2131 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2132 ; VI-NEXT: s_mov_b32 s0, 0x1389c755
2133 ; VI-NEXT: s_mov_b32 s4, s2
2134 ; VI-NEXT: s_mov_b32 s5, s3
2135 ; VI-NEXT: s_waitcnt vmcnt(0)
2136 ; VI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
2137 ; VI-NEXT: v_lshrrev_b32_e32 v1, 2, v1
2138 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
2139 ; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
2140 ; VI-NEXT: v_mul_hi_u32 v0, v0, s0
2141 ; VI-NEXT: v_mul_hi_u32 v1, v1, s0
2142 ; VI-NEXT: v_mul_hi_u32 v2, v2, s0
2143 ; VI-NEXT: v_mul_hi_u32 v3, v3, s0
2144 ; VI-NEXT: v_lshrrev_b32_e32 v0, 10, v0
2145 ; VI-NEXT: v_lshrrev_b32_e32 v1, 10, v1
2146 ; VI-NEXT: v_lshrrev_b32_e32 v2, 10, v2
2147 ; VI-NEXT: v_lshrrev_b32_e32 v3, 10, v3
2148 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2151 ; GCN-LABEL: scalarize_mulhu_4xi32:
2153 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2154 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
2155 ; GCN-NEXT: v_mov_b32_e32 v0, s0
2156 ; GCN-NEXT: v_mov_b32_e32 v1, s1
2157 ; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2158 ; GCN-NEXT: s_mov_b32 s0, 0x1389c755
2159 ; GCN-NEXT: v_mov_b32_e32 v4, s2
2160 ; GCN-NEXT: v_mov_b32_e32 v5, s3
2161 ; GCN-NEXT: s_waitcnt vmcnt(0)
2162 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 2, v0
2163 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v1
2164 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 2, v2
2165 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 2, v3
2166 ; GCN-NEXT: v_mul_hi_u32 v0, v0, s0
2167 ; GCN-NEXT: v_mul_hi_u32 v1, v1, s0
2168 ; GCN-NEXT: v_mul_hi_u32 v2, v2, s0
2169 ; GCN-NEXT: v_mul_hi_u32 v3, v3, s0
2170 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 10, v0
2171 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 10, v1
2172 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 10, v2
2173 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 10, v3
2174 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2175 ; GCN-NEXT: s_endpgm
2177 ; GFX1030-LABEL: scalarize_mulhu_4xi32:
2179 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2180 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0
2181 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
2182 ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
2183 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
2184 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 2, v0
2185 ; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 2, v1
2186 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 2, v2
2187 ; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3
2188 ; GFX1030-NEXT: v_mul_hi_u32 v0, 0x1389c755, v0
2189 ; GFX1030-NEXT: v_mul_hi_u32 v1, 0x1389c755, v1
2190 ; GFX1030-NEXT: v_mul_hi_u32 v2, 0x1389c755, v2
2191 ; GFX1030-NEXT: v_mul_hi_u32 v3, 0x1389c755, v3
2192 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 10, v0
2193 ; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 10, v1
2194 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 10, v2
2195 ; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 10, v3
2196 ; GFX1030-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
2197 ; GFX1030-NEXT: s_endpgm
2199 ; EG-LABEL: scalarize_mulhu_4xi32:
2201 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
2203 ; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
2204 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2207 ; EG-NEXT: Fetch clause starting at 6:
2208 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
2209 ; EG-NEXT: ALU clause starting at 8:
2210 ; EG-NEXT: MOV * T0.X, KC0[2].Y,
2211 ; EG-NEXT: ALU clause starting at 9:
2212 ; EG-NEXT: LSHR T0.W, T0.W, literal.x,
2213 ; EG-NEXT: LSHR * T1.W, T0.Z, literal.x,
2214 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2215 ; EG-NEXT: MULHI * T0.Z, PV.W, literal.x,
2216 ; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
2217 ; EG-NEXT: LSHR T1.Z, T0.Y, literal.x,
2218 ; EG-NEXT: LSHR T0.W, PS, literal.y,
2219 ; EG-NEXT: MULHI * T0.Y, T1.W, literal.z,
2220 ; EG-NEXT: 2(2.802597e-45), 10(1.401298e-44)
2221 ; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
2222 ; EG-NEXT: LSHR T0.Z, PS, literal.x,
2223 ; EG-NEXT: LSHR T1.W, T0.X, literal.y,
2224 ; EG-NEXT: MULHI * T0.X, PV.Z, literal.z,
2225 ; EG-NEXT: 10(1.401298e-44), 2(2.802597e-45)
2226 ; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
2227 ; EG-NEXT: LSHR T0.Y, PS, literal.x,
2228 ; EG-NEXT: MULHI * T0.X, PV.W, literal.y,
2229 ; EG-NEXT: 10(1.401298e-44), 327796565(3.478022e-27)
2230 ; EG-NEXT: LSHR T0.X, PS, literal.x,
2231 ; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.y,
2232 ; EG-NEXT: 10(1.401298e-44), 2(2.802597e-45)
2233 %1 = load <4 x i32>, ptr addrspace(1) %in, align 16
2234 %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
2235 store <4 x i32> %2, ptr addrspace(1) %out, align 16
2239 define amdgpu_kernel void @test_udiv2(i32 %p) {
2240 ; SI-LABEL: test_udiv2:
2242 ; SI-NEXT: s_load_dword s0, s[0:1], 0x9
2243 ; SI-NEXT: s_mov_b32 s3, 0xf000
2244 ; SI-NEXT: s_mov_b32 s2, -1
2245 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2246 ; SI-NEXT: s_lshr_b32 s0, s0, 1
2247 ; SI-NEXT: v_mov_b32_e32 v0, s0
2248 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2249 ; SI-NEXT: s_waitcnt vmcnt(0)
2252 ; VI-LABEL: test_udiv2:
2254 ; VI-NEXT: s_load_dword s0, s[0:1], 0x24
2255 ; VI-NEXT: s_mov_b32 s3, 0xf000
2256 ; VI-NEXT: s_mov_b32 s2, -1
2257 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2258 ; VI-NEXT: s_lshr_b32 s0, s0, 1
2259 ; VI-NEXT: v_mov_b32_e32 v0, s0
2260 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2261 ; VI-NEXT: s_waitcnt vmcnt(0)
2264 ; GCN-LABEL: test_udiv2:
2266 ; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
2267 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
2268 ; GCN-NEXT: s_lshr_b32 s0, s0, 1
2269 ; GCN-NEXT: v_mov_b32_e32 v0, s0
2270 ; GCN-NEXT: flat_store_dword v[0:1], v0
2271 ; GCN-NEXT: s_waitcnt vmcnt(0)
2272 ; GCN-NEXT: s_endpgm
2274 ; GFX1030-LABEL: test_udiv2:
2276 ; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0
2277 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
2278 ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1
2279 ; GFX1030-NEXT: v_mov_b32_e32 v0, s0
2280 ; GFX1030-NEXT: global_store_dword v[0:1], v0, off
2281 ; GFX1030-NEXT: s_waitcnt_vscnt null, 0x0
2282 ; GFX1030-NEXT: s_endpgm
2284 ; EG-LABEL: test_udiv2:
2286 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
2287 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2290 ; EG-NEXT: ALU clause starting at 4:
2291 ; EG-NEXT: MOV T0.X, literal.x,
2292 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, 1,
2293 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
2295 store volatile i32 %i, ptr addrspace(1) undef
2299 define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
2300 ; SI-LABEL: test_udiv_3_mulhu:
2302 ; SI-NEXT: s_load_dword s0, s[0:1], 0x9
2303 ; SI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
2304 ; SI-NEXT: s_mov_b32 s3, 0xf000
2305 ; SI-NEXT: s_mov_b32 s2, -1
2306 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2307 ; SI-NEXT: v_mul_hi_u32 v0, s0, v0
2308 ; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
2309 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2310 ; SI-NEXT: s_waitcnt vmcnt(0)
2313 ; VI-LABEL: test_udiv_3_mulhu:
2315 ; VI-NEXT: s_load_dword s0, s[0:1], 0x24
2316 ; VI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
2317 ; VI-NEXT: s_mov_b32 s3, 0xf000
2318 ; VI-NEXT: s_mov_b32 s2, -1
2319 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2320 ; VI-NEXT: v_mul_hi_u32 v0, s0, v0
2321 ; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
2322 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2323 ; VI-NEXT: s_waitcnt vmcnt(0)
2326 ; GCN-LABEL: test_udiv_3_mulhu:
2328 ; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
2329 ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
2330 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
2331 ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
2332 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0
2333 ; GCN-NEXT: flat_store_dword v[0:1], v0
2334 ; GCN-NEXT: s_waitcnt vmcnt(0)
2335 ; GCN-NEXT: s_endpgm
2337 ; GFX1030-LABEL: test_udiv_3_mulhu:
2339 ; GFX1030-NEXT: s_load_dword s0, s[4:5], 0x0
2340 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
2341 ; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0xaaaaaaab
2342 ; GFX1030-NEXT: s_lshr_b32 s0, s0, 1
2343 ; GFX1030-NEXT: v_mov_b32_e32 v0, s0
2344 ; GFX1030-NEXT: global_store_dword v[0:1], v0, off
2345 ; GFX1030-NEXT: s_waitcnt_vscnt null, 0x0
2346 ; GFX1030-NEXT: s_endpgm
2348 ; EG-LABEL: test_udiv_3_mulhu:
2350 ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
2351 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2354 ; EG-NEXT: ALU clause starting at 4:
2355 ; EG-NEXT: MULHI * T0.X, KC0[2].Y, literal.x,
2356 ; EG-NEXT: -1431655765(-3.031649e-13), 0(0.000000e+00)
2357 ; EG-NEXT: LSHR T0.X, PS, 1,
2358 ; EG-NEXT: MOV * T1.X, literal.x,
2359 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
2361 store volatile i32 %i, ptr addrspace(1) undef
2365 define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readonly %arg) {
2366 ; SI-LABEL: fdiv_test_denormals:
2367 ; SI: ; %bb.0: ; %bb
2368 ; SI-NEXT: s_mov_b32 s0, 0
2369 ; SI-NEXT: s_mov_b32 s3, 0xf000
2370 ; SI-NEXT: s_mov_b32 s2, -1
2371 ; SI-NEXT: s_mov_b32 s1, s0
2372 ; SI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0
2373 ; SI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0
2374 ; SI-NEXT: s_waitcnt vmcnt(1)
2375 ; SI-NEXT: v_cvt_f32_i32_e32 v2, v0
2376 ; SI-NEXT: s_waitcnt vmcnt(0)
2377 ; SI-NEXT: v_cvt_f32_i32_e32 v3, v1
2378 ; SI-NEXT: v_xor_b32_e32 v0, v1, v0
2379 ; SI-NEXT: v_ashrrev_i32_e32 v0, 30, v0
2380 ; SI-NEXT: v_rcp_iflag_f32_e32 v4, v2
2381 ; SI-NEXT: v_or_b32_e32 v0, 1, v0
2382 ; SI-NEXT: v_mul_f32_e32 v1, v3, v4
2383 ; SI-NEXT: v_trunc_f32_e32 v1, v1
2384 ; SI-NEXT: v_mad_f32 v3, -v1, v2, v3
2385 ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1
2386 ; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2|
2387 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
2388 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
2389 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
2392 ; VI-LABEL: fdiv_test_denormals:
2393 ; VI: ; %bb.0: ; %bb
2394 ; VI-NEXT: s_mov_b32 s0, 0
2395 ; VI-NEXT: s_mov_b32 s3, 0xf000
2396 ; VI-NEXT: s_mov_b32 s2, -1
2397 ; VI-NEXT: s_mov_b32 s1, s0
2398 ; VI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0
2399 ; VI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0
2400 ; VI-NEXT: s_waitcnt vmcnt(1)
2401 ; VI-NEXT: v_cvt_f32_i32_e32 v2, v0
2402 ; VI-NEXT: s_waitcnt vmcnt(0)
2403 ; VI-NEXT: v_cvt_f32_i32_e32 v3, v1
2404 ; VI-NEXT: v_xor_b32_e32 v0, v1, v0
2405 ; VI-NEXT: v_ashrrev_i32_e32 v0, 30, v0
2406 ; VI-NEXT: v_rcp_iflag_f32_e32 v4, v2
2407 ; VI-NEXT: v_or_b32_e32 v0, 1, v0
2408 ; VI-NEXT: v_mul_f32_e32 v1, v3, v4
2409 ; VI-NEXT: v_trunc_f32_e32 v1, v1
2410 ; VI-NEXT: v_mad_f32 v3, -v1, v2, v3
2411 ; VI-NEXT: v_cvt_i32_f32_e32 v1, v1
2412 ; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2|
2413 ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
2414 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
2415 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
2418 ; GCN-LABEL: fdiv_test_denormals:
2419 ; GCN: ; %bb.0: ; %bb
2420 ; GCN-NEXT: flat_load_sbyte v2, v[0:1]
2421 ; GCN-NEXT: v_mov_b32_e32 v0, 0
2422 ; GCN-NEXT: v_mov_b32_e32 v1, 0
2423 ; GCN-NEXT: flat_load_sbyte v3, v[0:1]
2424 ; GCN-NEXT: s_waitcnt vmcnt(1)
2425 ; GCN-NEXT: v_cvt_f32_i32_e32 v4, v2
2426 ; GCN-NEXT: s_waitcnt vmcnt(0)
2427 ; GCN-NEXT: v_cvt_f32_i32_e32 v5, v3
2428 ; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4
2429 ; GCN-NEXT: v_xor_b32_e32 v2, v3, v2
2430 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2
2431 ; GCN-NEXT: v_or_b32_e32 v2, 1, v2
2432 ; GCN-NEXT: v_mul_f32_e32 v3, v5, v6
2433 ; GCN-NEXT: v_trunc_f32_e32 v3, v3
2434 ; GCN-NEXT: v_mad_f32 v5, -v3, v4, v5
2435 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
2436 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4|
2437 ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
2438 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3
2439 ; GCN-NEXT: flat_store_byte v[0:1], v2
2440 ; GCN-NEXT: s_endpgm
2442 ; GFX1030-LABEL: fdiv_test_denormals:
2443 ; GFX1030: ; %bb.0: ; %bb
2444 ; GFX1030-NEXT: global_load_sbyte v2, v[0:1], off
2445 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0
2446 ; GFX1030-NEXT: v_mov_b32_e32 v1, 0
2447 ; GFX1030-NEXT: global_load_sbyte v3, v[0:1], off
2448 ; GFX1030-NEXT: s_waitcnt vmcnt(1)
2449 ; GFX1030-NEXT: v_cvt_f32_i32_e32 v4, v2
2450 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v5, v4
2451 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
2452 ; GFX1030-NEXT: v_cvt_f32_i32_e32 v6, v3
2453 ; GFX1030-NEXT: v_xor_b32_e32 v2, v3, v2
2454 ; GFX1030-NEXT: v_ashrrev_i32_e32 v2, 30, v2
2455 ; GFX1030-NEXT: v_mul_f32_e32 v5, v6, v5
2456 ; GFX1030-NEXT: v_or_b32_e32 v2, 1, v2
2457 ; GFX1030-NEXT: v_trunc_f32_e32 v3, v5
2458 ; GFX1030-NEXT: v_fma_f32 v5, -v3, v4, v6
2459 ; GFX1030-NEXT: v_cvt_i32_f32_e32 v3, v3
2460 ; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v4|
2461 ; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo
2462 ; GFX1030-NEXT: v_add_nc_u32_e32 v2, v3, v2
2463 ; GFX1030-NEXT: global_store_byte v[0:1], v2, off
2464 ; GFX1030-NEXT: s_endpgm
2466 ; EG-LABEL: fdiv_test_denormals:
2467 ; EG: ; %bb.0: ; %bb
2469 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
2471 ; EG-NEXT: ALU 25, @11, KC0[], KC1[]
2472 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
2474 ; EG-NEXT: Fetch clause starting at 6:
2475 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
2476 ; EG-NEXT: Fetch clause starting at 8:
2477 ; EG-NEXT: VTX_READ_8 T1.X, T1.X, 0, #1
2478 ; EG-NEXT: ALU clause starting at 10:
2479 ; EG-NEXT: MOV * T1.X, 0.0,
2480 ; EG-NEXT: ALU clause starting at 11:
2481 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
2482 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
2483 ; EG-NEXT: INT_TO_FLT * T0.X, PV.W,
2484 ; EG-NEXT: BFE_INT T1.W, T1.X, 0.0, literal.x,
2485 ; EG-NEXT: RECIP_IEEE * T0.Y, PS,
2486 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
2487 ; EG-NEXT: INT_TO_FLT * T0.Z, PV.W,
2488 ; EG-NEXT: MUL_IEEE * T2.W, PS, T0.Y,
2489 ; EG-NEXT: TRUNC T2.W, PV.W,
2490 ; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W,
2491 ; EG-NEXT: ASHR T0.W, PS, literal.x,
2492 ; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z,
2493 ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
2494 ; EG-NEXT: TRUNC T0.Z, T2.W,
2495 ; EG-NEXT: SETGE T1.W, |PS|, |T0.X|,
2496 ; EG-NEXT: OR_INT * T0.W, PV.W, 1,
2497 ; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS,
2498 ; EG-NEXT: FLT_TO_INT * T1.W, PV.Z,
2499 ; EG-NEXT: ADD_INT * T0.W, PS, PV.W,
2500 ; EG-NEXT: AND_INT T0.X, PV.W, literal.x,
2501 ; EG-NEXT: MOV * T0.W, literal.x,
2502 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2503 ; EG-NEXT: MOV T0.Y, 0.0,
2504 ; EG-NEXT: MOV * T0.Z, 0.0,
2505 ; EG-NEXT: MOV * T1.X, literal.x,
2506 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
2508 %tmp = load i8, ptr addrspace(1) null, align 1
2509 %tmp1 = sext i8 %tmp to i32
2510 %tmp2 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 undef
2511 %tmp3 = load i8, ptr addrspace(1) %tmp2, align 1
2512 %tmp4 = sext i8 %tmp3 to i32
2513 %tmp5 = sdiv i32 %tmp1, %tmp4
2514 %tmp6 = trunc i32 %tmp5 to i8
2515 store i8 %tmp6, ptr addrspace(1) null, align 1
2519 define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
2520 ; SI-LABEL: v_test_udiv64_mulhi_fold:
2522 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2523 ; SI-NEXT: s_mov_b32 s4, 0x346d900
2524 ; SI-NEXT: s_add_u32 s4, 0x4237, s4
2525 ; SI-NEXT: v_mov_b32_e32 v2, 0xa9000000
2526 ; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v2
2527 ; SI-NEXT: s_addc_u32 s5, 0, 0
2528 ; SI-NEXT: s_or_b32 s4, vcc_lo, vcc_hi
2529 ; SI-NEXT: s_cmp_lg_u32 s4, 0
2530 ; SI-NEXT: s_mov_b32 s4, 0xfffe7960
2531 ; SI-NEXT: v_mul_hi_u32 v3, v2, s4
2532 ; SI-NEXT: v_mul_lo_u32 v4, v2, s4
2533 ; SI-NEXT: s_addc_u32 s5, s5, 0xa7c5
2534 ; SI-NEXT: s_mul_i32 s6, s5, 0xfffe7960
2535 ; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v2
2536 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3
2537 ; SI-NEXT: v_mul_lo_u32 v5, v2, v3
2538 ; SI-NEXT: v_mul_hi_u32 v6, v2, v4
2539 ; SI-NEXT: v_mul_hi_u32 v7, v2, v3
2540 ; SI-NEXT: v_mul_hi_u32 v8, s5, v3
2541 ; SI-NEXT: v_mul_lo_u32 v3, s5, v3
2542 ; SI-NEXT: v_add_i32_e32 v5, vcc, v6, v5
2543 ; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc
2544 ; SI-NEXT: v_mul_lo_u32 v7, s5, v4
2545 ; SI-NEXT: v_mul_hi_u32 v4, s5, v4
2546 ; SI-NEXT: s_mov_b32 s4, 0x186a0
2547 ; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7
2548 ; SI-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc
2549 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc
2550 ; SI-NEXT: v_add_i32_e32 v3, vcc, v4, v3
2551 ; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
2552 ; SI-NEXT: v_mov_b32_e32 v5, s5
2553 ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
2554 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
2555 ; SI-NEXT: v_mul_lo_u32 v4, v0, v3
2556 ; SI-NEXT: v_mul_hi_u32 v5, v0, v2
2557 ; SI-NEXT: v_mul_hi_u32 v6, v0, v3
2558 ; SI-NEXT: v_mul_hi_u32 v7, v1, v3
2559 ; SI-NEXT: v_mul_lo_u32 v3, v1, v3
2560 ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4
2561 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
2562 ; SI-NEXT: v_mul_lo_u32 v6, v1, v2
2563 ; SI-NEXT: v_mul_hi_u32 v2, v1, v2
2564 ; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6
2565 ; SI-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc
2566 ; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
2567 ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
2568 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
2569 ; SI-NEXT: v_mul_lo_u32 v4, v3, s4
2570 ; SI-NEXT: v_mul_hi_u32 v5, v2, s4
2571 ; SI-NEXT: v_mul_lo_u32 v6, v2, s4
2572 ; SI-NEXT: s_mov_b32 s4, 0x1869f
2573 ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4
2574 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
2575 ; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
2576 ; SI-NEXT: v_subrev_i32_e32 v4, vcc, 0x186a0, v0
2577 ; SI-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v1, vcc
2578 ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4
2579 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
2580 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
2581 ; SI-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
2582 ; SI-NEXT: v_add_i32_e32 v5, vcc, 2, v2
2583 ; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v3, vcc
2584 ; SI-NEXT: v_add_i32_e32 v7, vcc, 1, v2
2585 ; SI-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0
2586 ; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
2587 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2588 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
2589 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
2590 ; SI-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2591 ; SI-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
2592 ; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
2593 ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc
2594 ; SI-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[4:5]
2595 ; SI-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
2596 ; SI-NEXT: s_setpc_b64 s[30:31]
2598 ; VI-LABEL: v_test_udiv64_mulhi_fold:
2600 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2601 ; VI-NEXT: s_mov_b32 s4, 0x346d900
2602 ; VI-NEXT: s_add_u32 s4, 0x4237, s4
2603 ; VI-NEXT: v_mov_b32_e32 v2, 0xa9000000
2604 ; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v2
2605 ; VI-NEXT: s_mov_b32 s4, 0xfffe7960
2606 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s4, 0
2607 ; VI-NEXT: s_addc_u32 s6, 0, 0
2608 ; VI-NEXT: s_cmp_lg_u64 vcc, 0
2609 ; VI-NEXT: s_addc_u32 s6, s6, 0xa7c5
2610 ; VI-NEXT: s_mul_i32 s4, s6, 0xfffe7960
2611 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
2612 ; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v3
2613 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
2614 ; VI-NEXT: v_mul_hi_u32 v7, v6, v2
2615 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3
2616 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v2, 0
2617 ; VI-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc
2618 ; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v5, 0
2619 ; VI-NEXT: v_add_u32_e32 v2, vcc, v7, v2
2620 ; VI-NEXT: v_addc_u32_e32 v2, vcc, v8, v3, vcc
2621 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
2622 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
2623 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2624 ; VI-NEXT: v_mov_b32_e32 v4, s6
2625 ; VI-NEXT: v_add_u32_e32 v5, vcc, v6, v2
2626 ; VI-NEXT: v_addc_u32_e32 v4, vcc, v4, v3, vcc
2627 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v4, 0
2628 ; VI-NEXT: v_mul_hi_u32 v6, v0, v5
2629 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2
2630 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
2631 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v5, 0
2632 ; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v4, 0
2633 ; VI-NEXT: v_add_u32_e32 v2, vcc, v6, v2
2634 ; VI-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc
2635 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
2636 ; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v4
2637 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
2638 ; VI-NEXT: s_mov_b32 s4, 0x186a0
2639 ; VI-NEXT: v_mul_lo_u32 v6, v5, s4
2640 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0
2641 ; VI-NEXT: s_mov_b32 s4, 0x1869f
2642 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v6
2643 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
2644 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
2645 ; VI-NEXT: v_subrev_u32_e32 v2, vcc, 0x186a0, v0
2646 ; VI-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
2647 ; VI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2
2648 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
2649 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
2650 ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
2651 ; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v4
2652 ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc
2653 ; VI-NEXT: v_add_u32_e32 v7, vcc, 1, v4
2654 ; VI-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0
2655 ; VI-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc
2656 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2657 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
2658 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
2659 ; VI-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2660 ; VI-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
2661 ; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
2662 ; VI-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc
2663 ; VI-NEXT: v_cndmask_b32_e64 v0, v4, v2, s[4:5]
2664 ; VI-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5]
2665 ; VI-NEXT: s_setpc_b64 s[30:31]
2667 ; GCN-LABEL: v_test_udiv64_mulhi_fold:
2669 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2670 ; GCN-NEXT: s_mov_b32 s4, 0x346d900
2671 ; GCN-NEXT: s_add_u32 s4, 0x4237, s4
2672 ; GCN-NEXT: v_mov_b32_e32 v2, 0xa9000000
2673 ; GCN-NEXT: v_add_u32_e32 v6, vcc, s4, v2
2674 ; GCN-NEXT: s_mov_b32 s4, 0xfffe7960
2675 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s4, 0
2676 ; GCN-NEXT: s_addc_u32 s6, 0, 0
2677 ; GCN-NEXT: s_cmp_lg_u64 vcc, 0
2678 ; GCN-NEXT: s_addc_u32 s6, s6, 0xa7c5
2679 ; GCN-NEXT: s_mul_i32 s4, s6, 0xfffe7960
2680 ; GCN-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
2681 ; GCN-NEXT: v_add_u32_e32 v5, vcc, s4, v3
2682 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
2683 ; GCN-NEXT: v_mul_hi_u32 v7, v6, v2
2684 ; GCN-NEXT: v_add_u32_e32 v7, vcc, v7, v3
2685 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v2, 0
2686 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc
2687 ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v5, 0
2688 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v7, v2
2689 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v8, v3, vcc
2690 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
2691 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v4
2692 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2693 ; GCN-NEXT: v_mov_b32_e32 v4, s6
2694 ; GCN-NEXT: v_add_u32_e32 v5, vcc, v6, v2
2695 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v3, vcc
2696 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v4, 0
2697 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v5
2698 ; GCN-NEXT: v_add_u32_e32 v6, vcc, v6, v2
2699 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
2700 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v5, 0
2701 ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v4, 0
2702 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v6, v2
2703 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc
2704 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
2705 ; GCN-NEXT: v_add_u32_e32 v4, vcc, v2, v4
2706 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
2707 ; GCN-NEXT: s_mov_b32 s4, 0x186a0
2708 ; GCN-NEXT: v_mul_lo_u32 v6, v5, s4
2709 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0
2710 ; GCN-NEXT: s_mov_b32 s4, 0x1869f
2711 ; GCN-NEXT: v_add_u32_e32 v3, vcc, v3, v6
2712 ; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
2713 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
2714 ; GCN-NEXT: v_subrev_u32_e32 v2, vcc, 0x186a0, v0
2715 ; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
2716 ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2
2717 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
2718 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
2719 ; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
2720 ; GCN-NEXT: v_add_u32_e32 v3, vcc, 2, v4
2721 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc
2722 ; GCN-NEXT: v_add_u32_e32 v7, vcc, 1, v4
2723 ; GCN-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0
2724 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc
2725 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
2726 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
2727 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
2728 ; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[4:5]
2729 ; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
2730 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
2731 ; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc
2732 ; GCN-NEXT: v_cndmask_b32_e64 v0, v4, v2, s[4:5]
2733 ; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5]
2734 ; GCN-NEXT: s_setpc_b64 s[30:31]
2736 ; GFX1030-LABEL: v_test_udiv64_mulhi_fold:
2738 ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2739 ; GFX1030-NEXT: s_mov_b32 s4, 0x346d900
2740 ; GFX1030-NEXT: s_add_u32 s4, 0x4237, s4
2741 ; GFX1030-NEXT: s_addc_u32 s5, 0, 0
2742 ; GFX1030-NEXT: v_add_co_u32 v2, s4, 0xa9000000, s4
2743 ; GFX1030-NEXT: s_cmp_lg_u32 s4, 0
2744 ; GFX1030-NEXT: s_addc_u32 s5, s5, 0xa7c5
2745 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v2
2746 ; GFX1030-NEXT: s_mul_i32 s6, s5, 0xfffe7960
2747 ; GFX1030-NEXT: s_mul_hi_u32 s7, s4, 0xfffe7960
2748 ; GFX1030-NEXT: s_mul_i32 s8, s4, 0xfffe7960
2749 ; GFX1030-NEXT: s_sub_i32 s7, s7, s4
2750 ; GFX1030-NEXT: s_mul_hi_u32 s9, s4, s8
2751 ; GFX1030-NEXT: s_add_i32 s7, s7, s6
2752 ; GFX1030-NEXT: s_mul_hi_u32 s10, s5, s8
2753 ; GFX1030-NEXT: s_mul_i32 s6, s5, s8
2754 ; GFX1030-NEXT: s_mul_hi_u32 s8, s4, s7
2755 ; GFX1030-NEXT: s_mul_i32 s4, s4, s7
2756 ; GFX1030-NEXT: s_mul_hi_u32 s11, s5, s7
2757 ; GFX1030-NEXT: s_add_u32 s4, s9, s4
2758 ; GFX1030-NEXT: s_addc_u32 s8, 0, s8
2759 ; GFX1030-NEXT: s_add_u32 s4, s4, s6
2760 ; GFX1030-NEXT: s_mul_i32 s7, s5, s7
2761 ; GFX1030-NEXT: s_addc_u32 s4, s8, s10
2762 ; GFX1030-NEXT: s_addc_u32 s6, s11, 0
2763 ; GFX1030-NEXT: s_add_u32 s4, s4, s7
2764 ; GFX1030-NEXT: s_addc_u32 s6, 0, s6
2765 ; GFX1030-NEXT: v_add_co_u32 v4, s4, v2, s4
2766 ; GFX1030-NEXT: s_cmp_lg_u32 s4, 0
2767 ; GFX1030-NEXT: s_addc_u32 s4, s5, s6
2768 ; GFX1030-NEXT: v_mul_hi_u32 v8, v0, v4
2769 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, s4, 0
2770 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, v4, 0
2771 ; GFX1030-NEXT: v_mad_u64_u32 v[6:7], null, v1, s4, 0
2772 ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2
2773 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2774 ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
2775 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v5, vcc_lo
2776 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
2777 ; GFX1030-NEXT: v_add_co_u32 v5, vcc_lo, v2, v6
2778 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v3, vcc_lo
2779 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x186a0, v5, 0
2780 ; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x186a0, v6, v[3:4]
2781 ; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
2782 ; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2783 ; GFX1030-NEXT: v_subrev_co_u32 v2, vcc_lo, 0x186a0, v0
2784 ; GFX1030-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
2785 ; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v2
2786 ; GFX1030-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
2787 ; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
2788 ; GFX1030-NEXT: v_add_co_u32 v4, vcc_lo, v5, 2
2789 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo
2790 ; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v0
2791 ; GFX1030-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2792 ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
2793 ; GFX1030-NEXT: v_cndmask_b32_e64 v0, -1, v0, s4
2794 ; GFX1030-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
2795 ; GFX1030-NEXT: v_add_co_u32 v3, vcc_lo, v5, 1
2796 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo
2797 ; GFX1030-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
2798 ; GFX1030-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
2799 ; GFX1030-NEXT: v_cndmask_b32_e32 v2, v8, v7, vcc_lo
2800 ; GFX1030-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
2801 ; GFX1030-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc_lo
2802 ; GFX1030-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
2803 ; GFX1030-NEXT: s_setpc_b64 s[30:31]
2805 ; EG-LABEL: v_test_udiv64_mulhi_fold:
2809 %d = udiv i64 %arg, 100000