1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
3 ; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
4 ; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
6 define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %x, i32 %y) {
7 ; GFX8-LABEL: udivrem_i32:
9 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
11 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
12 ; GFX8-NEXT: s_sub_i32 s0, 0, s5
13 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
14 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
15 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
16 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0
17 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
18 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
19 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
20 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0
21 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
22 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
23 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
24 ; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5
25 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2
26 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3
27 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
28 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
29 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
30 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
31 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2
32 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
33 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
34 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
35 ; GFX8-NEXT: flat_store_dword v[0:1], v2
36 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
37 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
38 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
39 ; GFX8-NEXT: flat_store_dword v[0:1], v3
42 ; GFX9-LABEL: udivrem_i32:
44 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
45 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
46 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
47 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
48 ; GFX9-NEXT: s_sub_i32 s0, 0, s5
49 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
50 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
51 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
52 ; GFX9-NEXT: v_mul_lo_u32 v1, s0, v0
53 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
54 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
55 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
56 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0
57 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5
58 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
59 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1
60 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1
61 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
62 ; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1
63 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
64 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
65 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1
66 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
67 ; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1
68 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
69 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
70 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
71 ; GFX9-NEXT: global_store_dword v2, v1, s[2:3]
74 ; GFX10-LABEL: udivrem_i32:
76 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
77 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
78 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5
79 ; GFX10-NEXT: s_sub_i32 s0, 0, s5
80 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
81 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
82 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
83 ; GFX10-NEXT: v_mul_lo_u32 v1, s0, v0
84 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
85 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
86 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
87 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0
88 ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s5
89 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0
90 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s4, v1
91 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1
92 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1
93 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
94 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
95 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0
96 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s5, v1
97 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s5, v1
98 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
99 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
100 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
101 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
102 ; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
103 ; GFX10-NEXT: global_store_dword v2, v1, s[2:3]
104 ; GFX10-NEXT: s_endpgm
105 %div = udiv i32 %x, %y
106 store i32 %div, ptr addrspace(1) %out0
107 %rem = urem i32 %x, %y
108 store i32 %rem, ptr addrspace(1) %out1
112 define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
113 ; GFX8-LABEL: udivrem_i64:
115 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0
116 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
117 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11
118 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10
119 ; GFX8-NEXT: s_sub_u32 s2, 0, s10
120 ; GFX8-NEXT: s_subb_u32 s3, 0, s11
121 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
122 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
123 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
124 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
125 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
126 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1
127 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
128 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
129 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
130 ; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
131 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
132 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
133 ; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
134 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
135 ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
136 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
137 ; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
138 ; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
139 ; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
140 ; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
141 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
142 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
143 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
144 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
145 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
146 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
147 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
148 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
149 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
150 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
151 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
152 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
153 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
154 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
155 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
156 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
157 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
158 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
159 ; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
160 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
161 ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
162 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
163 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
164 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
165 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
166 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
167 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
168 ; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
169 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
170 ; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
171 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
172 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
173 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
174 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
175 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
176 ; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
177 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
178 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
179 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
180 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
181 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
182 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
183 ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
184 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
185 ; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
186 ; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
187 ; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1
188 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
189 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
190 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
191 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
192 ; GFX8-NEXT: v_mul_lo_u32 v4, s9, v1
193 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
194 ; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1
195 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
196 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
197 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
198 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
199 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
200 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
201 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
202 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
203 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
204 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
205 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
206 ; GFX8-NEXT: v_mov_b32_e32 v6, s9
207 ; GFX8-NEXT: v_mov_b32_e32 v5, s11
208 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
209 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v0
210 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
211 ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1
212 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6
213 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
214 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2
215 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
216 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6
217 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
218 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[0:1]
219 ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s10, v2
220 ; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
221 ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
222 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
223 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8
224 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
225 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7
226 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
227 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
228 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8
229 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v7
230 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
231 ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
232 ; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
233 ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
234 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
235 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc
236 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc
237 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
238 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1]
239 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v9, s[0:1]
240 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
241 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v14, vcc
242 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
243 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v4, s[0:1]
244 ; GFX8-NEXT: v_mov_b32_e32 v4, s4
245 ; GFX8-NEXT: v_mov_b32_e32 v5, s5
246 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
247 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
248 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
249 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
250 ; GFX8-NEXT: s_endpgm
252 ; GFX9-LABEL: udivrem_i64:
254 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
255 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
256 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15
257 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14
258 ; GFX9-NEXT: s_sub_u32 s2, 0, s14
259 ; GFX9-NEXT: s_subb_u32 s3, 0, s15
260 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
261 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
262 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
263 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
264 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
265 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1
266 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
267 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
268 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
269 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
270 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
271 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
272 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
273 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
274 ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
275 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
276 ; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
277 ; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
278 ; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
279 ; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
280 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
281 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
282 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
283 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
284 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
285 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
286 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
287 ; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
288 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
289 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
290 ; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
291 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
292 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
293 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
294 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
295 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
296 ; GFX9-NEXT: v_mov_b32_e32 v7, s15
297 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
298 ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
299 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
300 ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
301 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
302 ; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
303 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
304 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
305 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
306 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
307 ; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
308 ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
309 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
310 ; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
311 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
312 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
313 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
314 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
315 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
316 ; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
317 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
318 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
319 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
320 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
321 ; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0
322 ; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
323 ; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0
324 ; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0
325 ; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1
326 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
327 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
328 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
329 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
330 ; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1
331 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
332 ; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1
333 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
334 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
335 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
336 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
337 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
338 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v5, 0
339 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
340 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
341 ; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
342 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v3, v[1:2]
343 ; GFX9-NEXT: v_mov_b32_e32 v6, s13
344 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
345 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v5, v[1:2]
346 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s12, v0
347 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
348 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v6
349 ; GFX9-NEXT: v_sub_u32_e32 v0, s13, v1
350 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
351 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2
352 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
353 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v6
354 ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
355 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[0:1]
356 ; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s14, v2
357 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
358 ; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
359 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
360 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v9
361 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
362 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8
363 ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
364 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
365 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v9
366 ; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s14, v8
367 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
368 ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
369 ; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
370 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
371 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
372 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v13, vcc
373 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc
374 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
375 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[0:1]
376 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v10, s[0:1]
377 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc
378 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v15, vcc
379 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
380 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v5, s[0:1]
381 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
382 ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
383 ; GFX9-NEXT: s_endpgm
385 ; GFX10-LABEL: udivrem_i64:
387 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
388 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
389 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s15
390 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s14
391 ; GFX10-NEXT: s_sub_u32 s0, 0, s14
392 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
393 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
394 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
395 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
396 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
397 ; GFX10-NEXT: v_trunc_f32_e32 v2, v1
398 ; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
399 ; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v2
400 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
401 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0
402 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s0, v3, 0
403 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s0, v4, v[1:2]
404 ; GFX10-NEXT: s_subb_u32 s1, 0, s15
405 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0
406 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, s1, v3, v[1:2]
407 ; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0
408 ; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0
409 ; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1
410 ; GFX10-NEXT: v_mul_lo_u32 v7, v4, v1
411 ; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1
412 ; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1
413 ; GFX10-NEXT: v_add_co_u32 v2, s2, v2, v5
414 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2
415 ; GFX10-NEXT: v_add_co_u32 v6, s2, v7, v6
416 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s2
417 ; GFX10-NEXT: v_add_co_u32 v0, s2, v2, v0
418 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
419 ; GFX10-NEXT: v_add_co_u32 v2, s2, v6, v8
420 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2
421 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0
422 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6
423 ; GFX10-NEXT: v_add_co_u32 v0, s2, v2, v0
424 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
425 ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v0
426 ; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1
427 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo
428 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v3, 0
429 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s0, v4, v[1:2]
430 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0
431 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s1, v3, v[1:2]
432 ; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0
433 ; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0
434 ; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1
435 ; GFX10-NEXT: v_mul_lo_u32 v7, v4, v1
436 ; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1
437 ; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1
438 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v5
439 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
440 ; GFX10-NEXT: v_add_co_u32 v6, s0, v7, v6
441 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
442 ; GFX10-NEXT: v_add_co_u32 v0, s0, v2, v0
443 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
444 ; GFX10-NEXT: v_add_co_u32 v2, s0, v6, v8
445 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
446 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0
447 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6
448 ; GFX10-NEXT: v_add_co_u32 v0, s0, v2, v0
449 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
450 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0
451 ; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1
452 ; GFX10-NEXT: v_mul_lo_u32 v2, s13, v0
453 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
454 ; GFX10-NEXT: v_mul_hi_u32 v4, s12, v0
455 ; GFX10-NEXT: v_mul_hi_u32 v0, s13, v0
456 ; GFX10-NEXT: v_mul_lo_u32 v3, s12, v1
457 ; GFX10-NEXT: v_mul_lo_u32 v5, s13, v1
458 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v3
459 ; GFX10-NEXT: v_mul_hi_u32 v3, s12, v1
460 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
461 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v4
462 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
463 ; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0
464 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
465 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2
466 ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3
467 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
468 ; GFX10-NEXT: v_add_co_u32 v5, s0, v0, v2
469 ; GFX10-NEXT: v_mul_hi_u32 v2, s13, v1
470 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
471 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3
472 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s14, v5, 0
473 ; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2
474 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s14, v3, v[1:2]
475 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s15, v5, v[1:2]
476 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1
477 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
478 ; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s12, v0
479 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, s13, v1
480 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s13, v1, vcc_lo
481 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v6, vcc_lo
482 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v7
483 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
484 ; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s14
485 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo
486 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v8
487 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v0, vcc_lo
488 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0
489 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v6
490 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
491 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v9
492 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
493 ; GFX10-NEXT: v_add_co_u32 v13, s0, v2, 1
494 ; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
495 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v9
496 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0
497 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v8
498 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
499 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v10, v1, s0
500 ; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s14
501 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0
502 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo
503 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
504 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
505 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
506 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc_lo
507 ; GFX10-NEXT: v_mov_b32_e32 v10, 0
508 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v2, s0
509 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v4, s0
510 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v6, s0
511 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v9, s0
512 ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[8:9]
513 ; GFX10-NEXT: global_store_dwordx2 v10, v[2:3], s[10:11]
514 ; GFX10-NEXT: s_endpgm
515 %div = udiv i64 %x, %y
516 store i64 %div, ptr addrspace(1) %out0
517 %rem = urem i64 %x, %y
518 store i64 %rem, ptr addrspace(1) %out1
522 define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
523 ; GFX8-LABEL: udivrem_v2i32:
525 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0
526 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
527 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10
528 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11
529 ; GFX8-NEXT: s_sub_i32 s0, 0, s10
530 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
531 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
532 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
533 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
534 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
535 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
536 ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0
537 ; GFX8-NEXT: s_sub_i32 s0, 0, s11
538 ; GFX8-NEXT: v_mul_lo_u32 v3, s0, v1
539 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
540 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3
541 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
542 ; GFX8-NEXT: v_mul_hi_u32 v0, s8, v0
543 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
544 ; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1
545 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, s10
546 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
547 ; GFX8-NEXT: v_mul_lo_u32 v4, v1, s11
548 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2
549 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2
550 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
551 ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s10, v2
552 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
553 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
554 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2
555 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
556 ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s10, v2
557 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
558 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v4
559 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1
560 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3
561 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
562 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3
563 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
564 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1
565 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3
566 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
567 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3
568 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
569 ; GFX8-NEXT: v_mov_b32_e32 v4, s4
570 ; GFX8-NEXT: v_mov_b32_e32 v5, s5
571 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
572 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
573 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
574 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
575 ; GFX8-NEXT: s_endpgm
577 ; GFX9-LABEL: udivrem_v2i32:
579 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
580 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
581 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14
582 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15
583 ; GFX9-NEXT: s_sub_i32 s0, 0, s14
584 ; GFX9-NEXT: s_sub_i32 s1, 0, s15
585 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
586 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
587 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
588 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
589 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
590 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
591 ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0
592 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1
593 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
594 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3
595 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
596 ; GFX9-NEXT: v_mul_hi_u32 v0, s12, v0
597 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
598 ; GFX9-NEXT: v_mul_hi_u32 v1, s13, v1
599 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s14
600 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
601 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s15
602 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1
603 ; GFX9-NEXT: v_sub_u32_e32 v2, s12, v2
604 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2
605 ; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3
606 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
607 ; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2
608 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v3
609 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
610 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
611 ; GFX9-NEXT: v_subrev_u32_e32 v5, s15, v3
612 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
613 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2
614 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
615 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
616 ; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2
617 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1
618 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
619 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v3
620 ; GFX9-NEXT: v_subrev_u32_e32 v4, s15, v3
621 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
622 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
623 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
624 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
625 ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
626 ; GFX9-NEXT: s_endpgm
628 ; GFX10-LABEL: udivrem_v2i32:
630 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
631 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
632 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
633 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14
634 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15
635 ; GFX10-NEXT: s_sub_i32 s0, 0, s14
636 ; GFX10-NEXT: s_sub_i32 s1, 0, s15
637 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
638 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
639 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
640 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
641 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
642 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
643 ; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0
644 ; GFX10-NEXT: v_mul_lo_u32 v3, s1, v1
645 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
646 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
647 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
648 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
649 ; GFX10-NEXT: v_mul_hi_u32 v0, s12, v0
650 ; GFX10-NEXT: v_mul_hi_u32 v1, s13, v1
651 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s14
652 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s15
653 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
654 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
655 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s12, v2
656 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s13, v3
657 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2
658 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3
659 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2
660 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3
661 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
662 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
663 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
664 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
665 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
666 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
667 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2
668 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3
669 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2
670 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3
671 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
672 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
673 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
674 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
675 ; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[8:9]
676 ; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[10:11]
677 ; GFX10-NEXT: s_endpgm
678 %div = udiv <2 x i32> %x, %y
679 store <2 x i32> %div, ptr addrspace(1) %out0
680 %rem = urem <2 x i32> %x, %y
681 store <2 x i32> %rem, ptr addrspace(1) %out1
685 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
686 ; GFX8-LABEL: udivrem_v4i32:
688 ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10
689 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
690 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
691 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s12
692 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s13
693 ; GFX8-NEXT: s_sub_i32 s0, 0, s12
694 ; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s14
695 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
696 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
697 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
698 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
699 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
700 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
701 ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0
702 ; GFX8-NEXT: s_sub_i32 s0, 0, s13
703 ; GFX8-NEXT: v_mul_lo_u32 v3, s0, v1
704 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
705 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3
706 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
707 ; GFX8-NEXT: v_mul_hi_u32 v0, s8, v0
708 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
709 ; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1
710 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, s12
711 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
712 ; GFX8-NEXT: v_mul_lo_u32 v5, v1, s13
713 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2
714 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v2
715 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
716 ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s12, v2
717 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
718 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
719 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v2
720 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
721 ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s12, v2
722 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
723 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v6
724 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s9, v5
725 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1
726 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
727 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
728 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2
729 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
730 ; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2
731 ; GFX8-NEXT: s_sub_i32 s0, 0, s14
732 ; GFX8-NEXT: v_mul_lo_u32 v6, s0, v3
733 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
734 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1
735 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2
736 ; GFX8-NEXT: v_mul_hi_u32 v6, v3, v6
737 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
738 ; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s15
739 ; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v6
740 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v5
741 ; GFX8-NEXT: v_mul_hi_u32 v3, s10, v3
742 ; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2
743 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
744 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
745 ; GFX8-NEXT: s_sub_i32 s0, 0, s15
746 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v2, v5, vcc
747 ; GFX8-NEXT: v_mul_lo_u32 v2, v3, s14
748 ; GFX8-NEXT: v_mul_lo_u32 v7, s0, v6
749 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v3
750 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s10, v2
751 ; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7
752 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s14, v2
753 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
754 ; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s14, v2
755 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v2, v8, vcc
756 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v7
757 ; GFX8-NEXT: v_mul_hi_u32 v7, s11, v2
758 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v3
759 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s14, v8
760 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
761 ; GFX8-NEXT: v_mul_lo_u32 v3, v7, s15
762 ; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s14, v8
763 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
764 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s11, v3
765 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v7
766 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s15, v3
767 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
768 ; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s15, v3
769 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc
770 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v7
771 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s15, v8
772 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
773 ; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s15, v8
774 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
775 ; GFX8-NEXT: v_mov_b32_e32 v9, s5
776 ; GFX8-NEXT: v_mov_b32_e32 v8, s4
777 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
779 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
780 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
781 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
782 ; GFX8-NEXT: s_endpgm
784 ; GFX9-LABEL: udivrem_v4i32:
786 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10
787 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
788 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12
789 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13
790 ; GFX9-NEXT: s_sub_i32 s0, 0, s12
791 ; GFX9-NEXT: s_sub_i32 s1, 0, s13
792 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
793 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
794 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s14
795 ; GFX9-NEXT: s_sub_i32 s4, 0, s14
796 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
797 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
798 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
799 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
800 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4
801 ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0
802 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1
803 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
804 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
805 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3
806 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
807 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
808 ; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0
809 ; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1
810 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v4
811 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
812 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s12
813 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, s13
814 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0
815 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v1
816 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3
817 ; GFX9-NEXT: v_sub_u32_e32 v7, s9, v4
818 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v3
819 ; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v3
820 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
821 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
822 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
823 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v3
824 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
825 ; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v3
826 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
827 ; GFX9-NEXT: v_mul_lo_u32 v3, s4, v2
828 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s15
829 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v7
830 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
831 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3
832 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5
833 ; GFX9-NEXT: v_subrev_u32_e32 v6, s13, v7
834 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
835 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3
836 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v5
837 ; GFX9-NEXT: v_mul_hi_u32 v2, s10, v2
838 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
839 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v1
840 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v6
841 ; GFX9-NEXT: s_sub_i32 s4, 0, s15
842 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
843 ; GFX9-NEXT: v_mul_lo_u32 v7, v2, s14
844 ; GFX9-NEXT: v_mul_lo_u32 v8, s4, v3
845 ; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v6
846 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
847 ; GFX9-NEXT: v_sub_u32_e32 v6, s10, v7
848 ; GFX9-NEXT: v_mul_hi_u32 v7, v3, v8
849 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v2
850 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v6
851 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
852 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7
853 ; GFX9-NEXT: v_mul_hi_u32 v3, s11, v3
854 ; GFX9-NEXT: v_subrev_u32_e32 v7, s14, v6
855 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
856 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v2
857 ; GFX9-NEXT: v_mul_lo_u32 v8, v3, s15
858 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v6
859 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
860 ; GFX9-NEXT: v_subrev_u32_e32 v7, s14, v6
861 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
862 ; GFX9-NEXT: v_sub_u32_e32 v7, s11, v8
863 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v3
864 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v7
865 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
866 ; GFX9-NEXT: v_subrev_u32_e32 v8, s15, v7
867 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
868 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v3
869 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v7
870 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
871 ; GFX9-NEXT: v_subrev_u32_e32 v8, s15, v7
872 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
873 ; GFX9-NEXT: v_mov_b32_e32 v8, 0
874 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
875 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
876 ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3]
877 ; GFX9-NEXT: s_endpgm
879 ; GFX10-LABEL: udivrem_v4i32:
881 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10
882 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
883 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
884 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
885 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12
886 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13
887 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s14
888 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s15
889 ; GFX10-NEXT: s_sub_i32 s0, 0, s12
890 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
891 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
892 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2
893 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3
894 ; GFX10-NEXT: s_sub_i32 s1, 0, s13
895 ; GFX10-NEXT: s_sub_i32 s2, 0, s14
896 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
897 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
898 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
899 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
900 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
901 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
902 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
903 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
904 ; GFX10-NEXT: v_mul_lo_u32 v4, s0, v0
905 ; GFX10-NEXT: v_mul_lo_u32 v5, s1, v1
906 ; GFX10-NEXT: v_mul_lo_u32 v6, s2, v2
907 ; GFX10-NEXT: s_sub_i32 s0, 0, s15
908 ; GFX10-NEXT: v_mul_lo_u32 v7, s0, v3
909 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4
910 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5
911 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6
912 ; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7
913 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4
914 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5
915 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v6
916 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v7
917 ; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0
918 ; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1
919 ; GFX10-NEXT: v_mul_hi_u32 v2, s10, v2
920 ; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3
921 ; GFX10-NEXT: v_mul_lo_u32 v4, v0, s12
922 ; GFX10-NEXT: v_mul_lo_u32 v5, v1, s13
923 ; GFX10-NEXT: v_mul_lo_u32 v6, v2, s14
924 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v0
925 ; GFX10-NEXT: v_mul_lo_u32 v7, v3, s15
926 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v1
927 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v2
928 ; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v3
929 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, s8, v4
930 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, s9, v5
931 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, s10, v6
932 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, s11, v7
933 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v4
934 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v5
935 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v6
936 ; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v7
937 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
938 ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s12, v4
939 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s0
940 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s13, v5
941 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s1
942 ; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s14, v6
943 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s2
944 ; GFX10-NEXT: v_subrev_nc_u32_e32 v12, s15, v7
945 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
946 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v10, s0
947 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s1
948 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v0
949 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v12, s2
950 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v1
951 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v2
952 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s12, v4
953 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v5
954 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v6
955 ; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v3
956 ; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v7
957 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
958 ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s12, v4
959 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s0
960 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s13, v5
961 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s1
962 ; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s14, v6
963 ; GFX10-NEXT: v_subrev_nc_u32_e32 v13, s15, v7
964 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s2
965 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo
966 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v10, s0
967 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s1
968 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s2
969 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
970 ; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7]
971 ; GFX10-NEXT: s_endpgm
972 %div = udiv <4 x i32> %x, %y
973 store <4 x i32> %div, ptr addrspace(1) %out0
974 %rem = urem <4 x i32> %x, %y
975 store <4 x i32> %rem, ptr addrspace(1) %out1
979 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
980 ; GFX8-LABEL: udivrem_v2i64:
982 ; GFX8-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x20
983 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0
984 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
985 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13
986 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12
987 ; GFX8-NEXT: s_sub_u32 s2, 0, s12
988 ; GFX8-NEXT: s_subb_u32 s3, 0, s13
989 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
990 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
991 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
992 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
993 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
994 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1
995 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
996 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
997 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
998 ; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
999 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
1000 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
1001 ; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
1002 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
1003 ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
1004 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
1005 ; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
1006 ; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
1007 ; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
1008 ; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
1009 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
1010 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
1011 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
1012 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
1013 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
1014 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1015 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
1016 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
1017 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
1018 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
1019 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1020 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1021 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
1022 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
1023 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
1024 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
1025 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
1026 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
1027 ; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
1028 ; GFX8-NEXT: s_sub_u32 s2, 0, s14
1029 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
1030 ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
1031 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
1032 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
1033 ; GFX8-NEXT: s_subb_u32 s3, 0, s15
1034 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
1035 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
1036 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
1037 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1038 ; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
1039 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
1040 ; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
1041 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
1042 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
1043 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
1044 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
1045 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
1046 ; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
1047 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1048 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1049 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
1050 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
1051 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
1052 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
1053 ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
1054 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
1055 ; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
1056 ; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
1057 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
1058 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
1059 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
1060 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1061 ; GFX8-NEXT: v_mul_lo_u32 v4, s9, v1
1062 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
1063 ; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1
1064 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
1065 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
1066 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
1067 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
1068 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
1069 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
1070 ; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1
1071 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v6, 0
1072 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1073 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
1074 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
1075 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v7, v[1:2]
1076 ; GFX8-NEXT: v_mov_b32_e32 v3, s9
1077 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s8, v0
1078 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v6, v[1:2]
1079 ; GFX8-NEXT: v_mov_b32_e32 v4, s13
1080 ; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v1, vcc
1081 ; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s9, v1
1082 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0
1083 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
1084 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8
1085 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
1086 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0
1087 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1]
1088 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15
1089 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc
1090 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14
1091 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
1092 ; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8
1093 ; GFX8-NEXT: v_add_f32_e32 v1, v2, v1
1094 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
1095 ; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc
1096 ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v6
1097 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
1098 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
1099 ; GFX8-NEXT: v_trunc_f32_e32 v14, v2
1100 ; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v14
1101 ; GFX8-NEXT: v_add_f32_e32 v1, v2, v1
1102 ; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v1
1103 ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v7, s[0:1]
1104 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11
1105 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
1106 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10
1107 ; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
1108 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0
1109 ; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v14
1110 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11
1111 ; GFX8-NEXT: v_cndmask_b32_e64 v16, v3, v16, s[0:1]
1112 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v14, v[2:3]
1113 ; GFX8-NEXT: v_add_u32_e64 v17, s[0:1], 1, v12
1114 ; GFX8-NEXT: v_addc_u32_e64 v18, s[0:1], 0, v13, s[0:1]
1115 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v15, v[2:3]
1116 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v4, vcc
1117 ; GFX8-NEXT: v_mul_lo_u32 v4, v14, v1
1118 ; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2
1119 ; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10
1120 ; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v3, vcc
1121 ; GFX8-NEXT: v_mul_hi_u32 v3, v15, v1
1122 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
1123 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
1124 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
1125 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
1126 ; GFX8-NEXT: v_mul_lo_u32 v4, v14, v2
1127 ; GFX8-NEXT: v_mul_hi_u32 v1, v14, v1
1128 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
1129 ; GFX8-NEXT: v_mul_hi_u32 v5, v15, v2
1130 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1
1131 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
1132 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5
1133 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
1134 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
1135 ; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2
1136 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
1137 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
1138 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
1139 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
1140 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v1
1141 ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, 0
1142 ; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v2, vcc
1143 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
1144 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc
1145 ; GFX8-NEXT: v_mov_b32_e32 v1, v4
1146 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v14, v[1:2]
1147 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v18, vcc
1148 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
1149 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v15, v[4:5]
1150 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v6, v2, s[0:1]
1151 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v12, s[0:1]
1152 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v19, vcc
1153 ; GFX8-NEXT: v_mul_lo_u32 v7, v14, v3
1154 ; GFX8-NEXT: v_mul_lo_u32 v9, v15, v4
1155 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1]
1156 ; GFX8-NEXT: v_mul_hi_u32 v8, v15, v3
1157 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v20, vcc
1158 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
1159 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
1160 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
1161 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
1162 ; GFX8-NEXT: v_mul_lo_u32 v8, v14, v4
1163 ; GFX8-NEXT: v_mul_hi_u32 v3, v14, v3
1164 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
1165 ; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4
1166 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3
1167 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
1168 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v9
1169 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
1170 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9
1171 ; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4
1172 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
1173 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
1174 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
1175 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
1176 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3
1177 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
1178 ; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3
1179 ; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4
1180 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
1181 ; GFX8-NEXT: v_mul_hi_u32 v0, s10, v3
1182 ; GFX8-NEXT: v_mul_hi_u32 v3, s11, v3
1183 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
1184 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
1185 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
1186 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1187 ; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4
1188 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0
1189 ; GFX8-NEXT: v_mul_hi_u32 v8, s10, v4
1190 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3
1191 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
1192 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
1193 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
1194 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
1195 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v3, v0
1196 ; GFX8-NEXT: v_mul_hi_u32 v8, s11, v4
1197 ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0
1198 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1199 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
1200 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v8, v0
1201 ; GFX8-NEXT: v_mov_b32_e32 v0, v4
1202 ; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[0:1]
1203 ; GFX8-NEXT: v_mov_b32_e32 v4, s11
1204 ; GFX8-NEXT: v_mov_b32_e32 v0, s15
1205 ; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8]
1206 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s10, v3
1207 ; GFX8-NEXT: v_subb_u32_e64 v11, s[0:1], v4, v7, vcc
1208 ; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v7
1209 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11
1210 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
1211 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8
1212 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
1213 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11
1214 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc
1215 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
1216 ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8
1217 ; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc
1218 ; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v9
1219 ; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v10, s[0:1]
1220 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12
1221 ; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1]
1222 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7
1223 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc
1224 ; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
1225 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12
1226 ; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v7
1227 ; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1]
1228 ; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13
1229 ; GFX8-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
1230 ; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1]
1231 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
1232 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc
1233 ; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc
1234 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
1235 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1]
1236 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v13, s[0:1]
1237 ; GFX8-NEXT: v_mov_b32_e32 v10, s5
1238 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc
1239 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
1240 ; GFX8-NEXT: v_mov_b32_e32 v9, s4
1241 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
1242 ; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v0, s[0:1]
1243 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[1:4]
1244 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
1245 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1246 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
1247 ; GFX8-NEXT: s_endpgm
1249 ; GFX9-LABEL: udivrem_v2i64:
1251 ; GFX9-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x20
1252 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
1253 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1254 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s17
1255 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s16
1256 ; GFX9-NEXT: s_sub_u32 s2, 0, s16
1257 ; GFX9-NEXT: s_subb_u32 s3, 0, s17
1258 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
1259 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
1260 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
1261 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
1262 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
1263 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1
1264 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
1265 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
1266 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
1267 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
1268 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
1269 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
1270 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
1271 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
1272 ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
1273 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
1274 ; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
1275 ; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
1276 ; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
1277 ; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
1278 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
1279 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
1280 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
1281 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
1282 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
1283 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1284 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
1285 ; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
1286 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
1287 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
1288 ; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
1289 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1290 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
1291 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
1292 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
1293 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
1294 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
1295 ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
1296 ; GFX9-NEXT: s_sub_u32 s2, 0, s18
1297 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
1298 ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
1299 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
1300 ; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
1301 ; GFX9-NEXT: s_subb_u32 s3, 0, s19
1302 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
1303 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
1304 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
1305 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1306 ; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
1307 ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
1308 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
1309 ; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
1310 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
1311 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
1312 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
1313 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
1314 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
1315 ; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
1316 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1317 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
1318 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
1319 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
1320 ; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0
1321 ; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
1322 ; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0
1323 ; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0
1324 ; GFX9-NEXT: v_mul_hi_u32 v5, s13, v1
1325 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
1326 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
1327 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
1328 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1329 ; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1
1330 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
1331 ; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1
1332 ; GFX9-NEXT: v_mov_b32_e32 v6, s17
1333 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
1334 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
1335 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
1336 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
1337 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2
1338 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v8, 0
1339 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1340 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
1341 ; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5
1342 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
1343 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v9, v[0:1]
1344 ; GFX9-NEXT: v_mov_b32_e32 v5, s13
1345 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1346 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s17, v8, v[2:3]
1347 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s12, v1
1348 ; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc
1349 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v1
1350 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
1351 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v2
1352 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
1353 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v1
1354 ; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3
1355 ; GFX9-NEXT: v_cndmask_b32_e64 v10, v4, v5, s[0:1]
1356 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s19
1357 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v6, vcc
1358 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s18
1359 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
1360 ; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s16, v2
1361 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
1362 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
1363 ; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
1364 ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v8
1365 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
1366 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
1367 ; GFX9-NEXT: v_trunc_f32_e32 v15, v4
1368 ; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v15
1369 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
1370 ; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v3
1371 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1]
1372 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v12
1373 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
1374 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v11
1375 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
1376 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0
1377 ; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15
1378 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v12
1379 ; GFX9-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[0:1]
1380 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5]
1381 ; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13
1382 ; GFX9-NEXT: v_addc_co_u32_e64 v19, s[0:1], 0, v14, s[0:1]
1383 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v16, v[4:5]
1384 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v6, vcc
1385 ; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3
1386 ; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4
1387 ; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s16, v11
1388 ; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc
1389 ; GFX9-NEXT: v_mul_hi_u32 v5, v16, v3
1390 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
1391 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
1392 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5
1393 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
1394 ; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4
1395 ; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3
1396 ; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
1397 ; GFX9-NEXT: v_mul_hi_u32 v7, v16, v4
1398 ; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4
1399 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3
1400 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
1401 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7
1402 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
1403 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
1404 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v7
1405 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
1406 ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v3
1407 ; GFX9-NEXT: v_add3_u32 v4, v6, v5, v4
1408 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v16, 0
1409 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v4, vcc
1410 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
1411 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v18, vcc
1412 ; GFX9-NEXT: v_mov_b32_e32 v3, v6
1413 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v15, v[3:4]
1414 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v19, vcc
1415 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10
1416 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[2:3], s3, v16, v[6:7]
1417 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v4, s[0:1]
1418 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v13, s[0:1]
1419 ; GFX9-NEXT: v_mul_lo_u32 v8, v15, v5
1420 ; GFX9-NEXT: v_mul_lo_u32 v9, v16, v6
1421 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v20, vcc
1422 ; GFX9-NEXT: v_mul_hi_u32 v11, v16, v5
1423 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v21, vcc
1424 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9
1425 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
1426 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v11
1427 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
1428 ; GFX9-NEXT: v_mul_lo_u32 v11, v15, v6
1429 ; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5
1430 ; GFX9-NEXT: v_add_u32_e32 v8, v9, v8
1431 ; GFX9-NEXT: v_mul_hi_u32 v9, v16, v6
1432 ; GFX9-NEXT: v_mul_hi_u32 v6, v15, v6
1433 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v11, v5
1434 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
1435 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
1436 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
1437 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8
1438 ; GFX9-NEXT: v_add_u32_e32 v9, v11, v9
1439 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
1440 ; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6
1441 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v16, v5
1442 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v6, vcc
1443 ; GFX9-NEXT: v_mul_lo_u32 v8, s15, v5
1444 ; GFX9-NEXT: v_mul_lo_u32 v9, s14, v6
1445 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v7, s[0:1]
1446 ; GFX9-NEXT: v_mul_hi_u32 v2, s14, v5
1447 ; GFX9-NEXT: v_mul_hi_u32 v5, s15, v5
1448 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9
1449 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
1450 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
1451 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1452 ; GFX9-NEXT: v_mul_lo_u32 v8, s15, v6
1453 ; GFX9-NEXT: v_add_u32_e32 v2, v9, v2
1454 ; GFX9-NEXT: v_mul_hi_u32 v9, s14, v6
1455 ; GFX9-NEXT: v_mul_hi_u32 v13, s15, v6
1456 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5
1457 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
1458 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
1459 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
1460 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v5, v2
1461 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s18, v12, 0
1462 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1463 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v10, s[0:1]
1464 ; GFX9-NEXT: v_add_u32_e32 v1, v11, v9
1465 ; GFX9-NEXT: v_add3_u32 v9, v1, v2, v13
1466 ; GFX9-NEXT: v_mov_b32_e32 v1, v6
1467 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v9, v[1:2]
1468 ; GFX9-NEXT: v_mov_b32_e32 v10, s15
1469 ; GFX9-NEXT: v_mov_b32_e32 v6, s19
1470 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v12, v[1:2]
1471 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s14, v5
1472 ; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc
1473 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v10
1474 ; GFX9-NEXT: v_sub_u32_e32 v1, s15, v1
1475 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
1476 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2
1477 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
1478 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v10
1479 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
1480 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1]
1481 ; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s18, v2
1482 ; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc
1483 ; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12
1484 ; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1]
1485 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v13
1486 ; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
1487 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v11
1488 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
1489 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
1490 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v13
1491 ; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s18, v11
1492 ; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
1493 ; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14
1494 ; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
1495 ; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1]
1496 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
1497 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc
1498 ; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc
1499 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5
1500 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v6, s[0:1]
1501 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v14, s[0:1]
1502 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v19, vcc
1503 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
1504 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
1505 ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[0:1]
1506 ; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[8:9]
1507 ; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[10:11]
1508 ; GFX9-NEXT: s_endpgm
1510 ; GFX10-LABEL: udivrem_v2i64:
1512 ; GFX10-NEXT: s_clause 0x1
1513 ; GFX10-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x20
1514 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
1515 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1516 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s17
1517 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s19
1518 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s16
1519 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s18
1520 ; GFX10-NEXT: s_sub_u32 s0, 0, s16
1521 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
1522 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
1523 ; GFX10-NEXT: s_subb_u32 s1, 0, s17
1524 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
1525 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
1526 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
1527 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
1528 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
1529 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
1530 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
1531 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1
1532 ; GFX10-NEXT: v_trunc_f32_e32 v4, v2
1533 ; GFX10-NEXT: v_trunc_f32_e32 v5, v3
1534 ; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v4
1535 ; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v5
1536 ; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v4
1537 ; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v5
1538 ; GFX10-NEXT: v_add_f32_e32 v0, v2, v0
1539 ; GFX10-NEXT: v_add_f32_e32 v1, v3, v1
1540 ; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0
1541 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1
1542 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v7, 0
1543 ; GFX10-NEXT: s_sub_u32 s2, 0, s18
1544 ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s3, s2, v8, 0
1545 ; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0
1546 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s3, s0, v9, v[1:2]
1547 ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4]
1548 ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0
1549 ; GFX10-NEXT: s_subb_u32 s3, 0, s19
1550 ; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, s1, v7, v[4:5]
1551 ; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0
1552 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s3, v8, v[5:6]
1553 ; GFX10-NEXT: v_mul_lo_u32 v1, v10, v2
1554 ; GFX10-NEXT: v_mul_hi_u32 v5, v8, v2
1555 ; GFX10-NEXT: v_mul_hi_u32 v2, v10, v2
1556 ; GFX10-NEXT: v_mul_lo_u32 v12, v7, v3
1557 ; GFX10-NEXT: v_mul_lo_u32 v13, v9, v3
1558 ; GFX10-NEXT: v_mul_hi_u32 v14, v7, v3
1559 ; GFX10-NEXT: v_mul_lo_u32 v15, v8, v0
1560 ; GFX10-NEXT: v_mul_lo_u32 v16, v10, v0
1561 ; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0
1562 ; GFX10-NEXT: v_mul_hi_u32 v3, v9, v3
1563 ; GFX10-NEXT: v_mul_hi_u32 v0, v10, v0
1564 ; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v12
1565 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4
1566 ; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11
1567 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4
1568 ; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v15
1569 ; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s4
1570 ; GFX10-NEXT: v_add_co_u32 v2, s4, v16, v2
1571 ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4
1572 ; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4
1573 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
1574 ; GFX10-NEXT: v_add_co_u32 v6, s4, v11, v14
1575 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4
1576 ; GFX10-NEXT: v_add_co_u32 v1, s4, v1, v5
1577 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v4
1578 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
1579 ; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v17
1580 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
1581 ; GFX10-NEXT: v_add_co_u32 v4, s4, v6, v4
1582 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v15, v1
1583 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11
1584 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4
1585 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v16, v5
1586 ; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v7, v4
1587 ; GFX10-NEXT: v_add_co_u32 v1, s4, v2, v1
1588 ; GFX10-NEXT: v_add3_u32 v3, v11, v6, v3
1589 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
1590 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v3, vcc_lo
1591 ; GFX10-NEXT: v_add3_u32 v2, v5, v2, v0
1592 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v1
1593 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, s0, v7, 0
1594 ; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo
1595 ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, s2, v8, 0
1596 ; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0
1597 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2]
1598 ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4]
1599 ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0
1600 ; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s1, v7, v[4:5]
1601 ; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0
1602 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s3, v8, v[5:6]
1603 ; GFX10-NEXT: v_mul_lo_u32 v1, v10, v2
1604 ; GFX10-NEXT: v_mul_hi_u32 v5, v8, v2
1605 ; GFX10-NEXT: v_mul_hi_u32 v2, v10, v2
1606 ; GFX10-NEXT: v_mul_lo_u32 v12, v7, v3
1607 ; GFX10-NEXT: v_mul_lo_u32 v13, v9, v3
1608 ; GFX10-NEXT: v_mul_hi_u32 v14, v7, v3
1609 ; GFX10-NEXT: v_mul_lo_u32 v15, v8, v0
1610 ; GFX10-NEXT: v_mul_lo_u32 v16, v10, v0
1611 ; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0
1612 ; GFX10-NEXT: v_mul_hi_u32 v3, v9, v3
1613 ; GFX10-NEXT: v_mul_hi_u32 v0, v10, v0
1614 ; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v12
1615 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0
1616 ; GFX10-NEXT: v_add_co_u32 v11, s0, v13, v11
1617 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0
1618 ; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v15
1619 ; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0
1620 ; GFX10-NEXT: v_add_co_u32 v2, s0, v16, v2
1621 ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s0
1622 ; GFX10-NEXT: v_add_co_u32 v4, s0, v6, v4
1623 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
1624 ; GFX10-NEXT: v_add_co_u32 v6, s0, v11, v14
1625 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0
1626 ; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v5
1627 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
1628 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v4
1629 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v17
1630 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
1631 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v15, v1
1632 ; GFX10-NEXT: v_add_co_u32 v4, s0, v6, v4
1633 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11
1634 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
1635 ; GFX10-NEXT: v_add_co_u32 v1, s0, v2, v1
1636 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v16, v5
1637 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
1638 ; GFX10-NEXT: v_add3_u32 v3, v11, v6, v3
1639 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v7, v4
1640 ; GFX10-NEXT: v_add3_u32 v0, v5, v2, v0
1641 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v9, v3, vcc_lo
1642 ; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v8, v1
1643 ; GFX10-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v10, v0, vcc_lo
1644 ; GFX10-NEXT: v_mul_lo_u32 v3, s13, v4
1645 ; GFX10-NEXT: v_mul_lo_u32 v8, s12, v2
1646 ; GFX10-NEXT: v_mul_hi_u32 v5, s12, v4
1647 ; GFX10-NEXT: v_mul_hi_u32 v4, s13, v4
1648 ; GFX10-NEXT: v_mul_lo_u32 v9, s13, v2
1649 ; GFX10-NEXT: v_mul_lo_u32 v6, s15, v1
1650 ; GFX10-NEXT: v_mul_hi_u32 v10, s12, v2
1651 ; GFX10-NEXT: v_mul_hi_u32 v11, s13, v2
1652 ; GFX10-NEXT: v_mul_lo_u32 v2, s14, v0
1653 ; GFX10-NEXT: v_mul_hi_u32 v7, s14, v1
1654 ; GFX10-NEXT: v_mul_hi_u32 v1, s15, v1
1655 ; GFX10-NEXT: v_mul_lo_u32 v12, s15, v0
1656 ; GFX10-NEXT: v_mul_hi_u32 v13, s14, v0
1657 ; GFX10-NEXT: v_mul_hi_u32 v14, s15, v0
1658 ; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v8
1659 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
1660 ; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4
1661 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
1662 ; GFX10-NEXT: v_add_co_u32 v2, s0, v6, v2
1663 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
1664 ; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1
1665 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
1666 ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v5
1667 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
1668 ; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v10
1669 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
1670 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v7
1671 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
1672 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0
1673 ; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v13
1674 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
1675 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2
1676 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5
1677 ; GFX10-NEXT: v_add_co_u32 v8, s0, v4, v0
1678 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
1679 ; GFX10-NEXT: v_add_co_u32 v10, s0, v1, v2
1680 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s16, v8, 0
1681 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
1682 ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s18, v10, 0
1683 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7
1684 ; GFX10-NEXT: v_add3_u32 v9, v5, v4, v11
1685 ; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v8, 1
1686 ; GFX10-NEXT: v_mov_b32_e32 v11, 0
1687 ; GFX10-NEXT: v_add3_u32 v7, v7, v6, v14
1688 ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s16, v9, v[1:2]
1689 ; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo
1690 ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s18, v7, v[3:4]
1691 ; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s17, v8, v[4:5]
1692 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, 1
1693 ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo
1694 ; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s12, v0
1695 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s19, v10, v[5:6]
1696 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s13, v3, vcc_lo
1697 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s16, v14
1698 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s13, v3
1699 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0
1700 ; GFX10-NEXT: v_sub_co_u32 v15, s0, s14, v2
1701 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
1702 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s15, v0, s0
1703 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v15
1704 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s15, v0
1705 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
1706 ; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s16
1707 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo
1708 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s17, v5
1709 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s19, v0, s0
1710 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s17, v18
1711 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
1712 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s1
1713 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s16, v17
1714 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s1
1715 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s17, v18
1716 ; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s1
1717 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s19, v16
1718 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v21, v20, s0
1719 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s17, v5
1720 ; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s1
1721 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
1722 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0
1723 ; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s16
1724 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0
1725 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc_lo
1726 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
1727 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo
1728 ; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s18
1729 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v0, vcc_lo
1730 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1
1731 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s0
1732 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s0
1733 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc_lo
1734 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s19, v16
1735 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s0
1736 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s0
1737 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc_lo
1738 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s19, v12
1739 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo
1740 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v6
1741 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo
1742 ; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v10, 1
1743 ; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo
1744 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s19, v12
1745 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
1746 ; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1
1747 ; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo
1748 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s19, v23, s1
1749 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
1750 ; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s18
1751 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1
1752 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc_lo
1753 ; GFX10-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc_lo
1754 ; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v2
1755 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo
1756 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v18, vcc_lo
1757 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v9, s1
1758 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1
1759 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v6, s1
1760 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v8, s1
1761 ; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[8:9]
1762 ; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[10:11]
1763 ; GFX10-NEXT: s_endpgm
1764 %div = udiv <2 x i64> %x, %y
1765 store <2 x i64> %div, ptr addrspace(1) %out0
1766 %rem = urem <2 x i64> %x, %y
1767 store <2 x i64> %rem, ptr addrspace(1) %out1
1771 define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i8 %x, i8 %y) {
1772 ; GFX8-LABEL: udiv_i8:
1774 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10
1775 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1776 ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008
1777 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
1778 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
1779 ; GFX8-NEXT: s_sub_i32 s0, 0, s5
1780 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff
1781 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1782 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
1783 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0
1784 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1785 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
1786 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
1787 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0
1788 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1789 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1790 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1791 ; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5
1792 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2
1793 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3
1794 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
1795 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1796 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
1797 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
1798 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2
1799 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
1800 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1801 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
1802 ; GFX8-NEXT: flat_store_byte v[0:1], v2
1803 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
1804 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
1805 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1806 ; GFX8-NEXT: flat_store_byte v[0:1], v3
1807 ; GFX8-NEXT: s_endpgm
1809 ; GFX9-LABEL: udiv_i8:
1811 ; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10
1812 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1813 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1814 ; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80008
1815 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
1816 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
1817 ; GFX9-NEXT: s_sub_i32 s1, 0, s4
1818 ; GFX9-NEXT: s_and_b32 s5, s0, 0xff
1819 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1820 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
1821 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0
1822 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1823 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
1824 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
1825 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0
1826 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4
1827 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
1828 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1
1829 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
1830 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
1831 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1
1832 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
1833 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
1834 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
1835 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
1836 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1
1837 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
1838 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1839 ; GFX9-NEXT: global_store_byte v2, v0, s[0:1]
1840 ; GFX9-NEXT: global_store_byte v2, v1, s[2:3]
1841 ; GFX9-NEXT: s_endpgm
1843 ; GFX10-LABEL: udiv_i8:
1845 ; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10
1846 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1847 ; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008
1848 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff
1849 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
1850 ; GFX10-NEXT: s_sub_i32 s1, 0, s4
1851 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
1852 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1853 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
1854 ; GFX10-NEXT: v_mul_lo_u32 v1, s1, v0
1855 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
1856 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
1857 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
1858 ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4
1859 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0
1860 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1
1861 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1862 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1
1863 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1
1864 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
1865 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
1866 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0
1867 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1
1868 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1
1869 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
1870 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1871 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
1872 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1873 ; GFX10-NEXT: global_store_byte v2, v0, s[0:1]
1874 ; GFX10-NEXT: global_store_byte v2, v1, s[2:3]
1875 ; GFX10-NEXT: s_endpgm
1876 %div = udiv i8 %x, %y
1877 store i8 %div, ptr addrspace(1) %out0
1878 %rem = urem i8 %x, %y
1879 store i8 %rem, ptr addrspace(1) %out1
1883 define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i8> %x, <2 x i8> %y) {
1884 ; GFX8-LABEL: udivrem_v2i8:
1886 ; GFX8-NEXT: s_load_dword s0, s[6:7], 0x10
1887 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
1888 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1889 ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010
1890 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
1891 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
1892 ; GFX8-NEXT: v_cvt_f32_ubyte3_e32 v1, s0
1893 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
1894 ; GFX8-NEXT: s_sub_i32 s1, 0, s2
1895 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1896 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
1897 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
1898 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
1899 ; GFX8-NEXT: s_lshr_b32 s3, s0, 24
1900 ; GFX8-NEXT: v_mul_lo_u32 v2, s1, v0
1901 ; GFX8-NEXT: s_sub_i32 s1, 0, s3
1902 ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1
1903 ; GFX8-NEXT: s_and_b32 s1, s0, 0xff
1904 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
1905 ; GFX8-NEXT: s_bfe_u32 s8, s0, 0x80008
1906 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3
1907 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1908 ; GFX8-NEXT: v_mul_hi_u32 v0, s1, v0
1909 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
1910 ; GFX8-NEXT: v_mul_hi_u32 v1, s8, v1
1911 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, s2
1912 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
1913 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v2
1914 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
1915 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
1916 ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s2, v2
1917 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1918 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
1919 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
1920 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
1921 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, s3
1922 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s2, v2
1923 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1924 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v3
1925 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1
1926 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
1927 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1928 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3
1929 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
1930 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1
1931 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
1932 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1933 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
1934 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3
1935 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1936 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
1937 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1938 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
1939 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
1940 ; GFX8-NEXT: flat_store_short v[0:1], v4
1941 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v3
1942 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1943 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1944 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
1945 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
1946 ; GFX8-NEXT: flat_store_short v[0:1], v2
1947 ; GFX8-NEXT: s_endpgm
1949 ; GFX9-LABEL: udivrem_v2i8:
1951 ; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10
1952 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1953 ; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80010
1954 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
1955 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
1956 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, s0
1957 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
1958 ; GFX9-NEXT: s_sub_i32 s1, 0, s4
1959 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
1960 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
1961 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1962 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
1963 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24
1964 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1
1965 ; GFX9-NEXT: s_sub_i32 s2, 0, s5
1966 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0
1967 ; GFX9-NEXT: s_and_b32 s8, s0, 0xff
1968 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3
1969 ; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008
1970 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
1971 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1972 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
1973 ; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1
1974 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
1975 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0
1976 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s4
1977 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
1978 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s5
1979 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3
1980 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3
1981 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1982 ; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3
1983 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
1984 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
1985 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3
1986 ; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2
1987 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1988 ; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3
1989 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
1990 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
1991 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2
1992 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
1993 ; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v2
1994 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1995 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
1996 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2
1997 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
1998 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
1999 ; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v2
2000 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
2001 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
2002 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2003 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2004 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2005 ; GFX9-NEXT: global_store_short v1, v0, s[0:1]
2006 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v2
2007 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
2008 ; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2009 ; GFX9-NEXT: global_store_short v1, v0, s[2:3]
2010 ; GFX9-NEXT: s_endpgm
2012 ; GFX10-LABEL: udivrem_v2i8:
2014 ; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10
2015 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2016 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, s0
2017 ; GFX10-NEXT: s_bfe_u32 s1, s0, 0x80010
2018 ; GFX10-NEXT: s_lshr_b32 s2, s0, 24
2019 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1
2020 ; GFX10-NEXT: s_sub_i32 s3, 0, s2
2021 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
2022 ; GFX10-NEXT: s_sub_i32 s4, 0, s1
2023 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
2024 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2025 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
2026 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
2027 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
2028 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0
2029 ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008
2030 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff
2031 ; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1
2032 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
2033 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
2034 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
2035 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
2036 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
2037 ; GFX10-NEXT: v_mul_hi_u32 v0, s3, v0
2038 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1
2039 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2
2040 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
2041 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1
2042 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
2043 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2
2044 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3
2045 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
2046 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
2047 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3
2048 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3
2049 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
2050 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
2051 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0
2052 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0
2053 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
2054 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
2055 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
2056 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
2057 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3
2058 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3
2059 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
2060 ; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
2061 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
2062 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0
2063 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0
2064 ; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2065 ; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2066 ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2067 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
2068 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2069 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2070 ; GFX10-NEXT: global_store_short v1, v0, s[4:5]
2071 ; GFX10-NEXT: global_store_short v1, v2, s[6:7]
2072 ; GFX10-NEXT: s_endpgm
2073 %div = udiv <2 x i8> %x, %y
2074 store <2 x i8> %div, ptr addrspace(1) %out0
2075 %rem = urem <2 x i8> %x, %y
2076 store <2 x i8> %rem, ptr addrspace(1) %out1
2080 define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i16 %x, i16 %y) {
2081 ; GFX8-LABEL: udiv_i16:
2083 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10
2084 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2085 ; GFX8-NEXT: s_lshr_b32 s5, s4, 16
2086 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
2087 ; GFX8-NEXT: s_sub_i32 s0, 0, s5
2088 ; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
2089 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
2090 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2091 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
2092 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0
2093 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2094 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
2095 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
2096 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0
2097 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2098 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2099 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2100 ; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5
2101 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2
2102 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3
2103 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
2104 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
2105 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
2106 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2107 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2
2108 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
2109 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
2110 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
2111 ; GFX8-NEXT: flat_store_short v[0:1], v2
2112 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
2113 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2114 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2115 ; GFX8-NEXT: flat_store_short v[0:1], v3
2116 ; GFX8-NEXT: s_endpgm
2118 ; GFX9-LABEL: udiv_i16:
2120 ; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10
2121 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2122 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2123 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16
2124 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
2125 ; GFX9-NEXT: s_sub_i32 s1, 0, s4
2126 ; GFX9-NEXT: s_and_b32 s5, s0, 0xffff
2127 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
2128 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2129 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
2130 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0
2131 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2132 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
2133 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
2134 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0
2135 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4
2136 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
2137 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1
2138 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
2139 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
2140 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1
2141 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
2142 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
2143 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
2144 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
2145 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1
2146 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
2147 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2148 ; GFX9-NEXT: global_store_short v2, v0, s[0:1]
2149 ; GFX9-NEXT: global_store_short v2, v1, s[2:3]
2150 ; GFX9-NEXT: s_endpgm
2152 ; GFX10-LABEL: udiv_i16:
2154 ; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10
2155 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2156 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16
2157 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
2158 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
2159 ; GFX10-NEXT: s_sub_i32 s1, 0, s4
2160 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
2161 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2162 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
2163 ; GFX10-NEXT: v_mul_lo_u32 v1, s1, v0
2164 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
2165 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
2166 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
2167 ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4
2168 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0
2169 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1
2170 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2171 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1
2172 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1
2173 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2174 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2175 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0
2176 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1
2177 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1
2178 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2179 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
2180 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2181 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2182 ; GFX10-NEXT: global_store_short v2, v0, s[0:1]
2183 ; GFX10-NEXT: global_store_short v2, v1, s[2:3]
2184 ; GFX10-NEXT: s_endpgm
2185 %div = udiv i16 %x, %y
2186 store i16 %div, ptr addrspace(1) %out0
2187 %rem = urem i16 %x, %y
2188 store i16 %rem, ptr addrspace(1) %out1
2192 define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %x, <2 x i16> %y) {
2193 ; GFX8-LABEL: udivrem_v2i16:
2195 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10
2196 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
2197 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2198 ; GFX8-NEXT: s_and_b32 s2, s1, 0xffff
2199 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
2200 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16
2201 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3
2202 ; GFX8-NEXT: s_sub_i32 s1, 0, s2
2203 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
2204 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16
2205 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
2206 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
2207 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2208 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
2209 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
2210 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
2211 ; GFX8-NEXT: v_mul_lo_u32 v2, s1, v0
2212 ; GFX8-NEXT: s_sub_i32 s1, 0, s3
2213 ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1
2214 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
2215 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3
2216 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
2217 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
2218 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
2219 ; GFX8-NEXT: v_mul_hi_u32 v1, s8, v1
2220 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, s2
2221 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
2222 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2
2223 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
2224 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
2225 ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s2, v2
2226 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
2227 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
2228 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
2229 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
2230 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, s3
2231 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s2, v2
2232 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
2233 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v3
2234 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1
2235 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
2236 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
2237 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3
2238 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2239 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1
2240 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
2241 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
2242 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3
2243 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
2244 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2245 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2246 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2247 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v3
2248 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2249 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2250 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
2251 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
2252 ; GFX8-NEXT: flat_store_dword v[0:1], v4
2253 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
2254 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
2255 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2256 ; GFX8-NEXT: s_endpgm
2258 ; GFX9-LABEL: udivrem_v2i16:
2260 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10
2261 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2262 ; GFX9-NEXT: s_and_b32 s3, s1, 0xffff
2263 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
2264 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16
2265 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
2266 ; GFX9-NEXT: s_sub_i32 s1, 0, s3
2267 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
2268 ; GFX9-NEXT: s_sub_i32 s4, 0, s2
2269 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
2270 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2271 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
2272 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
2273 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
2274 ; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0
2275 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16
2276 ; GFX9-NEXT: v_mul_lo_u32 v3, s4, v1
2277 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
2278 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
2279 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
2280 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3
2281 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
2282 ; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0
2283 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
2284 ; GFX9-NEXT: v_mul_hi_u32 v1, s1, v1
2285 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s3
2286 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
2287 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2
2288 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1
2289 ; GFX9-NEXT: v_sub_u32_e32 v2, s0, v2
2290 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2
2291 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
2292 ; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v2
2293 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
2294 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
2295 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2
2296 ; GFX9-NEXT: v_sub_u32_e32 v3, s1, v3
2297 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
2298 ; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v2
2299 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3
2300 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
2301 ; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3
2302 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
2303 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
2304 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
2305 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
2306 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
2307 ; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3
2308 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
2309 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2310 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
2311 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2
2312 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2313 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
2314 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2315 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
2316 ; GFX9-NEXT: global_store_dword v2, v1, s[6:7]
2317 ; GFX9-NEXT: s_endpgm
2319 ; GFX10-LABEL: udivrem_v2i16:
2321 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10
2322 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2323 ; GFX10-NEXT: s_and_b32 s2, s1, 0xffff
2324 ; GFX10-NEXT: s_lshr_b32 s1, s1, 16
2325 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
2326 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1
2327 ; GFX10-NEXT: s_sub_i32 s3, 0, s2
2328 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
2329 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
2330 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
2331 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2332 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
2333 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
2334 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
2335 ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0
2336 ; GFX10-NEXT: s_sub_i32 s3, 0, s1
2337 ; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1
2338 ; GFX10-NEXT: s_and_b32 s3, s0, 0xffff
2339 ; GFX10-NEXT: s_lshr_b32 s0, s0, 16
2340 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
2341 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
2342 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
2343 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
2344 ; GFX10-NEXT: v_mul_hi_u32 v0, s3, v0
2345 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1
2346 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2
2347 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
2348 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1
2349 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1
2350 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2
2351 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3
2352 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2
2353 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
2354 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3
2355 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
2356 ; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3
2357 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
2358 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0
2359 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0
2360 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0
2361 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
2362 ; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2
2363 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3
2364 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
2365 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
2366 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v3
2367 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
2368 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
2369 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0
2370 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0
2371 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
2372 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
2373 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
2374 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
2375 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2376 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
2377 ; GFX10-NEXT: global_store_dword v1, v2, s[6:7]
2378 ; GFX10-NEXT: s_endpgm
2379 %div = udiv <2 x i16> %x, %y
2380 store <2 x i16> %div, ptr addrspace(1) %out0
2381 %rem = urem <2 x i16> %x, %y
2382 store <2 x i16> %rem, ptr addrspace(1) %out1
2386 define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i3 %x, i3 %y) {
2387 ; GFX8-LABEL: udivrem_i3:
2389 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x10
2390 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2391 ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008
2392 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
2393 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
2394 ; GFX8-NEXT: s_sub_i32 s0, 0, s5
2395 ; GFX8-NEXT: s_and_b32 s4, s4, 7
2396 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2397 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
2398 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0
2399 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2400 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
2401 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
2402 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0
2403 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2404 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2405 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2406 ; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5
2407 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2
2408 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3
2409 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
2410 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
2411 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
2412 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2413 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2
2414 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
2415 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
2416 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
2417 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
2418 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2419 ; GFX8-NEXT: flat_store_byte v[0:1], v2
2420 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
2421 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3
2422 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2423 ; GFX8-NEXT: flat_store_byte v[0:1], v2
2424 ; GFX8-NEXT: s_endpgm
2426 ; GFX9-LABEL: udivrem_i3:
2428 ; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10
2429 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2430 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2431 ; GFX9-NEXT: s_bfe_u32 s4, s0, 0x30008
2432 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
2433 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
2434 ; GFX9-NEXT: s_sub_i32 s1, 0, s4
2435 ; GFX9-NEXT: s_and_b32 s5, s0, 7
2436 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2437 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
2438 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0
2439 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2440 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
2441 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
2442 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0
2443 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4
2444 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
2445 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1
2446 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
2447 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
2448 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1
2449 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
2450 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
2451 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
2452 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
2453 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1
2454 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
2455 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
2456 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2457 ; GFX9-NEXT: global_store_byte v2, v0, s[0:1]
2458 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v1
2459 ; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
2460 ; GFX9-NEXT: s_endpgm
2462 ; GFX10-LABEL: udivrem_i3:
2464 ; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10
2465 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2466 ; GFX10-NEXT: s_bfe_u32 s4, s0, 0x30008
2467 ; GFX10-NEXT: s_and_b32 s0, s0, 7
2468 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
2469 ; GFX10-NEXT: s_sub_i32 s1, 0, s4
2470 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
2471 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2472 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
2473 ; GFX10-NEXT: v_mul_lo_u32 v1, s1, v0
2474 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
2475 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
2476 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
2477 ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4
2478 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0
2479 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1
2480 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2481 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1
2482 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1
2483 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2484 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2485 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0
2486 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1
2487 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1
2488 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2489 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2490 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
2491 ; GFX10-NEXT: v_and_b32_e32 v0, 7, v0
2492 ; GFX10-NEXT: v_and_b32_e32 v1, 7, v1
2493 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2494 ; GFX10-NEXT: global_store_byte v2, v0, s[0:1]
2495 ; GFX10-NEXT: global_store_byte v2, v1, s[2:3]
2496 ; GFX10-NEXT: s_endpgm
2497 %div = udiv i3 %x, %y
2498 store i3 %div, ptr addrspace(1) %out0
2499 %rem = urem i3 %x, %y
2500 store i3 %rem, ptr addrspace(1) %out1
2504 define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i27 %x, i27 %y) {
2505 ; GFX8-LABEL: udivrem_i27:
2507 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10
2508 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2509 ; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff
2510 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
2511 ; GFX8-NEXT: s_sub_i32 s0, 0, s5
2512 ; GFX8-NEXT: s_and_b32 s4, s4, 0x7ffffff
2513 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
2514 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2515 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
2516 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0
2517 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2518 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
2519 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
2520 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0
2521 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2522 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2523 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2524 ; GFX8-NEXT: v_mul_lo_u32 v3, v2, s5
2525 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2
2526 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3
2527 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
2528 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
2529 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
2530 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2531 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2
2532 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
2533 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
2534 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3
2535 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2
2536 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2537 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2538 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
2539 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3
2540 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
2541 ; GFX8-NEXT: flat_store_dword v[0:1], v2
2542 ; GFX8-NEXT: s_endpgm
2544 ; GFX9-LABEL: udivrem_i27:
2546 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10
2547 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2548 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2549 ; GFX9-NEXT: s_and_b32 s4, s1, 0x7ffffff
2550 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
2551 ; GFX9-NEXT: s_sub_i32 s1, 0, s4
2552 ; GFX9-NEXT: s_and_b32 s5, s0, 0x7ffffff
2553 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
2554 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2555 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
2556 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0
2557 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2558 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
2559 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
2560 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0
2561 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4
2562 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
2563 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1
2564 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
2565 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
2566 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1
2567 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
2568 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
2569 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
2570 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
2571 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v1
2572 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
2573 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0
2574 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2575 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
2576 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v1
2577 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
2578 ; GFX9-NEXT: s_endpgm
2580 ; GFX10-LABEL: udivrem_i27:
2582 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x10
2583 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2584 ; GFX10-NEXT: s_and_b32 s4, s1, 0x7ffffff
2585 ; GFX10-NEXT: s_and_b32 s0, s0, 0x7ffffff
2586 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
2587 ; GFX10-NEXT: s_sub_i32 s1, 0, s4
2588 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
2589 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2590 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
2591 ; GFX10-NEXT: v_mul_lo_u32 v1, s1, v0
2592 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
2593 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
2594 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
2595 ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s4
2596 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0
2597 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1
2598 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2599 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1
2600 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1
2601 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2602 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2603 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0
2604 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1
2605 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v1
2606 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2607 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2608 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
2609 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0
2610 ; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1
2611 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2612 ; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
2613 ; GFX10-NEXT: global_store_dword v2, v1, s[2:3]
2614 ; GFX10-NEXT: s_endpgm
2615 %div = udiv i27 %x, %y
2616 store i27 %div, ptr addrspace(1) %out0
2617 %rem = urem i27 %x, %y
2618 store i27 %rem, ptr addrspace(1) %out1