1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn | FileCheck %s -check-prefixes=FUNC,SI,GCN
3 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s -check-prefixes=FUNC,SI,TONGA
4 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global | FileCheck %s -check-prefixes=FUNC,SI,GFX9
5 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood | FileCheck %s -check-prefixes=FUNC,EG
7 ; The code generated by sdiv is long and complex and may frequently change.
8 ; The goal of this test is to make sure the ISel doesn't fail.
10 ; This program was previously failing to compile when one of the selectcc
11 ; opcodes generated by the sdiv lowering was being legalized and optimized to:
12 ; selectcc Remainder -1, 0, -1, SETGT
13 ; This was fixed by adding an additional pattern in R600Instructions.td to
14 ; match this pattern with a CNDGE_INT.
16 define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
17 ; GCN-LABEL: sdiv_i32:
19 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
20 ; GCN-NEXT: s_mov_b32 s7, 0xf000
21 ; GCN-NEXT: s_mov_b32 s6, -1
22 ; GCN-NEXT: s_mov_b32 s10, s6
23 ; GCN-NEXT: s_mov_b32 s11, s7
24 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
25 ; GCN-NEXT: s_mov_b32 s8, s2
26 ; GCN-NEXT: s_mov_b32 s9, s3
27 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
28 ; GCN-NEXT: s_mov_b32 s4, s0
29 ; GCN-NEXT: s_mov_b32 s5, s1
30 ; GCN-NEXT: s_waitcnt vmcnt(0)
31 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0
32 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1
33 ; GCN-NEXT: v_xor_b32_e32 v4, v2, v3
34 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0
35 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
36 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
37 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v3
38 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1
39 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
40 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
41 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
42 ; GCN-NEXT: v_mul_hi_u32 v3, v2, v1
43 ; GCN-NEXT: v_mul_lo_u32 v5, v2, v1
44 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v5
45 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3
46 ; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1]
47 ; GCN-NEXT: v_mul_hi_u32 v3, v3, v2
48 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v3, v2
49 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v3, v2
50 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
51 ; GCN-NEXT: v_mul_hi_u32 v2, v2, v0
52 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v1
53 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2
54 ; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v2
55 ; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v0
56 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
57 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v1
58 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc
59 ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v5, s[0:1]
60 ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
61 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v4
62 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
63 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
66 ; TONGA-LABEL: sdiv_i32:
68 ; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
69 ; TONGA-NEXT: s_mov_b32 s7, 0xf000
70 ; TONGA-NEXT: s_mov_b32 s6, -1
71 ; TONGA-NEXT: s_mov_b32 s2, s6
72 ; TONGA-NEXT: s_mov_b32 s3, s7
73 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
74 ; TONGA-NEXT: s_mov_b32 s0, s10
75 ; TONGA-NEXT: s_mov_b32 s1, s11
76 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
77 ; TONGA-NEXT: s_mov_b32 s4, s8
78 ; TONGA-NEXT: s_mov_b32 s5, s9
79 ; TONGA-NEXT: s_waitcnt vmcnt(0)
80 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v1
81 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v1
82 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v2
83 ; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v1
84 ; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v0
85 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v6, v0
86 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v6
87 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
88 ; TONGA-NEXT: v_xor_b32_e32 v2, v6, v2
89 ; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3
90 ; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
91 ; TONGA-NEXT: v_mul_lo_u32 v4, v3, v1
92 ; TONGA-NEXT: v_mul_hi_u32 v5, v3, v1
93 ; TONGA-NEXT: v_sub_u32_e32 v7, vcc, 0, v4
94 ; TONGA-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5
95 ; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
96 ; TONGA-NEXT: v_mul_hi_u32 v4, v4, v3
97 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, v4, v3
98 ; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v4, v3
99 ; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
100 ; TONGA-NEXT: v_mul_hi_u32 v3, v3, v0
101 ; TONGA-NEXT: v_mul_lo_u32 v4, v3, v1
102 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
103 ; TONGA-NEXT: v_add_u32_e32 v6, vcc, -1, v3
104 ; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v4, v0
105 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
106 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v1
107 ; TONGA-NEXT: s_and_b64 s[0:1], s[0:1], vcc
108 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1]
109 ; TONGA-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
110 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2
111 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
112 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
113 ; TONGA-NEXT: s_endpgm
115 ; GFX9-LABEL: sdiv_i32:
117 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
118 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
119 ; GFX9-NEXT: s_mov_b32 s6, -1
120 ; GFX9-NEXT: s_mov_b32 s10, s6
121 ; GFX9-NEXT: s_mov_b32 s11, s7
122 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
123 ; GFX9-NEXT: s_mov_b32 s8, s2
124 ; GFX9-NEXT: s_mov_b32 s9, s3
125 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
126 ; GFX9-NEXT: s_mov_b32 s4, s0
127 ; GFX9-NEXT: s_mov_b32 s5, s1
128 ; GFX9-NEXT: s_waitcnt vmcnt(0)
129 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1
130 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
131 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2
132 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v1
133 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
134 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3
135 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
136 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1
137 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
138 ; GFX9-NEXT: v_sub_u32_e32 v6, 0, v4
139 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
140 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
141 ; GFX9-NEXT: v_mul_hi_u32 v4, v4, v3
142 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0
143 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v5
144 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5
145 ; GFX9-NEXT: v_add_u32_e32 v6, v3, v4
146 ; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4
147 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
148 ; GFX9-NEXT: v_mul_hi_u32 v3, v3, v0
149 ; GFX9-NEXT: v_xor_b32_e32 v2, v5, v2
150 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1
151 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
152 ; GFX9-NEXT: v_add_u32_e32 v6, -1, v3
153 ; GFX9-NEXT: v_sub_u32_e32 v7, v0, v4
154 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
155 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v1
156 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc
157 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1]
158 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
159 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2
160 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2
161 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
162 ; GFX9-NEXT: s_endpgm
164 ; EG-LABEL: sdiv_i32:
166 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
168 ; EG-NEXT: ALU 30, @9, KC0[CB0:0-32], KC1[]
169 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
172 ; EG-NEXT: Fetch clause starting at 6:
173 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
174 ; EG-NEXT: ALU clause starting at 8:
175 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
176 ; EG-NEXT: ALU clause starting at 9:
177 ; EG-NEXT: SETGT_INT * T0.W, 0.0, T0.Y,
178 ; EG-NEXT: ADD_INT * T1.W, T0.Y, PV.W,
179 ; EG-NEXT: XOR_INT * T1.W, PV.W, T0.W,
180 ; EG-NEXT: RECIP_UINT * T0.Y, PV.W,
181 ; EG-NEXT: MULLO_INT * T0.Z, PS, T1.W,
182 ; EG-NEXT: SUB_INT T2.W, 0.0, PS,
183 ; EG-NEXT: MULHI * T1.X, T0.Y, T1.W,
184 ; EG-NEXT: CNDE_INT T2.W, PS, PV.W, T0.Z,
185 ; EG-NEXT: SETGT_INT * T3.W, 0.0, T0.X,
186 ; EG-NEXT: MULHI * T0.Z, PV.W, T0.Y,
187 ; EG-NEXT: ADD_INT T1.Z, T0.X, T3.W,
188 ; EG-NEXT: ADD_INT T2.W, T0.Y, PS,
189 ; EG-NEXT: SUB_INT * T4.W, T0.Y, PS,
190 ; EG-NEXT: CNDE_INT T2.W, T1.X, PV.W, PS,
191 ; EG-NEXT: XOR_INT * T4.W, PV.Z, T3.W,
192 ; EG-NEXT: MULHI * T0.X, PV.W, PS,
193 ; EG-NEXT: MULLO_INT * T0.Y, PS, T1.W,
194 ; EG-NEXT: SUB_INT * T2.W, T4.W, PS,
195 ; EG-NEXT: SETGE_UINT T1.W, PV.W, T1.W,
196 ; EG-NEXT: SETGE_UINT * T2.W, T4.W, T0.Y,
197 ; EG-NEXT: AND_INT T1.W, PV.W, PS,
198 ; EG-NEXT: ADD_INT * T4.W, T0.X, 1,
199 ; EG-NEXT: CNDE_INT T1.W, PV.W, T0.X, PS,
200 ; EG-NEXT: ADD_INT * T4.W, T0.X, literal.x,
201 ; EG-NEXT: -1(nan), 0(0.000000e+00)
202 ; EG-NEXT: CNDE_INT T1.W, T2.W, PS, PV.W,
203 ; EG-NEXT: XOR_INT * T0.W, T3.W, T0.W,
204 ; EG-NEXT: XOR_INT * T1.W, PV.W, PS,
205 ; EG-NEXT: SUB_INT T0.X, PV.W, T0.W,
206 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
207 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
208 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
209 %num = load i32, i32 addrspace(1) * %in
210 %den = load i32, i32 addrspace(1) * %den_ptr
211 %result = sdiv i32 %num, %den
212 store i32 %result, i32 addrspace(1)* %out
216 define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
217 ; GCN-LABEL: sdiv_i32_4:
219 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
220 ; GCN-NEXT: s_mov_b32 s7, 0xf000
221 ; GCN-NEXT: s_mov_b32 s6, -1
222 ; GCN-NEXT: s_mov_b32 s10, s6
223 ; GCN-NEXT: s_mov_b32 s11, s7
224 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
225 ; GCN-NEXT: s_mov_b32 s8, s2
226 ; GCN-NEXT: s_mov_b32 s9, s3
227 ; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
228 ; GCN-NEXT: s_mov_b32 s4, s0
229 ; GCN-NEXT: s_mov_b32 s5, s1
230 ; GCN-NEXT: s_waitcnt vmcnt(0)
231 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
232 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1
233 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
234 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0
235 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
238 ; TONGA-LABEL: sdiv_i32_4:
240 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
241 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
242 ; TONGA-NEXT: s_mov_b32 s2, -1
243 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
244 ; TONGA-NEXT: s_mov_b32 s0, s4
245 ; TONGA-NEXT: s_mov_b32 s1, s5
246 ; TONGA-NEXT: s_mov_b32 s4, s6
247 ; TONGA-NEXT: s_mov_b32 s5, s7
248 ; TONGA-NEXT: s_mov_b32 s6, s2
249 ; TONGA-NEXT: s_mov_b32 s7, s3
250 ; TONGA-NEXT: buffer_load_dword v0, off, s[4:7], 0
251 ; TONGA-NEXT: s_waitcnt vmcnt(0)
252 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
253 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1
254 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
255 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0
256 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
257 ; TONGA-NEXT: s_endpgm
259 ; GFX9-LABEL: sdiv_i32_4:
261 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
262 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
263 ; GFX9-NEXT: s_mov_b32 s2, -1
264 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
265 ; GFX9-NEXT: s_mov_b32 s0, s4
266 ; GFX9-NEXT: s_mov_b32 s1, s5
267 ; GFX9-NEXT: s_mov_b32 s4, s6
268 ; GFX9-NEXT: s_mov_b32 s5, s7
269 ; GFX9-NEXT: s_mov_b32 s6, s2
270 ; GFX9-NEXT: s_mov_b32 s7, s3
271 ; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0
272 ; GFX9-NEXT: s_waitcnt vmcnt(0)
273 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
274 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 30, v1
275 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
276 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0
277 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
278 ; GFX9-NEXT: s_endpgm
280 ; EG-LABEL: sdiv_i32_4:
282 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
284 ; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
285 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
288 ; EG-NEXT: Fetch clause starting at 6:
289 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
290 ; EG-NEXT: ALU clause starting at 8:
291 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
292 ; EG-NEXT: ALU clause starting at 9:
293 ; EG-NEXT: ASHR * T0.W, T0.X, literal.x,
294 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
295 ; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
296 ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
297 ; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W,
298 ; EG-NEXT: ASHR T0.X, PV.W, literal.x,
299 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
300 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
301 %num = load i32, i32 addrspace(1) * %in
302 %result = sdiv i32 %num, 4
303 store i32 %result, i32 addrspace(1)* %out
307 ; Multiply by a weird constant to make sure setIntDivIsCheap is
310 define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
311 ; GCN-LABEL: slow_sdiv_i32_3435:
313 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
314 ; GCN-NEXT: s_mov_b32 s7, 0xf000
315 ; GCN-NEXT: s_mov_b32 s6, -1
316 ; GCN-NEXT: s_mov_b32 s10, s6
317 ; GCN-NEXT: s_mov_b32 s11, s7
318 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
319 ; GCN-NEXT: s_mov_b32 s8, s2
320 ; GCN-NEXT: s_mov_b32 s9, s3
321 ; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
322 ; GCN-NEXT: s_mov_b32 s2, 0x98a1930b
323 ; GCN-NEXT: s_mov_b32 s4, s0
324 ; GCN-NEXT: s_mov_b32 s5, s1
325 ; GCN-NEXT: s_waitcnt vmcnt(0)
326 ; GCN-NEXT: v_mul_hi_i32 v1, v0, s2
327 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
328 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0
329 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0
330 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
331 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
334 ; TONGA-LABEL: slow_sdiv_i32_3435:
336 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
337 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
338 ; TONGA-NEXT: s_mov_b32 s2, -1
339 ; TONGA-NEXT: s_mov_b32 s10, s2
340 ; TONGA-NEXT: s_mov_b32 s11, s3
341 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
342 ; TONGA-NEXT: s_mov_b32 s8, s6
343 ; TONGA-NEXT: s_mov_b32 s9, s7
344 ; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0
345 ; TONGA-NEXT: s_mov_b32 s0, 0x98a1930b
346 ; TONGA-NEXT: s_mov_b32 s1, s5
347 ; TONGA-NEXT: s_waitcnt vmcnt(0)
348 ; TONGA-NEXT: v_mul_hi_i32 v1, v0, s0
349 ; TONGA-NEXT: s_mov_b32 s0, s4
350 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
351 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0
352 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0
353 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0
354 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
355 ; TONGA-NEXT: s_endpgm
357 ; GFX9-LABEL: slow_sdiv_i32_3435:
359 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
360 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
361 ; GFX9-NEXT: s_mov_b32 s2, -1
362 ; GFX9-NEXT: s_mov_b32 s10, s2
363 ; GFX9-NEXT: s_mov_b32 s11, s3
364 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
365 ; GFX9-NEXT: s_mov_b32 s8, s6
366 ; GFX9-NEXT: s_mov_b32 s9, s7
367 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
368 ; GFX9-NEXT: s_mov_b32 s0, 0x98a1930b
369 ; GFX9-NEXT: s_mov_b32 s1, s5
370 ; GFX9-NEXT: s_waitcnt vmcnt(0)
371 ; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0
372 ; GFX9-NEXT: s_mov_b32 s0, s4
373 ; GFX9-NEXT: v_add_u32_e32 v0, v1, v0
374 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v0
375 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 11, v0
376 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
377 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
378 ; GFX9-NEXT: s_endpgm
380 ; EG-LABEL: slow_sdiv_i32_3435:
382 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
384 ; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
385 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
388 ; EG-NEXT: Fetch clause starting at 6:
389 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
390 ; EG-NEXT: ALU clause starting at 8:
391 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
392 ; EG-NEXT: ALU clause starting at 9:
393 ; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x,
394 ; EG-NEXT: -1734241525(-4.176600e-24), 0(0.000000e+00)
395 ; EG-NEXT: ADD_INT * T0.W, PS, T0.X,
396 ; EG-NEXT: ASHR T1.W, PV.W, literal.x,
397 ; EG-NEXT: LSHR * T0.W, PV.W, literal.y,
398 ; EG-NEXT: 11(1.541428e-44), 31(4.344025e-44)
399 ; EG-NEXT: ADD_INT T0.X, PV.W, PS,
400 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
401 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
402 %num = load i32, i32 addrspace(1) * %in
403 %result = sdiv i32 %num, 3435
404 store i32 %result, i32 addrspace(1)* %out
408 define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
409 ; GCN-LABEL: sdiv_v2i32:
411 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
412 ; GCN-NEXT: s_mov_b32 s11, 0xf000
413 ; GCN-NEXT: s_mov_b32 s10, -1
414 ; GCN-NEXT: s_mov_b32 s6, s10
415 ; GCN-NEXT: s_mov_b32 s7, s11
416 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
417 ; GCN-NEXT: s_mov_b32 s4, s2
418 ; GCN-NEXT: s_mov_b32 s5, s3
419 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
420 ; GCN-NEXT: s_mov_b32 s2, 0x4f800000
421 ; GCN-NEXT: s_mov_b32 s8, s0
422 ; GCN-NEXT: s_mov_b32 s9, s1
423 ; GCN-NEXT: s_waitcnt vmcnt(0)
424 ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0
425 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v2
426 ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1
427 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3
428 ; GCN-NEXT: v_xor_b32_e32 v8, v4, v5
429 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
430 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
431 ; GCN-NEXT: v_xor_b32_e32 v9, v6, v7
432 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v6, v1
433 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3
434 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v4
435 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v5
436 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v6
437 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v7
438 ; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2
439 ; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3
440 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4
441 ; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5
442 ; GCN-NEXT: v_mul_f32_e32 v4, s2, v4
443 ; GCN-NEXT: v_mul_f32_e32 v5, s2, v5
444 ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
445 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
446 ; GCN-NEXT: v_mul_hi_u32 v6, v4, v2
447 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v2
448 ; GCN-NEXT: v_mul_hi_u32 v10, v5, v3
449 ; GCN-NEXT: v_mul_lo_u32 v11, v5, v3
450 ; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v7
451 ; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
452 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
453 ; GCN-NEXT: v_cndmask_b32_e64 v6, v7, v12, s[0:1]
454 ; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v10
455 ; GCN-NEXT: v_cndmask_b32_e64 v7, v11, v13, s[2:3]
456 ; GCN-NEXT: v_mul_hi_u32 v6, v6, v4
457 ; GCN-NEXT: v_mul_hi_u32 v7, v7, v5
458 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v6, v4
459 ; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v6, v4
460 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v5
461 ; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v7, v5
462 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
463 ; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[2:3]
464 ; GCN-NEXT: v_mul_hi_u32 v4, v4, v0
465 ; GCN-NEXT: v_mul_hi_u32 v5, v5, v1
466 ; GCN-NEXT: v_mul_lo_u32 v6, v4, v2
467 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4
468 ; GCN-NEXT: v_add_i32_e32 v10, vcc, -1, v4
469 ; GCN-NEXT: v_mul_lo_u32 v11, v5, v3
470 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v5
471 ; GCN-NEXT: v_add_i32_e32 v13, vcc, -1, v5
472 ; GCN-NEXT: v_subrev_i32_e32 v14, vcc, v6, v0
473 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v6
474 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v11, v1
475 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v11
476 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v2
477 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3
478 ; GCN-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1]
479 ; GCN-NEXT: v_cndmask_b32_e64 v0, v4, v7, s[2:3]
480 ; GCN-NEXT: s_and_b64 s[2:3], s[4:5], vcc
481 ; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v12, s[2:3]
482 ; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[0:1]
483 ; GCN-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
484 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8
485 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v9
486 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
487 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
488 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
491 ; TONGA-LABEL: sdiv_v2i32:
493 ; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
494 ; TONGA-NEXT: s_mov_b32 s11, 0xf000
495 ; TONGA-NEXT: s_mov_b32 s10, -1
496 ; TONGA-NEXT: s_mov_b32 s4, 0x4f800000
497 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
498 ; TONGA-NEXT: s_mov_b32 s8, s0
499 ; TONGA-NEXT: s_mov_b32 s9, s1
500 ; TONGA-NEXT: s_mov_b32 s0, s2
501 ; TONGA-NEXT: s_mov_b32 s1, s3
502 ; TONGA-NEXT: s_mov_b32 s2, s10
503 ; TONGA-NEXT: s_mov_b32 s3, s11
504 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
505 ; TONGA-NEXT: s_waitcnt vmcnt(0)
506 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v2
507 ; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
508 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v5, v2
509 ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
510 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3
511 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v5
512 ; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v1
513 ; TONGA-NEXT: v_xor_b32_e32 v8, v4, v5
514 ; TONGA-NEXT: v_cvt_f32_u32_e32 v5, v2
515 ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7
516 ; TONGA-NEXT: v_xor_b32_e32 v9, v6, v7
517 ; TONGA-NEXT: v_cvt_f32_u32_e32 v7, v3
518 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v5
519 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0
520 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4
521 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v7, v7
522 ; TONGA-NEXT: v_mul_f32_e32 v4, s4, v5
523 ; TONGA-NEXT: v_cvt_u32_f32_e32 v4, v4
524 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v6, v1
525 ; TONGA-NEXT: v_mul_f32_e32 v5, s4, v7
526 ; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5
527 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6
528 ; TONGA-NEXT: v_mul_hi_u32 v6, v4, v2
529 ; TONGA-NEXT: v_mul_lo_u32 v7, v4, v2
530 ; TONGA-NEXT: v_mul_hi_u32 v10, v5, v3
531 ; TONGA-NEXT: v_mul_lo_u32 v11, v5, v3
532 ; TONGA-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
533 ; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v7
534 ; TONGA-NEXT: v_cndmask_b32_e64 v6, v7, v12, s[0:1]
535 ; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v11
536 ; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v10
537 ; TONGA-NEXT: v_cndmask_b32_e64 v7, v11, v13, s[2:3]
538 ; TONGA-NEXT: v_mul_hi_u32 v6, v6, v4
539 ; TONGA-NEXT: v_mul_hi_u32 v7, v7, v5
540 ; TONGA-NEXT: v_add_u32_e32 v10, vcc, v6, v4
541 ; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v6, v4
542 ; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
543 ; TONGA-NEXT: v_add_u32_e32 v6, vcc, v7, v5
544 ; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v7, v5
545 ; TONGA-NEXT: v_mul_hi_u32 v4, v4, v0
546 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[2:3]
547 ; TONGA-NEXT: v_mul_hi_u32 v5, v5, v1
548 ; TONGA-NEXT: v_mul_lo_u32 v6, v4, v2
549 ; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4
550 ; TONGA-NEXT: v_mul_lo_u32 v11, v5, v3
551 ; TONGA-NEXT: v_add_u32_e32 v10, vcc, -1, v4
552 ; TONGA-NEXT: v_subrev_u32_e32 v14, vcc, v6, v0
553 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v6
554 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v2
555 ; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v11, v1
556 ; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v5
557 ; TONGA-NEXT: v_add_u32_e32 v13, vcc, -1, v5
558 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v11
559 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3
560 ; TONGA-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1]
561 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v4, v7, s[2:3]
562 ; TONGA-NEXT: s_and_b64 s[2:3], s[4:5], vcc
563 ; TONGA-NEXT: v_cndmask_b32_e64 v1, v5, v12, s[2:3]
564 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[0:1]
565 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
566 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8
567 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9
568 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
569 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9
570 ; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
571 ; TONGA-NEXT: s_endpgm
573 ; GFX9-LABEL: sdiv_v2i32:
575 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
576 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
577 ; GFX9-NEXT: s_mov_b32 s10, -1
578 ; GFX9-NEXT: s_mov_b32 s4, 0x4f800000
579 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
580 ; GFX9-NEXT: s_mov_b32 s8, s0
581 ; GFX9-NEXT: s_mov_b32 s9, s1
582 ; GFX9-NEXT: s_mov_b32 s0, s2
583 ; GFX9-NEXT: s_mov_b32 s1, s3
584 ; GFX9-NEXT: s_mov_b32 s2, s10
585 ; GFX9-NEXT: s_mov_b32 s3, s11
586 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
587 ; GFX9-NEXT: s_waitcnt vmcnt(0)
588 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v2
589 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v3
590 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5
591 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v6
592 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v5
593 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2
594 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v6
595 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v3
596 ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0
597 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7
598 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v4
599 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8
600 ; GFX9-NEXT: v_xor_b32_e32 v5, v4, v5
601 ; GFX9-NEXT: v_mul_f32_e32 v7, s4, v7
602 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
603 ; GFX9-NEXT: v_mul_f32_e32 v8, s4, v8
604 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8
605 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4
606 ; GFX9-NEXT: v_mul_lo_u32 v4, v7, v2
607 ; GFX9-NEXT: v_mul_hi_u32 v11, v7, v2
608 ; GFX9-NEXT: v_mul_lo_u32 v10, v8, v3
609 ; GFX9-NEXT: v_mul_hi_u32 v12, v8, v3
610 ; GFX9-NEXT: v_sub_u32_e32 v13, 0, v4
611 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
612 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc
613 ; GFX9-NEXT: v_sub_u32_e32 v14, 0, v10
614 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v12
615 ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v14, s[0:1]
616 ; GFX9-NEXT: v_mul_hi_u32 v4, v4, v7
617 ; GFX9-NEXT: v_mul_hi_u32 v10, v10, v8
618 ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v1
619 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v9
620 ; GFX9-NEXT: v_xor_b32_e32 v6, v9, v6
621 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v9
622 ; GFX9-NEXT: v_add_u32_e32 v9, v7, v4
623 ; GFX9-NEXT: v_sub_u32_e32 v4, v7, v4
624 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
625 ; GFX9-NEXT: v_add_u32_e32 v7, v8, v10
626 ; GFX9-NEXT: v_sub_u32_e32 v8, v8, v10
627 ; GFX9-NEXT: v_mul_hi_u32 v4, v4, v0
628 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
629 ; GFX9-NEXT: v_mul_hi_u32 v7, v7, v1
630 ; GFX9-NEXT: v_mul_lo_u32 v8, v4, v2
631 ; GFX9-NEXT: v_add_u32_e32 v9, 1, v4
632 ; GFX9-NEXT: v_mul_lo_u32 v11, v7, v3
633 ; GFX9-NEXT: v_add_u32_e32 v12, 1, v7
634 ; GFX9-NEXT: v_sub_u32_e32 v14, v0, v8
635 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8
636 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v2
637 ; GFX9-NEXT: v_sub_u32_e32 v0, v1, v11
638 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v11
639 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3
640 ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], vcc
641 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v9, s[2:3]
642 ; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1]
643 ; GFX9-NEXT: v_add_u32_e32 v10, -1, v4
644 ; GFX9-NEXT: v_add_u32_e32 v13, -1, v7
645 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v12, s[2:3]
646 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
647 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[0:1]
648 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5
649 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v6
650 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v5
651 ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6
652 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
653 ; GFX9-NEXT: s_endpgm
655 ; EG-LABEL: sdiv_v2i32:
657 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
659 ; EG-NEXT: ALU 59, @11, KC0[CB0:0-32], KC1[]
660 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
663 ; EG-NEXT: Fetch clause starting at 6:
664 ; EG-NEXT: VTX_READ_64 T1.XY, T0.X, 8, #1
665 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
666 ; EG-NEXT: ALU clause starting at 10:
667 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
668 ; EG-NEXT: ALU clause starting at 11:
669 ; EG-NEXT: SETGT_INT * T0.W, 0.0, T1.Y,
670 ; EG-NEXT: ADD_INT * T1.W, T1.Y, PV.W,
671 ; EG-NEXT: XOR_INT T1.W, PV.W, T0.W,
672 ; EG-NEXT: SETGT_INT * T2.W, 0.0, T1.X,
673 ; EG-NEXT: ADD_INT T3.W, T1.X, PS,
674 ; EG-NEXT: RECIP_UINT * T0.Z, PV.W,
675 ; EG-NEXT: XOR_INT T3.W, PV.W, T2.W, BS:VEC_021/SCL_122
676 ; EG-NEXT: MULLO_INT * T1.X, PS, T1.W,
677 ; EG-NEXT: RECIP_UINT * T1.Y, PV.W,
678 ; EG-NEXT: MULLO_INT * T1.Z, PS, T3.W,
679 ; EG-NEXT: SUB_INT T4.W, 0.0, PS,
680 ; EG-NEXT: MULHI * T2.X, T1.Y, T3.W,
681 ; EG-NEXT: CNDE_INT T1.Z, PS, PV.W, T1.Z, BS:VEC_021/SCL_122
682 ; EG-NEXT: SUB_INT T4.W, 0.0, T1.X,
683 ; EG-NEXT: MULHI * T2.Y, T0.Z, T1.W,
684 ; EG-NEXT: CNDE_INT T2.Z, PS, PV.W, T1.X,
685 ; EG-NEXT: SETGT_INT T4.W, 0.0, T0.X,
686 ; EG-NEXT: MULHI * T1.X, PV.Z, T1.Y,
687 ; EG-NEXT: SETGT_INT T3.X, 0.0, T0.Y,
688 ; EG-NEXT: ADD_INT T3.Y, T0.X, PV.W,
689 ; EG-NEXT: ADD_INT T1.Z, T1.Y, PS,
690 ; EG-NEXT: SUB_INT T5.W, T1.Y, PS,
691 ; EG-NEXT: MULHI * T0.X, PV.Z, T0.Z,
692 ; EG-NEXT: CNDE_INT T1.X, T2.X, PV.Z, PV.W,
693 ; EG-NEXT: XOR_INT T1.Y, PV.Y, T4.W,
694 ; EG-NEXT: ADD_INT T1.Z, T0.Y, PV.X,
695 ; EG-NEXT: ADD_INT T5.W, T0.Z, PS,
696 ; EG-NEXT: SUB_INT * T6.W, T0.Z, PS,
697 ; EG-NEXT: CNDE_INT T0.Z, T2.Y, PV.W, PS,
698 ; EG-NEXT: XOR_INT T5.W, PV.Z, T3.X,
699 ; EG-NEXT: MULHI * T0.X, PV.X, PV.Y,
700 ; EG-NEXT: MULHI * T0.Y, PV.Z, PV.W,
701 ; EG-NEXT: MULLO_INT * T0.Z, PS, T1.W,
702 ; EG-NEXT: SUB_INT T6.W, T5.W, PS,
703 ; EG-NEXT: MULLO_INT * T1.X, T0.X, T3.W,
704 ; EG-NEXT: SUB_INT T1.Z, T1.Y, PS,
705 ; EG-NEXT: SETGE_UINT T1.W, PV.W, T1.W,
706 ; EG-NEXT: SETGE_UINT * T5.W, T5.W, T0.Z,
707 ; EG-NEXT: AND_INT T2.Y, PV.W, PS,
708 ; EG-NEXT: ADD_INT T0.Z, T0.Y, 1,
709 ; EG-NEXT: SETGE_UINT T1.W, PV.Z, T3.W,
710 ; EG-NEXT: SETGE_UINT * T3.W, T1.Y, T1.X,
711 ; EG-NEXT: AND_INT T1.Y, PV.W, PS,
712 ; EG-NEXT: ADD_INT T1.Z, T0.X, 1,
713 ; EG-NEXT: CNDE_INT T1.W, PV.Y, T0.Y, PV.Z,
714 ; EG-NEXT: ADD_INT * T6.W, T0.Y, literal.x,
715 ; EG-NEXT: -1(nan), 0(0.000000e+00)
716 ; EG-NEXT: CNDE_INT T0.Y, T5.W, PS, PV.W,
717 ; EG-NEXT: XOR_INT T0.Z, T3.X, T0.W,
718 ; EG-NEXT: CNDE_INT T0.W, PV.Y, T0.X, PV.Z,
719 ; EG-NEXT: ADD_INT * T1.W, T0.X, literal.x,
720 ; EG-NEXT: -1(nan), 0(0.000000e+00)
721 ; EG-NEXT: CNDE_INT T1.Z, T3.W, PS, PV.W,
722 ; EG-NEXT: XOR_INT T0.W, T4.W, T2.W, BS:VEC_120/SCL_212
723 ; EG-NEXT: XOR_INT * T1.W, PV.Y, PV.Z,
724 ; EG-NEXT: SUB_INT T0.Y, PS, T0.Z,
725 ; EG-NEXT: XOR_INT * T1.W, PV.Z, PV.W,
726 ; EG-NEXT: SUB_INT T0.X, PV.W, T0.W,
727 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
728 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
729 %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
730 %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
731 %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
732 %result = sdiv <2 x i32> %num, %den
733 store <2 x i32> %result, <2 x i32> addrspace(1)* %out
737 define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
738 ; GCN-LABEL: sdiv_v2i32_4:
740 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
741 ; GCN-NEXT: s_mov_b32 s7, 0xf000
742 ; GCN-NEXT: s_mov_b32 s6, -1
743 ; GCN-NEXT: s_mov_b32 s10, s6
744 ; GCN-NEXT: s_mov_b32 s11, s7
745 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
746 ; GCN-NEXT: s_mov_b32 s8, s2
747 ; GCN-NEXT: s_mov_b32 s9, s3
748 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
749 ; GCN-NEXT: s_mov_b32 s4, s0
750 ; GCN-NEXT: s_mov_b32 s5, s1
751 ; GCN-NEXT: s_waitcnt vmcnt(0)
752 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0
753 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1
754 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 30, v2
755 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 30, v3
756 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0
757 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
758 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0
759 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1
760 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
763 ; TONGA-LABEL: sdiv_v2i32_4:
765 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
766 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
767 ; TONGA-NEXT: s_mov_b32 s2, -1
768 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
769 ; TONGA-NEXT: s_mov_b32 s0, s4
770 ; TONGA-NEXT: s_mov_b32 s1, s5
771 ; TONGA-NEXT: s_mov_b32 s4, s6
772 ; TONGA-NEXT: s_mov_b32 s5, s7
773 ; TONGA-NEXT: s_mov_b32 s6, s2
774 ; TONGA-NEXT: s_mov_b32 s7, s3
775 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
776 ; TONGA-NEXT: s_waitcnt vmcnt(0)
777 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0
778 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
779 ; TONGA-NEXT: v_lshrrev_b32_e32 v2, 30, v2
780 ; TONGA-NEXT: v_lshrrev_b32_e32 v3, 30, v3
781 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0
782 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1
783 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0
784 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1
785 ; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
786 ; TONGA-NEXT: s_endpgm
788 ; GFX9-LABEL: sdiv_v2i32_4:
790 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
791 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
792 ; GFX9-NEXT: s_mov_b32 s2, -1
793 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
794 ; GFX9-NEXT: s_mov_b32 s0, s4
795 ; GFX9-NEXT: s_mov_b32 s1, s5
796 ; GFX9-NEXT: s_mov_b32 s4, s6
797 ; GFX9-NEXT: s_mov_b32 s5, s7
798 ; GFX9-NEXT: s_mov_b32 s6, s2
799 ; GFX9-NEXT: s_mov_b32 s7, s3
800 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
801 ; GFX9-NEXT: s_waitcnt vmcnt(0)
802 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0
803 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1
804 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 30, v2
805 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 30, v3
806 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
807 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
808 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0
809 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1
810 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
811 ; GFX9-NEXT: s_endpgm
813 ; EG-LABEL: sdiv_v2i32_4:
815 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
817 ; EG-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[]
818 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
821 ; EG-NEXT: Fetch clause starting at 6:
822 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
823 ; EG-NEXT: ALU clause starting at 8:
824 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
825 ; EG-NEXT: ALU clause starting at 9:
826 ; EG-NEXT: ASHR * T0.W, T0.Y, literal.x,
827 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
828 ; EG-NEXT: LSHR T0.W, PV.W, literal.x,
829 ; EG-NEXT: ASHR * T1.W, T0.X, literal.y,
830 ; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44)
831 ; EG-NEXT: LSHR T1.W, PS, literal.x,
832 ; EG-NEXT: ADD_INT * T0.W, T0.Y, PV.W,
833 ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
834 ; EG-NEXT: ASHR T0.Y, PS, literal.x,
835 ; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W,
836 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
837 ; EG-NEXT: ASHR T0.X, PV.W, literal.x,
838 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
839 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
840 %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
841 %result = sdiv <2 x i32> %num, <i32 4, i32 4>
842 store <2 x i32> %result, <2 x i32> addrspace(1)* %out
846 define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
847 ; GCN-LABEL: sdiv_v4i32:
849 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
850 ; GCN-NEXT: s_mov_b32 s19, 0xf000
851 ; GCN-NEXT: s_mov_b32 s18, -1
852 ; GCN-NEXT: s_mov_b32 s2, s18
853 ; GCN-NEXT: s_mov_b32 s3, s19
854 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
855 ; GCN-NEXT: s_mov_b32 s0, s10
856 ; GCN-NEXT: s_mov_b32 s1, s11
857 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
858 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
859 ; GCN-NEXT: s_mov_b32 s6, 0x4f800000
860 ; GCN-NEXT: s_waitcnt vmcnt(1)
861 ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v0
862 ; GCN-NEXT: s_waitcnt vmcnt(0)
863 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4
864 ; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1
865 ; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5
866 ; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2
867 ; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6
868 ; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v3
869 ; GCN-NEXT: v_ashrrev_i32_e32 v15, 31, v7
870 ; GCN-NEXT: v_xor_b32_e32 v16, v8, v9
871 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v8, v0
872 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1
873 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2
874 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v14, v3
875 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
876 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5
877 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6
878 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v15, v7
879 ; GCN-NEXT: v_xor_b32_e32 v17, v10, v11
880 ; GCN-NEXT: v_xor_b32_e32 v18, v12, v13
881 ; GCN-NEXT: v_xor_b32_e32 v19, v14, v15
882 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8
883 ; GCN-NEXT: v_xor_b32_e32 v4, v4, v9
884 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v10
885 ; GCN-NEXT: v_xor_b32_e32 v5, v5, v11
886 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v12
887 ; GCN-NEXT: v_xor_b32_e32 v6, v6, v13
888 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v14
889 ; GCN-NEXT: v_xor_b32_e32 v7, v7, v15
890 ; GCN-NEXT: v_cvt_f32_u32_e32 v8, v4
891 ; GCN-NEXT: v_cvt_f32_u32_e32 v9, v5
892 ; GCN-NEXT: v_cvt_f32_u32_e32 v10, v6
893 ; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8
894 ; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v9
895 ; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10
896 ; GCN-NEXT: v_mul_f32_e32 v8, s6, v8
897 ; GCN-NEXT: v_mul_f32_e32 v9, s6, v9
898 ; GCN-NEXT: v_mul_f32_e32 v10, s6, v10
899 ; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8
900 ; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9
901 ; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10
902 ; GCN-NEXT: v_mul_hi_u32 v11, v8, v4
903 ; GCN-NEXT: v_mul_lo_u32 v12, v8, v4
904 ; GCN-NEXT: v_mul_hi_u32 v13, v9, v5
905 ; GCN-NEXT: v_mul_lo_u32 v14, v9, v5
906 ; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
907 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11
908 ; GCN-NEXT: v_mul_hi_u32 v11, v10, v6
909 ; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v15, s[0:1]
910 ; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v14
911 ; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v13
912 ; GCN-NEXT: v_mul_lo_u32 v13, v10, v6
913 ; GCN-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[2:3]
914 ; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
915 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11
916 ; GCN-NEXT: v_cvt_f32_u32_e32 v11, v7
917 ; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11
918 ; GCN-NEXT: v_mul_f32_e32 v11, s6, v11
919 ; GCN-NEXT: v_cvt_u32_f32_e32 v11, v11
920 ; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5]
921 ; GCN-NEXT: v_mul_hi_u32 v15, v11, v7
922 ; GCN-NEXT: v_mul_lo_u32 v20, v11, v7
923 ; GCN-NEXT: v_sub_i32_e32 v21, vcc, 0, v20
924 ; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15
925 ; GCN-NEXT: v_cndmask_b32_e64 v15, v20, v21, s[6:7]
926 ; GCN-NEXT: v_mul_hi_u32 v12, v12, v8
927 ; GCN-NEXT: v_add_i32_e32 v20, vcc, v12, v8
928 ; GCN-NEXT: v_subrev_i32_e32 v8, vcc, v12, v8
929 ; GCN-NEXT: v_mul_hi_u32 v12, v14, v9
930 ; GCN-NEXT: v_add_i32_e32 v14, vcc, v12, v9
931 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, v12, v9
932 ; GCN-NEXT: v_mul_hi_u32 v12, v13, v10
933 ; GCN-NEXT: v_add_i32_e32 v13, vcc, v12, v10
934 ; GCN-NEXT: v_subrev_i32_e32 v10, vcc, v12, v10
935 ; GCN-NEXT: v_mul_hi_u32 v12, v15, v11
936 ; GCN-NEXT: v_add_i32_e32 v15, vcc, v12, v11
937 ; GCN-NEXT: v_subrev_i32_e32 v11, vcc, v12, v11
938 ; GCN-NEXT: s_mov_b32 s16, s8
939 ; GCN-NEXT: s_mov_b32 s17, s9
940 ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v20, s[0:1]
941 ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[2:3]
942 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[4:5]
943 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[6:7]
944 ; GCN-NEXT: v_mul_hi_u32 v8, v8, v0
945 ; GCN-NEXT: v_mul_hi_u32 v9, v9, v1
946 ; GCN-NEXT: v_mul_hi_u32 v10, v10, v2
947 ; GCN-NEXT: v_mul_hi_u32 v11, v11, v3
948 ; GCN-NEXT: v_mul_lo_u32 v12, v8, v4
949 ; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v8
950 ; GCN-NEXT: v_add_i32_e32 v14, vcc, -1, v8
951 ; GCN-NEXT: v_mul_lo_u32 v15, v9, v5
952 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v12
953 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
954 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v9
955 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4
956 ; GCN-NEXT: v_add_i32_e32 v0, vcc, -1, v9
957 ; GCN-NEXT: v_mul_lo_u32 v4, v10, v6
958 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v15
959 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v15
960 ; GCN-NEXT: v_add_i32_e32 v15, vcc, 1, v10
961 ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5
962 ; GCN-NEXT: v_add_i32_e32 v1, vcc, -1, v10
963 ; GCN-NEXT: v_mul_lo_u32 v5, v11, v7
964 ; GCN-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v4
965 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
966 ; GCN-NEXT: v_add_i32_e32 v4, vcc, -1, v11
967 ; GCN-NEXT: v_cmp_ge_u32_e64 s[10:11], v3, v5
968 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v5
969 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v11
970 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
971 ; GCN-NEXT: v_cmp_ge_u32_e64 s[12:13], v3, v7
972 ; GCN-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1]
973 ; GCN-NEXT: v_cndmask_b32_e64 v2, v8, v13, s[2:3]
974 ; GCN-NEXT: s_and_b64 s[2:3], s[6:7], s[4:5]
975 ; GCN-NEXT: v_cndmask_b32_e64 v3, v9, v12, s[2:3]
976 ; GCN-NEXT: s_and_b64 vcc, vcc, s[8:9]
977 ; GCN-NEXT: v_cndmask_b32_e32 v6, v10, v15, vcc
978 ; GCN-NEXT: s_and_b64 vcc, s[12:13], s[10:11]
979 ; GCN-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc
980 ; GCN-NEXT: v_cndmask_b32_e64 v2, v14, v2, s[0:1]
981 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5]
982 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9]
983 ; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v5, s[10:11]
984 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v16
985 ; GCN-NEXT: v_xor_b32_e32 v4, v0, v17
986 ; GCN-NEXT: v_xor_b32_e32 v5, v1, v18
987 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v19
988 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v2, v16
989 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v4, v17
990 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v5, v18
991 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v19
992 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
995 ; TONGA-LABEL: sdiv_v4i32:
997 ; TONGA-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24
998 ; TONGA-NEXT: s_mov_b32 s11, 0xf000
999 ; TONGA-NEXT: s_mov_b32 s10, -1
1000 ; TONGA-NEXT: s_mov_b32 s2, s10
1001 ; TONGA-NEXT: s_mov_b32 s3, s11
1002 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
1003 ; TONGA-NEXT: s_mov_b32 s0, s14
1004 ; TONGA-NEXT: s_mov_b32 s1, s15
1005 ; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1006 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
1007 ; TONGA-NEXT: s_mov_b32 s14, 0x4f800000
1008 ; TONGA-NEXT: s_mov_b32 s8, s12
1009 ; TONGA-NEXT: s_mov_b32 s9, s13
1010 ; TONGA-NEXT: s_waitcnt vmcnt(1)
1011 ; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v4
1012 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
1013 ; TONGA-NEXT: s_waitcnt vmcnt(0)
1014 ; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v0
1015 ; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9
1016 ; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9
1017 ; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v4
1018 ; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5
1019 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5
1020 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v8, v0
1021 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v9
1022 ; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11
1023 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8
1024 ; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v5
1025 ; TONGA-NEXT: v_mul_f32_e32 v9, s14, v9
1026 ; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9
1027 ; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
1028 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8
1029 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1
1030 ; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11
1031 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10
1032 ; TONGA-NEXT: v_mul_f32_e32 v8, s14, v8
1033 ; TONGA-NEXT: v_mul_hi_u32 v11, v9, v4
1034 ; TONGA-NEXT: v_mul_lo_u32 v10, v9, v4
1035 ; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8
1036 ; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2
1037 ; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v6
1038 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v12, v2
1039 ; TONGA-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11
1040 ; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13
1041 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12
1042 ; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v10
1043 ; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[0:1]
1044 ; TONGA-NEXT: v_mul_hi_u32 v12, v8, v5
1045 ; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6
1046 ; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13
1047 ; TONGA-NEXT: v_mul_lo_u32 v11, v8, v5
1048 ; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v12
1049 ; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v6
1050 ; TONGA-NEXT: v_mul_hi_u32 v10, v10, v9
1051 ; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v11
1052 ; TONGA-NEXT: v_cndmask_b32_e64 v11, v11, v13, s[2:3]
1053 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12
1054 ; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7
1055 ; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7
1056 ; TONGA-NEXT: v_xor_b32_e32 v7, v7, v14
1057 ; TONGA-NEXT: v_mul_f32_e32 v12, s14, v12
1058 ; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12
1059 ; TONGA-NEXT: v_mul_hi_u32 v18, v12, v6
1060 ; TONGA-NEXT: v_mul_lo_u32 v13, v12, v6
1061 ; TONGA-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
1062 ; TONGA-NEXT: v_add_u32_e32 v18, vcc, v10, v9
1063 ; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v9
1064 ; TONGA-NEXT: v_mul_hi_u32 v10, v11, v8
1065 ; TONGA-NEXT: v_cndmask_b32_e64 v9, v9, v18, s[0:1]
1066 ; TONGA-NEXT: v_mul_hi_u32 v9, v9, v0
1067 ; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v13
1068 ; TONGA-NEXT: v_add_u32_e32 v11, vcc, v10, v8
1069 ; TONGA-NEXT: v_subrev_u32_e32 v8, vcc, v10, v8
1070 ; TONGA-NEXT: v_cndmask_b32_e64 v13, v13, v19, s[4:5]
1071 ; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[2:3]
1072 ; TONGA-NEXT: v_mul_hi_u32 v10, v13, v12
1073 ; TONGA-NEXT: v_mul_lo_u32 v11, v9, v4
1074 ; TONGA-NEXT: v_mul_hi_u32 v8, v8, v1
1075 ; TONGA-NEXT: v_add_u32_e32 v13, vcc, v10, v12
1076 ; TONGA-NEXT: v_subrev_u32_e32 v10, vcc, v10, v12
1077 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v11
1078 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v11
1079 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4
1080 ; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[4:5]
1081 ; TONGA-NEXT: v_mul_lo_u32 v0, v8, v5
1082 ; TONGA-NEXT: v_mul_hi_u32 v4, v10, v2
1083 ; TONGA-NEXT: v_add_u32_e32 v12, vcc, -1, v9
1084 ; TONGA-NEXT: v_add_u32_e32 v10, vcc, -1, v8
1085 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v0
1086 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0
1087 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v5
1088 ; TONGA-NEXT: v_mul_lo_u32 v5, v4, v6
1089 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v9
1090 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, 1, v8
1091 ; TONGA-NEXT: s_and_b64 vcc, s[2:3], s[0:1]
1092 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
1093 ; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v2, v5
1094 ; TONGA-NEXT: s_and_b64 vcc, s[6:7], s[4:5]
1095 ; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v7
1096 ; TONGA-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
1097 ; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[0:1]
1098 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[4:5]
1099 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v15
1100 ; TONGA-NEXT: v_xor_b32_e32 v8, v0, v16
1101 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v15
1102 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v8, v16
1103 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v11
1104 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v9, v6
1105 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v2, v5
1106 ; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
1107 ; TONGA-NEXT: v_mul_f32_e32 v8, s14, v8
1108 ; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8
1109 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v10, v3
1110 ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v10
1111 ; TONGA-NEXT: v_add_u32_e32 v6, vcc, -1, v4
1112 ; TONGA-NEXT: v_mul_lo_u32 v5, v8, v7
1113 ; TONGA-NEXT: v_mul_hi_u32 v9, v8, v7
1114 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v4
1115 ; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v5
1116 ; TONGA-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9
1117 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5]
1118 ; TONGA-NEXT: v_mul_hi_u32 v5, v5, v8
1119 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, v5, v8
1120 ; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v5, v8
1121 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
1122 ; TONGA-NEXT: v_mul_hi_u32 v5, v5, v3
1123 ; TONGA-NEXT: s_and_b64 vcc, s[0:1], s[2:3]
1124 ; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
1125 ; TONGA-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3]
1126 ; TONGA-NEXT: v_mul_lo_u32 v4, v5, v7
1127 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v17
1128 ; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v17
1129 ; TONGA-NEXT: v_xor_b32_e32 v6, v10, v14
1130 ; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v3, v4
1131 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v8, v7
1132 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v3, v4
1133 ; TONGA-NEXT: v_add_u32_e32 v7, vcc, -1, v5
1134 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, 1, v5
1135 ; TONGA-NEXT: s_and_b64 vcc, s[0:1], s[2:3]
1136 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
1137 ; TONGA-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3]
1138 ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v6
1139 ; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
1140 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
1141 ; TONGA-NEXT: s_endpgm
1143 ; GFX9-LABEL: sdiv_v4i32:
1145 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
1146 ; GFX9-NEXT: s_mov_b32 s15, 0xf000
1147 ; GFX9-NEXT: s_mov_b32 s14, -1
1148 ; GFX9-NEXT: s_mov_b32 s2, s14
1149 ; GFX9-NEXT: s_mov_b32 s3, s15
1150 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1151 ; GFX9-NEXT: s_mov_b32 s0, s10
1152 ; GFX9-NEXT: s_mov_b32 s1, s11
1153 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1154 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
1155 ; GFX9-NEXT: s_mov_b32 s4, 0x4f800000
1156 ; GFX9-NEXT: s_mov_b32 s12, s8
1157 ; GFX9-NEXT: s_mov_b32 s13, s9
1158 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1159 ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v4
1160 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1161 ; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0
1162 ; GFX9-NEXT: v_add_u32_e32 v4, v4, v9
1163 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v8
1164 ; GFX9-NEXT: v_xor_b32_e32 v4, v4, v9
1165 ; GFX9-NEXT: v_xor_b32_e32 v16, v8, v9
1166 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8
1167 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v4
1168 ; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v5
1169 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v11
1170 ; GFX9-NEXT: v_xor_b32_e32 v5, v5, v11
1171 ; GFX9-NEXT: v_cvt_f32_u32_e32 v9, v5
1172 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8
1173 ; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v6
1174 ; GFX9-NEXT: v_ashrrev_i32_e32 v10, 31, v1
1175 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v13
1176 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v10
1177 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v9
1178 ; GFX9-NEXT: v_mul_f32_e32 v8, s4, v8
1179 ; GFX9-NEXT: v_xor_b32_e32 v6, v6, v13
1180 ; GFX9-NEXT: v_xor_b32_e32 v17, v10, v11
1181 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v10
1182 ; GFX9-NEXT: v_cvt_f32_u32_e32 v10, v6
1183 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8
1184 ; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v2
1185 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v12
1186 ; GFX9-NEXT: v_mul_f32_e32 v9, s4, v9
1187 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v10, v10
1188 ; GFX9-NEXT: v_xor_b32_e32 v18, v12, v13
1189 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v12
1190 ; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9
1191 ; GFX9-NEXT: v_mul_hi_u32 v12, v8, v4
1192 ; GFX9-NEXT: v_mul_lo_u32 v11, v8, v4
1193 ; GFX9-NEXT: v_mul_f32_e32 v10, s4, v10
1194 ; GFX9-NEXT: v_mul_lo_u32 v13, v9, v5
1195 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
1196 ; GFX9-NEXT: v_mul_hi_u32 v12, v9, v5
1197 ; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v10
1198 ; GFX9-NEXT: v_sub_u32_e32 v19, 0, v11
1199 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc
1200 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v12
1201 ; GFX9-NEXT: v_sub_u32_e32 v19, 0, v13
1202 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v19, s[0:1]
1203 ; GFX9-NEXT: v_mul_hi_u32 v19, v10, v6
1204 ; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v7
1205 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v15
1206 ; GFX9-NEXT: v_xor_b32_e32 v7, v7, v15
1207 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v19
1208 ; GFX9-NEXT: v_cvt_f32_u32_e32 v19, v7
1209 ; GFX9-NEXT: v_mul_hi_u32 v11, v11, v8
1210 ; GFX9-NEXT: v_mul_lo_u32 v12, v10, v6
1211 ; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v3
1212 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v19, v19
1213 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v14
1214 ; GFX9-NEXT: v_sub_u32_e32 v20, 0, v12
1215 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v20, s[2:3]
1216 ; GFX9-NEXT: v_mul_f32_e32 v19, s4, v19
1217 ; GFX9-NEXT: v_cvt_u32_f32_e32 v19, v19
1218 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v14
1219 ; GFX9-NEXT: v_mul_hi_u32 v21, v19, v7
1220 ; GFX9-NEXT: v_mul_lo_u32 v20, v19, v7
1221 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v21
1222 ; GFX9-NEXT: v_add_u32_e32 v21, v8, v11
1223 ; GFX9-NEXT: v_sub_u32_e32 v8, v8, v11
1224 ; GFX9-NEXT: v_mul_hi_u32 v11, v13, v9
1225 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v21, vcc
1226 ; GFX9-NEXT: v_mul_hi_u32 v8, v8, v0
1227 ; GFX9-NEXT: v_sub_u32_e32 v22, 0, v20
1228 ; GFX9-NEXT: v_add_u32_e32 v13, v9, v11
1229 ; GFX9-NEXT: v_sub_u32_e32 v9, v9, v11
1230 ; GFX9-NEXT: v_mul_hi_u32 v11, v12, v10
1231 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[0:1]
1232 ; GFX9-NEXT: v_mul_hi_u32 v9, v9, v1
1233 ; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v22, s[4:5]
1234 ; GFX9-NEXT: v_add_u32_e32 v12, v10, v11
1235 ; GFX9-NEXT: v_sub_u32_e32 v10, v10, v11
1236 ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[2:3]
1237 ; GFX9-NEXT: v_mul_lo_u32 v12, v8, v4
1238 ; GFX9-NEXT: v_mul_hi_u32 v11, v20, v19
1239 ; GFX9-NEXT: v_mul_hi_u32 v10, v10, v2
1240 ; GFX9-NEXT: v_add_u32_e32 v13, 1, v8
1241 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v12
1242 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v12
1243 ; GFX9-NEXT: v_mul_lo_u32 v12, v9, v5
1244 ; GFX9-NEXT: v_add_u32_e32 v20, v19, v11
1245 ; GFX9-NEXT: v_sub_u32_e32 v11, v19, v11
1246 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v20, s[4:5]
1247 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v12
1248 ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v12
1249 ; GFX9-NEXT: v_mul_lo_u32 v12, v10, v6
1250 ; GFX9-NEXT: v_mul_hi_u32 v11, v11, v3
1251 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
1252 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5
1253 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v12
1254 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v12
1255 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc
1256 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6
1257 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v13, s[0:1]
1258 ; GFX9-NEXT: v_add_u32_e32 v0, 1, v9
1259 ; GFX9-NEXT: s_and_b64 s[0:1], s[4:5], s[2:3]
1260 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[0:1]
1261 ; GFX9-NEXT: v_add_u32_e32 v1, 1, v10
1262 ; GFX9-NEXT: s_and_b64 s[0:1], s[8:9], s[6:7]
1263 ; GFX9-NEXT: v_mul_lo_u32 v12, v11, v7
1264 ; GFX9-NEXT: v_add_u32_e32 v19, -1, v8
1265 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v10, v1, s[0:1]
1266 ; GFX9-NEXT: v_add_u32_e32 v5, -1, v10
1267 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc
1268 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[6:7]
1269 ; GFX9-NEXT: v_add_u32_e32 v4, -1, v9
1270 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[2:3]
1271 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v16
1272 ; GFX9-NEXT: v_xor_b32_e32 v5, v1, v18
1273 ; GFX9-NEXT: v_xor_b32_e32 v4, v0, v17
1274 ; GFX9-NEXT: v_sub_u32_e32 v0, v2, v16
1275 ; GFX9-NEXT: v_sub_u32_e32 v2, v5, v18
1276 ; GFX9-NEXT: v_sub_u32_e32 v5, v3, v12
1277 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v5, v7
1278 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v12
1279 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v11
1280 ; GFX9-NEXT: s_and_b64 vcc, vcc, s[0:1]
1281 ; GFX9-NEXT: v_add_u32_e32 v5, -1, v11
1282 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
1283 ; GFX9-NEXT: v_sub_u32_e32 v1, v4, v17
1284 ; GFX9-NEXT: v_xor_b32_e32 v4, v14, v15
1285 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1]
1286 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v4
1287 ; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4
1288 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1289 ; GFX9-NEXT: s_endpgm
1291 ; EG-LABEL: sdiv_v4i32:
1293 ; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
1295 ; EG-NEXT: ALU 2, @13, KC0[], KC1[]
1296 ; EG-NEXT: TEX 0 @10
1297 ; EG-NEXT: ALU 114, @16, KC0[CB0:0-32], KC1[]
1298 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
1301 ; EG-NEXT: Fetch clause starting at 8:
1302 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
1303 ; EG-NEXT: Fetch clause starting at 10:
1304 ; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 0, #1
1305 ; EG-NEXT: ALU clause starting at 12:
1306 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1307 ; EG-NEXT: ALU clause starting at 13:
1308 ; EG-NEXT: SETGT_INT * T0.W, 0.0, T1.Z,
1309 ; EG-NEXT: ADD_INT * T2.W, T1.Z, PV.W,
1310 ; EG-NEXT: XOR_INT * T2.W, PV.W, T0.W,
1311 ; EG-NEXT: ALU clause starting at 16:
1312 ; EG-NEXT: RECIP_UINT * T0.X, T2.W,
1313 ; EG-NEXT: MULLO_INT * T0.Y, PS, T2.W,
1314 ; EG-NEXT: SUB_INT T4.W, 0.0, PS,
1315 ; EG-NEXT: MULHI * T0.Z, T0.X, T2.W,
1316 ; EG-NEXT: CNDE_INT T4.W, PS, PV.W, T0.Y,
1317 ; EG-NEXT: SETGT_INT * T5.W, 0.0, T3.Z,
1318 ; EG-NEXT: MULHI * T0.Y, PV.W, T0.X,
1319 ; EG-NEXT: SETGT_INT T2.Y, 0.0, T1.W,
1320 ; EG-NEXT: ADD_INT T1.Z, T3.Z, T5.W, BS:VEC_021/SCL_122
1321 ; EG-NEXT: ADD_INT T4.W, T0.X, PS,
1322 ; EG-NEXT: SUB_INT * T6.W, T0.X, PS,
1323 ; EG-NEXT: CNDE_INT T0.Z, T0.Z, PV.W, PS,
1324 ; EG-NEXT: XOR_INT T4.W, PV.Z, T5.W,
1325 ; EG-NEXT: ADD_INT * T1.W, T1.W, PV.Y,
1326 ; EG-NEXT: XOR_INT T1.W, PS, T2.Y,
1327 ; EG-NEXT: MULHI * T0.X, PV.Z, PV.W,
1328 ; EG-NEXT: SETGT_INT T6.W, 0.0, T1.Y,
1329 ; EG-NEXT: RECIP_UINT * T0.Y, PV.W,
1330 ; EG-NEXT: ADD_INT T7.W, T1.Y, PV.W,
1331 ; EG-NEXT: MULLO_INT * T0.Z, PS, T1.W,
1332 ; EG-NEXT: XOR_INT T1.Z, PV.W, T6.W, BS:VEC_021/SCL_122
1333 ; EG-NEXT: SUB_INT T7.W, 0.0, PS,
1334 ; EG-NEXT: MULHI * T1.Y, T0.Y, T1.W,
1335 ; EG-NEXT: CNDE_INT T7.W, PS, PV.W, T0.Z,
1336 ; EG-NEXT: RECIP_UINT * T0.Z, PV.Z,
1337 ; EG-NEXT: SETGT_INT T8.W, 0.0, T3.W,
1338 ; EG-NEXT: MULHI * T2.X, PV.W, T0.Y,
1339 ; EG-NEXT: ADD_INT T4.Y, T3.W, PV.W,
1340 ; EG-NEXT: ADD_INT T2.Z, T0.Y, PS,
1341 ; EG-NEXT: SUB_INT T3.W, T0.Y, PS,
1342 ; EG-NEXT: MULLO_INT * T0.Y, T0.Z, T1.Z,
1343 ; EG-NEXT: CNDE_INT T2.X, T1.Y, PV.Z, PV.W,
1344 ; EG-NEXT: XOR_INT T1.Y, PV.Y, T8.W,
1345 ; EG-NEXT: SETGT_INT T2.Z, 0.0, T1.X,
1346 ; EG-NEXT: SUB_INT T3.W, 0.0, PS,
1347 ; EG-NEXT: MULHI * T3.Z, T0.Z, T1.Z,
1348 ; EG-NEXT: CNDE_INT T4.Z, PS, PV.W, T0.Y,
1349 ; EG-NEXT: ADD_INT T3.W, T1.X, PV.Z,
1350 ; EG-NEXT: MULHI * T0.Y, PV.X, PV.Y,
1351 ; EG-NEXT: XOR_INT T3.W, PV.W, T2.Z, BS:VEC_021/SCL_122
1352 ; EG-NEXT: MULHI * T1.X, PV.Z, T0.Z,
1353 ; EG-NEXT: RECIP_UINT * T2.X, PV.W,
1354 ; EG-NEXT: MULLO_INT * T4.X, PS, T3.W,
1355 ; EG-NEXT: SETGT_INT T4.Z, 0.0, T3.Y,
1356 ; EG-NEXT: SUB_INT T7.W, 0.0, PS,
1357 ; EG-NEXT: MULHI * T4.Y, T2.X, T3.W,
1358 ; EG-NEXT: CNDE_INT T4.X, PS, PV.W, T4.X,
1359 ; EG-NEXT: ADD_INT T3.Y, T3.Y, PV.Z,
1360 ; EG-NEXT: ADD_INT T5.Z, T0.Z, T1.X,
1361 ; EG-NEXT: SUB_INT T7.W, T0.Z, T1.X,
1362 ; EG-NEXT: MULLO_INT * T0.Z, T0.Y, T1.W,
1363 ; EG-NEXT: CNDE_INT T5.Y, T3.Z, PV.Z, PV.W,
1364 ; EG-NEXT: XOR_INT T3.Z, PV.Y, T4.Z,
1365 ; EG-NEXT: SUB_INT T7.W, T1.Y, PS,
1366 ; EG-NEXT: MULHI * T1.X, PV.X, T2.X,
1367 ; EG-NEXT: SETGE_UINT T5.Z, PV.W, T1.W,
1368 ; EG-NEXT: SETGE_UINT T1.W, T1.Y, T0.Z,
1369 ; EG-NEXT: MULHI * T0.Z, PV.Y, PV.Z,
1370 ; EG-NEXT: AND_INT T1.Y, PV.Z, PV.W,
1371 ; EG-NEXT: ADD_INT T5.Z, T0.Y, 1,
1372 ; EG-NEXT: SETGT_INT T7.W, 0.0, T3.X,
1373 ; EG-NEXT: MULLO_INT * T3.Y, PS, T1.Z,
1374 ; EG-NEXT: SUB_INT T4.X, T3.Z, PS,
1375 ; EG-NEXT: ADD_INT T5.Y, T3.X, PV.W,
1376 ; EG-NEXT: ADD_INT T6.Z, T2.X, T1.X, BS:VEC_120/SCL_212
1377 ; EG-NEXT: SUB_INT * T9.W, T2.X, T1.X, BS:VEC_120/SCL_212
1378 ; EG-NEXT: MULLO_INT * T1.X, T0.X, T2.W,
1379 ; EG-NEXT: CNDE_INT T2.X, T4.Y, T6.Z, T9.W,
1380 ; EG-NEXT: XOR_INT T4.Y, T5.Y, T7.W, BS:VEC_201
1381 ; EG-NEXT: SUB_INT T6.Z, T4.W, PS, BS:VEC_120/SCL_212
1382 ; EG-NEXT: SETGE_UINT T9.W, T4.X, T1.Z, BS:VEC_102/SCL_221
1383 ; EG-NEXT: SETGE_UINT * T10.W, T3.Z, T3.Y,
1384 ; EG-NEXT: AND_INT T3.X, PV.W, PS,
1385 ; EG-NEXT: ADD_INT T3.Y, T0.Z, 1,
1386 ; EG-NEXT: SETGE_UINT T1.Z, PV.Z, T2.W,
1387 ; EG-NEXT: SETGE_UINT T2.W, T4.W, T1.X,
1388 ; EG-NEXT: MULHI * T1.X, PV.X, PV.Y,
1389 ; EG-NEXT: AND_INT T2.X, PV.Z, PV.W,
1390 ; EG-NEXT: ADD_INT T5.Y, T0.X, 1,
1391 ; EG-NEXT: CNDE_INT T1.Z, PV.X, T0.Z, PV.Y,
1392 ; EG-NEXT: ADD_INT T4.W, T0.Z, literal.x,
1393 ; EG-NEXT: MULLO_INT * T0.Z, PS, T3.W,
1394 ; EG-NEXT: -1(nan), 0(0.000000e+00)
1395 ; EG-NEXT: CNDE_INT T3.X, T10.W, PV.W, PV.Z,
1396 ; EG-NEXT: CNDE_INT T3.Y, PV.X, T0.X, PV.Y,
1397 ; EG-NEXT: CNDE_INT T1.Z, T1.Y, T0.Y, T5.Z,
1398 ; EG-NEXT: ADD_INT T4.W, T0.Y, literal.x, BS:VEC_120/SCL_212
1399 ; EG-NEXT: SUB_INT * T9.W, T4.Y, PS,
1400 ; EG-NEXT: -1(nan), 0(0.000000e+00)
1401 ; EG-NEXT: ADD_INT T0.X, T0.X, literal.x,
1402 ; EG-NEXT: SETGE_UINT T0.Y, PS, T3.W,
1403 ; EG-NEXT: SETGE_UINT T0.Z, T4.Y, T0.Z,
1404 ; EG-NEXT: CNDE_INT T1.W, T1.W, PV.W, PV.Z,
1405 ; EG-NEXT: XOR_INT * T3.W, T8.W, T2.Y,
1406 ; EG-NEXT: -1(nan), 0(0.000000e+00)
1407 ; EG-NEXT: XOR_INT T2.X, PV.W, PS,
1408 ; EG-NEXT: AND_INT T0.Y, PV.Y, PV.Z,
1409 ; EG-NEXT: ADD_INT T1.Z, T1.X, 1,
1410 ; EG-NEXT: CNDE_INT T1.W, T2.W, PV.X, T3.Y,
1411 ; EG-NEXT: XOR_INT * T0.W, T5.W, T0.W,
1412 ; EG-NEXT: XOR_INT T0.X, T4.Z, T6.W, BS:VEC_021/SCL_122
1413 ; EG-NEXT: XOR_INT T1.Y, PV.W, PS,
1414 ; EG-NEXT: CNDE_INT T1.Z, PV.Y, T1.X, PV.Z,
1415 ; EG-NEXT: ADD_INT T1.W, T1.X, literal.x,
1416 ; EG-NEXT: SUB_INT * T3.W, PV.X, T3.W,
1417 ; EG-NEXT: -1(nan), 0(0.000000e+00)
1418 ; EG-NEXT: CNDE_INT T0.Y, T0.Z, PV.W, PV.Z,
1419 ; EG-NEXT: SUB_INT T3.Z, PV.Y, T0.W,
1420 ; EG-NEXT: XOR_INT T0.W, T7.W, T2.Z,
1421 ; EG-NEXT: XOR_INT * T1.W, T3.X, PV.X,
1422 ; EG-NEXT: SUB_INT T3.Y, PS, T0.X,
1423 ; EG-NEXT: XOR_INT * T1.W, PV.Y, PV.W,
1424 ; EG-NEXT: SUB_INT T3.X, PV.W, T0.W,
1425 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
1426 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1427 %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
1428 %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
1429 %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
1430 %result = sdiv <4 x i32> %num, %den
1431 store <4 x i32> %result, <4 x i32> addrspace(1)* %out
1435 define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
1436 ; GCN-LABEL: sdiv_v4i32_4:
1438 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1439 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1440 ; GCN-NEXT: s_mov_b32 s6, -1
1441 ; GCN-NEXT: s_mov_b32 s10, s6
1442 ; GCN-NEXT: s_mov_b32 s11, s7
1443 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1444 ; GCN-NEXT: s_mov_b32 s8, s2
1445 ; GCN-NEXT: s_mov_b32 s9, s3
1446 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1447 ; GCN-NEXT: s_mov_b32 s4, s0
1448 ; GCN-NEXT: s_mov_b32 s5, s1
1449 ; GCN-NEXT: s_waitcnt vmcnt(0)
1450 ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1451 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1
1452 ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v2
1453 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3
1454 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 30, v4
1455 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 30, v5
1456 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 30, v6
1457 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 30, v7
1458 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
1459 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v5, v1
1460 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2
1461 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3
1462 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0
1463 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1
1464 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 2, v2
1465 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 2, v3
1466 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1467 ; GCN-NEXT: s_endpgm
1469 ; TONGA-LABEL: sdiv_v4i32_4:
1471 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1472 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
1473 ; TONGA-NEXT: s_mov_b32 s2, -1
1474 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
1475 ; TONGA-NEXT: s_mov_b32 s0, s4
1476 ; TONGA-NEXT: s_mov_b32 s1, s5
1477 ; TONGA-NEXT: s_mov_b32 s4, s6
1478 ; TONGA-NEXT: s_mov_b32 s5, s7
1479 ; TONGA-NEXT: s_mov_b32 s6, s2
1480 ; TONGA-NEXT: s_mov_b32 s7, s3
1481 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
1482 ; TONGA-NEXT: s_waitcnt vmcnt(0)
1483 ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1484 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1
1485 ; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v2
1486 ; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
1487 ; TONGA-NEXT: v_lshrrev_b32_e32 v4, 30, v4
1488 ; TONGA-NEXT: v_lshrrev_b32_e32 v5, 30, v5
1489 ; TONGA-NEXT: v_lshrrev_b32_e32 v6, 30, v6
1490 ; TONGA-NEXT: v_lshrrev_b32_e32 v7, 30, v7
1491 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0
1492 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v5, v1
1493 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2
1494 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3
1495 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0
1496 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1
1497 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 2, v2
1498 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 2, v3
1499 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1500 ; TONGA-NEXT: s_endpgm
1502 ; GFX9-LABEL: sdiv_v4i32_4:
1504 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1505 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1506 ; GFX9-NEXT: s_mov_b32 s2, -1
1507 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1508 ; GFX9-NEXT: s_mov_b32 s0, s4
1509 ; GFX9-NEXT: s_mov_b32 s1, s5
1510 ; GFX9-NEXT: s_mov_b32 s4, s6
1511 ; GFX9-NEXT: s_mov_b32 s5, s7
1512 ; GFX9-NEXT: s_mov_b32 s6, s2
1513 ; GFX9-NEXT: s_mov_b32 s7, s3
1514 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
1515 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1516 ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1517 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1
1518 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v2
1519 ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v3
1520 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 30, v4
1521 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 30, v5
1522 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 30, v6
1523 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 30, v7
1524 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v4
1525 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v5
1526 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6
1527 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7
1528 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0
1529 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1
1530 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 2, v2
1531 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 2, v3
1532 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1533 ; GFX9-NEXT: s_endpgm
1535 ; EG-LABEL: sdiv_v4i32_4:
1537 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1539 ; EG-NEXT: ALU 24, @9, KC0[CB0:0-32], KC1[]
1540 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
1543 ; EG-NEXT: Fetch clause starting at 6:
1544 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
1545 ; EG-NEXT: ALU clause starting at 8:
1546 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1547 ; EG-NEXT: ALU clause starting at 9:
1548 ; EG-NEXT: ASHR T1.W, T0.W, literal.x,
1549 ; EG-NEXT: ASHR * T2.W, T0.Z, literal.x,
1550 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1551 ; EG-NEXT: LSHR * T1.W, PV.W, literal.x,
1552 ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
1553 ; EG-NEXT: ADD_INT T1.Z, T0.W, PV.W,
1554 ; EG-NEXT: LSHR T0.W, T2.W, literal.x, BS:VEC_120/SCL_212
1555 ; EG-NEXT: ASHR * T1.W, T0.Y, literal.y,
1556 ; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44)
1557 ; EG-NEXT: LSHR T1.Y, PS, literal.x,
1558 ; EG-NEXT: ASHR T2.Z, T0.X, literal.y,
1559 ; EG-NEXT: ADD_INT T0.W, T0.Z, PV.W,
1560 ; EG-NEXT: ASHR * T1.W, PV.Z, literal.z,
1561 ; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44)
1562 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1563 ; EG-NEXT: ASHR T1.Z, PV.W, literal.x,
1564 ; EG-NEXT: LSHR T0.W, PV.Z, literal.y,
1565 ; EG-NEXT: ADD_INT * T2.W, T0.Y, PV.Y,
1566 ; EG-NEXT: 2(2.802597e-45), 30(4.203895e-44)
1567 ; EG-NEXT: ASHR T1.Y, PS, literal.x,
1568 ; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W,
1569 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1570 ; EG-NEXT: ASHR T1.X, PV.W, literal.x,
1571 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
1572 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1573 %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
1574 %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
1575 store <4 x i32> %result, <4 x i32> addrspace(1)* %out
1579 define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
1580 ; GCN-LABEL: v_sdiv_i8:
1582 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1583 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1584 ; GCN-NEXT: s_mov_b32 s6, -1
1585 ; GCN-NEXT: s_mov_b32 s10, s6
1586 ; GCN-NEXT: s_mov_b32 s11, s7
1587 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1588 ; GCN-NEXT: s_mov_b32 s8, s2
1589 ; GCN-NEXT: s_mov_b32 s9, s3
1590 ; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
1591 ; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1
1592 ; GCN-NEXT: s_mov_b32 s4, s0
1593 ; GCN-NEXT: s_mov_b32 s5, s1
1594 ; GCN-NEXT: s_waitcnt vmcnt(0)
1595 ; GCN-NEXT: v_xor_b32_e32 v2, v0, v1
1596 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
1597 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
1598 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2
1599 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1
1600 ; GCN-NEXT: v_or_b32_e32 v2, 1, v2
1601 ; GCN-NEXT: v_mul_f32_e32 v3, v0, v3
1602 ; GCN-NEXT: v_trunc_f32_e32 v3, v3
1603 ; GCN-NEXT: v_mad_f32 v0, -v3, v1, v0
1604 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
1605 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1|
1606 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
1607 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3
1608 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8
1609 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
1610 ; GCN-NEXT: s_endpgm
1612 ; TONGA-LABEL: v_sdiv_i8:
1614 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1615 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
1616 ; TONGA-NEXT: s_mov_b32 s2, -1
1617 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
1618 ; TONGA-NEXT: s_mov_b32 s0, s4
1619 ; TONGA-NEXT: s_mov_b32 s1, s5
1620 ; TONGA-NEXT: s_mov_b32 s4, s6
1621 ; TONGA-NEXT: s_mov_b32 s5, s7
1622 ; TONGA-NEXT: s_mov_b32 s6, s2
1623 ; TONGA-NEXT: s_mov_b32 s7, s3
1624 ; TONGA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:1
1625 ; TONGA-NEXT: buffer_load_sbyte v2, off, s[4:7], 0
1626 ; TONGA-NEXT: s_waitcnt vmcnt(1)
1627 ; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v0
1628 ; TONGA-NEXT: s_waitcnt vmcnt(0)
1629 ; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v2
1630 ; TONGA-NEXT: v_xor_b32_e32 v0, v2, v0
1631 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1632 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v1
1633 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0
1634 ; TONGA-NEXT: v_mul_f32_e32 v2, v3, v4
1635 ; TONGA-NEXT: v_trunc_f32_e32 v2, v2
1636 ; TONGA-NEXT: v_mad_f32 v3, -v2, v1, v3
1637 ; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2
1638 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1|
1639 ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1640 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1641 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8
1642 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
1643 ; TONGA-NEXT: s_endpgm
1645 ; GFX9-LABEL: v_sdiv_i8:
1647 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1648 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1649 ; GFX9-NEXT: s_mov_b32 s2, -1
1650 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1651 ; GFX9-NEXT: s_mov_b32 s0, s4
1652 ; GFX9-NEXT: s_mov_b32 s1, s5
1653 ; GFX9-NEXT: s_mov_b32 s4, s6
1654 ; GFX9-NEXT: s_mov_b32 s5, s7
1655 ; GFX9-NEXT: s_mov_b32 s6, s2
1656 ; GFX9-NEXT: s_mov_b32 s7, s3
1657 ; GFX9-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:1
1658 ; GFX9-NEXT: buffer_load_sbyte v2, off, s[4:7], 0
1659 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1660 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v0
1661 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1662 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v2
1663 ; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0
1664 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1665 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v1
1666 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0
1667 ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v4
1668 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
1669 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2
1670 ; GFX9-NEXT: v_mad_f32 v2, -v2, v1, v3
1671 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1|
1672 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1673 ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
1674 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
1675 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
1676 ; GFX9-NEXT: s_endpgm
1678 ; EG-LABEL: v_sdiv_i8:
1680 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1682 ; EG-NEXT: ALU 21, @11, KC0[CB0:0-32], KC1[]
1683 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1686 ; EG-NEXT: Fetch clause starting at 6:
1687 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1
1688 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1689 ; EG-NEXT: ALU clause starting at 10:
1690 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1691 ; EG-NEXT: ALU clause starting at 11:
1692 ; EG-NEXT: BFE_INT * T0.W, T1.X, 0.0, literal.x,
1693 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1694 ; EG-NEXT: INT_TO_FLT * T0.Y, PV.W,
1695 ; EG-NEXT: BFE_INT T1.W, T0.X, 0.0, literal.x,
1696 ; EG-NEXT: RECIP_IEEE * T0.X, PS,
1697 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1698 ; EG-NEXT: INT_TO_FLT * T0.Z, PV.W,
1699 ; EG-NEXT: MUL_IEEE * T2.W, PS, T0.X,
1700 ; EG-NEXT: TRUNC T2.W, PV.W,
1701 ; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W,
1702 ; EG-NEXT: ASHR T0.W, PS, literal.x,
1703 ; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.Y, T0.Z,
1704 ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
1705 ; EG-NEXT: TRUNC T0.Z, T2.W,
1706 ; EG-NEXT: SETGE T1.W, |PS|, |T0.Y|,
1707 ; EG-NEXT: OR_INT * T0.W, PV.W, 1,
1708 ; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS,
1709 ; EG-NEXT: FLT_TO_INT * T1.W, PV.Z,
1710 ; EG-NEXT: ADD_INT * T0.W, PS, PV.W,
1711 ; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
1712 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1713 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
1714 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
1715 %num = load i8, i8 addrspace(1) * %in
1716 %den = load i8, i8 addrspace(1) * %den_ptr
1717 %result = sdiv i8 %num, %den
1718 %result.ext = sext i8 %result to i32
1719 store i32 %result.ext, i32 addrspace(1)* %out
1723 define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
1724 ; GCN-LABEL: v_sdiv_i23:
1726 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1727 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1728 ; GCN-NEXT: s_mov_b32 s6, -1
1729 ; GCN-NEXT: s_mov_b32 s10, s6
1730 ; GCN-NEXT: s_mov_b32 s11, s7
1731 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1732 ; GCN-NEXT: s_mov_b32 s4, s0
1733 ; GCN-NEXT: s_mov_b32 s5, s1
1734 ; GCN-NEXT: s_mov_b32 s8, s2
1735 ; GCN-NEXT: s_mov_b32 s9, s3
1736 ; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
1737 ; GCN-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6
1738 ; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
1739 ; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
1740 ; GCN-NEXT: s_waitcnt vmcnt(3)
1741 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1742 ; GCN-NEXT: s_waitcnt vmcnt(2)
1743 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1744 ; GCN-NEXT: s_waitcnt vmcnt(1)
1745 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
1746 ; GCN-NEXT: s_waitcnt vmcnt(0)
1747 ; GCN-NEXT: v_or_b32_e32 v1, v2, v3
1748 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23
1749 ; GCN-NEXT: v_bfe_i32 v1, v1, 0, 23
1750 ; GCN-NEXT: v_xor_b32_e32 v2, v0, v1
1751 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
1752 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1
1753 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2
1754 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1
1755 ; GCN-NEXT: v_or_b32_e32 v2, 1, v2
1756 ; GCN-NEXT: v_mul_f32_e32 v3, v0, v3
1757 ; GCN-NEXT: v_trunc_f32_e32 v3, v3
1758 ; GCN-NEXT: v_mad_f32 v0, -v3, v1, v0
1759 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
1760 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v1|
1761 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
1762 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3
1763 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23
1764 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
1765 ; GCN-NEXT: s_endpgm
1767 ; TONGA-LABEL: v_sdiv_i23:
1769 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1770 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
1771 ; TONGA-NEXT: s_mov_b32 s2, -1
1772 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
1773 ; TONGA-NEXT: s_mov_b32 s0, s4
1774 ; TONGA-NEXT: s_mov_b32 s1, s5
1775 ; TONGA-NEXT: s_mov_b32 s4, s6
1776 ; TONGA-NEXT: s_mov_b32 s5, s7
1777 ; TONGA-NEXT: s_mov_b32 s6, s2
1778 ; TONGA-NEXT: s_mov_b32 s7, s3
1779 ; TONGA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:2
1780 ; TONGA-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:4
1781 ; TONGA-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:6
1782 ; TONGA-NEXT: buffer_load_ushort v3, off, s[4:7], 0
1783 ; TONGA-NEXT: s_waitcnt vmcnt(3)
1784 ; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1785 ; TONGA-NEXT: s_waitcnt vmcnt(1)
1786 ; TONGA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1787 ; TONGA-NEXT: v_or_b32_e32 v1, v1, v2
1788 ; TONGA-NEXT: v_bfe_i32 v1, v1, 0, 23
1789 ; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v1
1790 ; TONGA-NEXT: s_waitcnt vmcnt(0)
1791 ; TONGA-NEXT: v_or_b32_e32 v0, v3, v0
1792 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23
1793 ; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v0
1794 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2
1795 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1
1796 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1797 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0
1798 ; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4
1799 ; TONGA-NEXT: v_trunc_f32_e32 v1, v1
1800 ; TONGA-NEXT: v_mad_f32 v3, -v1, v2, v3
1801 ; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1
1802 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2|
1803 ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1804 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
1805 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23
1806 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
1807 ; TONGA-NEXT: s_endpgm
1809 ; GFX9-LABEL: v_sdiv_i23:
1811 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1812 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1813 ; GFX9-NEXT: s_mov_b32 s2, -1
1814 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1815 ; GFX9-NEXT: s_mov_b32 s0, s4
1816 ; GFX9-NEXT: s_mov_b32 s1, s5
1817 ; GFX9-NEXT: s_mov_b32 s4, s6
1818 ; GFX9-NEXT: s_mov_b32 s5, s7
1819 ; GFX9-NEXT: s_mov_b32 s6, s2
1820 ; GFX9-NEXT: s_mov_b32 s7, s3
1821 ; GFX9-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:2
1822 ; GFX9-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:4
1823 ; GFX9-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:6
1824 ; GFX9-NEXT: buffer_load_ushort v3, off, s[4:7], 0
1825 ; GFX9-NEXT: s_waitcnt vmcnt(3)
1826 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1827 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1828 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1829 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
1830 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 23
1831 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v1
1832 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1833 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0
1834 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23
1835 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v0
1836 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2
1837 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1
1838 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1839 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0
1840 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v4
1841 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
1842 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v1
1843 ; GFX9-NEXT: v_mad_f32 v1, -v1, v2, v3
1844 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2|
1845 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1846 ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
1847 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23
1848 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
1849 ; GFX9-NEXT: s_endpgm
1851 ; EG-LABEL: v_sdiv_i23:
1853 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
1855 ; EG-NEXT: ALU 33, @15, KC0[CB0:0-32], KC1[]
1856 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1859 ; EG-NEXT: Fetch clause starting at 6:
1860 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1
1861 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1
1862 ; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1
1863 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
1864 ; EG-NEXT: ALU clause starting at 14:
1865 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1866 ; EG-NEXT: ALU clause starting at 15:
1867 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1868 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1869 ; EG-NEXT: OR_INT T0.W, T0.X, PV.W,
1870 ; EG-NEXT: LSHL * T1.W, T3.X, literal.x,
1871 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1872 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
1873 ; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
1874 ; EG-NEXT: ASHR T0.W, PV.W, literal.x,
1875 ; EG-NEXT: OR_INT * T1.W, T2.X, T1.W,
1876 ; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
1877 ; EG-NEXT: LSHL T1.W, PS, literal.x,
1878 ; EG-NEXT: INT_TO_FLT * T0.X, PV.W,
1879 ; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
1880 ; EG-NEXT: ASHR T1.W, PV.W, literal.x,
1881 ; EG-NEXT: RECIP_IEEE * T0.Y, PS,
1882 ; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
1883 ; EG-NEXT: INT_TO_FLT * T0.Z, PV.W,
1884 ; EG-NEXT: MUL_IEEE * T2.W, PS, T0.Y,
1885 ; EG-NEXT: TRUNC T2.W, PV.W,
1886 ; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W,
1887 ; EG-NEXT: ASHR T0.W, PS, literal.x,
1888 ; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z,
1889 ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
1890 ; EG-NEXT: TRUNC T0.Z, T2.W,
1891 ; EG-NEXT: SETGE T1.W, |PS|, |T0.X|,
1892 ; EG-NEXT: OR_INT * T0.W, PV.W, 1,
1893 ; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS,
1894 ; EG-NEXT: FLT_TO_INT * T1.W, PV.Z,
1895 ; EG-NEXT: ADD_INT * T0.W, PS, PV.W,
1896 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
1897 ; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
1898 ; EG-NEXT: ASHR T0.X, PV.W, literal.x,
1899 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1900 ; EG-NEXT: 9(1.261169e-44), 2(2.802597e-45)
1901 %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
1902 %num = load i23, i23 addrspace(1) * %in
1903 %den = load i23, i23 addrspace(1) * %den_ptr
1904 %result = sdiv i23 %num, %den
1905 %result.ext = sext i23 %result to i32
1906 store i32 %result.ext, i32 addrspace(1)* %out
1910 define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
1911 ; GCN-LABEL: v_sdiv_i24:
1913 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1914 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1915 ; GCN-NEXT: s_mov_b32 s6, -1
1916 ; GCN-NEXT: s_mov_b32 s10, s6
1917 ; GCN-NEXT: s_mov_b32 s11, s7
1918 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1919 ; GCN-NEXT: s_mov_b32 s4, s0
1920 ; GCN-NEXT: s_mov_b32 s5, s1
1921 ; GCN-NEXT: s_mov_b32 s8, s2
1922 ; GCN-NEXT: s_mov_b32 s9, s3
1923 ; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2
1924 ; GCN-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6
1925 ; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
1926 ; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
1927 ; GCN-NEXT: s_waitcnt vmcnt(3)
1928 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1929 ; GCN-NEXT: s_waitcnt vmcnt(2)
1930 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1931 ; GCN-NEXT: s_waitcnt vmcnt(1)
1932 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
1933 ; GCN-NEXT: s_waitcnt vmcnt(0)
1934 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3
1935 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v3
1936 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 30, v1
1937 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
1938 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
1939 ; GCN-NEXT: v_or_b32_e32 v1, 1, v1
1940 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v2
1941 ; GCN-NEXT: v_mul_f32_e32 v3, v0, v3
1942 ; GCN-NEXT: v_trunc_f32_e32 v3, v3
1943 ; GCN-NEXT: v_mad_f32 v0, -v3, v2, v0
1944 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
1945 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2|
1946 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1947 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3
1948 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24
1949 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
1950 ; GCN-NEXT: s_endpgm
1952 ; TONGA-LABEL: v_sdiv_i24:
1954 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1955 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
1956 ; TONGA-NEXT: s_mov_b32 s2, -1
1957 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
1958 ; TONGA-NEXT: s_mov_b32 s0, s4
1959 ; TONGA-NEXT: s_mov_b32 s1, s5
1960 ; TONGA-NEXT: s_mov_b32 s4, s6
1961 ; TONGA-NEXT: s_mov_b32 s5, s7
1962 ; TONGA-NEXT: s_mov_b32 s6, s2
1963 ; TONGA-NEXT: s_mov_b32 s7, s3
1964 ; TONGA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:2
1965 ; TONGA-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:4
1966 ; TONGA-NEXT: buffer_load_sbyte v2, off, s[4:7], 0 offset:6
1967 ; TONGA-NEXT: buffer_load_ushort v3, off, s[4:7], 0
1968 ; TONGA-NEXT: s_waitcnt vmcnt(3)
1969 ; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1970 ; TONGA-NEXT: s_waitcnt vmcnt(1)
1971 ; TONGA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1972 ; TONGA-NEXT: v_or_b32_e32 v1, v1, v2
1973 ; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v1
1974 ; TONGA-NEXT: s_waitcnt vmcnt(0)
1975 ; TONGA-NEXT: v_or_b32_e32 v3, v3, v0
1976 ; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v3
1977 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2
1978 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v1
1979 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1980 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0
1981 ; TONGA-NEXT: v_mul_f32_e32 v2, v3, v4
1982 ; TONGA-NEXT: v_trunc_f32_e32 v2, v2
1983 ; TONGA-NEXT: v_mad_f32 v3, -v2, v1, v3
1984 ; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2
1985 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1|
1986 ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1987 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1988 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24
1989 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
1990 ; TONGA-NEXT: s_endpgm
1992 ; GFX9-LABEL: v_sdiv_i24:
1994 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1995 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1996 ; GFX9-NEXT: s_mov_b32 s2, -1
1997 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1998 ; GFX9-NEXT: s_mov_b32 s0, s4
1999 ; GFX9-NEXT: s_mov_b32 s1, s5
2000 ; GFX9-NEXT: s_mov_b32 s4, s6
2001 ; GFX9-NEXT: s_mov_b32 s5, s7
2002 ; GFX9-NEXT: s_mov_b32 s6, s2
2003 ; GFX9-NEXT: s_mov_b32 s7, s3
2004 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
2005 ; GFX9-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2
2006 ; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4
2007 ; GFX9-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6
2008 ; GFX9-NEXT: s_waitcnt vmcnt(2)
2009 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2010 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
2011 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2012 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2013 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3
2014 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
2015 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
2016 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v3
2017 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1
2018 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2
2019 ; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
2020 ; GFX9-NEXT: v_mul_f32_e32 v3, v0, v4
2021 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
2022 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v3
2023 ; GFX9-NEXT: v_mad_f32 v0, -v3, v2, v0
2024 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2|
2025 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
2026 ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
2027 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 24
2028 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
2029 ; GFX9-NEXT: s_endpgm
2031 ; EG-LABEL: v_sdiv_i24:
2033 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
2035 ; EG-NEXT: ALU 43, @15, KC0[CB0:0-32], KC1[]
2036 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2039 ; EG-NEXT: Fetch clause starting at 6:
2040 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1
2041 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1
2042 ; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1
2043 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
2044 ; EG-NEXT: ALU clause starting at 14:
2045 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2046 ; EG-NEXT: ALU clause starting at 15:
2047 ; EG-NEXT: BFE_INT * T0.W, T1.X, 0.0, literal.x,
2048 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
2049 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2050 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2051 ; EG-NEXT: OR_INT * T0.W, T0.X, PV.W,
2052 ; EG-NEXT: SETGT_INT * T1.W, 0.0, PV.W,
2053 ; EG-NEXT: ADD_INT * T0.W, T0.W, PV.W,
2054 ; EG-NEXT: XOR_INT * T0.W, PV.W, T1.W,
2055 ; EG-NEXT: RECIP_UINT * T0.X, PV.W,
2056 ; EG-NEXT: BFE_INT T2.W, T3.X, 0.0, literal.x,
2057 ; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W,
2058 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
2059 ; EG-NEXT: LSHL T0.Z, PV.W, literal.x,
2060 ; EG-NEXT: SUB_INT T2.W, 0.0, PS,
2061 ; EG-NEXT: MULHI * T1.X, T0.X, T0.W,
2062 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2063 ; EG-NEXT: CNDE_INT T2.W, PS, PV.W, T0.Y,
2064 ; EG-NEXT: OR_INT * T3.W, T2.X, PV.Z,
2065 ; EG-NEXT: SETGT_INT T4.W, 0.0, PS,
2066 ; EG-NEXT: MULHI * T0.Y, PV.W, T0.X,
2067 ; EG-NEXT: ADD_INT T0.Z, T3.W, PV.W,
2068 ; EG-NEXT: ADD_INT T2.W, T0.X, PS,
2069 ; EG-NEXT: SUB_INT * T3.W, T0.X, PS,
2070 ; EG-NEXT: CNDE_INT T2.W, T1.X, PV.W, PS,
2071 ; EG-NEXT: XOR_INT * T3.W, PV.Z, T4.W,
2072 ; EG-NEXT: MULHI * T0.X, PV.W, PS,
2073 ; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W,
2074 ; EG-NEXT: SUB_INT * T2.W, T3.W, PS,
2075 ; EG-NEXT: SETGE_UINT T0.W, PV.W, T0.W,
2076 ; EG-NEXT: SETGE_UINT * T2.W, T3.W, T0.Y,
2077 ; EG-NEXT: AND_INT T0.W, PV.W, PS,
2078 ; EG-NEXT: ADD_INT * T3.W, T0.X, 1,
2079 ; EG-NEXT: CNDE_INT T0.W, PV.W, T0.X, PS,
2080 ; EG-NEXT: ADD_INT * T3.W, T0.X, literal.x,
2081 ; EG-NEXT: -1(nan), 0(0.000000e+00)
2082 ; EG-NEXT: CNDE_INT T0.W, T2.W, PS, PV.W,
2083 ; EG-NEXT: XOR_INT * T1.W, T4.W, T1.W,
2084 ; EG-NEXT: XOR_INT * T0.W, PV.W, PS,
2085 ; EG-NEXT: SUB_INT * T0.W, PV.W, T1.W,
2086 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2087 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
2088 ; EG-NEXT: ASHR T0.X, PV.W, literal.x,
2089 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
2090 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
2091 %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
2092 %num = load i24, i24 addrspace(1) * %in
2093 %den = load i24, i24 addrspace(1) * %den_ptr
2094 %result = sdiv i24 %num, %den
2095 %result.ext = sext i24 %result to i32
2096 store i32 %result.ext, i32 addrspace(1)* %out
2100 define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
2101 ; GCN-LABEL: v_sdiv_i25:
2103 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2104 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2105 ; GCN-NEXT: s_mov_b32 s6, -1
2106 ; GCN-NEXT: s_mov_b32 s10, s6
2107 ; GCN-NEXT: s_mov_b32 s11, s7
2108 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
2109 ; GCN-NEXT: s_mov_b32 s8, s2
2110 ; GCN-NEXT: s_mov_b32 s9, s3
2111 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2112 ; GCN-NEXT: s_mov_b32 s4, s0
2113 ; GCN-NEXT: s_mov_b32 s5, s1
2114 ; GCN-NEXT: s_waitcnt vmcnt(0)
2115 ; GCN-NEXT: v_bfe_i32 v2, v0, 0, 25
2116 ; GCN-NEXT: v_bfe_i32 v3, v1, 0, 25
2117 ; GCN-NEXT: v_bfe_i32 v0, v0, 24, 1
2118 ; GCN-NEXT: v_bfe_i32 v1, v1, 24, 1
2119 ; GCN-NEXT: v_xor_b32_e32 v4, v0, v1
2120 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v0, v2
2121 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v3
2122 ; GCN-NEXT: v_xor_b32_e32 v0, v2, v0
2123 ; GCN-NEXT: v_xor_b32_e32 v1, v3, v1
2124 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1
2125 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
2126 ; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
2127 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
2128 ; GCN-NEXT: v_mul_hi_u32 v3, v2, v1
2129 ; GCN-NEXT: v_mul_lo_u32 v5, v2, v1
2130 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v5
2131 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3
2132 ; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1]
2133 ; GCN-NEXT: v_mul_hi_u32 v3, v3, v2
2134 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v3, v2
2135 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v3, v2
2136 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2137 ; GCN-NEXT: v_mul_hi_u32 v2, v2, v0
2138 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v1
2139 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2
2140 ; GCN-NEXT: v_add_i32_e32 v6, vcc, -1, v2
2141 ; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v0
2142 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
2143 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v1
2144 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc
2145 ; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v5, s[0:1]
2146 ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
2147 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v4
2148 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
2149 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25
2150 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
2151 ; GCN-NEXT: s_endpgm
2153 ; TONGA-LABEL: v_sdiv_i25:
2155 ; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
2156 ; TONGA-NEXT: s_mov_b32 s7, 0xf000
2157 ; TONGA-NEXT: s_mov_b32 s6, -1
2158 ; TONGA-NEXT: s_mov_b32 s2, s6
2159 ; TONGA-NEXT: s_mov_b32 s3, s7
2160 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
2161 ; TONGA-NEXT: s_mov_b32 s0, s10
2162 ; TONGA-NEXT: s_mov_b32 s1, s11
2163 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
2164 ; TONGA-NEXT: s_mov_b32 s4, s8
2165 ; TONGA-NEXT: s_mov_b32 s5, s9
2166 ; TONGA-NEXT: s_waitcnt vmcnt(0)
2167 ; TONGA-NEXT: v_bfe_i32 v2, v1, 0, 25
2168 ; TONGA-NEXT: v_bfe_i32 v1, v1, 24, 1
2169 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v1, v2
2170 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v1
2171 ; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2
2172 ; TONGA-NEXT: v_bfe_i32 v4, v0, 0, 25
2173 ; TONGA-NEXT: v_bfe_i32 v0, v0, 24, 1
2174 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v0, v4
2175 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
2176 ; TONGA-NEXT: v_xor_b32_e32 v4, v4, v0
2177 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1
2178 ; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3
2179 ; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
2180 ; TONGA-NEXT: v_mul_lo_u32 v5, v3, v2
2181 ; TONGA-NEXT: v_mul_hi_u32 v6, v3, v2
2182 ; TONGA-NEXT: v_sub_u32_e32 v7, vcc, 0, v5
2183 ; TONGA-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
2184 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1]
2185 ; TONGA-NEXT: v_mul_hi_u32 v5, v5, v3
2186 ; TONGA-NEXT: v_add_u32_e32 v6, vcc, v5, v3
2187 ; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v5, v3
2188 ; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2189 ; TONGA-NEXT: v_mul_hi_u32 v3, v3, v4
2190 ; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2
2191 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
2192 ; TONGA-NEXT: v_add_u32_e32 v6, vcc, -1, v3
2193 ; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v1, v4
2194 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
2195 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v2
2196 ; TONGA-NEXT: s_and_b64 s[0:1], s[0:1], vcc
2197 ; TONGA-NEXT: v_cndmask_b32_e64 v1, v3, v5, s[0:1]
2198 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
2199 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0
2200 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0
2201 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25
2202 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
2203 ; TONGA-NEXT: s_endpgm
2205 ; GFX9-LABEL: v_sdiv_i25:
2207 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2208 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
2209 ; GFX9-NEXT: s_mov_b32 s6, -1
2210 ; GFX9-NEXT: s_mov_b32 s10, s6
2211 ; GFX9-NEXT: s_mov_b32 s11, s7
2212 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2213 ; GFX9-NEXT: s_mov_b32 s8, s2
2214 ; GFX9-NEXT: s_mov_b32 s9, s3
2215 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2216 ; GFX9-NEXT: s_mov_b32 s4, s0
2217 ; GFX9-NEXT: s_mov_b32 s5, s1
2218 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2219 ; GFX9-NEXT: v_bfe_i32 v2, v1, 0, 25
2220 ; GFX9-NEXT: v_bfe_i32 v1, v1, 24, 1
2221 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
2222 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1
2223 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2
2224 ; GFX9-NEXT: v_bfe_i32 v6, v0, 0, 25
2225 ; GFX9-NEXT: v_bfe_i32 v0, v0, 24, 1
2226 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
2227 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3
2228 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
2229 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, v2
2230 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v2
2231 ; GFX9-NEXT: v_sub_u32_e32 v7, 0, v4
2232 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
2233 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
2234 ; GFX9-NEXT: v_mul_hi_u32 v4, v4, v3
2235 ; GFX9-NEXT: v_add_u32_e32 v5, v6, v0
2236 ; GFX9-NEXT: v_xor_b32_e32 v5, v5, v0
2237 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1
2238 ; GFX9-NEXT: v_add_u32_e32 v6, v3, v4
2239 ; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4
2240 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
2241 ; GFX9-NEXT: v_mul_hi_u32 v3, v3, v5
2242 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, v2
2243 ; GFX9-NEXT: v_add_u32_e32 v1, 1, v3
2244 ; GFX9-NEXT: v_add_u32_e32 v6, -1, v3
2245 ; GFX9-NEXT: v_sub_u32_e32 v7, v5, v4
2246 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4
2247 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v7, v2
2248 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc
2249 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1]
2250 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
2251 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v0
2252 ; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0
2253 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 25
2254 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
2255 ; GFX9-NEXT: s_endpgm
2257 ; EG-LABEL: v_sdiv_i25:
2259 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
2261 ; EG-NEXT: ALU 41, @12, KC0[CB0:0-32], KC1[]
2262 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2265 ; EG-NEXT: Fetch clause starting at 6:
2266 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1
2267 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
2268 ; EG-NEXT: ALU clause starting at 10:
2269 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2270 ; EG-NEXT: MOV * T1.X, PV.X,
2271 ; EG-NEXT: ALU clause starting at 12:
2272 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
2273 ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
2274 ; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
2275 ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
2276 ; EG-NEXT: SETGT_INT * T1.W, 0.0, PV.W,
2277 ; EG-NEXT: ADD_INT * T0.W, T0.W, PV.W,
2278 ; EG-NEXT: XOR_INT * T0.W, PV.W, T1.W,
2279 ; EG-NEXT: RECIP_UINT * T0.X, PV.W,
2280 ; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W,
2281 ; EG-NEXT: LSHL T0.Z, T1.X, literal.x,
2282 ; EG-NEXT: SUB_INT T2.W, 0.0, PS,
2283 ; EG-NEXT: MULHI * T1.X, T0.X, T0.W,
2284 ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
2285 ; EG-NEXT: CNDE_INT T2.W, PS, PV.W, T0.Y,
2286 ; EG-NEXT: ASHR * T3.W, PV.Z, literal.x,
2287 ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
2288 ; EG-NEXT: SETGT_INT T4.W, 0.0, PS,
2289 ; EG-NEXT: MULHI * T0.Y, PV.W, T0.X,
2290 ; EG-NEXT: ADD_INT T0.Z, T3.W, PV.W,
2291 ; EG-NEXT: ADD_INT T2.W, T0.X, PS,
2292 ; EG-NEXT: SUB_INT * T3.W, T0.X, PS,
2293 ; EG-NEXT: CNDE_INT T2.W, T1.X, PV.W, PS,
2294 ; EG-NEXT: XOR_INT * T3.W, PV.Z, T4.W,
2295 ; EG-NEXT: MULHI * T0.X, PV.W, PS,
2296 ; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W,
2297 ; EG-NEXT: SUB_INT * T2.W, T3.W, PS,
2298 ; EG-NEXT: SETGE_UINT T0.W, PV.W, T0.W,
2299 ; EG-NEXT: SETGE_UINT * T2.W, T3.W, T0.Y,
2300 ; EG-NEXT: AND_INT T0.W, PV.W, PS,
2301 ; EG-NEXT: ADD_INT * T3.W, T0.X, 1,
2302 ; EG-NEXT: CNDE_INT T0.W, PV.W, T0.X, PS,
2303 ; EG-NEXT: ADD_INT * T3.W, T0.X, literal.x,
2304 ; EG-NEXT: -1(nan), 0(0.000000e+00)
2305 ; EG-NEXT: CNDE_INT T0.W, T2.W, PS, PV.W,
2306 ; EG-NEXT: XOR_INT * T1.W, T4.W, T1.W,
2307 ; EG-NEXT: XOR_INT * T0.W, PV.W, PS,
2308 ; EG-NEXT: SUB_INT * T0.W, PV.W, T1.W,
2309 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2310 ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
2311 ; EG-NEXT: ASHR T0.X, PV.W, literal.x,
2312 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
2313 ; EG-NEXT: 7(9.809089e-45), 2(2.802597e-45)
2314 %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1
2315 %num = load i25, i25 addrspace(1) * %in
2316 %den = load i25, i25 addrspace(1) * %den_ptr
2317 %result = sdiv i25 %num, %den
2318 %result.ext = sext i25 %result to i32
2319 store i32 %result.ext, i32 addrspace(1)* %out
2323 ; Tests for 64-bit divide bypass.
2324 ; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
2325 ; %result = sdiv i64 %a, %b
2326 ; store i64 %result, i64 addrspace(1)* %out, align 8
2330 ; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
2331 ; %result = srem i64 %a, %b
2332 ; store i64 %result, i64 addrspace(1)* %out, align 8
2336 ; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
2337 ; %resultdiv = sdiv i64 %a, %b
2338 ; %resultrem = srem i64 %a, %b
2339 ; %result = add i64 %resultdiv, %resultrem
2340 ; store i64 %result, i64 addrspace(1)* %out, align 8
2344 define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
2345 ; GCN-LABEL: scalarize_mulhs_4xi32:
2347 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2348 ; GCN-NEXT: s_mov_b32 s7, 0xf000
2349 ; GCN-NEXT: s_mov_b32 s6, -1
2350 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
2351 ; GCN-NEXT: s_mov_b32 s4, s0
2352 ; GCN-NEXT: s_mov_b32 s5, s1
2353 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2354 ; GCN-NEXT: s_mov_b32 s0, 0x1389c755
2355 ; GCN-NEXT: s_mov_b32 s4, s2
2356 ; GCN-NEXT: s_mov_b32 s5, s3
2357 ; GCN-NEXT: s_waitcnt vmcnt(0)
2358 ; GCN-NEXT: v_mul_hi_i32 v0, v0, s0
2359 ; GCN-NEXT: v_mul_hi_i32 v1, v1, s0
2360 ; GCN-NEXT: v_mul_hi_i32 v2, v2, s0
2361 ; GCN-NEXT: v_mul_hi_i32 v3, v3, s0
2362 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 31, v0
2363 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 12, v0
2364 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 31, v1
2365 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 12, v1
2366 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 31, v2
2367 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 12, v2
2368 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 31, v3
2369 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 12, v3
2370 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
2371 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
2372 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
2373 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
2374 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2375 ; GCN-NEXT: s_endpgm
2377 ; TONGA-LABEL: scalarize_mulhs_4xi32:
2379 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2380 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
2381 ; TONGA-NEXT: s_mov_b32 s2, -1
2382 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
2383 ; TONGA-NEXT: s_mov_b32 s0, s4
2384 ; TONGA-NEXT: s_mov_b32 s1, s5
2385 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
2386 ; TONGA-NEXT: s_mov_b32 s0, 0x1389c755
2387 ; TONGA-NEXT: s_mov_b32 s4, s6
2388 ; TONGA-NEXT: s_mov_b32 s5, s7
2389 ; TONGA-NEXT: s_mov_b32 s6, s2
2390 ; TONGA-NEXT: s_mov_b32 s7, s3
2391 ; TONGA-NEXT: s_waitcnt vmcnt(0)
2392 ; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0
2393 ; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0
2394 ; TONGA-NEXT: v_mul_hi_i32 v2, v2, s0
2395 ; TONGA-NEXT: v_mul_hi_i32 v3, v3, s0
2396 ; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0
2397 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0
2398 ; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1
2399 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 12, v1
2400 ; TONGA-NEXT: v_lshrrev_b32_e32 v6, 31, v2
2401 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 12, v2
2402 ; TONGA-NEXT: v_lshrrev_b32_e32 v7, 31, v3
2403 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 12, v3
2404 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4
2405 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
2406 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6
2407 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
2408 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2409 ; TONGA-NEXT: s_endpgm
2411 ; GFX9-LABEL: scalarize_mulhs_4xi32:
2413 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2414 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
2415 ; GFX9-NEXT: s_mov_b32 s2, -1
2416 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2417 ; GFX9-NEXT: s_mov_b32 s0, s4
2418 ; GFX9-NEXT: s_mov_b32 s1, s5
2419 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
2420 ; GFX9-NEXT: s_mov_b32 s0, 0x1389c755
2421 ; GFX9-NEXT: s_mov_b32 s4, s6
2422 ; GFX9-NEXT: s_mov_b32 s5, s7
2423 ; GFX9-NEXT: s_mov_b32 s6, s2
2424 ; GFX9-NEXT: s_mov_b32 s7, s3
2425 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2426 ; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0
2427 ; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0
2428 ; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0
2429 ; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0
2430 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0
2431 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0
2432 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1
2433 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 12, v1
2434 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v2
2435 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 12, v2
2436 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 31, v3
2437 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 12, v3
2438 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v4
2439 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v5
2440 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6
2441 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7
2442 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2443 ; GFX9-NEXT: s_endpgm
2445 ; EG-LABEL: scalarize_mulhs_4xi32:
2447 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
2449 ; EG-NEXT: ALU 25, @9, KC0[CB0:0-32], KC1[]
2450 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2453 ; EG-NEXT: Fetch clause starting at 6:
2454 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
2455 ; EG-NEXT: ALU clause starting at 8:
2456 ; EG-NEXT: MOV * T0.X, KC0[2].Y,
2457 ; EG-NEXT: ALU clause starting at 9:
2458 ; EG-NEXT: MULHI_INT * T0.W, T0.W, literal.x,
2459 ; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
2460 ; EG-NEXT: ASHR T1.Z, PS, literal.x,
2461 ; EG-NEXT: LSHR T0.W, PS, literal.y,
2462 ; EG-NEXT: MULHI_INT * T0.Z, T0.Z, literal.z,
2463 ; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44)
2464 ; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
2465 ; EG-NEXT: ASHR T1.Y, PS, literal.x,
2466 ; EG-NEXT: LSHR T0.Z, PS, literal.y,
2467 ; EG-NEXT: ADD_INT T0.W, PV.Z, PV.W,
2468 ; EG-NEXT: MULHI_INT * T0.Y, T0.Y, literal.z,
2469 ; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44)
2470 ; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
2471 ; EG-NEXT: ASHR T2.Y, PS, literal.x,
2472 ; EG-NEXT: ADD_INT T0.Z, PV.Y, PV.Z,
2473 ; EG-NEXT: LSHR T1.W, PS, literal.y,
2474 ; EG-NEXT: MULHI_INT * T0.X, T0.X, literal.z,
2475 ; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44)
2476 ; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
2477 ; EG-NEXT: ADD_INT T0.Y, PV.Y, PV.W,
2478 ; EG-NEXT: ASHR T1.W, PS, literal.x,
2479 ; EG-NEXT: LSHR * T2.W, PS, literal.y,
2480 ; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44)
2481 ; EG-NEXT: ADD_INT T0.X, PV.W, PS,
2482 ; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x,
2483 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2484 %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
2485 %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
2486 store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16