1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx600 | FileCheck %s --check-prefix=GCN
3 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefix=TONGA
4 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global | FileCheck %s --check-prefix=GFX9
5 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG
7 ; The code generated by sdiv is long and complex and may frequently change.
8 ; The goal of this test is to make sure the ISel doesn't fail.
10 ; This program was previously failing to compile when one of the selectcc
11 ; opcodes generated by the sdiv lowering was being legalized and optimized to:
12 ; selectcc Remainder -1, 0, -1, SETGT
13 ; This was fixed by adding an additional pattern in R600Instructions.td to
14 ; match this pattern with a CNDGE_INT.
16 define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
17 ; GCN-LABEL: sdiv_i32:
19 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
20 ; GCN-NEXT: s_mov_b32 s7, 0xf000
21 ; GCN-NEXT: s_mov_b32 s6, -1
22 ; GCN-NEXT: s_mov_b32 s10, s6
23 ; GCN-NEXT: s_mov_b32 s11, s7
24 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
25 ; GCN-NEXT: s_mov_b32 s8, s2
26 ; GCN-NEXT: s_mov_b32 s9, s3
27 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
28 ; GCN-NEXT: s_mov_b32 s4, s0
29 ; GCN-NEXT: s_mov_b32 s5, s1
30 ; GCN-NEXT: s_waitcnt vmcnt(0)
31 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1
32 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1
33 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
34 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1
35 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
36 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0
37 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v5, v0
38 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3
39 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v5
40 ; GCN-NEXT: v_xor_b32_e32 v2, v5, v2
41 ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
42 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
43 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v3
44 ; GCN-NEXT: v_mul_hi_u32 v4, v3, v4
45 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
46 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v3
47 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v1
48 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
49 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v4, v0
50 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1
51 ; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v1, v0
52 ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
53 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
54 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3
55 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
56 ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
57 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
58 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
59 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
62 ; TONGA-LABEL: sdiv_i32:
64 ; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
65 ; TONGA-NEXT: s_mov_b32 s7, 0xf000
66 ; TONGA-NEXT: s_mov_b32 s6, -1
67 ; TONGA-NEXT: s_mov_b32 s10, s6
68 ; TONGA-NEXT: s_mov_b32 s11, s7
69 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
70 ; TONGA-NEXT: s_mov_b32 s8, s2
71 ; TONGA-NEXT: s_mov_b32 s9, s3
72 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
73 ; TONGA-NEXT: s_mov_b32 s4, s0
74 ; TONGA-NEXT: s_mov_b32 s5, s1
75 ; TONGA-NEXT: s_waitcnt vmcnt(0)
76 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v1
77 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v1
78 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v2
79 ; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v1
80 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v1
81 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
82 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v5, v0
83 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
84 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v5
85 ; TONGA-NEXT: v_xor_b32_e32 v2, v5, v2
86 ; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
87 ; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
88 ; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3
89 ; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4
90 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v4, v3
91 ; TONGA-NEXT: v_mul_hi_u32 v3, v0, v3
92 ; TONGA-NEXT: v_mul_lo_u32 v4, v3, v1
93 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
94 ; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v4, v0
95 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1
96 ; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v1, v0
97 ; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
98 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
99 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3
100 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
101 ; TONGA-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
102 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2
103 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
104 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
105 ; TONGA-NEXT: s_endpgm
107 ; GFX9-LABEL: sdiv_i32:
109 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
110 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
111 ; GFX9-NEXT: s_mov_b32 s2, -1
112 ; GFX9-NEXT: s_mov_b32 s10, s2
113 ; GFX9-NEXT: s_mov_b32 s11, s3
114 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
115 ; GFX9-NEXT: s_mov_b32 s8, s6
116 ; GFX9-NEXT: s_mov_b32 s9, s7
117 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
118 ; GFX9-NEXT: s_mov_b32 s0, s4
119 ; GFX9-NEXT: s_mov_b32 s1, s5
120 ; GFX9-NEXT: s_waitcnt vmcnt(0)
121 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1
122 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
123 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2
124 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v1
125 ; GFX9-NEXT: v_sub_u32_e32 v4, 0, v1
126 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0
127 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v5
128 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
129 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5
130 ; GFX9-NEXT: v_xor_b32_e32 v2, v5, v2
131 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
132 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
133 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3
134 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4
135 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4
136 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3
137 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1
138 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
139 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4
140 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
141 ; GFX9-NEXT: v_sub_u32_e32 v4, v0, v1
142 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
143 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
144 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v3
145 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
146 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
147 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2
148 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2
149 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
150 ; GFX9-NEXT: s_endpgm
152 ; EG-LABEL: sdiv_i32:
154 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
156 ; EG-NEXT: ALU 26, @9, KC0[CB0:0-32], KC1[]
157 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
160 ; EG-NEXT: Fetch clause starting at 6:
161 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
162 ; EG-NEXT: ALU clause starting at 8:
163 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
164 ; EG-NEXT: ALU clause starting at 9:
165 ; EG-NEXT: SETGT_INT * T0.W, 0.0, T0.Y,
166 ; EG-NEXT: ADD_INT * T1.W, T0.Y, PV.W,
167 ; EG-NEXT: XOR_INT * T1.W, PV.W, T0.W,
168 ; EG-NEXT: SUB_INT T2.W, 0.0, PV.W,
169 ; EG-NEXT: RECIP_UINT * T0.Y, PV.W,
170 ; EG-NEXT: SETGT_INT T3.W, 0.0, T0.X,
171 ; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS,
172 ; EG-NEXT: ADD_INT T2.W, T0.X, PV.W,
173 ; EG-NEXT: MULHI * T0.X, T0.Y, PS,
174 ; EG-NEXT: ADD_INT T4.W, T0.Y, PS,
175 ; EG-NEXT: XOR_INT * T2.W, PV.W, T3.W,
176 ; EG-NEXT: MULHI * T0.X, PS, PV.W,
177 ; EG-NEXT: MULLO_INT * T0.Y, PS, T1.W,
178 ; EG-NEXT: SUB_INT * T2.W, T2.W, PS,
179 ; EG-NEXT: ADD_INT T0.Z, T0.X, 1,
180 ; EG-NEXT: SETGE_UINT T4.W, PV.W, T1.W,
181 ; EG-NEXT: SUB_INT * T5.W, PV.W, T1.W,
182 ; EG-NEXT: CNDE_INT T2.W, PV.W, T2.W, PS,
183 ; EG-NEXT: CNDE_INT * T4.W, PV.W, T0.X, PV.Z,
184 ; EG-NEXT: ADD_INT T5.W, PS, 1,
185 ; EG-NEXT: SETGE_UINT * T1.W, PV.W, T1.W,
186 ; EG-NEXT: CNDE_INT T1.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221
187 ; EG-NEXT: XOR_INT * T0.W, T3.W, T0.W,
188 ; EG-NEXT: XOR_INT * T1.W, PV.W, PS,
189 ; EG-NEXT: SUB_INT T0.X, PV.W, T0.W,
190 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
191 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
192 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
193 %num = load i32, i32 addrspace(1) * %in
194 %den = load i32, i32 addrspace(1) * %den_ptr
195 %result = sdiv i32 %num, %den
196 store i32 %result, i32 addrspace(1)* %out
200 define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
201 ; GCN-LABEL: sdiv_i32_4:
203 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
204 ; GCN-NEXT: s_mov_b32 s3, 0xf000
205 ; GCN-NEXT: s_mov_b32 s2, -1
206 ; GCN-NEXT: s_mov_b32 s10, s2
207 ; GCN-NEXT: s_mov_b32 s11, s3
208 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
209 ; GCN-NEXT: s_mov_b32 s8, s6
210 ; GCN-NEXT: s_mov_b32 s9, s7
211 ; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
212 ; GCN-NEXT: s_mov_b32 s0, s4
213 ; GCN-NEXT: s_mov_b32 s1, s5
214 ; GCN-NEXT: s_waitcnt vmcnt(0)
215 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
216 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1
217 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
218 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0
219 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
222 ; TONGA-LABEL: sdiv_i32_4:
224 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
225 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
226 ; TONGA-NEXT: s_mov_b32 s2, -1
227 ; TONGA-NEXT: s_mov_b32 s10, s2
228 ; TONGA-NEXT: s_mov_b32 s11, s3
229 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
230 ; TONGA-NEXT: s_mov_b32 s8, s6
231 ; TONGA-NEXT: s_mov_b32 s9, s7
232 ; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0
233 ; TONGA-NEXT: s_mov_b32 s0, s4
234 ; TONGA-NEXT: s_mov_b32 s1, s5
235 ; TONGA-NEXT: s_waitcnt vmcnt(0)
236 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
237 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1
238 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
239 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0
240 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
241 ; TONGA-NEXT: s_endpgm
243 ; GFX9-LABEL: sdiv_i32_4:
245 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
246 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
247 ; GFX9-NEXT: s_mov_b32 s2, -1
248 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
249 ; GFX9-NEXT: s_mov_b32 s0, s4
250 ; GFX9-NEXT: s_mov_b32 s1, s5
251 ; GFX9-NEXT: s_mov_b32 s4, s6
252 ; GFX9-NEXT: s_mov_b32 s5, s7
253 ; GFX9-NEXT: s_mov_b32 s6, s2
254 ; GFX9-NEXT: s_mov_b32 s7, s3
255 ; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0
256 ; GFX9-NEXT: s_waitcnt vmcnt(0)
257 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
258 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 30, v1
259 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
260 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0
261 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
262 ; GFX9-NEXT: s_endpgm
264 ; EG-LABEL: sdiv_i32_4:
266 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
268 ; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
269 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
272 ; EG-NEXT: Fetch clause starting at 6:
273 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
274 ; EG-NEXT: ALU clause starting at 8:
275 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
276 ; EG-NEXT: ALU clause starting at 9:
277 ; EG-NEXT: ASHR * T0.W, T0.X, literal.x,
278 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
279 ; EG-NEXT: LSHR * T0.W, PV.W, literal.x,
280 ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
281 ; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W,
282 ; EG-NEXT: ASHR T0.X, PV.W, literal.x,
283 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
284 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
285 %num = load i32, i32 addrspace(1) * %in
286 %result = sdiv i32 %num, 4
287 store i32 %result, i32 addrspace(1)* %out
291 ; Multiply by a weird constant to make sure setIntDivIsCheap is
294 define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
295 ; GCN-LABEL: slow_sdiv_i32_3435:
297 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
298 ; GCN-NEXT: s_mov_b32 s3, 0xf000
299 ; GCN-NEXT: s_mov_b32 s2, -1
300 ; GCN-NEXT: s_mov_b32 s10, s2
301 ; GCN-NEXT: s_mov_b32 s11, s3
302 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
303 ; GCN-NEXT: s_mov_b32 s8, s6
304 ; GCN-NEXT: s_mov_b32 s9, s7
305 ; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
306 ; GCN-NEXT: s_mov_b32 s0, 0x98a1930b
307 ; GCN-NEXT: s_mov_b32 s1, s5
308 ; GCN-NEXT: s_waitcnt vmcnt(0)
309 ; GCN-NEXT: v_mul_hi_i32 v1, v0, s0
310 ; GCN-NEXT: s_mov_b32 s0, s4
311 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
312 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0
313 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0
314 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
315 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
318 ; TONGA-LABEL: slow_sdiv_i32_3435:
320 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
321 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
322 ; TONGA-NEXT: s_mov_b32 s2, -1
323 ; TONGA-NEXT: s_mov_b32 s10, s2
324 ; TONGA-NEXT: s_mov_b32 s11, s3
325 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
326 ; TONGA-NEXT: s_mov_b32 s8, s6
327 ; TONGA-NEXT: s_mov_b32 s9, s7
328 ; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0
329 ; TONGA-NEXT: s_mov_b32 s0, 0x98a1930b
330 ; TONGA-NEXT: s_mov_b32 s1, s5
331 ; TONGA-NEXT: s_waitcnt vmcnt(0)
332 ; TONGA-NEXT: v_mul_hi_i32 v1, v0, s0
333 ; TONGA-NEXT: s_mov_b32 s0, s4
334 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
335 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0
336 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0
337 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0
338 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
339 ; TONGA-NEXT: s_endpgm
341 ; GFX9-LABEL: slow_sdiv_i32_3435:
343 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
344 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
345 ; GFX9-NEXT: s_mov_b32 s2, -1
346 ; GFX9-NEXT: s_mov_b32 s10, s2
347 ; GFX9-NEXT: s_mov_b32 s11, s3
348 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
349 ; GFX9-NEXT: s_mov_b32 s8, s6
350 ; GFX9-NEXT: s_mov_b32 s9, s7
351 ; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
352 ; GFX9-NEXT: s_mov_b32 s0, 0x98a1930b
353 ; GFX9-NEXT: s_mov_b32 s1, s5
354 ; GFX9-NEXT: s_waitcnt vmcnt(0)
355 ; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0
356 ; GFX9-NEXT: s_mov_b32 s0, s4
357 ; GFX9-NEXT: v_add_u32_e32 v0, v1, v0
358 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v0
359 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 11, v0
360 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
361 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
362 ; GFX9-NEXT: s_endpgm
364 ; EG-LABEL: slow_sdiv_i32_3435:
366 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
368 ; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
369 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
372 ; EG-NEXT: Fetch clause starting at 6:
373 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
374 ; EG-NEXT: ALU clause starting at 8:
375 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
376 ; EG-NEXT: ALU clause starting at 9:
377 ; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x,
378 ; EG-NEXT: -1734241525(-4.176600e-24), 0(0.000000e+00)
379 ; EG-NEXT: ADD_INT * T0.W, PS, T0.X,
380 ; EG-NEXT: ASHR T1.W, PV.W, literal.x,
381 ; EG-NEXT: LSHR * T0.W, PV.W, literal.y,
382 ; EG-NEXT: 11(1.541428e-44), 31(4.344025e-44)
383 ; EG-NEXT: ADD_INT T0.X, PV.W, PS,
384 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
385 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
386 %num = load i32, i32 addrspace(1) * %in
387 %result = sdiv i32 %num, 3435
388 store i32 %result, i32 addrspace(1)* %out
392 define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
393 ; GCN-LABEL: sdiv_v2i32:
395 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
396 ; GCN-NEXT: s_mov_b32 s7, 0xf000
397 ; GCN-NEXT: s_mov_b32 s6, -1
398 ; GCN-NEXT: s_mov_b32 s10, s6
399 ; GCN-NEXT: s_mov_b32 s11, s7
400 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
401 ; GCN-NEXT: s_mov_b32 s8, s2
402 ; GCN-NEXT: s_mov_b32 s9, s3
403 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
404 ; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe
405 ; GCN-NEXT: s_mov_b32 s4, s0
406 ; GCN-NEXT: s_mov_b32 s5, s1
407 ; GCN-NEXT: s_waitcnt vmcnt(0)
408 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v2
409 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3
410 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2
411 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3
412 ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0
413 ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1
414 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v5
415 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v7
416 ; GCN-NEXT: v_xor_b32_e32 v8, v4, v5
417 ; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2
418 ; GCN-NEXT: v_xor_b32_e32 v9, v6, v7
419 ; GCN-NEXT: v_cvt_f32_u32_e32 v7, v3
420 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v2
421 ; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5
422 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 0, v3
423 ; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v7
424 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
425 ; GCN-NEXT: v_mul_f32_e32 v5, s2, v5
426 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
427 ; GCN-NEXT: v_mul_f32_e32 v7, s2, v7
428 ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7
429 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v6, v1
430 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v5
431 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v4
432 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v7
433 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v6
434 ; GCN-NEXT: v_mul_hi_u32 v4, v5, v10
435 ; GCN-NEXT: v_mul_hi_u32 v6, v7, v11
436 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5
437 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v4
438 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v7
439 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v5
440 ; GCN-NEXT: v_mul_lo_u32 v6, v4, v2
441 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4
442 ; GCN-NEXT: v_mul_lo_u32 v10, v5, v3
443 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5
444 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v6, v0
445 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
446 ; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v10, v1
447 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
448 ; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0
449 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
450 ; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1
451 ; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3]
452 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1]
453 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
454 ; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4
455 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v5
456 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
457 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
458 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
459 ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
460 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8
461 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v9
462 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
463 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
464 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
467 ; TONGA-LABEL: sdiv_v2i32:
469 ; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
470 ; TONGA-NEXT: s_mov_b32 s7, 0xf000
471 ; TONGA-NEXT: s_mov_b32 s6, -1
472 ; TONGA-NEXT: s_mov_b32 s2, s6
473 ; TONGA-NEXT: s_mov_b32 s3, s7
474 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
475 ; TONGA-NEXT: s_mov_b32 s0, s10
476 ; TONGA-NEXT: s_mov_b32 s1, s11
477 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
478 ; TONGA-NEXT: s_mov_b32 s2, 0x4f7ffffe
479 ; TONGA-NEXT: s_mov_b32 s4, s8
480 ; TONGA-NEXT: s_mov_b32 s5, s9
481 ; TONGA-NEXT: s_waitcnt vmcnt(0)
482 ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v2
483 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v4, v2
484 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v4
485 ; TONGA-NEXT: v_cvt_f32_u32_e32 v5, v2
486 ; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2
487 ; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v0
488 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v7, v0
489 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v5
490 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v7
491 ; TONGA-NEXT: v_xor_b32_e32 v4, v7, v4
492 ; TONGA-NEXT: v_mul_f32_e32 v5, s2, v5
493 ; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5
494 ; TONGA-NEXT: v_mul_lo_u32 v6, v6, v5
495 ; TONGA-NEXT: v_mul_hi_u32 v6, v5, v6
496 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, v6, v5
497 ; TONGA-NEXT: v_mul_hi_u32 v5, v0, v5
498 ; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v3
499 ; TONGA-NEXT: v_mul_lo_u32 v8, v5, v2
500 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v5
501 ; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v8, v0
502 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
503 ; TONGA-NEXT: v_subrev_u32_e32 v8, vcc, v2, v0
504 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[0:1]
505 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1]
506 ; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v5
507 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
508 ; TONGA-NEXT: s_mov_b64 s[0:1], vcc
509 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v6, v3
510 ; TONGA-NEXT: v_xor_b32_e32 v2, v0, v6
511 ; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v2
512 ; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2
513 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
514 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1
515 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0
516 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v3
517 ; TONGA-NEXT: v_xor_b32_e32 v6, v3, v6
518 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[0:1]
519 ; TONGA-NEXT: v_mul_f32_e32 v0, s2, v0
520 ; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0
521 ; TONGA-NEXT: v_mul_lo_u32 v9, v9, v0
522 ; TONGA-NEXT: v_mul_hi_u32 v7, v0, v9
523 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v7, v0
524 ; TONGA-NEXT: v_mul_hi_u32 v3, v1, v0
525 ; TONGA-NEXT: v_xor_b32_e32 v0, v5, v4
526 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
527 ; TONGA-NEXT: v_mul_lo_u32 v4, v3, v2
528 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
529 ; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v4, v1
530 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v2
531 ; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v2, v1
532 ; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
533 ; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1]
534 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3
535 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
536 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
537 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6
538 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v6
539 ; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
540 ; TONGA-NEXT: s_endpgm
542 ; GFX9-LABEL: sdiv_v2i32:
544 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
545 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
546 ; GFX9-NEXT: s_mov_b32 s6, -1
547 ; GFX9-NEXT: s_mov_b32 s10, s6
548 ; GFX9-NEXT: s_mov_b32 s11, s7
549 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
550 ; GFX9-NEXT: s_mov_b32 s8, s2
551 ; GFX9-NEXT: s_mov_b32 s9, s3
552 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
553 ; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe
554 ; GFX9-NEXT: s_mov_b32 s4, s0
555 ; GFX9-NEXT: s_mov_b32 s5, s1
556 ; GFX9-NEXT: s_waitcnt vmcnt(0)
557 ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v2
558 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v3
559 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4
560 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5
561 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4
562 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v5
563 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v2
564 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v3
565 ; GFX9-NEXT: v_sub_u32_e32 v10, 0, v2
566 ; GFX9-NEXT: v_sub_u32_e32 v11, 0, v3
567 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6
568 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7
569 ; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0
570 ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v1
571 ; GFX9-NEXT: v_mul_f32_e32 v6, s2, v6
572 ; GFX9-NEXT: v_mul_f32_e32 v7, s2, v7
573 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
574 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
575 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v8
576 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v9
577 ; GFX9-NEXT: v_mul_lo_u32 v10, v10, v6
578 ; GFX9-NEXT: v_mul_lo_u32 v11, v11, v7
579 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8
580 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v9
581 ; GFX9-NEXT: v_mul_hi_u32 v10, v6, v10
582 ; GFX9-NEXT: v_mul_hi_u32 v11, v7, v11
583 ; GFX9-NEXT: v_xor_b32_e32 v4, v8, v4
584 ; GFX9-NEXT: v_xor_b32_e32 v5, v9, v5
585 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v10
586 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v11
587 ; GFX9-NEXT: v_mul_hi_u32 v6, v0, v6
588 ; GFX9-NEXT: v_mul_hi_u32 v7, v1, v7
589 ; GFX9-NEXT: v_mul_lo_u32 v8, v6, v2
590 ; GFX9-NEXT: v_mul_lo_u32 v9, v7, v3
591 ; GFX9-NEXT: v_add_u32_e32 v10, 1, v6
592 ; GFX9-NEXT: v_add_u32_e32 v11, 1, v7
593 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v8
594 ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v9
595 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
596 ; GFX9-NEXT: v_sub_u32_e32 v8, v0, v2
597 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v3
598 ; GFX9-NEXT: v_sub_u32_e32 v9, v1, v3
599 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
600 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
601 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[0:1]
602 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[0:1]
603 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v6
604 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
605 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
606 ; GFX9-NEXT: v_add_u32_e32 v9, 1, v7
607 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
608 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc
609 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4
610 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v5
611 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4
612 ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v5
613 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
614 ; GFX9-NEXT: s_endpgm
616 ; EG-LABEL: sdiv_v2i32:
618 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
620 ; EG-NEXT: ALU 51, @11, KC0[CB0:0-32], KC1[]
621 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
624 ; EG-NEXT: Fetch clause starting at 6:
625 ; EG-NEXT: VTX_READ_64 T1.XY, T0.X, 8, #1
626 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
627 ; EG-NEXT: ALU clause starting at 10:
628 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
629 ; EG-NEXT: ALU clause starting at 11:
630 ; EG-NEXT: SETGT_INT * T0.W, 0.0, T1.Y,
631 ; EG-NEXT: ADD_INT T1.W, T1.Y, PV.W,
632 ; EG-NEXT: SETGT_INT * T2.W, 0.0, T1.X,
633 ; EG-NEXT: XOR_INT * T1.W, PV.W, T0.W,
634 ; EG-NEXT: SUB_INT T0.Z, 0.0, PV.W,
635 ; EG-NEXT: ADD_INT T3.W, T1.X, T2.W,
636 ; EG-NEXT: RECIP_UINT * T1.X, PV.W,
637 ; EG-NEXT: XOR_INT T3.W, PV.W, T2.W,
638 ; EG-NEXT: MULLO_INT * T0.Z, PV.Z, PS,
639 ; EG-NEXT: SUB_INT T4.W, 0.0, PV.W,
640 ; EG-NEXT: RECIP_UINT * T1.Y, PV.W,
641 ; EG-NEXT: SETGT_INT T5.W, 0.0, T0.X,
642 ; EG-NEXT: MULLO_INT * T1.Z, PV.W, PS,
643 ; EG-NEXT: SETGT_INT T2.Z, 0.0, T0.Y,
644 ; EG-NEXT: ADD_INT T4.W, T0.X, PV.W,
645 ; EG-NEXT: MULHI * T0.X, T1.Y, PS,
646 ; EG-NEXT: ADD_INT T1.Y, T1.Y, PS,
647 ; EG-NEXT: XOR_INT T1.Z, PV.W, T5.W,
648 ; EG-NEXT: ADD_INT T4.W, T0.Y, PV.Z, BS:VEC_120/SCL_212
649 ; EG-NEXT: MULHI * T0.X, T1.X, T0.Z,
650 ; EG-NEXT: ADD_INT T0.Z, T1.X, PS,
651 ; EG-NEXT: XOR_INT T4.W, PV.W, T2.Z,
652 ; EG-NEXT: MULHI * T0.X, PV.Z, PV.Y,
653 ; EG-NEXT: MULHI * T0.Y, PV.W, PV.Z,
654 ; EG-NEXT: MULLO_INT * T0.Z, PS, T1.W,
655 ; EG-NEXT: SUB_INT T4.W, T4.W, PS,
656 ; EG-NEXT: MULLO_INT * T0.Z, T0.X, T3.W,
657 ; EG-NEXT: SUB_INT T1.Y, T1.Z, PS,
658 ; EG-NEXT: ADD_INT T0.Z, T0.Y, 1,
659 ; EG-NEXT: SETGE_UINT T6.W, PV.W, T1.W,
660 ; EG-NEXT: SUB_INT * T7.W, PV.W, T1.W,
661 ; EG-NEXT: CNDE_INT T1.X, PV.W, T4.W, PS, BS:VEC_021/SCL_122
662 ; EG-NEXT: CNDE_INT T0.Y, PV.W, T0.Y, PV.Z,
663 ; EG-NEXT: ADD_INT T0.Z, T0.X, 1,
664 ; EG-NEXT: SETGE_UINT T4.W, PV.Y, T3.W,
665 ; EG-NEXT: SUB_INT * T6.W, PV.Y, T3.W,
666 ; EG-NEXT: CNDE_INT T1.Y, PV.W, T1.Y, PS,
667 ; EG-NEXT: CNDE_INT T0.Z, PV.W, T0.X, PV.Z,
668 ; EG-NEXT: ADD_INT T4.W, PV.Y, 1,
669 ; EG-NEXT: SETGE_UINT * T1.W, PV.X, T1.W,
670 ; EG-NEXT: CNDE_INT T0.Y, PS, T0.Y, PV.W,
671 ; EG-NEXT: XOR_INT T1.Z, T2.Z, T0.W, BS:VEC_021/SCL_122
672 ; EG-NEXT: ADD_INT T0.W, PV.Z, 1,
673 ; EG-NEXT: SETGE_UINT * T1.W, PV.Y, T3.W,
674 ; EG-NEXT: CNDE_INT T0.Z, PS, T0.Z, PV.W,
675 ; EG-NEXT: XOR_INT T0.W, T5.W, T2.W,
676 ; EG-NEXT: XOR_INT * T1.W, PV.Y, PV.Z,
677 ; EG-NEXT: SUB_INT T0.Y, PS, T1.Z,
678 ; EG-NEXT: XOR_INT * T1.W, PV.Z, PV.W,
679 ; EG-NEXT: SUB_INT T0.X, PV.W, T0.W,
680 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
681 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
682 %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
683 %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
684 %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
685 %result = sdiv <2 x i32> %num, %den
686 store <2 x i32> %result, <2 x i32> addrspace(1)* %out
690 define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
691 ; GCN-LABEL: sdiv_v2i32_4:
693 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
694 ; GCN-NEXT: s_mov_b32 s3, 0xf000
695 ; GCN-NEXT: s_mov_b32 s2, -1
696 ; GCN-NEXT: s_mov_b32 s10, s2
697 ; GCN-NEXT: s_mov_b32 s11, s3
698 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
699 ; GCN-NEXT: s_mov_b32 s8, s6
700 ; GCN-NEXT: s_mov_b32 s9, s7
701 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
702 ; GCN-NEXT: s_mov_b32 s0, s4
703 ; GCN-NEXT: s_mov_b32 s1, s5
704 ; GCN-NEXT: s_waitcnt vmcnt(0)
705 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0
706 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1
707 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 30, v2
708 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 30, v3
709 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0
710 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
711 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0
712 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1
713 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
716 ; TONGA-LABEL: sdiv_v2i32_4:
718 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
719 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
720 ; TONGA-NEXT: s_mov_b32 s2, -1
721 ; TONGA-NEXT: s_mov_b32 s10, s2
722 ; TONGA-NEXT: s_mov_b32 s11, s3
723 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
724 ; TONGA-NEXT: s_mov_b32 s8, s6
725 ; TONGA-NEXT: s_mov_b32 s9, s7
726 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
727 ; TONGA-NEXT: s_mov_b32 s0, s4
728 ; TONGA-NEXT: s_mov_b32 s1, s5
729 ; TONGA-NEXT: s_waitcnt vmcnt(0)
730 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0
731 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
732 ; TONGA-NEXT: v_lshrrev_b32_e32 v2, 30, v2
733 ; TONGA-NEXT: v_lshrrev_b32_e32 v3, 30, v3
734 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0
735 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1
736 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0
737 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1
738 ; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
739 ; TONGA-NEXT: s_endpgm
741 ; GFX9-LABEL: sdiv_v2i32_4:
743 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
744 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
745 ; GFX9-NEXT: s_mov_b32 s2, -1
746 ; GFX9-NEXT: s_mov_b32 s10, s2
747 ; GFX9-NEXT: s_mov_b32 s11, s3
748 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
749 ; GFX9-NEXT: s_mov_b32 s8, s6
750 ; GFX9-NEXT: s_mov_b32 s9, s7
751 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
752 ; GFX9-NEXT: s_mov_b32 s0, s4
753 ; GFX9-NEXT: s_mov_b32 s1, s5
754 ; GFX9-NEXT: s_waitcnt vmcnt(0)
755 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0
756 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1
757 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 30, v2
758 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 30, v3
759 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
760 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
761 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0
762 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1
763 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
764 ; GFX9-NEXT: s_endpgm
766 ; EG-LABEL: sdiv_v2i32_4:
768 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
770 ; EG-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[]
771 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
774 ; EG-NEXT: Fetch clause starting at 6:
775 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
776 ; EG-NEXT: ALU clause starting at 8:
777 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
778 ; EG-NEXT: ALU clause starting at 9:
779 ; EG-NEXT: ASHR * T0.W, T0.Y, literal.x,
780 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
781 ; EG-NEXT: LSHR T0.W, PV.W, literal.x,
782 ; EG-NEXT: ASHR * T1.W, T0.X, literal.y,
783 ; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44)
784 ; EG-NEXT: LSHR T1.W, PS, literal.x,
785 ; EG-NEXT: ADD_INT * T0.W, T0.Y, PV.W,
786 ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
787 ; EG-NEXT: ASHR T0.Y, PS, literal.x,
788 ; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W,
789 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
790 ; EG-NEXT: ASHR T0.X, PV.W, literal.x,
791 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
792 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
793 %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
794 %result = sdiv <2 x i32> %num, <i32 4, i32 4>
795 store <2 x i32> %result, <2 x i32> addrspace(1)* %out
799 define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
800 ; GCN-LABEL: sdiv_v4i32:
802 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
803 ; GCN-NEXT: s_mov_b32 s11, 0xf000
804 ; GCN-NEXT: s_mov_b32 s10, -1
805 ; GCN-NEXT: s_mov_b32 s6, s10
806 ; GCN-NEXT: s_mov_b32 s7, s11
807 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
808 ; GCN-NEXT: s_mov_b32 s4, s2
809 ; GCN-NEXT: s_mov_b32 s5, s3
810 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
811 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
812 ; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe
813 ; GCN-NEXT: s_mov_b32 s8, s0
814 ; GCN-NEXT: s_mov_b32 s9, s1
815 ; GCN-NEXT: s_waitcnt vmcnt(1)
816 ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v0
817 ; GCN-NEXT: s_waitcnt vmcnt(0)
818 ; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5
819 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4
820 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5
821 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
822 ; GCN-NEXT: v_xor_b32_e32 v5, v5, v11
823 ; GCN-NEXT: v_xor_b32_e32 v15, v8, v9
824 ; GCN-NEXT: v_xor_b32_e32 v4, v4, v9
825 ; GCN-NEXT: v_cvt_f32_u32_e32 v9, v5
826 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v8, v0
827 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8
828 ; GCN-NEXT: v_cvt_f32_u32_e32 v8, v4
829 ; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v9
830 ; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6
831 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6
832 ; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1
833 ; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8
834 ; GCN-NEXT: v_mul_f32_e32 v9, s2, v9
835 ; GCN-NEXT: v_xor_b32_e32 v6, v6, v13
836 ; GCN-NEXT: v_xor_b32_e32 v16, v10, v11
837 ; GCN-NEXT: v_cvt_f32_u32_e32 v11, v6
838 ; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9
839 ; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2
840 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2
841 ; GCN-NEXT: v_mul_f32_e32 v8, s2, v8
842 ; GCN-NEXT: v_xor_b32_e32 v17, v12, v13
843 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v12
844 ; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v5
845 ; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11
846 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v9
847 ; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8
848 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1
849 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v10
850 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
851 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v8
852 ; GCN-NEXT: v_mul_hi_u32 v12, v9, v12
853 ; GCN-NEXT: v_mul_f32_e32 v11, s2, v11
854 ; GCN-NEXT: v_cvt_u32_f32_e32 v11, v11
855 ; GCN-NEXT: v_mul_hi_u32 v10, v8, v10
856 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9
857 ; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v6
858 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v11
859 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8
860 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v8
861 ; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7
862 ; GCN-NEXT: v_mul_hi_u32 v12, v11, v12
863 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v14, v7
864 ; GCN-NEXT: v_xor_b32_e32 v7, v7, v14
865 ; GCN-NEXT: v_cvt_f32_u32_e32 v10, v7
866 ; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11
867 ; GCN-NEXT: v_mul_lo_u32 v12, v8, v4
868 ; GCN-NEXT: v_mul_hi_u32 v9, v1, v9
869 ; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10
870 ; GCN-NEXT: v_mul_hi_u32 v11, v2, v11
871 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
872 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8
873 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
874 ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1]
875 ; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v4
876 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1]
877 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
878 ; GCN-NEXT: v_mul_lo_u32 v0, v9, v5
879 ; GCN-NEXT: v_mul_f32_e32 v10, s2, v10
880 ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v10
881 ; GCN-NEXT: v_mul_lo_u32 v10, v11, v6
882 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
883 ; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v9
884 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
885 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
886 ; GCN-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3]
887 ; GCN-NEXT: v_sub_i32_e32 v9, vcc, v0, v5
888 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v11
889 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
890 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8
891 ; GCN-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[4:5]
892 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, v2, v6
893 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[2:3]
894 ; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v1
895 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
896 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc
897 ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1]
898 ; GCN-NEXT: v_xor_b32_e32 v1, v8, v15
899 ; GCN-NEXT: v_xor_b32_e32 v5, v0, v16
900 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v15
901 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v16
902 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v7
903 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v4
904 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v3
905 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3
906 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v9
907 ; GCN-NEXT: v_mul_hi_u32 v5, v4, v5
908 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v11, s[4:5]
909 ; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v10
910 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
911 ; GCN-NEXT: v_mul_hi_u32 v4, v3, v4
912 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
913 ; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v8, vcc
914 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v17
915 ; GCN-NEXT: v_mul_lo_u32 v5, v4, v7
916 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v17
917 ; GCN-NEXT: v_xor_b32_e32 v6, v9, v14
918 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v5
919 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4
920 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v7
921 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
922 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v7
923 ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
924 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4
925 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7
926 ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
927 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v6
928 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6
929 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
932 ; TONGA-LABEL: sdiv_v4i32:
934 ; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
935 ; TONGA-NEXT: s_mov_b32 s7, 0xf000
936 ; TONGA-NEXT: s_mov_b32 s6, -1
937 ; TONGA-NEXT: s_mov_b32 s2, s6
938 ; TONGA-NEXT: s_mov_b32 s3, s7
939 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
940 ; TONGA-NEXT: s_mov_b32 s0, s10
941 ; TONGA-NEXT: s_mov_b32 s1, s11
942 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16
943 ; TONGA-NEXT: s_mov_b32 s10, 0x4f7ffffe
944 ; TONGA-NEXT: s_mov_b32 s4, s8
945 ; TONGA-NEXT: s_mov_b32 s5, s9
946 ; TONGA-NEXT: s_waitcnt vmcnt(0)
947 ; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v0
948 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v8, v0
949 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8
950 ; TONGA-NEXT: v_cvt_f32_u32_e32 v4, v0
951 ; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v2
952 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v4
953 ; TONGA-NEXT: v_mul_f32_e32 v4, s10, v4
954 ; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v4
955 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v0
956 ; TONGA-NEXT: v_mul_lo_u32 v10, v4, v9
957 ; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0
958 ; TONGA-NEXT: v_mul_hi_u32 v10, v9, v10
959 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v9
960 ; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1
961 ; TONGA-NEXT: s_waitcnt vmcnt(0)
962 ; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v4
963 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v11, v4
964 ; TONGA-NEXT: v_xor_b32_e32 v4, v4, v11
965 ; TONGA-NEXT: v_mul_hi_u32 v9, v4, v9
966 ; TONGA-NEXT: v_xor_b32_e32 v8, v11, v8
967 ; TONGA-NEXT: v_mul_lo_u32 v12, v9, v0
968 ; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v9
969 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v12
970 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0
971 ; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v4, v0
972 ; TONGA-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[0:1]
973 ; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1]
974 ; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v9
975 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0
976 ; TONGA-NEXT: s_mov_b64 s[0:1], vcc
977 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v10, v1
978 ; TONGA-NEXT: v_xor_b32_e32 v1, v0, v10
979 ; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v1
980 ; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v1
981 ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v5
982 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, v4, v5
983 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0
984 ; TONGA-NEXT: v_xor_b32_e32 v5, v5, v4
985 ; TONGA-NEXT: v_cndmask_b32_e64 v9, v9, v12, s[0:1]
986 ; TONGA-NEXT: v_xor_b32_e32 v4, v4, v10
987 ; TONGA-NEXT: v_mul_f32_e32 v0, s10, v0
988 ; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0
989 ; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v6
990 ; TONGA-NEXT: v_mul_lo_u32 v13, v13, v0
991 ; TONGA-NEXT: v_mul_hi_u32 v11, v0, v13
992 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v11, v0
993 ; TONGA-NEXT: v_mul_hi_u32 v11, v5, v0
994 ; TONGA-NEXT: v_xor_b32_e32 v0, v9, v8
995 ; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v8, v0
996 ; TONGA-NEXT: v_mul_lo_u32 v8, v11, v1
997 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v11
998 ; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v5, v8
999 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v5, v1
1000 ; TONGA-NEXT: v_cndmask_b32_e64 v8, v11, v9, s[0:1]
1001 ; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v5, v1
1002 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[0:1]
1003 ; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v8
1004 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
1005 ; TONGA-NEXT: s_mov_b64 s[0:1], vcc
1006 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v14, v2
1007 ; TONGA-NEXT: v_xor_b32_e32 v2, v1, v14
1008 ; TONGA-NEXT: v_cvt_f32_u32_e32 v1, v2
1009 ; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v2
1010 ; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1]
1011 ; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v3
1012 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1
1013 ; TONGA-NEXT: v_mul_f32_e32 v1, s10, v1
1014 ; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1
1015 ; TONGA-NEXT: v_mul_lo_u32 v5, v5, v1
1016 ; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5
1017 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v5, v1
1018 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, v10, v6
1019 ; TONGA-NEXT: v_xor_b32_e32 v5, v5, v10
1020 ; TONGA-NEXT: v_mul_hi_u32 v6, v5, v1
1021 ; TONGA-NEXT: v_xor_b32_e32 v1, v8, v4
1022 ; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v4, v1
1023 ; TONGA-NEXT: v_xor_b32_e32 v10, v10, v14
1024 ; TONGA-NEXT: v_mul_lo_u32 v4, v6, v2
1025 ; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v6
1026 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v5, v4
1027 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v2
1028 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1]
1029 ; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v4, v2
1030 ; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1]
1031 ; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5
1032 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
1033 ; TONGA-NEXT: s_mov_b64 s[0:1], vcc
1034 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v9, v3
1035 ; TONGA-NEXT: v_xor_b32_e32 v3, v2, v9
1036 ; TONGA-NEXT: v_cvt_f32_u32_e32 v2, v3
1037 ; TONGA-NEXT: v_sub_u32_e32 v8, vcc, 0, v3
1038 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1]
1039 ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v7
1040 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2
1041 ; TONGA-NEXT: v_add_u32_e32 v7, vcc, v4, v7
1042 ; TONGA-NEXT: v_xor_b32_e32 v9, v4, v9
1043 ; TONGA-NEXT: v_xor_b32_e32 v4, v7, v4
1044 ; TONGA-NEXT: v_mul_f32_e32 v2, s10, v2
1045 ; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2
1046 ; TONGA-NEXT: v_mul_lo_u32 v8, v8, v2
1047 ; TONGA-NEXT: v_mul_hi_u32 v6, v2, v8
1048 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2
1049 ; TONGA-NEXT: v_mul_hi_u32 v6, v4, v2
1050 ; TONGA-NEXT: v_xor_b32_e32 v2, v5, v10
1051 ; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v10, v2
1052 ; TONGA-NEXT: v_mul_lo_u32 v5, v6, v3
1053 ; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v6
1054 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
1055 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v3
1056 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v6, v7, s[0:1]
1057 ; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v4, v3
1058 ; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1]
1059 ; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5
1060 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3
1061 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
1062 ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v9
1063 ; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v9, v3
1064 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1065 ; TONGA-NEXT: s_endpgm
1067 ; GFX9-LABEL: sdiv_v4i32:
1069 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1070 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
1071 ; GFX9-NEXT: s_mov_b32 s10, -1
1072 ; GFX9-NEXT: s_mov_b32 s6, s10
1073 ; GFX9-NEXT: s_mov_b32 s7, s11
1074 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1075 ; GFX9-NEXT: s_mov_b32 s4, s2
1076 ; GFX9-NEXT: s_mov_b32 s5, s3
1077 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
1078 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1079 ; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe
1080 ; GFX9-NEXT: s_mov_b32 s8, s0
1081 ; GFX9-NEXT: s_mov_b32 s9, s1
1082 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1083 ; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0
1084 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1085 ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v4
1086 ; GFX9-NEXT: v_add_u32_e32 v4, v4, v9
1087 ; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v5
1088 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v8
1089 ; GFX9-NEXT: v_xor_b32_e32 v4, v4, v9
1090 ; GFX9-NEXT: v_ashrrev_i32_e32 v10, 31, v1
1091 ; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v6
1092 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v11
1093 ; GFX9-NEXT: v_xor_b32_e32 v16, v8, v9
1094 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8
1095 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v4
1096 ; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v2
1097 ; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v7
1098 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v10
1099 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v13
1100 ; GFX9-NEXT: v_xor_b32_e32 v5, v5, v11
1101 ; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v3
1102 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v12
1103 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v15
1104 ; GFX9-NEXT: v_xor_b32_e32 v17, v10, v11
1105 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v10
1106 ; GFX9-NEXT: v_cvt_f32_u32_e32 v10, v5
1107 ; GFX9-NEXT: v_xor_b32_e32 v6, v6, v13
1108 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v14
1109 ; GFX9-NEXT: v_xor_b32_e32 v18, v12, v13
1110 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v12
1111 ; GFX9-NEXT: v_cvt_f32_u32_e32 v12, v6
1112 ; GFX9-NEXT: v_xor_b32_e32 v7, v7, v15
1113 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8
1114 ; GFX9-NEXT: v_xor_b32_e32 v19, v14, v15
1115 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v14
1116 ; GFX9-NEXT: v_cvt_f32_u32_e32 v14, v7
1117 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v10, v10
1118 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v12
1119 ; GFX9-NEXT: v_mul_f32_e32 v8, s2, v8
1120 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v14
1121 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8
1122 ; GFX9-NEXT: v_mul_f32_e32 v10, s2, v10
1123 ; GFX9-NEXT: v_mul_f32_e32 v12, s2, v12
1124 ; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v10
1125 ; GFX9-NEXT: v_sub_u32_e32 v9, 0, v4
1126 ; GFX9-NEXT: v_mul_f32_e32 v14, s2, v14
1127 ; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12
1128 ; GFX9-NEXT: v_mul_lo_u32 v9, v9, v8
1129 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14
1130 ; GFX9-NEXT: v_sub_u32_e32 v11, 0, v5
1131 ; GFX9-NEXT: v_sub_u32_e32 v13, 0, v6
1132 ; GFX9-NEXT: v_mul_lo_u32 v11, v11, v10
1133 ; GFX9-NEXT: v_sub_u32_e32 v15, 0, v7
1134 ; GFX9-NEXT: v_mul_lo_u32 v13, v13, v12
1135 ; GFX9-NEXT: v_mul_lo_u32 v15, v15, v14
1136 ; GFX9-NEXT: v_mul_hi_u32 v9, v8, v9
1137 ; GFX9-NEXT: v_mul_hi_u32 v11, v10, v11
1138 ; GFX9-NEXT: v_mul_hi_u32 v13, v12, v13
1139 ; GFX9-NEXT: v_mul_hi_u32 v15, v14, v15
1140 ; GFX9-NEXT: v_add_u32_e32 v8, v8, v9
1141 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v8
1142 ; GFX9-NEXT: v_add_u32_e32 v9, v10, v11
1143 ; GFX9-NEXT: v_add_u32_e32 v10, v12, v13
1144 ; GFX9-NEXT: v_mul_hi_u32 v9, v1, v9
1145 ; GFX9-NEXT: v_add_u32_e32 v11, v14, v15
1146 ; GFX9-NEXT: v_mul_hi_u32 v10, v2, v10
1147 ; GFX9-NEXT: v_mul_lo_u32 v12, v8, v4
1148 ; GFX9-NEXT: v_mul_hi_u32 v11, v3, v11
1149 ; GFX9-NEXT: v_mul_lo_u32 v14, v9, v5
1150 ; GFX9-NEXT: v_mul_lo_u32 v15, v10, v6
1151 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v12
1152 ; GFX9-NEXT: v_mul_lo_u32 v12, v11, v7
1153 ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v14
1154 ; GFX9-NEXT: v_add_u32_e32 v13, 1, v8
1155 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
1156 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v15
1157 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc
1158 ; GFX9-NEXT: v_sub_u32_e32 v13, v0, v4
1159 ; GFX9-NEXT: v_add_u32_e32 v14, 1, v9
1160 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5
1161 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
1162 ; GFX9-NEXT: v_sub_u32_e32 v3, v3, v12
1163 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1]
1164 ; GFX9-NEXT: v_sub_u32_e32 v14, v1, v5
1165 ; GFX9-NEXT: v_add_u32_e32 v15, 1, v10
1166 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], v2, v6
1167 ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[2:3]
1168 ; GFX9-NEXT: v_sub_u32_e32 v15, v2, v6
1169 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v14, s[0:1]
1170 ; GFX9-NEXT: v_add_u32_e32 v12, 1, v11
1171 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7
1172 ; GFX9-NEXT: v_add_u32_e32 v13, 1, v8
1173 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
1174 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc
1175 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5]
1176 ; GFX9-NEXT: v_sub_u32_e32 v12, v3, v7
1177 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v15, s[2:3]
1178 ; GFX9-NEXT: v_add_u32_e32 v14, 1, v9
1179 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
1180 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v14, vcc
1181 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[4:5]
1182 ; GFX9-NEXT: v_add_u32_e32 v15, 1, v10
1183 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
1184 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v15, vcc
1185 ; GFX9-NEXT: v_add_u32_e32 v12, 1, v11
1186 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7
1187 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc
1188 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v16
1189 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v17
1190 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v18
1191 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v19
1192 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v16
1193 ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v17
1194 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v18
1195 ; GFX9-NEXT: v_sub_u32_e32 v3, v3, v19
1196 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
1197 ; GFX9-NEXT: s_endpgm
1199 ; EG-LABEL: sdiv_v4i32:
1201 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1203 ; EG-NEXT: ALU 101, @11, KC0[CB0:0-32], KC1[]
1204 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
1207 ; EG-NEXT: Fetch clause starting at 6:
1208 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
1209 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
1210 ; EG-NEXT: ALU clause starting at 10:
1211 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1212 ; EG-NEXT: ALU clause starting at 11:
1213 ; EG-NEXT: SETGT_INT * T2.W, 0.0, T1.W,
1214 ; EG-NEXT: ADD_INT * T1.W, T1.W, PV.W,
1215 ; EG-NEXT: XOR_INT * T1.W, PV.W, T2.W,
1216 ; EG-NEXT: SUB_INT T3.W, 0.0, PV.W,
1217 ; EG-NEXT: RECIP_UINT * T2.X, PV.W,
1218 ; EG-NEXT: SETGT_INT T4.W, 0.0, T0.W,
1219 ; EG-NEXT: MULLO_INT * T2.Y, PV.W, PS,
1220 ; EG-NEXT: SETGT_INT T2.Z, 0.0, T1.Y,
1221 ; EG-NEXT: ADD_INT T0.W, T0.W, PV.W,
1222 ; EG-NEXT: MULHI * T2.Y, T2.X, PS,
1223 ; EG-NEXT: ADD_INT T3.Z, T2.X, PS,
1224 ; EG-NEXT: XOR_INT T0.W, PV.W, T4.W,
1225 ; EG-NEXT: ADD_INT * T3.W, T1.Y, PV.Z,
1226 ; EG-NEXT: XOR_INT T3.W, PS, T2.Z,
1227 ; EG-NEXT: MULHI * T1.Y, PV.W, PV.Z,
1228 ; EG-NEXT: SUB_INT T5.W, 0.0, PV.W,
1229 ; EG-NEXT: RECIP_UINT * T2.X, PV.W,
1230 ; EG-NEXT: SETGT_INT T6.W, 0.0, T0.Y,
1231 ; EG-NEXT: MULLO_INT * T2.Y, PV.W, PS,
1232 ; EG-NEXT: ADD_INT T5.W, T0.Y, PV.W,
1233 ; EG-NEXT: MULHI * T0.Y, T2.X, PS,
1234 ; EG-NEXT: ADD_INT T0.Y, T2.X, PS,
1235 ; EG-NEXT: XOR_INT T3.Z, PV.W, T6.W, BS:VEC_021/SCL_122
1236 ; EG-NEXT: SETGT_INT T5.W, 0.0, T1.Z,
1237 ; EG-NEXT: MULLO_INT * T2.X, T1.Y, T1.W,
1238 ; EG-NEXT: ADD_INT T7.W, T1.Z, PV.W,
1239 ; EG-NEXT: MULHI * T0.Y, PV.Z, PV.Y,
1240 ; EG-NEXT: XOR_INT T7.W, PV.W, T5.W, BS:VEC_021/SCL_122
1241 ; EG-NEXT: MULLO_INT * T1.Z, PS, T3.W,
1242 ; EG-NEXT: SUB_INT T4.Z, 0.0, PV.W,
1243 ; EG-NEXT: SETGT_INT T8.W, 0.0, T1.X,
1244 ; EG-NEXT: RECIP_UINT * T2.Y, PV.W,
1245 ; EG-NEXT: ADD_INT T9.W, T1.X, PV.W,
1246 ; EG-NEXT: MULLO_INT * T1.X, PV.Z, PS,
1247 ; EG-NEXT: SETGT_INT T4.Z, 0.0, T0.Z,
1248 ; EG-NEXT: XOR_INT T9.W, PV.W, T8.W,
1249 ; EG-NEXT: MULHI * T1.X, T2.Y, PS,
1250 ; EG-NEXT: ADD_INT T1.X, T2.Y, PS,
1251 ; EG-NEXT: SUB_INT T2.Y, 0.0, PV.W,
1252 ; EG-NEXT: SUB_INT T1.Z, T3.Z, T1.Z,
1253 ; EG-NEXT: ADD_INT T10.W, T0.Z, PV.Z, BS:VEC_201
1254 ; EG-NEXT: RECIP_UINT * T0.Z, PV.W,
1255 ; EG-NEXT: XOR_INT T3.X, PV.W, T4.Z,
1256 ; EG-NEXT: ADD_INT T3.Y, T0.Y, 1,
1257 ; EG-NEXT: SETGE_UINT T3.Z, PV.Z, T3.W,
1258 ; EG-NEXT: SUB_INT T10.W, PV.Z, T3.W,
1259 ; EG-NEXT: MULLO_INT * T2.Y, PV.Y, PS,
1260 ; EG-NEXT: CNDE_INT T1.Z, PV.Z, T1.Z, PV.W,
1261 ; EG-NEXT: CNDE_INT T10.W, PV.Z, T0.Y, PV.Y,
1262 ; EG-NEXT: MULHI * T0.Y, PV.X, T1.X,
1263 ; EG-NEXT: SETGT_INT T3.Y, 0.0, T0.X,
1264 ; EG-NEXT: ADD_INT T3.Z, PV.W, 1,
1265 ; EG-NEXT: SETGE_UINT T3.W, PV.Z, T3.W, BS:VEC_021/SCL_122
1266 ; EG-NEXT: MULLO_INT * T1.X, PS, T7.W,
1267 ; EG-NEXT: CNDE_INT T4.Y, PV.W, T10.W, PV.Z,
1268 ; EG-NEXT: ADD_INT T1.Z, T0.X, PV.Y,
1269 ; EG-NEXT: SUB_INT T3.W, T3.X, PS, BS:VEC_120/SCL_212
1270 ; EG-NEXT: MULHI * T0.X, T0.Z, T2.Y,
1271 ; EG-NEXT: ADD_INT T1.X, T0.Y, 1,
1272 ; EG-NEXT: SETGE_UINT T2.Y, PV.W, T7.W,
1273 ; EG-NEXT: ADD_INT T0.Z, T0.Z, PS,
1274 ; EG-NEXT: XOR_INT T10.W, PV.Z, T3.Y,
1275 ; EG-NEXT: SUB_INT * T0.W, T0.W, T2.X,
1276 ; EG-NEXT: SUB_INT T0.X, T3.W, T7.W,
1277 ; EG-NEXT: ADD_INT T5.Y, T1.Y, 1,
1278 ; EG-NEXT: SETGE_UINT T1.Z, PS, T1.W, BS:VEC_021/SCL_122
1279 ; EG-NEXT: SUB_INT T11.W, PS, T1.W, BS:VEC_021/SCL_122
1280 ; EG-NEXT: MULHI * T0.Z, PV.W, PV.Z,
1281 ; EG-NEXT: CNDE_INT T2.X, PV.Z, T0.W, PV.W, BS:VEC_021/SCL_122
1282 ; EG-NEXT: CNDE_INT T1.Y, PV.Z, T1.Y, PV.Y,
1283 ; EG-NEXT: CNDE_INT T1.Z, T2.Y, T3.W, PV.X, BS:VEC_201
1284 ; EG-NEXT: CNDE_INT T0.W, T2.Y, T0.Y, T1.X, BS:VEC_201
1285 ; EG-NEXT: MULLO_INT * T0.X, PS, T9.W,
1286 ; EG-NEXT: ADD_INT T1.X, PV.W, 1,
1287 ; EG-NEXT: SETGE_UINT T0.Y, PV.Z, T7.W,
1288 ; EG-NEXT: ADD_INT T1.Z, PV.Y, 1,
1289 ; EG-NEXT: SETGE_UINT T1.W, PV.X, T1.W, BS:VEC_102/SCL_221
1290 ; EG-NEXT: SUB_INT * T3.W, T10.W, PS,
1291 ; EG-NEXT: ADD_INT T0.X, T0.Z, 1,
1292 ; EG-NEXT: SETGE_UINT T2.Y, PS, T9.W, BS:VEC_102/SCL_221
1293 ; EG-NEXT: SUB_INT T3.Z, PS, T9.W, BS:VEC_102/SCL_221
1294 ; EG-NEXT: CNDE_INT T1.W, PV.W, T1.Y, PV.Z,
1295 ; EG-NEXT: XOR_INT * T2.W, T4.W, T2.W,
1296 ; EG-NEXT: XOR_INT T2.X, PV.W, PS,
1297 ; EG-NEXT: CNDE_INT T1.Y, PV.Y, T3.W, PV.Z, BS:VEC_021/SCL_122
1298 ; EG-NEXT: CNDE_INT T0.Z, PV.Y, T0.Z, PV.X,
1299 ; EG-NEXT: CNDE_INT T0.W, T0.Y, T0.W, T1.X, BS:VEC_102/SCL_221
1300 ; EG-NEXT: XOR_INT * T1.W, T4.Z, T5.W,
1301 ; EG-NEXT: XOR_INT T0.X, T6.W, T2.Z,
1302 ; EG-NEXT: XOR_INT T0.Y, PV.W, PS,
1303 ; EG-NEXT: ADD_INT T1.Z, PV.Z, 1,
1304 ; EG-NEXT: SETGE_UINT T0.W, PV.Y, T9.W, BS:VEC_021/SCL_122
1305 ; EG-NEXT: SUB_INT * T2.W, PV.X, T2.W,
1306 ; EG-NEXT: CNDE_INT T1.Y, PV.W, T0.Z, PV.Z,
1307 ; EG-NEXT: SUB_INT T2.Z, PV.Y, T1.W,
1308 ; EG-NEXT: XOR_INT T0.W, T3.Y, T8.W, BS:VEC_021/SCL_122
1309 ; EG-NEXT: XOR_INT * T1.W, T4.Y, PV.X,
1310 ; EG-NEXT: SUB_INT T2.Y, PS, T0.X,
1311 ; EG-NEXT: XOR_INT * T1.W, PV.Y, PV.W,
1312 ; EG-NEXT: SUB_INT T2.X, PV.W, T0.W,
1313 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
1314 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1315 %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
1316 %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
1317 %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
1318 %result = sdiv <4 x i32> %num, %den
1319 store <4 x i32> %result, <4 x i32> addrspace(1)* %out
1323 define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
1324 ; GCN-LABEL: sdiv_v4i32_4:
1326 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1327 ; GCN-NEXT: s_mov_b32 s3, 0xf000
1328 ; GCN-NEXT: s_mov_b32 s2, -1
1329 ; GCN-NEXT: s_mov_b32 s10, s2
1330 ; GCN-NEXT: s_mov_b32 s11, s3
1331 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1332 ; GCN-NEXT: s_mov_b32 s8, s6
1333 ; GCN-NEXT: s_mov_b32 s9, s7
1334 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1335 ; GCN-NEXT: s_mov_b32 s0, s4
1336 ; GCN-NEXT: s_mov_b32 s1, s5
1337 ; GCN-NEXT: s_waitcnt vmcnt(0)
1338 ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1339 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1
1340 ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v2
1341 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3
1342 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 30, v4
1343 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 30, v5
1344 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 30, v6
1345 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 30, v7
1346 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0
1347 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v5, v1
1348 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2
1349 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3
1350 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0
1351 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1
1352 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 2, v2
1353 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 2, v3
1354 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1355 ; GCN-NEXT: s_endpgm
1357 ; TONGA-LABEL: sdiv_v4i32_4:
1359 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1360 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
1361 ; TONGA-NEXT: s_mov_b32 s2, -1
1362 ; TONGA-NEXT: s_mov_b32 s10, s2
1363 ; TONGA-NEXT: s_mov_b32 s11, s3
1364 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
1365 ; TONGA-NEXT: s_mov_b32 s8, s6
1366 ; TONGA-NEXT: s_mov_b32 s9, s7
1367 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1368 ; TONGA-NEXT: s_mov_b32 s0, s4
1369 ; TONGA-NEXT: s_mov_b32 s1, s5
1370 ; TONGA-NEXT: s_waitcnt vmcnt(0)
1371 ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1372 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1
1373 ; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v2
1374 ; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3
1375 ; TONGA-NEXT: v_lshrrev_b32_e32 v4, 30, v4
1376 ; TONGA-NEXT: v_lshrrev_b32_e32 v5, 30, v5
1377 ; TONGA-NEXT: v_lshrrev_b32_e32 v6, 30, v6
1378 ; TONGA-NEXT: v_lshrrev_b32_e32 v7, 30, v7
1379 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0
1380 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v5, v1
1381 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2
1382 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3
1383 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0
1384 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1
1385 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 2, v2
1386 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 2, v3
1387 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1388 ; TONGA-NEXT: s_endpgm
1390 ; GFX9-LABEL: sdiv_v4i32_4:
1392 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1393 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1394 ; GFX9-NEXT: s_mov_b32 s2, -1
1395 ; GFX9-NEXT: s_mov_b32 s10, s2
1396 ; GFX9-NEXT: s_mov_b32 s11, s3
1397 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1398 ; GFX9-NEXT: s_mov_b32 s8, s6
1399 ; GFX9-NEXT: s_mov_b32 s9, s7
1400 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1401 ; GFX9-NEXT: s_mov_b32 s0, s4
1402 ; GFX9-NEXT: s_mov_b32 s1, s5
1403 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1404 ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0
1405 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1
1406 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v2
1407 ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v3
1408 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 30, v4
1409 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 30, v5
1410 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 30, v6
1411 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 30, v7
1412 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v4
1413 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v5
1414 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6
1415 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7
1416 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0
1417 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1
1418 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 2, v2
1419 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 2, v3
1420 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1421 ; GFX9-NEXT: s_endpgm
1423 ; EG-LABEL: sdiv_v4i32_4:
1425 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1427 ; EG-NEXT: ALU 24, @9, KC0[CB0:0-32], KC1[]
1428 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
1431 ; EG-NEXT: Fetch clause starting at 6:
1432 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
1433 ; EG-NEXT: ALU clause starting at 8:
1434 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1435 ; EG-NEXT: ALU clause starting at 9:
1436 ; EG-NEXT: ASHR T1.W, T0.W, literal.x,
1437 ; EG-NEXT: ASHR * T2.W, T0.Z, literal.x,
1438 ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
1439 ; EG-NEXT: LSHR * T1.W, PV.W, literal.x,
1440 ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
1441 ; EG-NEXT: ADD_INT T1.Z, T0.W, PV.W,
1442 ; EG-NEXT: LSHR T0.W, T2.W, literal.x, BS:VEC_120/SCL_212
1443 ; EG-NEXT: ASHR * T1.W, T0.Y, literal.y,
1444 ; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44)
1445 ; EG-NEXT: LSHR T1.Y, PS, literal.x,
1446 ; EG-NEXT: ASHR T2.Z, T0.X, literal.y,
1447 ; EG-NEXT: ADD_INT T0.W, T0.Z, PV.W,
1448 ; EG-NEXT: ASHR * T1.W, PV.Z, literal.z,
1449 ; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44)
1450 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1451 ; EG-NEXT: ASHR T1.Z, PV.W, literal.x,
1452 ; EG-NEXT: LSHR T0.W, PV.Z, literal.y,
1453 ; EG-NEXT: ADD_INT * T2.W, T0.Y, PV.Y,
1454 ; EG-NEXT: 2(2.802597e-45), 30(4.203895e-44)
1455 ; EG-NEXT: ASHR T1.Y, PS, literal.x,
1456 ; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W,
1457 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1458 ; EG-NEXT: ASHR T1.X, PV.W, literal.x,
1459 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
1460 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1461 %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
1462 %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
1463 store <4 x i32> %result, <4 x i32> addrspace(1)* %out
1467 define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
1468 ; GCN-LABEL: v_sdiv_i8:
1470 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1471 ; GCN-NEXT: s_mov_b32 s3, 0xf000
1472 ; GCN-NEXT: s_mov_b32 s2, -1
1473 ; GCN-NEXT: s_mov_b32 s10, s2
1474 ; GCN-NEXT: s_mov_b32 s11, s3
1475 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1476 ; GCN-NEXT: s_mov_b32 s8, s6
1477 ; GCN-NEXT: s_mov_b32 s9, s7
1478 ; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
1479 ; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1
1480 ; GCN-NEXT: s_mov_b32 s0, s4
1481 ; GCN-NEXT: s_mov_b32 s1, s5
1482 ; GCN-NEXT: s_waitcnt vmcnt(1)
1483 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v0
1484 ; GCN-NEXT: s_waitcnt vmcnt(0)
1485 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v1
1486 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
1487 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1488 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0
1489 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2
1490 ; GCN-NEXT: v_mul_f32_e32 v1, v3, v4
1491 ; GCN-NEXT: v_trunc_f32_e32 v1, v1
1492 ; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3
1493 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
1494 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2|
1495 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1496 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1497 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8
1498 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
1499 ; GCN-NEXT: s_endpgm
1501 ; TONGA-LABEL: v_sdiv_i8:
1503 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1504 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
1505 ; TONGA-NEXT: s_mov_b32 s2, -1
1506 ; TONGA-NEXT: s_mov_b32 s10, s2
1507 ; TONGA-NEXT: s_mov_b32 s11, s3
1508 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
1509 ; TONGA-NEXT: s_mov_b32 s8, s6
1510 ; TONGA-NEXT: s_mov_b32 s9, s7
1511 ; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
1512 ; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1
1513 ; TONGA-NEXT: s_mov_b32 s0, s4
1514 ; TONGA-NEXT: s_mov_b32 s1, s5
1515 ; TONGA-NEXT: s_waitcnt vmcnt(1)
1516 ; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v0
1517 ; TONGA-NEXT: s_waitcnt vmcnt(0)
1518 ; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v1
1519 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1
1520 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1521 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0
1522 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2
1523 ; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4
1524 ; TONGA-NEXT: v_trunc_f32_e32 v1, v1
1525 ; TONGA-NEXT: v_mad_f32 v3, -v1, v2, v3
1526 ; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1
1527 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2|
1528 ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1529 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0
1530 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8
1531 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
1532 ; TONGA-NEXT: s_endpgm
1534 ; GFX9-LABEL: v_sdiv_i8:
1536 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1537 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1538 ; GFX9-NEXT: s_mov_b32 s2, -1
1539 ; GFX9-NEXT: s_mov_b32 s10, s2
1540 ; GFX9-NEXT: s_mov_b32 s11, s3
1541 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1542 ; GFX9-NEXT: s_mov_b32 s8, s6
1543 ; GFX9-NEXT: s_mov_b32 s9, s7
1544 ; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
1545 ; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:1
1546 ; GFX9-NEXT: s_mov_b32 s0, s4
1547 ; GFX9-NEXT: s_mov_b32 s1, s5
1548 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1549 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v0
1550 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1551 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v1
1552 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1
1553 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1554 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0
1555 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2
1556 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v4
1557 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
1558 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v1
1559 ; GFX9-NEXT: v_mad_f32 v1, -v1, v2, v3
1560 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2|
1561 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1562 ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
1563 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
1564 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
1565 ; GFX9-NEXT: s_endpgm
1567 ; EG-LABEL: v_sdiv_i8:
1569 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
1571 ; EG-NEXT: ALU 21, @11, KC0[CB0:0-32], KC1[]
1572 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1575 ; EG-NEXT: Fetch clause starting at 6:
1576 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1
1577 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
1578 ; EG-NEXT: ALU clause starting at 10:
1579 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1580 ; EG-NEXT: ALU clause starting at 11:
1581 ; EG-NEXT: BFE_INT * T0.W, T1.X, 0.0, literal.x,
1582 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1583 ; EG-NEXT: INT_TO_FLT * T0.Y, PV.W,
1584 ; EG-NEXT: BFE_INT T1.W, T0.X, 0.0, literal.x,
1585 ; EG-NEXT: RECIP_IEEE * T0.X, PS,
1586 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1587 ; EG-NEXT: INT_TO_FLT * T0.Z, PV.W,
1588 ; EG-NEXT: MUL_IEEE * T2.W, PS, T0.X,
1589 ; EG-NEXT: TRUNC T2.W, PV.W,
1590 ; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W,
1591 ; EG-NEXT: ASHR T0.W, PS, literal.x,
1592 ; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.Y, T0.Z,
1593 ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
1594 ; EG-NEXT: TRUNC T0.Z, T2.W,
1595 ; EG-NEXT: SETGE T1.W, |PS|, |T0.Y|,
1596 ; EG-NEXT: OR_INT * T0.W, PV.W, 1,
1597 ; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS,
1598 ; EG-NEXT: FLT_TO_INT * T1.W, PV.Z,
1599 ; EG-NEXT: ADD_INT * T0.W, PS, PV.W,
1600 ; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
1601 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1602 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
1603 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
1604 %num = load i8, i8 addrspace(1) * %in
1605 %den = load i8, i8 addrspace(1) * %den_ptr
1606 %result = sdiv i8 %num, %den
1607 %result.ext = sext i8 %result to i32
1608 store i32 %result.ext, i32 addrspace(1)* %out
1612 define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
1613 ; GCN-LABEL: v_sdiv_i23:
1615 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1616 ; GCN-NEXT: s_mov_b32 s3, 0xf000
1617 ; GCN-NEXT: s_mov_b32 s2, -1
1618 ; GCN-NEXT: s_mov_b32 s10, s2
1619 ; GCN-NEXT: s_mov_b32 s11, s3
1620 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1621 ; GCN-NEXT: s_mov_b32 s8, s6
1622 ; GCN-NEXT: s_mov_b32 s9, s7
1623 ; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
1624 ; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
1625 ; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
1626 ; GCN-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6
1627 ; GCN-NEXT: s_mov_b32 s0, s4
1628 ; GCN-NEXT: s_mov_b32 s1, s5
1629 ; GCN-NEXT: s_waitcnt vmcnt(2)
1630 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1631 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1
1632 ; GCN-NEXT: s_waitcnt vmcnt(0)
1633 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1634 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3
1635 ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 23
1636 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2
1637 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23
1638 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0
1639 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
1640 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v3
1641 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1642 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0
1643 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v4
1644 ; GCN-NEXT: v_trunc_f32_e32 v2, v2
1645 ; GCN-NEXT: v_mad_f32 v1, -v2, v3, v1
1646 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2
1647 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3|
1648 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1649 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
1650 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23
1651 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
1652 ; GCN-NEXT: s_endpgm
1654 ; TONGA-LABEL: v_sdiv_i23:
1656 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1657 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
1658 ; TONGA-NEXT: s_mov_b32 s2, -1
1659 ; TONGA-NEXT: s_mov_b32 s10, s2
1660 ; TONGA-NEXT: s_mov_b32 s11, s3
1661 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
1662 ; TONGA-NEXT: s_mov_b32 s8, s6
1663 ; TONGA-NEXT: s_mov_b32 s9, s7
1664 ; TONGA-NEXT: buffer_load_ushort v0, off, s[8:11], 0
1665 ; TONGA-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
1666 ; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
1667 ; TONGA-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6
1668 ; TONGA-NEXT: s_mov_b32 s0, s4
1669 ; TONGA-NEXT: s_mov_b32 s1, s5
1670 ; TONGA-NEXT: s_waitcnt vmcnt(2)
1671 ; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1672 ; TONGA-NEXT: v_or_b32_e32 v0, v0, v1
1673 ; TONGA-NEXT: s_waitcnt vmcnt(0)
1674 ; TONGA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1675 ; TONGA-NEXT: v_or_b32_e32 v2, v2, v3
1676 ; TONGA-NEXT: v_bfe_i32 v2, v2, 0, 23
1677 ; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v2
1678 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23
1679 ; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v0
1680 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2
1681 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v3
1682 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1683 ; TONGA-NEXT: v_or_b32_e32 v0, 1, v0
1684 ; TONGA-NEXT: v_mul_f32_e32 v2, v1, v4
1685 ; TONGA-NEXT: v_trunc_f32_e32 v2, v2
1686 ; TONGA-NEXT: v_mad_f32 v1, -v2, v3, v1
1687 ; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2
1688 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3|
1689 ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1690 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0
1691 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23
1692 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
1693 ; TONGA-NEXT: s_endpgm
1695 ; GFX9-LABEL: v_sdiv_i23:
1697 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1698 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1699 ; GFX9-NEXT: s_mov_b32 s2, -1
1700 ; GFX9-NEXT: s_mov_b32 s10, s2
1701 ; GFX9-NEXT: s_mov_b32 s11, s3
1702 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1703 ; GFX9-NEXT: s_mov_b32 s8, s6
1704 ; GFX9-NEXT: s_mov_b32 s9, s7
1705 ; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
1706 ; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
1707 ; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
1708 ; GFX9-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6
1709 ; GFX9-NEXT: s_mov_b32 s0, s4
1710 ; GFX9-NEXT: s_mov_b32 s1, s5
1711 ; GFX9-NEXT: s_waitcnt vmcnt(2)
1712 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1713 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
1714 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1715 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1716 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3
1717 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 23
1718 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v2
1719 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23
1720 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v0
1721 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2
1722 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v3
1723 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0
1724 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0
1725 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v4
1726 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
1727 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2
1728 ; GFX9-NEXT: v_mad_f32 v1, -v2, v3, v1
1729 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3|
1730 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1731 ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
1732 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23
1733 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
1734 ; GFX9-NEXT: s_endpgm
1736 ; EG-LABEL: v_sdiv_i23:
1738 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
1740 ; EG-NEXT: ALU 33, @15, KC0[CB0:0-32], KC1[]
1741 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1744 ; EG-NEXT: Fetch clause starting at 6:
1745 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1
1746 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1
1747 ; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1
1748 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
1749 ; EG-NEXT: ALU clause starting at 14:
1750 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1751 ; EG-NEXT: ALU clause starting at 15:
1752 ; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
1753 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1754 ; EG-NEXT: OR_INT T0.W, T0.X, PV.W,
1755 ; EG-NEXT: LSHL * T1.W, T3.X, literal.x,
1756 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1757 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
1758 ; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
1759 ; EG-NEXT: ASHR T0.W, PV.W, literal.x,
1760 ; EG-NEXT: OR_INT * T1.W, T2.X, T1.W,
1761 ; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
1762 ; EG-NEXT: LSHL T1.W, PS, literal.x,
1763 ; EG-NEXT: INT_TO_FLT * T0.X, PV.W,
1764 ; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
1765 ; EG-NEXT: ASHR T1.W, PV.W, literal.x,
1766 ; EG-NEXT: RECIP_IEEE * T0.Y, PS,
1767 ; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
1768 ; EG-NEXT: INT_TO_FLT * T0.Z, PV.W,
1769 ; EG-NEXT: MUL_IEEE * T2.W, PS, T0.Y,
1770 ; EG-NEXT: TRUNC T2.W, PV.W,
1771 ; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W,
1772 ; EG-NEXT: ASHR T0.W, PS, literal.x,
1773 ; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z,
1774 ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
1775 ; EG-NEXT: TRUNC T0.Z, T2.W,
1776 ; EG-NEXT: SETGE T1.W, |PS|, |T0.X|,
1777 ; EG-NEXT: OR_INT * T0.W, PV.W, 1,
1778 ; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS,
1779 ; EG-NEXT: FLT_TO_INT * T1.W, PV.Z,
1780 ; EG-NEXT: ADD_INT * T0.W, PS, PV.W,
1781 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
1782 ; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
1783 ; EG-NEXT: ASHR T0.X, PV.W, literal.x,
1784 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1785 ; EG-NEXT: 9(1.261169e-44), 2(2.802597e-45)
1786 %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
1787 %num = load i23, i23 addrspace(1) * %in
1788 %den = load i23, i23 addrspace(1) * %den_ptr
1789 %result = sdiv i23 %num, %den
1790 %result.ext = sext i23 %result to i32
1791 store i32 %result.ext, i32 addrspace(1)* %out
1795 define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
1796 ; GCN-LABEL: v_sdiv_i24:
1798 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1799 ; GCN-NEXT: s_mov_b32 s3, 0xf000
1800 ; GCN-NEXT: s_mov_b32 s2, -1
1801 ; GCN-NEXT: s_mov_b32 s10, s2
1802 ; GCN-NEXT: s_mov_b32 s11, s3
1803 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1804 ; GCN-NEXT: s_mov_b32 s8, s6
1805 ; GCN-NEXT: s_mov_b32 s9, s7
1806 ; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
1807 ; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2
1808 ; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
1809 ; GCN-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6
1810 ; GCN-NEXT: s_mov_b32 s0, s4
1811 ; GCN-NEXT: s_mov_b32 s1, s5
1812 ; GCN-NEXT: s_waitcnt vmcnt(0)
1813 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3
1814 ; GCN-NEXT: v_or_b32_e32 v2, v2, v4
1815 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, v2
1816 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1
1817 ; GCN-NEXT: v_or_b32_e32 v0, v0, v4
1818 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
1819 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2
1820 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v3
1821 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 30, v1
1822 ; GCN-NEXT: v_or_b32_e32 v1, 1, v1
1823 ; GCN-NEXT: v_mul_f32_e32 v3, v0, v4
1824 ; GCN-NEXT: v_trunc_f32_e32 v3, v3
1825 ; GCN-NEXT: v_mad_f32 v0, -v3, v2, v0
1826 ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3
1827 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2|
1828 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1829 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3
1830 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24
1831 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
1832 ; GCN-NEXT: s_endpgm
1834 ; TONGA-LABEL: v_sdiv_i24:
1836 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1837 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
1838 ; TONGA-NEXT: s_mov_b32 s2, -1
1839 ; TONGA-NEXT: s_mov_b32 s10, s2
1840 ; TONGA-NEXT: s_mov_b32 s11, s3
1841 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
1842 ; TONGA-NEXT: s_mov_b32 s8, s6
1843 ; TONGA-NEXT: s_mov_b32 s9, s7
1844 ; TONGA-NEXT: buffer_load_ushort v0, off, s[8:11], 0
1845 ; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2
1846 ; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
1847 ; TONGA-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6
1848 ; TONGA-NEXT: s_mov_b32 s0, s4
1849 ; TONGA-NEXT: s_mov_b32 s1, s5
1850 ; TONGA-NEXT: s_waitcnt vmcnt(0)
1851 ; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v3
1852 ; TONGA-NEXT: v_or_b32_e32 v2, v2, v4
1853 ; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v2
1854 ; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v1
1855 ; TONGA-NEXT: v_or_b32_e32 v0, v0, v4
1856 ; TONGA-NEXT: v_cvt_f32_i32_e32 v0, v0
1857 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2
1858 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v3
1859 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 30, v1
1860 ; TONGA-NEXT: v_or_b32_e32 v1, 1, v1
1861 ; TONGA-NEXT: v_mul_f32_e32 v3, v0, v4
1862 ; TONGA-NEXT: v_trunc_f32_e32 v3, v3
1863 ; TONGA-NEXT: v_mad_f32 v0, -v3, v2, v0
1864 ; TONGA-NEXT: v_cvt_i32_f32_e32 v3, v3
1865 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2|
1866 ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1867 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v3, v0
1868 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24
1869 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
1870 ; TONGA-NEXT: s_endpgm
1872 ; GFX9-LABEL: v_sdiv_i24:
1874 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1875 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
1876 ; GFX9-NEXT: s_mov_b32 s2, -1
1877 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1878 ; GFX9-NEXT: s_mov_b32 s0, s4
1879 ; GFX9-NEXT: s_mov_b32 s1, s5
1880 ; GFX9-NEXT: s_mov_b32 s4, s6
1881 ; GFX9-NEXT: s_mov_b32 s5, s7
1882 ; GFX9-NEXT: s_mov_b32 s6, s2
1883 ; GFX9-NEXT: s_mov_b32 s7, s3
1884 ; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
1885 ; GFX9-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2
1886 ; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4
1887 ; GFX9-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6
1888 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1889 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
1890 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v4
1891 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
1892 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1
1893 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
1894 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0
1895 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2
1896 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v3
1897 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1
1898 ; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
1899 ; GFX9-NEXT: v_mul_f32_e32 v3, v0, v4
1900 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
1901 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v3
1902 ; GFX9-NEXT: v_mad_f32 v0, -v3, v2, v0
1903 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, |v2|
1904 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
1905 ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
1906 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 24
1907 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
1908 ; GFX9-NEXT: s_endpgm
1910 ; EG-LABEL: v_sdiv_i24:
1912 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
1914 ; EG-NEXT: ALU 39, @15, KC0[CB0:0-32], KC1[]
1915 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1918 ; EG-NEXT: Fetch clause starting at 6:
1919 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1
1920 ; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1
1921 ; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1
1922 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
1923 ; EG-NEXT: ALU clause starting at 14:
1924 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1925 ; EG-NEXT: ALU clause starting at 15:
1926 ; EG-NEXT: BFE_INT * T0.W, T1.X, 0.0, literal.x,
1927 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1928 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
1929 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1930 ; EG-NEXT: OR_INT * T0.W, T0.X, PV.W,
1931 ; EG-NEXT: SETGT_INT * T1.W, 0.0, PV.W,
1932 ; EG-NEXT: BFE_INT T2.W, T3.X, 0.0, literal.x,
1933 ; EG-NEXT: ADD_INT * T0.W, T0.W, PV.W,
1934 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1935 ; EG-NEXT: LSHL T2.W, PV.W, literal.x,
1936 ; EG-NEXT: XOR_INT * T0.W, PS, T1.W,
1937 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1938 ; EG-NEXT: SUB_INT T0.Z, 0.0, PS,
1939 ; EG-NEXT: OR_INT T2.W, T2.X, PV.W,
1940 ; EG-NEXT: RECIP_UINT * T0.X, PS,
1941 ; EG-NEXT: SETGT_INT T3.W, 0.0, PV.W,
1942 ; EG-NEXT: MULLO_INT * T0.Y, PV.Z, PS,
1943 ; EG-NEXT: ADD_INT T2.W, T2.W, PV.W,
1944 ; EG-NEXT: MULHI * T0.Y, T0.X, PS,
1945 ; EG-NEXT: ADD_INT T4.W, T0.X, PS,
1946 ; EG-NEXT: XOR_INT * T2.W, PV.W, T3.W,
1947 ; EG-NEXT: MULHI * T0.X, PS, PV.W,
1948 ; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W,
1949 ; EG-NEXT: SUB_INT * T2.W, T2.W, PS,
1950 ; EG-NEXT: ADD_INT T0.Z, T0.X, 1,
1951 ; EG-NEXT: SETGE_UINT T4.W, PV.W, T0.W,
1952 ; EG-NEXT: SUB_INT * T5.W, PV.W, T0.W,
1953 ; EG-NEXT: CNDE_INT T2.W, PV.W, T2.W, PS,
1954 ; EG-NEXT: CNDE_INT * T4.W, PV.W, T0.X, PV.Z,
1955 ; EG-NEXT: ADD_INT T5.W, PS, 1,
1956 ; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W,
1957 ; EG-NEXT: CNDE_INT T0.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221
1958 ; EG-NEXT: XOR_INT * T1.W, T3.W, T1.W,
1959 ; EG-NEXT: XOR_INT * T0.W, PV.W, PS,
1960 ; EG-NEXT: SUB_INT * T0.W, PV.W, T1.W,
1961 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
1962 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1963 ; EG-NEXT: ASHR T0.X, PV.W, literal.x,
1964 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1965 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
1966 %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
1967 %num = load i24, i24 addrspace(1) * %in
1968 %den = load i24, i24 addrspace(1) * %den_ptr
1969 %result = sdiv i24 %num, %den
1970 %result.ext = sext i24 %result to i32
1971 store i32 %result.ext, i32 addrspace(1)* %out
1975 define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
1976 ; GCN-LABEL: v_sdiv_i25:
1978 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1979 ; GCN-NEXT: s_mov_b32 s7, 0xf000
1980 ; GCN-NEXT: s_mov_b32 s6, -1
1981 ; GCN-NEXT: s_mov_b32 s10, s6
1982 ; GCN-NEXT: s_mov_b32 s11, s7
1983 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1984 ; GCN-NEXT: s_mov_b32 s8, s2
1985 ; GCN-NEXT: s_mov_b32 s9, s3
1986 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1987 ; GCN-NEXT: s_mov_b32 s4, s0
1988 ; GCN-NEXT: s_mov_b32 s5, s1
1989 ; GCN-NEXT: s_waitcnt vmcnt(0)
1990 ; GCN-NEXT: v_bfe_i32 v2, v1, 0, 25
1991 ; GCN-NEXT: v_bfe_i32 v1, v1, 24, 1
1992 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v2
1993 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v1
1994 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v2
1995 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
1996 ; GCN-NEXT: v_bfe_i32 v5, v0, 0, 25
1997 ; GCN-NEXT: v_bfe_i32 v0, v0, 24, 1
1998 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3
1999 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v5
2000 ; GCN-NEXT: v_xor_b32_e32 v5, v5, v0
2001 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
2002 ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
2003 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
2004 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v3
2005 ; GCN-NEXT: v_mul_hi_u32 v4, v3, v4
2006 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
2007 ; GCN-NEXT: v_mul_hi_u32 v3, v5, v3
2008 ; GCN-NEXT: v_mul_lo_u32 v1, v3, v2
2009 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3
2010 ; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v1, v5
2011 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v2
2012 ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
2013 ; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v1
2014 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1]
2015 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3
2016 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
2017 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
2018 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0
2019 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
2020 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25
2021 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
2022 ; GCN-NEXT: s_endpgm
2024 ; TONGA-LABEL: v_sdiv_i25:
2026 ; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2027 ; TONGA-NEXT: s_mov_b32 s7, 0xf000
2028 ; TONGA-NEXT: s_mov_b32 s6, -1
2029 ; TONGA-NEXT: s_mov_b32 s10, s6
2030 ; TONGA-NEXT: s_mov_b32 s11, s7
2031 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
2032 ; TONGA-NEXT: s_mov_b32 s8, s2
2033 ; TONGA-NEXT: s_mov_b32 s9, s3
2034 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2035 ; TONGA-NEXT: s_mov_b32 s4, s0
2036 ; TONGA-NEXT: s_mov_b32 s5, s1
2037 ; TONGA-NEXT: s_waitcnt vmcnt(0)
2038 ; TONGA-NEXT: v_bfe_i32 v2, v1, 0, 25
2039 ; TONGA-NEXT: v_bfe_i32 v1, v1, 24, 1
2040 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v1, v2
2041 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v1
2042 ; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2
2043 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v2
2044 ; TONGA-NEXT: v_bfe_i32 v5, v0, 0, 25
2045 ; TONGA-NEXT: v_bfe_i32 v0, v0, 24, 1
2046 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
2047 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, v0, v5
2048 ; TONGA-NEXT: v_xor_b32_e32 v5, v5, v0
2049 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1
2050 ; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
2051 ; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
2052 ; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3
2053 ; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4
2054 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v4, v3
2055 ; TONGA-NEXT: v_mul_hi_u32 v3, v5, v3
2056 ; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2
2057 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3
2058 ; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v1, v5
2059 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v2
2060 ; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
2061 ; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v2, v1
2062 ; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1]
2063 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3
2064 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
2065 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
2066 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0
2067 ; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1
2068 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25
2069 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
2070 ; TONGA-NEXT: s_endpgm
2072 ; GFX9-LABEL: v_sdiv_i25:
2074 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2075 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
2076 ; GFX9-NEXT: s_mov_b32 s2, -1
2077 ; GFX9-NEXT: s_mov_b32 s10, s2
2078 ; GFX9-NEXT: s_mov_b32 s11, s3
2079 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2080 ; GFX9-NEXT: s_mov_b32 s8, s6
2081 ; GFX9-NEXT: s_mov_b32 s9, s7
2082 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2083 ; GFX9-NEXT: s_mov_b32 s0, s4
2084 ; GFX9-NEXT: s_mov_b32 s1, s5
2085 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2086 ; GFX9-NEXT: v_bfe_i32 v2, v1, 0, 25
2087 ; GFX9-NEXT: v_bfe_i32 v1, v1, 24, 1
2088 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
2089 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1
2090 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2
2091 ; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2
2092 ; GFX9-NEXT: v_bfe_i32 v5, v0, 0, 25
2093 ; GFX9-NEXT: v_bfe_i32 v0, v0, 24, 1
2094 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
2095 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v0
2096 ; GFX9-NEXT: v_xor_b32_e32 v5, v5, v0
2097 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1
2098 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
2099 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
2100 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3
2101 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4
2102 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4
2103 ; GFX9-NEXT: v_mul_hi_u32 v3, v5, v3
2104 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, v2
2105 ; GFX9-NEXT: v_add_u32_e32 v1, 1, v3
2106 ; GFX9-NEXT: v_sub_u32_e32 v4, v5, v4
2107 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
2108 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
2109 ; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2
2110 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
2111 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
2112 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
2113 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
2114 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v0
2115 ; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0
2116 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 25
2117 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
2118 ; GFX9-NEXT: s_endpgm
2120 ; EG-LABEL: v_sdiv_i25:
2122 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
2124 ; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[]
2125 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2128 ; EG-NEXT: Fetch clause starting at 6:
2129 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1
2130 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
2131 ; EG-NEXT: ALU clause starting at 10:
2132 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
2133 ; EG-NEXT: MOV * T1.X, PV.X,
2134 ; EG-NEXT: ALU clause starting at 12:
2135 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
2136 ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
2137 ; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
2138 ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
2139 ; EG-NEXT: SETGT_INT * T1.W, 0.0, PV.W,
2140 ; EG-NEXT: ADD_INT T0.W, T0.W, PV.W,
2141 ; EG-NEXT: LSHL * T2.W, T1.X, literal.x,
2142 ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
2143 ; EG-NEXT: XOR_INT * T0.W, PV.W, T1.W,
2144 ; EG-NEXT: SUB_INT T0.Z, 0.0, PV.W,
2145 ; EG-NEXT: ASHR T2.W, T2.W, literal.x,
2146 ; EG-NEXT: RECIP_UINT * T0.X, PV.W,
2147 ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
2148 ; EG-NEXT: SETGT_INT T3.W, 0.0, PV.W,
2149 ; EG-NEXT: MULLO_INT * T0.Y, PV.Z, PS,
2150 ; EG-NEXT: ADD_INT T2.W, T2.W, PV.W,
2151 ; EG-NEXT: MULHI * T0.Y, T0.X, PS,
2152 ; EG-NEXT: ADD_INT T4.W, T0.X, PS,
2153 ; EG-NEXT: XOR_INT * T2.W, PV.W, T3.W,
2154 ; EG-NEXT: MULHI * T0.X, PS, PV.W,
2155 ; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W,
2156 ; EG-NEXT: SUB_INT * T2.W, T2.W, PS,
2157 ; EG-NEXT: ADD_INT T0.Z, T0.X, 1,
2158 ; EG-NEXT: SETGE_UINT T4.W, PV.W, T0.W,
2159 ; EG-NEXT: SUB_INT * T5.W, PV.W, T0.W,
2160 ; EG-NEXT: CNDE_INT T2.W, PV.W, T2.W, PS,
2161 ; EG-NEXT: CNDE_INT * T4.W, PV.W, T0.X, PV.Z,
2162 ; EG-NEXT: ADD_INT T5.W, PS, 1,
2163 ; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W,
2164 ; EG-NEXT: CNDE_INT T0.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221
2165 ; EG-NEXT: XOR_INT * T1.W, T3.W, T1.W,
2166 ; EG-NEXT: XOR_INT * T0.W, PV.W, PS,
2167 ; EG-NEXT: SUB_INT * T0.W, PV.W, T1.W,
2168 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2169 ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
2170 ; EG-NEXT: ASHR T0.X, PV.W, literal.x,
2171 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
2172 ; EG-NEXT: 7(9.809089e-45), 2(2.802597e-45)
2173 %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1
2174 %num = load i25, i25 addrspace(1) * %in
2175 %den = load i25, i25 addrspace(1) * %den_ptr
2176 %result = sdiv i25 %num, %den
2177 %result.ext = sext i25 %result to i32
2178 store i32 %result.ext, i32 addrspace(1)* %out
2182 ; Tests for 64-bit divide bypass.
2183 ; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
2184 ; %result = sdiv i64 %a, %b
2185 ; store i64 %result, i64 addrspace(1)* %out, align 8
2189 ; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
2190 ; %result = srem i64 %a, %b
2191 ; store i64 %result, i64 addrspace(1)* %out, align 8
2195 ; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
2196 ; %resultdiv = sdiv i64 %a, %b
2197 ; %resultrem = srem i64 %a, %b
2198 ; %result = add i64 %resultdiv, %resultrem
2199 ; store i64 %result, i64 addrspace(1)* %out, align 8
2203 define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
2204 ; GCN-LABEL: scalarize_mulhs_4xi32:
2206 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2207 ; GCN-NEXT: s_mov_b32 s3, 0xf000
2208 ; GCN-NEXT: s_mov_b32 s2, -1
2209 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
2210 ; GCN-NEXT: s_mov_b32 s0, s4
2211 ; GCN-NEXT: s_mov_b32 s1, s5
2212 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
2213 ; GCN-NEXT: s_mov_b32 s4, 0x1389c755
2214 ; GCN-NEXT: s_mov_b32 s0, s6
2215 ; GCN-NEXT: s_mov_b32 s1, s7
2216 ; GCN-NEXT: s_waitcnt vmcnt(0)
2217 ; GCN-NEXT: v_mul_hi_i32 v0, v0, s4
2218 ; GCN-NEXT: v_mul_hi_i32 v1, v1, s4
2219 ; GCN-NEXT: v_mul_hi_i32 v2, v2, s4
2220 ; GCN-NEXT: v_mul_hi_i32 v3, v3, s4
2221 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 31, v0
2222 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 12, v0
2223 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 31, v1
2224 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 12, v1
2225 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 31, v2
2226 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 12, v2
2227 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 31, v3
2228 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 12, v3
2229 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
2230 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
2231 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
2232 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
2233 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2234 ; GCN-NEXT: s_endpgm
2236 ; TONGA-LABEL: scalarize_mulhs_4xi32:
2238 ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2239 ; TONGA-NEXT: s_mov_b32 s3, 0xf000
2240 ; TONGA-NEXT: s_mov_b32 s2, -1
2241 ; TONGA-NEXT: s_waitcnt lgkmcnt(0)
2242 ; TONGA-NEXT: s_mov_b32 s0, s4
2243 ; TONGA-NEXT: s_mov_b32 s1, s5
2244 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
2245 ; TONGA-NEXT: s_mov_b32 s4, 0x1389c755
2246 ; TONGA-NEXT: s_mov_b32 s0, s6
2247 ; TONGA-NEXT: s_mov_b32 s1, s7
2248 ; TONGA-NEXT: s_waitcnt vmcnt(0)
2249 ; TONGA-NEXT: v_mul_hi_i32 v0, v0, s4
2250 ; TONGA-NEXT: v_mul_hi_i32 v1, v1, s4
2251 ; TONGA-NEXT: v_mul_hi_i32 v2, v2, s4
2252 ; TONGA-NEXT: v_mul_hi_i32 v3, v3, s4
2253 ; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0
2254 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0
2255 ; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1
2256 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 12, v1
2257 ; TONGA-NEXT: v_lshrrev_b32_e32 v6, 31, v2
2258 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 12, v2
2259 ; TONGA-NEXT: v_lshrrev_b32_e32 v7, 31, v3
2260 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 12, v3
2261 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4
2262 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
2263 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6
2264 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
2265 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2266 ; TONGA-NEXT: s_endpgm
2268 ; GFX9-LABEL: scalarize_mulhs_4xi32:
2270 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2271 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
2272 ; GFX9-NEXT: s_mov_b32 s2, -1
2273 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2274 ; GFX9-NEXT: s_mov_b32 s0, s4
2275 ; GFX9-NEXT: s_mov_b32 s1, s5
2276 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
2277 ; GFX9-NEXT: s_mov_b32 s4, 0x1389c755
2278 ; GFX9-NEXT: s_mov_b32 s0, s6
2279 ; GFX9-NEXT: s_mov_b32 s1, s7
2280 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2281 ; GFX9-NEXT: v_mul_hi_i32 v0, v0, s4
2282 ; GFX9-NEXT: v_mul_hi_i32 v1, v1, s4
2283 ; GFX9-NEXT: v_mul_hi_i32 v2, v2, s4
2284 ; GFX9-NEXT: v_mul_hi_i32 v3, v3, s4
2285 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0
2286 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0
2287 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1
2288 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 12, v1
2289 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v2
2290 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 12, v2
2291 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 31, v3
2292 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 12, v3
2293 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v4
2294 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v5
2295 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6
2296 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7
2297 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2298 ; GFX9-NEXT: s_endpgm
2300 ; EG-LABEL: scalarize_mulhs_4xi32:
2302 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
2304 ; EG-NEXT: ALU 25, @9, KC0[CB0:0-32], KC1[]
2305 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2308 ; EG-NEXT: Fetch clause starting at 6:
2309 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
2310 ; EG-NEXT: ALU clause starting at 8:
2311 ; EG-NEXT: MOV * T0.X, KC0[2].Y,
2312 ; EG-NEXT: ALU clause starting at 9:
2313 ; EG-NEXT: MULHI_INT * T0.W, T0.W, literal.x,
2314 ; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
2315 ; EG-NEXT: ASHR T1.Z, PS, literal.x,
2316 ; EG-NEXT: LSHR T0.W, PS, literal.y,
2317 ; EG-NEXT: MULHI_INT * T0.Z, T0.Z, literal.z,
2318 ; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44)
2319 ; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
2320 ; EG-NEXT: ASHR T1.Y, PS, literal.x,
2321 ; EG-NEXT: LSHR T0.Z, PS, literal.y,
2322 ; EG-NEXT: ADD_INT T0.W, PV.Z, PV.W,
2323 ; EG-NEXT: MULHI_INT * T0.Y, T0.Y, literal.z,
2324 ; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44)
2325 ; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
2326 ; EG-NEXT: ASHR T2.Y, PS, literal.x,
2327 ; EG-NEXT: ADD_INT T0.Z, PV.Y, PV.Z,
2328 ; EG-NEXT: LSHR T1.W, PS, literal.y,
2329 ; EG-NEXT: MULHI_INT * T0.X, T0.X, literal.z,
2330 ; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44)
2331 ; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
2332 ; EG-NEXT: ADD_INT T0.Y, PV.Y, PV.W,
2333 ; EG-NEXT: ASHR T1.W, PS, literal.x,
2334 ; EG-NEXT: LSHR * T2.W, PS, literal.y,
2335 ; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44)
2336 ; EG-NEXT: ADD_INT T0.X, PV.W, PS,
2337 ; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x,
2338 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2339 %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
2340 %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
2341 store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16