1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
5 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
8 define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
9 ; GFX7-LABEL: s_mul_i16:
11 ; GFX7-NEXT: s_mul_i32 s0, s0, s1
12 ; GFX7-NEXT: ; return to shader part epilog
14 ; GFX8-LABEL: s_mul_i16:
16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
17 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
18 ; GFX8-NEXT: s_mul_i32 s0, s0, s1
19 ; GFX8-NEXT: ; return to shader part epilog
21 ; GFX9-LABEL: s_mul_i16:
23 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
24 ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
25 ; GFX9-NEXT: s_mul_i32 s0, s0, s1
26 ; GFX9-NEXT: ; return to shader part epilog
28 ; GFX10PLUS-LABEL: s_mul_i16:
30 ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
31 ; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
32 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
33 ; GFX10PLUS-NEXT: ; return to shader part epilog
34 %result = mul i16 %num, %den
38 define i16 @v_mul_i16(i16 %num, i16 %den) {
39 ; GFX7-LABEL: v_mul_i16:
41 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
43 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
44 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
45 ; GFX7-NEXT: s_setpc_b64 s[30:31]
47 ; GFX8-LABEL: v_mul_i16:
49 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50 ; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
51 ; GFX8-NEXT: s_setpc_b64 s[30:31]
53 ; GFX9-LABEL: v_mul_i16:
55 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56 ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
57 ; GFX9-NEXT: s_setpc_b64 s[30:31]
59 ; GFX10PLUS-LABEL: v_mul_i16:
61 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62 ; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1
63 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
64 %result = mul i16 %num, %den
68 define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
69 ; GFX7-LABEL: s_mul_i16_zeroext:
71 ; GFX7-NEXT: s_mul_i32 s0, s0, s1
72 ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
73 ; GFX7-NEXT: ; return to shader part epilog
75 ; GFX8-LABEL: s_mul_i16_zeroext:
77 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
78 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
79 ; GFX8-NEXT: s_mul_i32 s0, s0, s1
80 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
81 ; GFX8-NEXT: ; return to shader part epilog
83 ; GFX9-LABEL: s_mul_i16_zeroext:
85 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
86 ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
87 ; GFX9-NEXT: s_mul_i32 s0, s0, s1
88 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
89 ; GFX9-NEXT: ; return to shader part epilog
91 ; GFX10PLUS-LABEL: s_mul_i16_zeroext:
93 ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
94 ; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
95 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
96 ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
97 ; GFX10PLUS-NEXT: ; return to shader part epilog
98 %result = mul i16 %num, %den
102 define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
103 ; GFX7-LABEL: v_mul_i16_zeroext:
105 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
107 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
108 ; GFX7-NEXT: s_setpc_b64 s[30:31]
110 ; GFX8-LABEL: v_mul_i16_zeroext:
112 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113 ; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
114 ; GFX8-NEXT: s_setpc_b64 s[30:31]
116 ; GFX9-LABEL: v_mul_i16_zeroext:
118 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119 ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
120 ; GFX9-NEXT: s_setpc_b64 s[30:31]
122 ; GFX10PLUS-LABEL: v_mul_i16_zeroext:
123 ; GFX10PLUS: ; %bb.0:
124 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125 ; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1
126 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
127 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
128 %result = mul i16 %num, %den
132 define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
133 ; GFX7-LABEL: s_mul_i16_signext:
135 ; GFX7-NEXT: s_mul_i32 s0, s0, s1
136 ; GFX7-NEXT: s_sext_i32_i16 s0, s0
137 ; GFX7-NEXT: ; return to shader part epilog
139 ; GFX8-LABEL: s_mul_i16_signext:
141 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
142 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
143 ; GFX8-NEXT: s_mul_i32 s0, s0, s1
144 ; GFX8-NEXT: s_sext_i32_i16 s0, s0
145 ; GFX8-NEXT: ; return to shader part epilog
147 ; GFX9-LABEL: s_mul_i16_signext:
149 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
150 ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
151 ; GFX9-NEXT: s_mul_i32 s0, s0, s1
152 ; GFX9-NEXT: s_sext_i32_i16 s0, s0
153 ; GFX9-NEXT: ; return to shader part epilog
155 ; GFX10PLUS-LABEL: s_mul_i16_signext:
156 ; GFX10PLUS: ; %bb.0:
157 ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
158 ; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
159 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
160 ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
161 ; GFX10PLUS-NEXT: ; return to shader part epilog
162 %result = mul i16 %num, %den
166 define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
167 ; GFX7-LABEL: v_mul_i16_signext:
169 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
170 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
171 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
172 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
173 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
174 ; GFX7-NEXT: s_setpc_b64 s[30:31]
176 ; GFX8-LABEL: v_mul_i16_signext:
178 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179 ; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
180 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16
181 ; GFX8-NEXT: s_setpc_b64 s[30:31]
183 ; GFX9-LABEL: v_mul_i16_signext:
185 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186 ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
187 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
188 ; GFX9-NEXT: s_setpc_b64 s[30:31]
190 ; GFX10PLUS-LABEL: v_mul_i16_signext:
191 ; GFX10PLUS: ; %bb.0:
192 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193 ; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1
194 ; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 16
195 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
196 %result = mul i16 %num, %den
200 define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
201 ; GCN-LABEL: s_mul_i32:
203 ; GCN-NEXT: s_mul_i32 s0, s0, s1
204 ; GCN-NEXT: ; return to shader part epilog
206 ; GFX10PLUS-LABEL: s_mul_i32:
207 ; GFX10PLUS: ; %bb.0:
208 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
209 ; GFX10PLUS-NEXT: ; return to shader part epilog
210 %result = mul i32 %num, %den
214 define i32 @v_mul_i32(i32 %num, i32 %den) {
215 ; GCN-LABEL: v_mul_i32:
217 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v1
219 ; GCN-NEXT: s_setpc_b64 s[30:31]
221 ; GFX10PLUS-LABEL: v_mul_i32:
222 ; GFX10PLUS: ; %bb.0:
223 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224 ; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v1
225 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
226 %result = mul i32 %num, %den
230 define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
231 ; GCN-LABEL: s_mul_v2i32:
233 ; GCN-NEXT: s_mul_i32 s0, s0, s2
234 ; GCN-NEXT: s_mul_i32 s1, s1, s3
235 ; GCN-NEXT: ; return to shader part epilog
237 ; GFX10PLUS-LABEL: s_mul_v2i32:
238 ; GFX10PLUS: ; %bb.0:
239 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2
240 ; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s3
241 ; GFX10PLUS-NEXT: ; return to shader part epilog
242 %result = mul <2 x i32> %num, %den
243 ret <2 x i32> %result
246 define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
247 ; GCN-LABEL: v_mul_v2i32:
249 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v2
251 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v3
252 ; GCN-NEXT: s_setpc_b64 s[30:31]
254 ; GFX10PLUS-LABEL: v_mul_v2i32:
255 ; GFX10PLUS: ; %bb.0:
256 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257 ; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v2
258 ; GFX10PLUS-NEXT: v_mul_lo_u32 v1, v1, v3
259 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
260 %result = mul <2 x i32> %num, %den
261 ret <2 x i32> %result
264 define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) {
265 ; GFX7-LABEL: s_mul_i33:
267 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
268 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
269 ; GFX7-NEXT: s_mul_i32 s4, s0, s2
270 ; GFX7-NEXT: s_mul_i32 s0, s0, s3
271 ; GFX7-NEXT: s_mul_i32 s1, s1, s2
272 ; GFX7-NEXT: v_readfirstlane_b32 s5, v0
273 ; GFX7-NEXT: s_add_u32 s0, s0, s5
274 ; GFX7-NEXT: s_add_u32 s1, s1, s0
275 ; GFX7-NEXT: s_mov_b32 s0, s4
276 ; GFX7-NEXT: ; return to shader part epilog
278 ; GFX8-LABEL: s_mul_i33:
280 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
281 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
282 ; GFX8-NEXT: s_mul_i32 s4, s0, s2
283 ; GFX8-NEXT: s_mul_i32 s0, s0, s3
284 ; GFX8-NEXT: s_mul_i32 s1, s1, s2
285 ; GFX8-NEXT: v_readfirstlane_b32 s5, v0
286 ; GFX8-NEXT: s_add_u32 s0, s0, s5
287 ; GFX8-NEXT: s_add_u32 s1, s1, s0
288 ; GFX8-NEXT: s_mov_b32 s0, s4
289 ; GFX8-NEXT: ; return to shader part epilog
291 ; GFX9-LABEL: s_mul_i33:
293 ; GFX9-NEXT: s_mul_i32 s4, s0, s2
294 ; GFX9-NEXT: s_mul_hi_u32 s5, s0, s2
295 ; GFX9-NEXT: s_mul_i32 s0, s0, s3
296 ; GFX9-NEXT: s_add_u32 s0, s0, s5
297 ; GFX9-NEXT: s_mul_i32 s1, s1, s2
298 ; GFX9-NEXT: s_add_u32 s1, s1, s0
299 ; GFX9-NEXT: s_mov_b32 s0, s4
300 ; GFX9-NEXT: ; return to shader part epilog
302 ; GFX10PLUS-LABEL: s_mul_i33:
303 ; GFX10PLUS: ; %bb.0:
304 ; GFX10PLUS-NEXT: s_mul_hi_u32 s4, s0, s2
305 ; GFX10PLUS-NEXT: s_mul_i32 s3, s0, s3
306 ; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s2
307 ; GFX10PLUS-NEXT: s_add_i32 s3, s4, s3
308 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2
309 ; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1
310 ; GFX10PLUS-NEXT: ; return to shader part epilog
311 %result = mul i33 %num, %den
315 define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
316 ; GFX7-LABEL: s_mul_i64:
318 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
319 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
320 ; GFX7-NEXT: s_mul_i32 s4, s0, s2
321 ; GFX7-NEXT: s_mul_i32 s0, s0, s3
322 ; GFX7-NEXT: s_mul_i32 s1, s1, s2
323 ; GFX7-NEXT: v_readfirstlane_b32 s5, v0
324 ; GFX7-NEXT: s_add_u32 s0, s0, s5
325 ; GFX7-NEXT: s_add_u32 s1, s1, s0
326 ; GFX7-NEXT: s_mov_b32 s0, s4
327 ; GFX7-NEXT: ; return to shader part epilog
329 ; GFX8-LABEL: s_mul_i64:
331 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
332 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
333 ; GFX8-NEXT: s_mul_i32 s4, s0, s2
334 ; GFX8-NEXT: s_mul_i32 s0, s0, s3
335 ; GFX8-NEXT: s_mul_i32 s1, s1, s2
336 ; GFX8-NEXT: v_readfirstlane_b32 s5, v0
337 ; GFX8-NEXT: s_add_u32 s0, s0, s5
338 ; GFX8-NEXT: s_add_u32 s1, s1, s0
339 ; GFX8-NEXT: s_mov_b32 s0, s4
340 ; GFX8-NEXT: ; return to shader part epilog
342 ; GFX9-LABEL: s_mul_i64:
344 ; GFX9-NEXT: s_mul_i32 s4, s0, s2
345 ; GFX9-NEXT: s_mul_hi_u32 s5, s0, s2
346 ; GFX9-NEXT: s_mul_i32 s0, s0, s3
347 ; GFX9-NEXT: s_add_u32 s0, s0, s5
348 ; GFX9-NEXT: s_mul_i32 s1, s1, s2
349 ; GFX9-NEXT: s_add_u32 s1, s1, s0
350 ; GFX9-NEXT: s_mov_b32 s0, s4
351 ; GFX9-NEXT: ; return to shader part epilog
353 ; GFX10PLUS-LABEL: s_mul_i64:
354 ; GFX10PLUS: ; %bb.0:
355 ; GFX10PLUS-NEXT: s_mul_hi_u32 s4, s0, s2
356 ; GFX10PLUS-NEXT: s_mul_i32 s3, s0, s3
357 ; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s2
358 ; GFX10PLUS-NEXT: s_add_i32 s3, s4, s3
359 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2
360 ; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1
361 ; GFX10PLUS-NEXT: ; return to shader part epilog
362 %result = mul i64 %num, %den
366 define i64 @v_mul_i64(i64 %num, i64 %den) {
367 ; GCN-LABEL: v_mul_i64:
369 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370 ; GCN-NEXT: v_mov_b32_e32 v4, v0
371 ; GCN-NEXT: v_mov_b32_e32 v5, v1
372 ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
373 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
374 ; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
375 ; GCN-NEXT: s_setpc_b64 s[30:31]
377 ; GFX10-LABEL: v_mul_i64:
379 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
380 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
381 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
382 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0
383 ; GFX10-NEXT: v_mul_lo_u32 v3, v4, v3
384 ; GFX10-NEXT: v_mul_lo_u32 v2, v5, v2
385 ; GFX10-NEXT: v_add3_u32 v1, v1, v3, v2
386 ; GFX10-NEXT: s_setpc_b64 s[30:31]
388 ; GFX11-LABEL: v_mul_i64:
390 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391 ; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
392 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
393 ; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3
394 ; GFX11-NEXT: v_mul_lo_u32 v2, v5, v2
395 ; GFX11-NEXT: v_add3_u32 v1, v1, v3, v2
396 ; GFX11-NEXT: s_setpc_b64 s[30:31]
397 %result = mul i64 %num, %den
401 define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
402 ; GFX7-LABEL: s_mul_i96:
404 ; GFX7-NEXT: v_mov_b32_e32 v0, s3
405 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
406 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
407 ; GFX7-NEXT: v_mul_hi_u32 v1, s0, v1
408 ; GFX7-NEXT: s_mul_i32 s5, s0, s5
409 ; GFX7-NEXT: v_readfirstlane_b32 s7, v0
410 ; GFX7-NEXT: s_mul_i32 s8, s1, s4
411 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
412 ; GFX7-NEXT: s_add_u32 s5, s8, s5
413 ; GFX7-NEXT: s_mul_i32 s2, s2, s3
414 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s3
415 ; GFX7-NEXT: s_mul_i32 s6, s0, s3
416 ; GFX7-NEXT: s_add_u32 s2, s2, s5
417 ; GFX7-NEXT: s_mul_i32 s0, s0, s4
418 ; GFX7-NEXT: v_readfirstlane_b32 s4, v1
419 ; GFX7-NEXT: s_add_u32 s0, s0, s7
420 ; GFX7-NEXT: s_addc_u32 s2, s4, s2
421 ; GFX7-NEXT: s_mul_i32 s1, s1, s3
422 ; GFX7-NEXT: v_readfirstlane_b32 s3, v0
423 ; GFX7-NEXT: s_add_u32 s1, s1, s0
424 ; GFX7-NEXT: s_addc_u32 s2, s3, s2
425 ; GFX7-NEXT: s_mov_b32 s0, s6
426 ; GFX7-NEXT: ; return to shader part epilog
428 ; GFX8-LABEL: s_mul_i96:
430 ; GFX8-NEXT: v_mov_b32_e32 v0, s3
431 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
432 ; GFX8-NEXT: v_mov_b32_e32 v1, s4
433 ; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1
434 ; GFX8-NEXT: s_mul_i32 s5, s0, s5
435 ; GFX8-NEXT: v_readfirstlane_b32 s7, v0
436 ; GFX8-NEXT: s_mul_i32 s8, s1, s4
437 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
438 ; GFX8-NEXT: s_add_u32 s5, s8, s5
439 ; GFX8-NEXT: s_mul_i32 s2, s2, s3
440 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s3
441 ; GFX8-NEXT: s_mul_i32 s6, s0, s3
442 ; GFX8-NEXT: s_add_u32 s2, s2, s5
443 ; GFX8-NEXT: s_mul_i32 s0, s0, s4
444 ; GFX8-NEXT: v_readfirstlane_b32 s4, v1
445 ; GFX8-NEXT: s_add_u32 s0, s0, s7
446 ; GFX8-NEXT: s_addc_u32 s2, s4, s2
447 ; GFX8-NEXT: s_mul_i32 s1, s1, s3
448 ; GFX8-NEXT: v_readfirstlane_b32 s3, v0
449 ; GFX8-NEXT: s_add_u32 s1, s1, s0
450 ; GFX8-NEXT: s_addc_u32 s2, s3, s2
451 ; GFX8-NEXT: s_mov_b32 s0, s6
452 ; GFX8-NEXT: ; return to shader part epilog
454 ; GFX9-LABEL: s_mul_i96:
456 ; GFX9-NEXT: s_mul_i32 s5, s0, s5
457 ; GFX9-NEXT: s_mul_i32 s8, s1, s4
458 ; GFX9-NEXT: s_add_u32 s5, s8, s5
459 ; GFX9-NEXT: s_mul_i32 s2, s2, s3
460 ; GFX9-NEXT: s_mul_hi_u32 s7, s0, s3
461 ; GFX9-NEXT: s_add_u32 s2, s2, s5
462 ; GFX9-NEXT: s_mul_i32 s5, s0, s4
463 ; GFX9-NEXT: s_mul_i32 s6, s0, s3
464 ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s4
465 ; GFX9-NEXT: s_add_u32 s4, s5, s7
466 ; GFX9-NEXT: s_addc_u32 s0, s0, s2
467 ; GFX9-NEXT: s_mul_i32 s2, s1, s3
468 ; GFX9-NEXT: s_mul_hi_u32 s3, s1, s3
469 ; GFX9-NEXT: s_add_u32 s1, s2, s4
470 ; GFX9-NEXT: s_addc_u32 s2, s3, s0
471 ; GFX9-NEXT: s_mov_b32 s0, s6
472 ; GFX9-NEXT: ; return to shader part epilog
474 ; GFX10PLUS-LABEL: s_mul_i96:
475 ; GFX10PLUS: ; %bb.0:
476 ; GFX10PLUS-NEXT: s_mul_i32 s6, s0, s5
477 ; GFX10PLUS-NEXT: s_mul_i32 s7, s1, s4
478 ; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s3
479 ; GFX10PLUS-NEXT: s_add_i32 s6, s6, s7
480 ; GFX10PLUS-NEXT: s_mul_hi_u32 s7, s0, s3
481 ; GFX10PLUS-NEXT: s_add_i32 s6, s6, s2
482 ; GFX10PLUS-NEXT: s_mul_i32 s2, s0, s4
483 ; GFX10PLUS-NEXT: s_mul_i32 s5, s0, s3
484 ; GFX10PLUS-NEXT: s_mul_hi_u32 s0, s0, s4
485 ; GFX10PLUS-NEXT: s_add_u32 s2, s2, s7
486 ; GFX10PLUS-NEXT: s_mul_i32 s4, s1, s3
487 ; GFX10PLUS-NEXT: s_addc_u32 s0, s0, s6
488 ; GFX10PLUS-NEXT: s_mul_hi_u32 s3, s1, s3
489 ; GFX10PLUS-NEXT: s_add_u32 s1, s4, s2
490 ; GFX10PLUS-NEXT: s_addc_u32 s2, s3, s0
491 ; GFX10PLUS-NEXT: s_mov_b32 s0, s5
492 ; GFX10PLUS-NEXT: ; return to shader part epilog
493 %result = mul i96 %num, %den
494 %cast = bitcast i96 %result to <3 x i32>
498 define i96 @v_mul_i96(i96 %num, i96 %den) {
499 ; GCN-LABEL: v_mul_i96:
501 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
502 ; GCN-NEXT: v_mov_b32_e32 v6, v0
503 ; GCN-NEXT: v_mov_b32_e32 v7, v1
504 ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
505 ; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1]
506 ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
507 ; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9]
508 ; GCN-NEXT: v_mov_b32_e32 v2, v8
509 ; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
510 ; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
511 ; GCN-NEXT: s_setpc_b64 s[30:31]
513 ; GFX10-LABEL: v_mul_i96:
515 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
516 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
517 ; GFX10-NEXT: v_mov_b32_e32 v7, v1
518 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3
519 ; GFX10-NEXT: v_mul_lo_u32 v5, v6, v5
520 ; GFX10-NEXT: v_mul_lo_u32 v8, v7, v4
521 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0
522 ; GFX10-NEXT: v_add3_u32 v2, v5, v8, v2
523 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
524 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
525 ; GFX10-NEXT: s_setpc_b64 s[30:31]
527 ; GFX11-LABEL: v_mul_i96:
529 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
530 ; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
531 ; GFX11-NEXT: v_mul_lo_u32 v2, v2, v3
532 ; GFX11-NEXT: v_mul_lo_u32 v5, v6, v5
533 ; GFX11-NEXT: v_mul_lo_u32 v8, v7, v4
534 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0
535 ; GFX11-NEXT: v_add3_u32 v2, v5, v8, v2
536 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
537 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
538 ; GFX11-NEXT: s_setpc_b64 s[30:31]
539 %result = mul i96 %num, %den
543 define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
544 ; GFX7-LABEL: s_mul_i128:
546 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
547 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
548 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
549 ; GFX7-NEXT: v_mul_hi_u32 v2, s1, v1
550 ; GFX7-NEXT: s_mul_i32 s10, s0, s6
551 ; GFX7-NEXT: v_readfirstlane_b32 s9, v0
552 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
553 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
554 ; GFX7-NEXT: v_readfirstlane_b32 s13, v2
555 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
556 ; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4
557 ; GFX7-NEXT: s_mul_i32 s12, s1, s5
558 ; GFX7-NEXT: v_readfirstlane_b32 s11, v0
559 ; GFX7-NEXT: s_add_u32 s10, s12, s10
560 ; GFX7-NEXT: v_mul_hi_u32 v1, s0, v1
561 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
562 ; GFX7-NEXT: s_addc_u32 s11, s13, s11
563 ; GFX7-NEXT: s_mul_i32 s12, s2, s4
564 ; GFX7-NEXT: v_readfirstlane_b32 s13, v2
565 ; GFX7-NEXT: s_add_u32 s10, s12, s10
566 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s4
567 ; GFX7-NEXT: s_addc_u32 s11, s13, s11
568 ; GFX7-NEXT: s_mul_i32 s12, s0, s5
569 ; GFX7-NEXT: v_readfirstlane_b32 s13, v1
570 ; GFX7-NEXT: s_add_u32 s9, s12, s9
571 ; GFX7-NEXT: s_addc_u32 s10, s13, s10
572 ; GFX7-NEXT: s_mul_i32 s13, s1, s4
573 ; GFX7-NEXT: s_cselect_b32 s12, 1, 0
574 ; GFX7-NEXT: v_readfirstlane_b32 s14, v0
575 ; GFX7-NEXT: s_add_u32 s9, s13, s9
576 ; GFX7-NEXT: s_mul_i32 s8, s0, s4
577 ; GFX7-NEXT: s_addc_u32 s10, s14, s10
578 ; GFX7-NEXT: s_mul_i32 s0, s0, s7
579 ; GFX7-NEXT: s_addc_u32 s0, s11, s0
580 ; GFX7-NEXT: s_mul_i32 s1, s1, s6
581 ; GFX7-NEXT: s_cmp_lg_u32 s12, 0
582 ; GFX7-NEXT: s_addc_u32 s0, s0, s1
583 ; GFX7-NEXT: s_mul_i32 s2, s2, s5
584 ; GFX7-NEXT: s_add_u32 s0, s2, s0
585 ; GFX7-NEXT: s_mul_i32 s3, s3, s4
586 ; GFX7-NEXT: s_add_u32 s3, s3, s0
587 ; GFX7-NEXT: s_mov_b32 s0, s8
588 ; GFX7-NEXT: s_mov_b32 s1, s9
589 ; GFX7-NEXT: s_mov_b32 s2, s10
590 ; GFX7-NEXT: ; return to shader part epilog
592 ; GFX8-LABEL: s_mul_i128:
594 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
595 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
596 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
597 ; GFX8-NEXT: v_mul_hi_u32 v2, s1, v1
598 ; GFX8-NEXT: s_mul_i32 s10, s0, s6
599 ; GFX8-NEXT: v_readfirstlane_b32 s9, v0
600 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
601 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
602 ; GFX8-NEXT: v_readfirstlane_b32 s13, v2
603 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
604 ; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4
605 ; GFX8-NEXT: s_mul_i32 s12, s1, s5
606 ; GFX8-NEXT: v_readfirstlane_b32 s11, v0
607 ; GFX8-NEXT: s_add_u32 s10, s12, s10
608 ; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1
609 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
610 ; GFX8-NEXT: s_addc_u32 s11, s13, s11
611 ; GFX8-NEXT: s_mul_i32 s12, s2, s4
612 ; GFX8-NEXT: v_readfirstlane_b32 s13, v2
613 ; GFX8-NEXT: s_add_u32 s10, s12, s10
614 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s4
615 ; GFX8-NEXT: s_addc_u32 s11, s13, s11
616 ; GFX8-NEXT: s_mul_i32 s12, s0, s5
617 ; GFX8-NEXT: v_readfirstlane_b32 s13, v1
618 ; GFX8-NEXT: s_add_u32 s9, s12, s9
619 ; GFX8-NEXT: s_addc_u32 s10, s13, s10
620 ; GFX8-NEXT: s_mul_i32 s13, s1, s4
621 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0
622 ; GFX8-NEXT: v_readfirstlane_b32 s14, v0
623 ; GFX8-NEXT: s_add_u32 s9, s13, s9
624 ; GFX8-NEXT: s_mul_i32 s8, s0, s4
625 ; GFX8-NEXT: s_addc_u32 s10, s14, s10
626 ; GFX8-NEXT: s_mul_i32 s0, s0, s7
627 ; GFX8-NEXT: s_addc_u32 s0, s11, s0
628 ; GFX8-NEXT: s_mul_i32 s1, s1, s6
629 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0
630 ; GFX8-NEXT: s_addc_u32 s0, s0, s1
631 ; GFX8-NEXT: s_mul_i32 s2, s2, s5
632 ; GFX8-NEXT: s_add_u32 s0, s2, s0
633 ; GFX8-NEXT: s_mul_i32 s3, s3, s4
634 ; GFX8-NEXT: s_add_u32 s3, s3, s0
635 ; GFX8-NEXT: s_mov_b32 s0, s8
636 ; GFX8-NEXT: s_mov_b32 s1, s9
637 ; GFX8-NEXT: s_mov_b32 s2, s10
638 ; GFX8-NEXT: ; return to shader part epilog
640 ; GFX9-LABEL: s_mul_i128:
642 ; GFX9-NEXT: s_mul_i32 s10, s0, s6
643 ; GFX9-NEXT: s_mul_i32 s12, s1, s5
644 ; GFX9-NEXT: s_mul_hi_u32 s11, s0, s6
645 ; GFX9-NEXT: s_mul_hi_u32 s13, s1, s5
646 ; GFX9-NEXT: s_add_u32 s10, s12, s10
647 ; GFX9-NEXT: s_addc_u32 s11, s13, s11
648 ; GFX9-NEXT: s_mul_i32 s12, s2, s4
649 ; GFX9-NEXT: s_mul_hi_u32 s13, s2, s4
650 ; GFX9-NEXT: s_add_u32 s10, s12, s10
651 ; GFX9-NEXT: s_mul_hi_u32 s9, s0, s4
652 ; GFX9-NEXT: s_addc_u32 s11, s13, s11
653 ; GFX9-NEXT: s_mul_i32 s12, s0, s5
654 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s5
655 ; GFX9-NEXT: s_add_u32 s9, s12, s9
656 ; GFX9-NEXT: s_addc_u32 s10, s13, s10
657 ; GFX9-NEXT: s_mul_i32 s13, s1, s4
658 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0
659 ; GFX9-NEXT: s_mul_hi_u32 s14, s1, s4
660 ; GFX9-NEXT: s_add_u32 s9, s13, s9
661 ; GFX9-NEXT: s_mul_i32 s8, s0, s4
662 ; GFX9-NEXT: s_addc_u32 s10, s14, s10
663 ; GFX9-NEXT: s_mul_i32 s0, s0, s7
664 ; GFX9-NEXT: s_addc_u32 s0, s11, s0
665 ; GFX9-NEXT: s_mul_i32 s1, s1, s6
666 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0
667 ; GFX9-NEXT: s_addc_u32 s0, s0, s1
668 ; GFX9-NEXT: s_mul_i32 s2, s2, s5
669 ; GFX9-NEXT: s_add_u32 s0, s2, s0
670 ; GFX9-NEXT: s_mul_i32 s3, s3, s4
671 ; GFX9-NEXT: s_add_u32 s3, s3, s0
672 ; GFX9-NEXT: s_mov_b32 s0, s8
673 ; GFX9-NEXT: s_mov_b32 s1, s9
674 ; GFX9-NEXT: s_mov_b32 s2, s10
675 ; GFX9-NEXT: ; return to shader part epilog
677 ; GFX10PLUS-LABEL: s_mul_i128:
678 ; GFX10PLUS: ; %bb.0:
679 ; GFX10PLUS-NEXT: s_mul_i32 s9, s0, s6
680 ; GFX10PLUS-NEXT: s_mul_i32 s11, s1, s5
681 ; GFX10PLUS-NEXT: s_mul_hi_u32 s10, s0, s6
682 ; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s1, s5
683 ; GFX10PLUS-NEXT: s_add_u32 s9, s11, s9
684 ; GFX10PLUS-NEXT: s_mul_i32 s11, s2, s4
685 ; GFX10PLUS-NEXT: s_addc_u32 s10, s12, s10
686 ; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s2, s4
687 ; GFX10PLUS-NEXT: s_mul_hi_u32 s8, s0, s4
688 ; GFX10PLUS-NEXT: s_add_u32 s9, s11, s9
689 ; GFX10PLUS-NEXT: s_mul_i32 s11, s0, s5
690 ; GFX10PLUS-NEXT: s_addc_u32 s10, s12, s10
691 ; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s0, s5
692 ; GFX10PLUS-NEXT: s_add_u32 s8, s11, s8
693 ; GFX10PLUS-NEXT: s_addc_u32 s9, s12, s9
694 ; GFX10PLUS-NEXT: s_mul_i32 s12, s1, s4
695 ; GFX10PLUS-NEXT: s_mul_hi_u32 s13, s1, s4
696 ; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
697 ; GFX10PLUS-NEXT: s_add_u32 s8, s12, s8
698 ; GFX10PLUS-NEXT: s_mul_i32 s12, s0, s7
699 ; GFX10PLUS-NEXT: s_addc_u32 s7, s13, s9
700 ; GFX10PLUS-NEXT: s_addc_u32 s9, s10, s12
701 ; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s6
702 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
703 ; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s5
704 ; GFX10PLUS-NEXT: s_addc_u32 s1, s9, s1
705 ; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s4
706 ; GFX10PLUS-NEXT: s_add_i32 s1, s1, s2
707 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s4
708 ; GFX10PLUS-NEXT: s_add_i32 s3, s1, s3
709 ; GFX10PLUS-NEXT: s_mov_b32 s1, s8
710 ; GFX10PLUS-NEXT: s_mov_b32 s2, s7
711 ; GFX10PLUS-NEXT: ; return to shader part epilog
712 %result = mul i128 %num, %den
713 %cast = bitcast i128 %result to <4 x i32>
717 define i128 @v_mul_i128(i128 %num, i128 %den) {
718 ; GFX7-LABEL: v_mul_i128:
720 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
721 ; GFX7-NEXT: v_mov_b32_e32 v8, v0
722 ; GFX7-NEXT: v_mov_b32_e32 v9, v1
723 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
724 ; GFX7-NEXT: v_mov_b32_e32 v10, v2
725 ; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7
726 ; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
727 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
728 ; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
729 ; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6
730 ; GFX7-NEXT: v_mov_b32_e32 v2, v11
731 ; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
732 ; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
733 ; GFX7-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
734 ; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
735 ; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
736 ; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
737 ; GFX7-NEXT: s_setpc_b64 s[30:31]
739 ; GFX8-LABEL: v_mul_i128:
741 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742 ; GFX8-NEXT: v_mov_b32_e32 v8, v0
743 ; GFX8-NEXT: v_mov_b32_e32 v9, v1
744 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
745 ; GFX8-NEXT: v_mov_b32_e32 v10, v2
746 ; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7
747 ; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
748 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
749 ; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
750 ; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6
751 ; GFX8-NEXT: v_mov_b32_e32 v2, v11
752 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
753 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
754 ; GFX8-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
755 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
756 ; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
757 ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
758 ; GFX8-NEXT: s_setpc_b64 s[30:31]
760 ; GFX9-LABEL: v_mul_i128:
762 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
763 ; GFX9-NEXT: v_mov_b32_e32 v8, v0
764 ; GFX9-NEXT: v_mov_b32_e32 v9, v1
765 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
766 ; GFX9-NEXT: v_mov_b32_e32 v10, v2
767 ; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7
768 ; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
769 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
770 ; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
771 ; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6
772 ; GFX9-NEXT: v_mov_b32_e32 v2, v11
773 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
774 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
775 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5]
776 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
777 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
778 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
779 ; GFX9-NEXT: s_setpc_b64 s[30:31]
781 ; GFX10-LABEL: v_mul_i128:
783 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
784 ; GFX10-NEXT: v_mov_b32_e32 v8, v0
785 ; GFX10-NEXT: v_mov_b32_e32 v9, v1
786 ; GFX10-NEXT: v_mov_b32_e32 v10, v2
787 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, v4
788 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v6, 0
789 ; GFX10-NEXT: v_mul_lo_u32 v7, v8, v7
790 ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6
791 ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1]
792 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0
793 ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
794 ; GFX10-NEXT: v_mov_b32_e32 v2, v11
795 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
796 ; GFX10-NEXT: v_mul_lo_u32 v5, v10, v5
797 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
798 ; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
799 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
800 ; GFX10-NEXT: v_add3_u32 v3, v4, v5, v3
801 ; GFX10-NEXT: s_setpc_b64 s[30:31]
803 ; GFX11-LABEL: v_mul_i128:
805 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
806 ; GFX11-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
807 ; GFX11-NEXT: v_mov_b32_e32 v10, v2
808 ; GFX11-NEXT: v_mul_lo_u32 v3, v3, v4
809 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0
810 ; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7
811 ; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6
812 ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v9, v5, v[0:1]
813 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v4, 0
814 ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v10, v4, v[11:12]
815 ; GFX11-NEXT: v_mov_b32_e32 v2, v11
816 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
817 ; GFX11-NEXT: v_mul_lo_u32 v5, v10, v5
818 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v4, v[1:2]
819 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
820 ; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
821 ; GFX11-NEXT: v_add3_u32 v3, v4, v5, v3
822 ; GFX11-NEXT: s_setpc_b64 s[30:31]
823 %result = mul i128 %num, %den
827 define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
828 ; GFX7-LABEL: s_mul_i256:
830 ; GFX7-NEXT: s_mov_b32 s16, s0
831 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
832 ; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0
833 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
834 ; GFX7-NEXT: v_mul_hi_u32 v2, s1, v1
835 ; GFX7-NEXT: v_mul_hi_u32 v1, s16, v1
836 ; GFX7-NEXT: v_readfirstlane_b32 s17, v0
837 ; GFX7-NEXT: v_mov_b32_e32 v0, s10
838 ; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0
839 ; GFX7-NEXT: v_readfirstlane_b32 s21, v2
840 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
841 ; GFX7-NEXT: v_mul_hi_u32 v3, v2, s8
842 ; GFX7-NEXT: s_mul_i32 s18, s16, s10
843 ; GFX7-NEXT: s_mul_i32 s20, s1, s9
844 ; GFX7-NEXT: v_readfirstlane_b32 s19, v0
845 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
846 ; GFX7-NEXT: s_add_u32 s18, s20, s18
847 ; GFX7-NEXT: s_addc_u32 s19, s21, s19
848 ; GFX7-NEXT: s_mul_i32 s21, s2, s8
849 ; GFX7-NEXT: v_readfirstlane_b32 s23, v1
850 ; GFX7-NEXT: v_mul_hi_u32 v1, v0, s8
851 ; GFX7-NEXT: s_cselect_b32 s20, 1, 0
852 ; GFX7-NEXT: v_readfirstlane_b32 s22, v3
853 ; GFX7-NEXT: s_add_u32 s18, s21, s18
854 ; GFX7-NEXT: s_addc_u32 s19, s22, s19
855 ; GFX7-NEXT: s_mul_i32 s22, s16, s9
856 ; GFX7-NEXT: s_cselect_b32 s21, 1, 0
857 ; GFX7-NEXT: s_add_u32 s17, s22, s17
858 ; GFX7-NEXT: s_addc_u32 s22, s23, s18
859 ; GFX7-NEXT: v_readfirstlane_b32 s23, v1
860 ; GFX7-NEXT: v_mov_b32_e32 v1, s12
861 ; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1
862 ; GFX7-NEXT: s_mul_i32 s18, s1, s8
863 ; GFX7-NEXT: s_cselect_b32 s25, 1, 0
864 ; GFX7-NEXT: s_add_u32 s18, s18, s17
865 ; GFX7-NEXT: s_addc_u32 s17, s23, s22
866 ; GFX7-NEXT: v_mov_b32_e32 v4, s11
867 ; GFX7-NEXT: v_readfirstlane_b32 s23, v3
868 ; GFX7-NEXT: v_mul_hi_u32 v3, v2, s10
869 ; GFX7-NEXT: v_mul_hi_u32 v5, s1, v4
870 ; GFX7-NEXT: s_mul_i32 s22, s16, s12
871 ; GFX7-NEXT: s_mul_i32 s24, s1, s11
872 ; GFX7-NEXT: v_readfirstlane_b32 s28, v3
873 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
874 ; GFX7-NEXT: v_readfirstlane_b32 s27, v5
875 ; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9
876 ; GFX7-NEXT: s_cselect_b32 s26, 1, 0
877 ; GFX7-NEXT: s_add_u32 s24, s24, s22
878 ; GFX7-NEXT: s_addc_u32 s23, s27, s23
879 ; GFX7-NEXT: v_readfirstlane_b32 s29, v5
880 ; GFX7-NEXT: v_mov_b32_e32 v5, s4
881 ; GFX7-NEXT: v_mul_hi_u32 v6, v5, s8
882 ; GFX7-NEXT: s_mul_i32 s27, s2, s10
883 ; GFX7-NEXT: s_cselect_b32 s22, 1, 0
884 ; GFX7-NEXT: s_add_u32 s24, s27, s24
885 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s10
886 ; GFX7-NEXT: s_addc_u32 s27, s28, s23
887 ; GFX7-NEXT: s_mul_i32 s28, s3, s9
888 ; GFX7-NEXT: s_cselect_b32 s23, 1, 0
889 ; GFX7-NEXT: s_add_u32 s28, s28, s24
890 ; GFX7-NEXT: v_readfirstlane_b32 s30, v6
891 ; GFX7-NEXT: v_mul_hi_u32 v6, s16, v4
892 ; GFX7-NEXT: s_addc_u32 s27, s29, s27
893 ; GFX7-NEXT: s_mul_i32 s29, s4, s8
894 ; GFX7-NEXT: s_cselect_b32 s24, 1, 0
895 ; GFX7-NEXT: s_add_u32 s28, s29, s28
896 ; GFX7-NEXT: v_readfirstlane_b32 s33, v0
897 ; GFX7-NEXT: v_mul_hi_u32 v0, v2, s9
898 ; GFX7-NEXT: s_addc_u32 s27, s30, s27
899 ; GFX7-NEXT: s_mul_i32 s30, s16, s11
900 ; GFX7-NEXT: s_cselect_b32 s29, 1, 0
901 ; GFX7-NEXT: v_readfirstlane_b32 s31, v6
902 ; GFX7-NEXT: s_add_u32 s19, s30, s19
903 ; GFX7-NEXT: s_addc_u32 s28, s31, s28
904 ; GFX7-NEXT: s_mul_i32 s31, s1, s10
905 ; GFX7-NEXT: s_cselect_b32 s30, 1, 0
906 ; GFX7-NEXT: s_add_u32 s19, s31, s19
907 ; GFX7-NEXT: v_readfirstlane_b32 s34, v0
908 ; GFX7-NEXT: v_mul_hi_u32 v0, v3, s8
909 ; GFX7-NEXT: s_addc_u32 s28, s33, s28
910 ; GFX7-NEXT: s_mul_i32 s33, s2, s9
911 ; GFX7-NEXT: s_cselect_b32 s31, 1, 0
912 ; GFX7-NEXT: s_add_u32 s19, s33, s19
913 ; GFX7-NEXT: s_addc_u32 s28, s34, s28
914 ; GFX7-NEXT: s_mul_i32 s34, s3, s8
915 ; GFX7-NEXT: s_cselect_b32 s33, 1, 0
916 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0
917 ; GFX7-NEXT: s_add_u32 s19, s34, s19
918 ; GFX7-NEXT: v_mov_b32_e32 v0, s14
919 ; GFX7-NEXT: s_addc_u32 s28, s35, s28
920 ; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0
921 ; GFX7-NEXT: s_cselect_b32 s34, 1, 0
922 ; GFX7-NEXT: s_cmp_lg_u32 s26, 0
923 ; GFX7-NEXT: s_addc_u32 s19, s25, s19
924 ; GFX7-NEXT: v_mov_b32_e32 v2, s13
925 ; GFX7-NEXT: s_cselect_b32 s25, 1, 0
926 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0
927 ; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2
928 ; GFX7-NEXT: s_addc_u32 s20, s20, 0
929 ; GFX7-NEXT: v_readfirstlane_b32 s26, v0
930 ; GFX7-NEXT: v_mul_hi_u32 v0, s2, v1
931 ; GFX7-NEXT: s_cmp_lg_u32 s25, 0
932 ; GFX7-NEXT: s_addc_u32 s20, s20, s28
933 ; GFX7-NEXT: s_mul_i32 s25, s16, s14
934 ; GFX7-NEXT: s_mul_i32 s28, s1, s13
935 ; GFX7-NEXT: s_cselect_b32 s21, 1, 0
936 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6
937 ; GFX7-NEXT: s_add_u32 s25, s28, s25
938 ; GFX7-NEXT: s_addc_u32 s26, s35, s26
939 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0
940 ; GFX7-NEXT: v_mul_hi_u32 v0, v3, s11
941 ; GFX7-NEXT: s_mul_i32 s28, s2, s12
942 ; GFX7-NEXT: s_add_u32 s25, s28, s25
943 ; GFX7-NEXT: s_addc_u32 s26, s35, s26
944 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0
945 ; GFX7-NEXT: v_mul_hi_u32 v0, v5, s10
946 ; GFX7-NEXT: s_mul_i32 s28, s3, s11
947 ; GFX7-NEXT: s_add_u32 s25, s28, s25
948 ; GFX7-NEXT: s_addc_u32 s26, s35, s26
949 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0
950 ; GFX7-NEXT: v_mov_b32_e32 v0, s5
951 ; GFX7-NEXT: v_mul_hi_u32 v6, v0, s9
952 ; GFX7-NEXT: s_mul_i32 s28, s4, s10
953 ; GFX7-NEXT: s_add_u32 s25, s28, s25
954 ; GFX7-NEXT: v_mul_hi_u32 v1, s1, v1
955 ; GFX7-NEXT: s_addc_u32 s26, s35, s26
956 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6
957 ; GFX7-NEXT: v_mov_b32_e32 v6, s6
958 ; GFX7-NEXT: v_mul_hi_u32 v6, v6, s8
959 ; GFX7-NEXT: s_mul_i32 s28, s5, s9
960 ; GFX7-NEXT: s_add_u32 s25, s28, s25
961 ; GFX7-NEXT: v_mul_hi_u32 v2, s16, v2
962 ; GFX7-NEXT: v_readfirstlane_b32 s36, v1
963 ; GFX7-NEXT: v_mul_hi_u32 v1, s2, v4
964 ; GFX7-NEXT: s_addc_u32 s26, s35, s26
965 ; GFX7-NEXT: s_mul_i32 s28, s6, s8
966 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6
967 ; GFX7-NEXT: s_add_u32 s25, s28, s25
968 ; GFX7-NEXT: s_addc_u32 s26, s35, s26
969 ; GFX7-NEXT: s_mul_i32 s28, s16, s13
970 ; GFX7-NEXT: v_readfirstlane_b32 s35, v2
971 ; GFX7-NEXT: s_add_u32 s27, s28, s27
972 ; GFX7-NEXT: v_readfirstlane_b32 s37, v1
973 ; GFX7-NEXT: v_mul_hi_u32 v1, v3, s10
974 ; GFX7-NEXT: s_addc_u32 s25, s35, s25
975 ; GFX7-NEXT: s_mul_i32 s35, s1, s12
976 ; GFX7-NEXT: s_cselect_b32 s28, 1, 0
977 ; GFX7-NEXT: s_add_u32 s27, s35, s27
978 ; GFX7-NEXT: s_addc_u32 s25, s36, s25
979 ; GFX7-NEXT: s_mul_i32 s36, s2, s11
980 ; GFX7-NEXT: s_cselect_b32 s35, 1, 0
981 ; GFX7-NEXT: s_add_u32 s27, s36, s27
982 ; GFX7-NEXT: v_readfirstlane_b32 s38, v1
983 ; GFX7-NEXT: v_mul_hi_u32 v1, v5, s9
984 ; GFX7-NEXT: s_addc_u32 s25, s37, s25
985 ; GFX7-NEXT: s_mul_i32 s37, s3, s10
986 ; GFX7-NEXT: s_cselect_b32 s36, 1, 0
987 ; GFX7-NEXT: s_add_u32 s27, s37, s27
988 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s8
989 ; GFX7-NEXT: s_addc_u32 s25, s38, s25
990 ; GFX7-NEXT: s_mul_i32 s38, s4, s9
991 ; GFX7-NEXT: s_cselect_b32 s37, 1, 0
992 ; GFX7-NEXT: v_readfirstlane_b32 s39, v1
993 ; GFX7-NEXT: s_add_u32 s27, s38, s27
994 ; GFX7-NEXT: s_addc_u32 s25, s39, s25
995 ; GFX7-NEXT: s_mul_i32 s39, s5, s8
996 ; GFX7-NEXT: s_cselect_b32 s38, 1, 0
997 ; GFX7-NEXT: v_readfirstlane_b32 s40, v0
998 ; GFX7-NEXT: s_add_u32 s27, s39, s27
999 ; GFX7-NEXT: s_addc_u32 s25, s40, s25
1000 ; GFX7-NEXT: s_cselect_b32 s39, 1, 0
1001 ; GFX7-NEXT: s_cmp_lg_u32 s31, 0
1002 ; GFX7-NEXT: s_addc_u32 s30, s30, 0
1003 ; GFX7-NEXT: s_cmp_lg_u32 s33, 0
1004 ; GFX7-NEXT: s_addc_u32 s30, s30, 0
1005 ; GFX7-NEXT: s_cmp_lg_u32 s34, 0
1006 ; GFX7-NEXT: s_addc_u32 s30, s30, 0
1007 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0
1008 ; GFX7-NEXT: s_addc_u32 s21, s30, s27
1009 ; GFX7-NEXT: s_cselect_b32 s27, 1, 0
1010 ; GFX7-NEXT: s_cmp_lg_u32 s23, 0
1011 ; GFX7-NEXT: s_addc_u32 s22, s22, 0
1012 ; GFX7-NEXT: s_cmp_lg_u32 s24, 0
1013 ; GFX7-NEXT: s_addc_u32 s22, s22, 0
1014 ; GFX7-NEXT: s_cmp_lg_u32 s29, 0
1015 ; GFX7-NEXT: s_addc_u32 s22, s22, 0
1016 ; GFX7-NEXT: s_cmp_lg_u32 s27, 0
1017 ; GFX7-NEXT: s_addc_u32 s22, s22, s25
1018 ; GFX7-NEXT: s_mul_i32 s16, s16, s15
1019 ; GFX7-NEXT: s_addc_u32 s15, s26, s16
1020 ; GFX7-NEXT: s_mul_i32 s1, s1, s14
1021 ; GFX7-NEXT: s_cmp_lg_u32 s39, 0
1022 ; GFX7-NEXT: s_addc_u32 s1, s15, s1
1023 ; GFX7-NEXT: s_mul_i32 s2, s2, s13
1024 ; GFX7-NEXT: s_cmp_lg_u32 s38, 0
1025 ; GFX7-NEXT: s_addc_u32 s1, s1, s2
1026 ; GFX7-NEXT: s_mul_i32 s3, s3, s12
1027 ; GFX7-NEXT: s_cmp_lg_u32 s37, 0
1028 ; GFX7-NEXT: s_addc_u32 s1, s1, s3
1029 ; GFX7-NEXT: s_mul_i32 s4, s4, s11
1030 ; GFX7-NEXT: s_cmp_lg_u32 s36, 0
1031 ; GFX7-NEXT: s_addc_u32 s1, s1, s4
1032 ; GFX7-NEXT: s_mul_i32 s5, s5, s10
1033 ; GFX7-NEXT: s_cmp_lg_u32 s35, 0
1034 ; GFX7-NEXT: s_addc_u32 s1, s1, s5
1035 ; GFX7-NEXT: s_mul_i32 s6, s6, s9
1036 ; GFX7-NEXT: s_cmp_lg_u32 s28, 0
1037 ; GFX7-NEXT: s_addc_u32 s1, s1, s6
1038 ; GFX7-NEXT: s_mul_i32 s7, s7, s8
1039 ; GFX7-NEXT: s_mul_i32 s0, s0, s8
1040 ; GFX7-NEXT: s_add_u32 s7, s7, s1
1041 ; GFX7-NEXT: s_mov_b32 s1, s18
1042 ; GFX7-NEXT: s_mov_b32 s2, s17
1043 ; GFX7-NEXT: s_mov_b32 s3, s19
1044 ; GFX7-NEXT: s_mov_b32 s4, s20
1045 ; GFX7-NEXT: s_mov_b32 s5, s21
1046 ; GFX7-NEXT: s_mov_b32 s6, s22
1047 ; GFX7-NEXT: ; return to shader part epilog
1049 ; GFX8-LABEL: s_mul_i256:
1051 ; GFX8-NEXT: s_mov_b32 s16, s0
1052 ; GFX8-NEXT: v_mov_b32_e32 v0, s8
1053 ; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0
1054 ; GFX8-NEXT: v_mov_b32_e32 v1, s9
1055 ; GFX8-NEXT: v_mul_hi_u32 v2, s1, v1
1056 ; GFX8-NEXT: v_mul_hi_u32 v1, s16, v1
1057 ; GFX8-NEXT: v_readfirstlane_b32 s17, v0
1058 ; GFX8-NEXT: v_mov_b32_e32 v0, s10
1059 ; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0
1060 ; GFX8-NEXT: v_readfirstlane_b32 s21, v2
1061 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
1062 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, s8
1063 ; GFX8-NEXT: s_mul_i32 s18, s16, s10
1064 ; GFX8-NEXT: s_mul_i32 s20, s1, s9
1065 ; GFX8-NEXT: v_readfirstlane_b32 s19, v0
1066 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
1067 ; GFX8-NEXT: s_add_u32 s18, s20, s18
1068 ; GFX8-NEXT: s_addc_u32 s19, s21, s19
1069 ; GFX8-NEXT: s_mul_i32 s21, s2, s8
1070 ; GFX8-NEXT: v_readfirstlane_b32 s23, v1
1071 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, s8
1072 ; GFX8-NEXT: s_cselect_b32 s20, 1, 0
1073 ; GFX8-NEXT: v_readfirstlane_b32 s22, v3
1074 ; GFX8-NEXT: s_add_u32 s18, s21, s18
1075 ; GFX8-NEXT: s_addc_u32 s19, s22, s19
1076 ; GFX8-NEXT: s_mul_i32 s22, s16, s9
1077 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0
1078 ; GFX8-NEXT: s_add_u32 s17, s22, s17
1079 ; GFX8-NEXT: s_addc_u32 s22, s23, s18
1080 ; GFX8-NEXT: v_readfirstlane_b32 s23, v1
1081 ; GFX8-NEXT: v_mov_b32_e32 v1, s12
1082 ; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1
1083 ; GFX8-NEXT: s_mul_i32 s18, s1, s8
1084 ; GFX8-NEXT: s_cselect_b32 s25, 1, 0
1085 ; GFX8-NEXT: s_add_u32 s18, s18, s17
1086 ; GFX8-NEXT: s_addc_u32 s17, s23, s22
1087 ; GFX8-NEXT: v_mov_b32_e32 v4, s11
1088 ; GFX8-NEXT: v_readfirstlane_b32 s23, v3
1089 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, s10
1090 ; GFX8-NEXT: v_mul_hi_u32 v5, s1, v4
1091 ; GFX8-NEXT: s_mul_i32 s22, s16, s12
1092 ; GFX8-NEXT: s_mul_i32 s24, s1, s11
1093 ; GFX8-NEXT: v_readfirstlane_b32 s28, v3
1094 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
1095 ; GFX8-NEXT: v_readfirstlane_b32 s27, v5
1096 ; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9
1097 ; GFX8-NEXT: s_cselect_b32 s26, 1, 0
1098 ; GFX8-NEXT: s_add_u32 s24, s24, s22
1099 ; GFX8-NEXT: s_addc_u32 s23, s27, s23
1100 ; GFX8-NEXT: v_readfirstlane_b32 s29, v5
1101 ; GFX8-NEXT: v_mov_b32_e32 v5, s4
1102 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, s8
1103 ; GFX8-NEXT: s_mul_i32 s27, s2, s10
1104 ; GFX8-NEXT: s_cselect_b32 s22, 1, 0
1105 ; GFX8-NEXT: s_add_u32 s24, s27, s24
1106 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s10
1107 ; GFX8-NEXT: s_addc_u32 s27, s28, s23
1108 ; GFX8-NEXT: s_mul_i32 s28, s3, s9
1109 ; GFX8-NEXT: s_cselect_b32 s23, 1, 0
1110 ; GFX8-NEXT: s_add_u32 s28, s28, s24
1111 ; GFX8-NEXT: v_readfirstlane_b32 s30, v6
1112 ; GFX8-NEXT: v_mul_hi_u32 v6, s16, v4
1113 ; GFX8-NEXT: s_addc_u32 s27, s29, s27
1114 ; GFX8-NEXT: s_mul_i32 s29, s4, s8
1115 ; GFX8-NEXT: s_cselect_b32 s24, 1, 0
1116 ; GFX8-NEXT: s_add_u32 s28, s29, s28
1117 ; GFX8-NEXT: v_readfirstlane_b32 s33, v0
1118 ; GFX8-NEXT: v_mul_hi_u32 v0, v2, s9
1119 ; GFX8-NEXT: s_addc_u32 s27, s30, s27
1120 ; GFX8-NEXT: s_mul_i32 s30, s16, s11
1121 ; GFX8-NEXT: s_cselect_b32 s29, 1, 0
1122 ; GFX8-NEXT: v_readfirstlane_b32 s31, v6
1123 ; GFX8-NEXT: s_add_u32 s19, s30, s19
1124 ; GFX8-NEXT: s_addc_u32 s28, s31, s28
1125 ; GFX8-NEXT: s_mul_i32 s31, s1, s10
1126 ; GFX8-NEXT: s_cselect_b32 s30, 1, 0
1127 ; GFX8-NEXT: s_add_u32 s19, s31, s19
1128 ; GFX8-NEXT: v_readfirstlane_b32 s34, v0
1129 ; GFX8-NEXT: v_mul_hi_u32 v0, v3, s8
1130 ; GFX8-NEXT: s_addc_u32 s28, s33, s28
1131 ; GFX8-NEXT: s_mul_i32 s33, s2, s9
1132 ; GFX8-NEXT: s_cselect_b32 s31, 1, 0
1133 ; GFX8-NEXT: s_add_u32 s19, s33, s19
1134 ; GFX8-NEXT: s_addc_u32 s28, s34, s28
1135 ; GFX8-NEXT: s_mul_i32 s34, s3, s8
1136 ; GFX8-NEXT: s_cselect_b32 s33, 1, 0
1137 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0
1138 ; GFX8-NEXT: s_add_u32 s19, s34, s19
1139 ; GFX8-NEXT: v_mov_b32_e32 v0, s14
1140 ; GFX8-NEXT: s_addc_u32 s28, s35, s28
1141 ; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0
1142 ; GFX8-NEXT: s_cselect_b32 s34, 1, 0
1143 ; GFX8-NEXT: s_cmp_lg_u32 s26, 0
1144 ; GFX8-NEXT: s_addc_u32 s19, s25, s19
1145 ; GFX8-NEXT: v_mov_b32_e32 v2, s13
1146 ; GFX8-NEXT: s_cselect_b32 s25, 1, 0
1147 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0
1148 ; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2
1149 ; GFX8-NEXT: s_addc_u32 s20, s20, 0
1150 ; GFX8-NEXT: v_readfirstlane_b32 s26, v0
1151 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v1
1152 ; GFX8-NEXT: s_cmp_lg_u32 s25, 0
1153 ; GFX8-NEXT: s_addc_u32 s20, s20, s28
1154 ; GFX8-NEXT: s_mul_i32 s25, s16, s14
1155 ; GFX8-NEXT: s_mul_i32 s28, s1, s13
1156 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0
1157 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6
1158 ; GFX8-NEXT: s_add_u32 s25, s28, s25
1159 ; GFX8-NEXT: s_addc_u32 s26, s35, s26
1160 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0
1161 ; GFX8-NEXT: v_mul_hi_u32 v0, v3, s11
1162 ; GFX8-NEXT: s_mul_i32 s28, s2, s12
1163 ; GFX8-NEXT: s_add_u32 s25, s28, s25
1164 ; GFX8-NEXT: s_addc_u32 s26, s35, s26
1165 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0
1166 ; GFX8-NEXT: v_mul_hi_u32 v0, v5, s10
1167 ; GFX8-NEXT: s_mul_i32 s28, s3, s11
1168 ; GFX8-NEXT: s_add_u32 s25, s28, s25
1169 ; GFX8-NEXT: s_addc_u32 s26, s35, s26
1170 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0
1171 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
1172 ; GFX8-NEXT: v_mul_hi_u32 v6, v0, s9
1173 ; GFX8-NEXT: s_mul_i32 s28, s4, s10
1174 ; GFX8-NEXT: s_add_u32 s25, s28, s25
1175 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1
1176 ; GFX8-NEXT: s_addc_u32 s26, s35, s26
1177 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6
1178 ; GFX8-NEXT: v_mov_b32_e32 v6, s6
1179 ; GFX8-NEXT: v_mul_hi_u32 v6, v6, s8
1180 ; GFX8-NEXT: s_mul_i32 s28, s5, s9
1181 ; GFX8-NEXT: s_add_u32 s25, s28, s25
1182 ; GFX8-NEXT: v_mul_hi_u32 v2, s16, v2
1183 ; GFX8-NEXT: v_readfirstlane_b32 s36, v1
1184 ; GFX8-NEXT: v_mul_hi_u32 v1, s2, v4
1185 ; GFX8-NEXT: s_addc_u32 s26, s35, s26
1186 ; GFX8-NEXT: s_mul_i32 s28, s6, s8
1187 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6
1188 ; GFX8-NEXT: s_add_u32 s25, s28, s25
1189 ; GFX8-NEXT: s_addc_u32 s26, s35, s26
1190 ; GFX8-NEXT: s_mul_i32 s28, s16, s13
1191 ; GFX8-NEXT: v_readfirstlane_b32 s35, v2
1192 ; GFX8-NEXT: s_add_u32 s27, s28, s27
1193 ; GFX8-NEXT: v_readfirstlane_b32 s37, v1
1194 ; GFX8-NEXT: v_mul_hi_u32 v1, v3, s10
1195 ; GFX8-NEXT: s_addc_u32 s25, s35, s25
1196 ; GFX8-NEXT: s_mul_i32 s35, s1, s12
1197 ; GFX8-NEXT: s_cselect_b32 s28, 1, 0
1198 ; GFX8-NEXT: s_add_u32 s27, s35, s27
1199 ; GFX8-NEXT: s_addc_u32 s25, s36, s25
1200 ; GFX8-NEXT: s_mul_i32 s36, s2, s11
1201 ; GFX8-NEXT: s_cselect_b32 s35, 1, 0
1202 ; GFX8-NEXT: s_add_u32 s27, s36, s27
1203 ; GFX8-NEXT: v_readfirstlane_b32 s38, v1
1204 ; GFX8-NEXT: v_mul_hi_u32 v1, v5, s9
1205 ; GFX8-NEXT: s_addc_u32 s25, s37, s25
1206 ; GFX8-NEXT: s_mul_i32 s37, s3, s10
1207 ; GFX8-NEXT: s_cselect_b32 s36, 1, 0
1208 ; GFX8-NEXT: s_add_u32 s27, s37, s27
1209 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s8
1210 ; GFX8-NEXT: s_addc_u32 s25, s38, s25
1211 ; GFX8-NEXT: s_mul_i32 s38, s4, s9
1212 ; GFX8-NEXT: s_cselect_b32 s37, 1, 0
1213 ; GFX8-NEXT: v_readfirstlane_b32 s39, v1
1214 ; GFX8-NEXT: s_add_u32 s27, s38, s27
1215 ; GFX8-NEXT: s_addc_u32 s25, s39, s25
1216 ; GFX8-NEXT: s_mul_i32 s39, s5, s8
1217 ; GFX8-NEXT: s_cselect_b32 s38, 1, 0
1218 ; GFX8-NEXT: v_readfirstlane_b32 s40, v0
1219 ; GFX8-NEXT: s_add_u32 s27, s39, s27
1220 ; GFX8-NEXT: s_addc_u32 s25, s40, s25
1221 ; GFX8-NEXT: s_cselect_b32 s39, 1, 0
1222 ; GFX8-NEXT: s_cmp_lg_u32 s31, 0
1223 ; GFX8-NEXT: s_addc_u32 s30, s30, 0
1224 ; GFX8-NEXT: s_cmp_lg_u32 s33, 0
1225 ; GFX8-NEXT: s_addc_u32 s30, s30, 0
1226 ; GFX8-NEXT: s_cmp_lg_u32 s34, 0
1227 ; GFX8-NEXT: s_addc_u32 s30, s30, 0
1228 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0
1229 ; GFX8-NEXT: s_addc_u32 s21, s30, s27
1230 ; GFX8-NEXT: s_cselect_b32 s27, 1, 0
1231 ; GFX8-NEXT: s_cmp_lg_u32 s23, 0
1232 ; GFX8-NEXT: s_addc_u32 s22, s22, 0
1233 ; GFX8-NEXT: s_cmp_lg_u32 s24, 0
1234 ; GFX8-NEXT: s_addc_u32 s22, s22, 0
1235 ; GFX8-NEXT: s_cmp_lg_u32 s29, 0
1236 ; GFX8-NEXT: s_addc_u32 s22, s22, 0
1237 ; GFX8-NEXT: s_cmp_lg_u32 s27, 0
1238 ; GFX8-NEXT: s_addc_u32 s22, s22, s25
1239 ; GFX8-NEXT: s_mul_i32 s16, s16, s15
1240 ; GFX8-NEXT: s_addc_u32 s15, s26, s16
1241 ; GFX8-NEXT: s_mul_i32 s1, s1, s14
1242 ; GFX8-NEXT: s_cmp_lg_u32 s39, 0
1243 ; GFX8-NEXT: s_addc_u32 s1, s15, s1
1244 ; GFX8-NEXT: s_mul_i32 s2, s2, s13
1245 ; GFX8-NEXT: s_cmp_lg_u32 s38, 0
1246 ; GFX8-NEXT: s_addc_u32 s1, s1, s2
1247 ; GFX8-NEXT: s_mul_i32 s3, s3, s12
1248 ; GFX8-NEXT: s_cmp_lg_u32 s37, 0
1249 ; GFX8-NEXT: s_addc_u32 s1, s1, s3
1250 ; GFX8-NEXT: s_mul_i32 s4, s4, s11
1251 ; GFX8-NEXT: s_cmp_lg_u32 s36, 0
1252 ; GFX8-NEXT: s_addc_u32 s1, s1, s4
1253 ; GFX8-NEXT: s_mul_i32 s5, s5, s10
1254 ; GFX8-NEXT: s_cmp_lg_u32 s35, 0
1255 ; GFX8-NEXT: s_addc_u32 s1, s1, s5
1256 ; GFX8-NEXT: s_mul_i32 s6, s6, s9
1257 ; GFX8-NEXT: s_cmp_lg_u32 s28, 0
1258 ; GFX8-NEXT: s_addc_u32 s1, s1, s6
1259 ; GFX8-NEXT: s_mul_i32 s7, s7, s8
1260 ; GFX8-NEXT: s_mul_i32 s0, s0, s8
1261 ; GFX8-NEXT: s_add_u32 s7, s7, s1
1262 ; GFX8-NEXT: s_mov_b32 s1, s18
1263 ; GFX8-NEXT: s_mov_b32 s2, s17
1264 ; GFX8-NEXT: s_mov_b32 s3, s19
1265 ; GFX8-NEXT: s_mov_b32 s4, s20
1266 ; GFX8-NEXT: s_mov_b32 s5, s21
1267 ; GFX8-NEXT: s_mov_b32 s6, s22
1268 ; GFX8-NEXT: ; return to shader part epilog
1270 ; GFX9-LABEL: s_mul_i256:
1272 ; GFX9-NEXT: s_mov_b32 s16, s0
1273 ; GFX9-NEXT: s_mul_i32 s18, s16, s10
1274 ; GFX9-NEXT: s_mul_i32 s20, s1, s9
1275 ; GFX9-NEXT: s_mul_hi_u32 s19, s16, s10
1276 ; GFX9-NEXT: s_mul_hi_u32 s21, s1, s9
1277 ; GFX9-NEXT: s_add_u32 s18, s20, s18
1278 ; GFX9-NEXT: s_addc_u32 s19, s21, s19
1279 ; GFX9-NEXT: s_mul_i32 s21, s2, s8
1280 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0
1281 ; GFX9-NEXT: s_mul_hi_u32 s22, s2, s8
1282 ; GFX9-NEXT: s_add_u32 s18, s21, s18
1283 ; GFX9-NEXT: s_mul_hi_u32 s17, s16, s8
1284 ; GFX9-NEXT: s_addc_u32 s19, s22, s19
1285 ; GFX9-NEXT: s_mul_i32 s22, s16, s9
1286 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0
1287 ; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9
1288 ; GFX9-NEXT: s_add_u32 s17, s22, s17
1289 ; GFX9-NEXT: s_addc_u32 s18, s23, s18
1290 ; GFX9-NEXT: s_mul_i32 s23, s1, s8
1291 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0
1292 ; GFX9-NEXT: s_mul_hi_u32 s24, s1, s8
1293 ; GFX9-NEXT: s_add_u32 s17, s23, s17
1294 ; GFX9-NEXT: s_addc_u32 s18, s24, s18
1295 ; GFX9-NEXT: s_mul_i32 s24, s16, s12
1296 ; GFX9-NEXT: s_mul_i32 s26, s1, s11
1297 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0
1298 ; GFX9-NEXT: s_mul_hi_u32 s25, s16, s12
1299 ; GFX9-NEXT: s_mul_hi_u32 s27, s1, s11
1300 ; GFX9-NEXT: s_add_u32 s24, s26, s24
1301 ; GFX9-NEXT: s_addc_u32 s25, s27, s25
1302 ; GFX9-NEXT: s_mul_i32 s27, s2, s10
1303 ; GFX9-NEXT: s_cselect_b32 s26, 1, 0
1304 ; GFX9-NEXT: s_mul_hi_u32 s28, s2, s10
1305 ; GFX9-NEXT: s_add_u32 s24, s27, s24
1306 ; GFX9-NEXT: s_addc_u32 s25, s28, s25
1307 ; GFX9-NEXT: s_mul_i32 s28, s3, s9
1308 ; GFX9-NEXT: s_cselect_b32 s27, 1, 0
1309 ; GFX9-NEXT: s_mul_hi_u32 s29, s3, s9
1310 ; GFX9-NEXT: s_add_u32 s24, s28, s24
1311 ; GFX9-NEXT: s_addc_u32 s25, s29, s25
1312 ; GFX9-NEXT: s_mul_i32 s29, s4, s8
1313 ; GFX9-NEXT: s_cselect_b32 s28, 1, 0
1314 ; GFX9-NEXT: s_mul_hi_u32 s30, s4, s8
1315 ; GFX9-NEXT: s_add_u32 s24, s29, s24
1316 ; GFX9-NEXT: s_addc_u32 s25, s30, s25
1317 ; GFX9-NEXT: s_mul_i32 s30, s16, s11
1318 ; GFX9-NEXT: s_cselect_b32 s29, 1, 0
1319 ; GFX9-NEXT: s_mul_hi_u32 s31, s16, s11
1320 ; GFX9-NEXT: s_add_u32 s19, s30, s19
1321 ; GFX9-NEXT: s_addc_u32 s24, s31, s24
1322 ; GFX9-NEXT: s_mul_i32 s31, s1, s10
1323 ; GFX9-NEXT: s_cselect_b32 s30, 1, 0
1324 ; GFX9-NEXT: s_mul_hi_u32 s33, s1, s10
1325 ; GFX9-NEXT: s_add_u32 s19, s31, s19
1326 ; GFX9-NEXT: s_addc_u32 s24, s33, s24
1327 ; GFX9-NEXT: s_mul_i32 s33, s2, s9
1328 ; GFX9-NEXT: s_cselect_b32 s31, 1, 0
1329 ; GFX9-NEXT: s_mul_hi_u32 s34, s2, s9
1330 ; GFX9-NEXT: s_add_u32 s19, s33, s19
1331 ; GFX9-NEXT: s_addc_u32 s24, s34, s24
1332 ; GFX9-NEXT: s_mul_i32 s34, s3, s8
1333 ; GFX9-NEXT: s_cselect_b32 s33, 1, 0
1334 ; GFX9-NEXT: s_mul_hi_u32 s35, s3, s8
1335 ; GFX9-NEXT: s_add_u32 s19, s34, s19
1336 ; GFX9-NEXT: s_addc_u32 s24, s35, s24
1337 ; GFX9-NEXT: s_cselect_b32 s34, 1, 0
1338 ; GFX9-NEXT: s_cmp_lg_u32 s23, 0
1339 ; GFX9-NEXT: s_addc_u32 s19, s22, s19
1340 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0
1341 ; GFX9-NEXT: s_cmp_lg_u32 s21, 0
1342 ; GFX9-NEXT: s_addc_u32 s20, s20, 0
1343 ; GFX9-NEXT: s_cmp_lg_u32 s22, 0
1344 ; GFX9-NEXT: s_addc_u32 s20, s20, s24
1345 ; GFX9-NEXT: s_mul_i32 s22, s16, s14
1346 ; GFX9-NEXT: s_mul_i32 s24, s1, s13
1347 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0
1348 ; GFX9-NEXT: s_mul_hi_u32 s23, s16, s14
1349 ; GFX9-NEXT: s_mul_hi_u32 s35, s1, s13
1350 ; GFX9-NEXT: s_add_u32 s22, s24, s22
1351 ; GFX9-NEXT: s_addc_u32 s23, s35, s23
1352 ; GFX9-NEXT: s_mul_i32 s24, s2, s12
1353 ; GFX9-NEXT: s_mul_hi_u32 s35, s2, s12
1354 ; GFX9-NEXT: s_add_u32 s22, s24, s22
1355 ; GFX9-NEXT: s_addc_u32 s23, s35, s23
1356 ; GFX9-NEXT: s_mul_i32 s24, s3, s11
1357 ; GFX9-NEXT: s_mul_hi_u32 s35, s3, s11
1358 ; GFX9-NEXT: s_add_u32 s22, s24, s22
1359 ; GFX9-NEXT: s_addc_u32 s23, s35, s23
1360 ; GFX9-NEXT: s_mul_i32 s24, s4, s10
1361 ; GFX9-NEXT: s_mul_hi_u32 s35, s4, s10
1362 ; GFX9-NEXT: s_add_u32 s22, s24, s22
1363 ; GFX9-NEXT: s_addc_u32 s23, s35, s23
1364 ; GFX9-NEXT: s_mul_i32 s24, s5, s9
1365 ; GFX9-NEXT: s_mul_hi_u32 s35, s5, s9
1366 ; GFX9-NEXT: s_add_u32 s22, s24, s22
1367 ; GFX9-NEXT: s_addc_u32 s23, s35, s23
1368 ; GFX9-NEXT: s_mul_i32 s24, s6, s8
1369 ; GFX9-NEXT: s_mul_hi_u32 s35, s6, s8
1370 ; GFX9-NEXT: s_add_u32 s22, s24, s22
1371 ; GFX9-NEXT: s_addc_u32 s23, s35, s23
1372 ; GFX9-NEXT: s_mul_i32 s24, s16, s13
1373 ; GFX9-NEXT: s_mul_hi_u32 s35, s16, s13
1374 ; GFX9-NEXT: s_add_u32 s24, s24, s25
1375 ; GFX9-NEXT: s_addc_u32 s22, s35, s22
1376 ; GFX9-NEXT: s_mul_i32 s35, s1, s12
1377 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0
1378 ; GFX9-NEXT: s_mul_hi_u32 s36, s1, s12
1379 ; GFX9-NEXT: s_add_u32 s24, s35, s24
1380 ; GFX9-NEXT: s_addc_u32 s22, s36, s22
1381 ; GFX9-NEXT: s_mul_i32 s36, s2, s11
1382 ; GFX9-NEXT: s_cselect_b32 s35, 1, 0
1383 ; GFX9-NEXT: s_mul_hi_u32 s37, s2, s11
1384 ; GFX9-NEXT: s_add_u32 s24, s36, s24
1385 ; GFX9-NEXT: s_addc_u32 s22, s37, s22
1386 ; GFX9-NEXT: s_mul_i32 s37, s3, s10
1387 ; GFX9-NEXT: s_cselect_b32 s36, 1, 0
1388 ; GFX9-NEXT: s_mul_hi_u32 s38, s3, s10
1389 ; GFX9-NEXT: s_add_u32 s24, s37, s24
1390 ; GFX9-NEXT: s_addc_u32 s22, s38, s22
1391 ; GFX9-NEXT: s_mul_i32 s38, s4, s9
1392 ; GFX9-NEXT: s_cselect_b32 s37, 1, 0
1393 ; GFX9-NEXT: s_mul_hi_u32 s39, s4, s9
1394 ; GFX9-NEXT: s_add_u32 s24, s38, s24
1395 ; GFX9-NEXT: s_addc_u32 s22, s39, s22
1396 ; GFX9-NEXT: s_mul_i32 s39, s5, s8
1397 ; GFX9-NEXT: s_cselect_b32 s38, 1, 0
1398 ; GFX9-NEXT: s_mul_hi_u32 s40, s5, s8
1399 ; GFX9-NEXT: s_add_u32 s24, s39, s24
1400 ; GFX9-NEXT: s_addc_u32 s22, s40, s22
1401 ; GFX9-NEXT: s_cselect_b32 s39, 1, 0
1402 ; GFX9-NEXT: s_cmp_lg_u32 s31, 0
1403 ; GFX9-NEXT: s_addc_u32 s30, s30, 0
1404 ; GFX9-NEXT: s_cmp_lg_u32 s33, 0
1405 ; GFX9-NEXT: s_addc_u32 s30, s30, 0
1406 ; GFX9-NEXT: s_cmp_lg_u32 s34, 0
1407 ; GFX9-NEXT: s_addc_u32 s30, s30, 0
1408 ; GFX9-NEXT: s_cmp_lg_u32 s21, 0
1409 ; GFX9-NEXT: s_addc_u32 s21, s30, s24
1410 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0
1411 ; GFX9-NEXT: s_cmp_lg_u32 s27, 0
1412 ; GFX9-NEXT: s_addc_u32 s26, s26, 0
1413 ; GFX9-NEXT: s_cmp_lg_u32 s28, 0
1414 ; GFX9-NEXT: s_addc_u32 s26, s26, 0
1415 ; GFX9-NEXT: s_cmp_lg_u32 s29, 0
1416 ; GFX9-NEXT: s_addc_u32 s26, s26, 0
1417 ; GFX9-NEXT: s_cmp_lg_u32 s24, 0
1418 ; GFX9-NEXT: s_addc_u32 s22, s26, s22
1419 ; GFX9-NEXT: s_mul_i32 s16, s16, s15
1420 ; GFX9-NEXT: s_addc_u32 s15, s23, s16
1421 ; GFX9-NEXT: s_mul_i32 s1, s1, s14
1422 ; GFX9-NEXT: s_cmp_lg_u32 s39, 0
1423 ; GFX9-NEXT: s_addc_u32 s1, s15, s1
1424 ; GFX9-NEXT: s_mul_i32 s2, s2, s13
1425 ; GFX9-NEXT: s_cmp_lg_u32 s38, 0
1426 ; GFX9-NEXT: s_addc_u32 s1, s1, s2
1427 ; GFX9-NEXT: s_mul_i32 s3, s3, s12
1428 ; GFX9-NEXT: s_cmp_lg_u32 s37, 0
1429 ; GFX9-NEXT: s_addc_u32 s1, s1, s3
1430 ; GFX9-NEXT: s_mul_i32 s4, s4, s11
1431 ; GFX9-NEXT: s_cmp_lg_u32 s36, 0
1432 ; GFX9-NEXT: s_addc_u32 s1, s1, s4
1433 ; GFX9-NEXT: s_mul_i32 s5, s5, s10
1434 ; GFX9-NEXT: s_cmp_lg_u32 s35, 0
1435 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
1436 ; GFX9-NEXT: s_mul_i32 s6, s6, s9
1437 ; GFX9-NEXT: s_cmp_lg_u32 s25, 0
1438 ; GFX9-NEXT: s_addc_u32 s1, s1, s6
1439 ; GFX9-NEXT: s_mul_i32 s7, s7, s8
1440 ; GFX9-NEXT: s_mul_i32 s0, s0, s8
1441 ; GFX9-NEXT: s_add_u32 s7, s7, s1
1442 ; GFX9-NEXT: s_mov_b32 s1, s17
1443 ; GFX9-NEXT: s_mov_b32 s2, s18
1444 ; GFX9-NEXT: s_mov_b32 s3, s19
1445 ; GFX9-NEXT: s_mov_b32 s4, s20
1446 ; GFX9-NEXT: s_mov_b32 s5, s21
1447 ; GFX9-NEXT: s_mov_b32 s6, s22
1448 ; GFX9-NEXT: ; return to shader part epilog
1450 ; GFX10PLUS-LABEL: s_mul_i256:
1451 ; GFX10PLUS: ; %bb.0:
1452 ; GFX10PLUS-NEXT: s_mul_i32 s17, s0, s10
1453 ; GFX10PLUS-NEXT: s_mul_i32 s19, s1, s9
1454 ; GFX10PLUS-NEXT: s_mul_hi_u32 s18, s0, s10
1455 ; GFX10PLUS-NEXT: s_mul_hi_u32 s20, s1, s9
1456 ; GFX10PLUS-NEXT: s_add_u32 s17, s19, s17
1457 ; GFX10PLUS-NEXT: s_addc_u32 s18, s20, s18
1458 ; GFX10PLUS-NEXT: s_mul_i32 s20, s2, s8
1459 ; GFX10PLUS-NEXT: s_mul_hi_u32 s21, s2, s8
1460 ; GFX10PLUS-NEXT: s_cselect_b32 s19, 1, 0
1461 ; GFX10PLUS-NEXT: s_add_u32 s17, s20, s17
1462 ; GFX10PLUS-NEXT: s_mul_hi_u32 s16, s0, s8
1463 ; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18
1464 ; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s9
1465 ; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s9
1466 ; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0
1467 ; GFX10PLUS-NEXT: s_add_u32 s16, s21, s16
1468 ; GFX10PLUS-NEXT: s_addc_u32 s17, s22, s17
1469 ; GFX10PLUS-NEXT: s_mul_i32 s22, s1, s8
1470 ; GFX10PLUS-NEXT: s_mul_hi_u32 s23, s1, s8
1471 ; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0
1472 ; GFX10PLUS-NEXT: s_add_u32 s16, s22, s16
1473 ; GFX10PLUS-NEXT: s_addc_u32 s17, s23, s17
1474 ; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s12
1475 ; GFX10PLUS-NEXT: s_mul_i32 s25, s1, s11
1476 ; GFX10PLUS-NEXT: s_mul_hi_u32 s24, s0, s12
1477 ; GFX10PLUS-NEXT: s_mul_hi_u32 s26, s1, s11
1478 ; GFX10PLUS-NEXT: s_cselect_b32 s22, 1, 0
1479 ; GFX10PLUS-NEXT: s_add_u32 s23, s25, s23
1480 ; GFX10PLUS-NEXT: s_addc_u32 s24, s26, s24
1481 ; GFX10PLUS-NEXT: s_mul_i32 s26, s2, s10
1482 ; GFX10PLUS-NEXT: s_mul_hi_u32 s27, s2, s10
1483 ; GFX10PLUS-NEXT: s_cselect_b32 s25, 1, 0
1484 ; GFX10PLUS-NEXT: s_add_u32 s23, s26, s23
1485 ; GFX10PLUS-NEXT: s_addc_u32 s24, s27, s24
1486 ; GFX10PLUS-NEXT: s_mul_i32 s27, s3, s9
1487 ; GFX10PLUS-NEXT: s_mul_hi_u32 s28, s3, s9
1488 ; GFX10PLUS-NEXT: s_cselect_b32 s26, 1, 0
1489 ; GFX10PLUS-NEXT: s_add_u32 s23, s27, s23
1490 ; GFX10PLUS-NEXT: s_addc_u32 s24, s28, s24
1491 ; GFX10PLUS-NEXT: s_mul_i32 s28, s4, s8
1492 ; GFX10PLUS-NEXT: s_mul_hi_u32 s29, s4, s8
1493 ; GFX10PLUS-NEXT: s_cselect_b32 s27, 1, 0
1494 ; GFX10PLUS-NEXT: s_add_u32 s23, s28, s23
1495 ; GFX10PLUS-NEXT: s_addc_u32 s24, s29, s24
1496 ; GFX10PLUS-NEXT: s_mul_i32 s29, s0, s11
1497 ; GFX10PLUS-NEXT: s_mul_hi_u32 s30, s0, s11
1498 ; GFX10PLUS-NEXT: s_cselect_b32 s28, 1, 0
1499 ; GFX10PLUS-NEXT: s_add_u32 s18, s29, s18
1500 ; GFX10PLUS-NEXT: s_addc_u32 s23, s30, s23
1501 ; GFX10PLUS-NEXT: s_mul_i32 s30, s1, s10
1502 ; GFX10PLUS-NEXT: s_mul_hi_u32 s31, s1, s10
1503 ; GFX10PLUS-NEXT: s_cselect_b32 s29, 1, 0
1504 ; GFX10PLUS-NEXT: s_add_u32 s18, s30, s18
1505 ; GFX10PLUS-NEXT: s_addc_u32 s23, s31, s23
1506 ; GFX10PLUS-NEXT: s_mul_i32 s31, s2, s9
1507 ; GFX10PLUS-NEXT: s_mul_hi_u32 s33, s2, s9
1508 ; GFX10PLUS-NEXT: s_cselect_b32 s30, 1, 0
1509 ; GFX10PLUS-NEXT: s_add_u32 s18, s31, s18
1510 ; GFX10PLUS-NEXT: s_addc_u32 s23, s33, s23
1511 ; GFX10PLUS-NEXT: s_mul_i32 s33, s3, s8
1512 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s8
1513 ; GFX10PLUS-NEXT: s_cselect_b32 s31, 1, 0
1514 ; GFX10PLUS-NEXT: s_add_u32 s18, s33, s18
1515 ; GFX10PLUS-NEXT: s_addc_u32 s23, s34, s23
1516 ; GFX10PLUS-NEXT: s_cselect_b32 s33, 1, 0
1517 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s22, 0
1518 ; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s14
1519 ; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18
1520 ; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0
1521 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0
1522 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s1, s13
1523 ; GFX10PLUS-NEXT: s_addc_u32 s19, s19, 0
1524 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s21, 0
1525 ; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s14
1526 ; GFX10PLUS-NEXT: s_addc_u32 s19, s19, s23
1527 ; GFX10PLUS-NEXT: s_mul_i32 s23, s1, s13
1528 ; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0
1529 ; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
1530 ; GFX10PLUS-NEXT: s_mul_i32 s23, s2, s12
1531 ; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
1532 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s2, s12
1533 ; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
1534 ; GFX10PLUS-NEXT: s_mul_i32 s23, s3, s11
1535 ; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
1536 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s11
1537 ; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
1538 ; GFX10PLUS-NEXT: s_mul_i32 s23, s4, s10
1539 ; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
1540 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s4, s10
1541 ; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
1542 ; GFX10PLUS-NEXT: s_mul_i32 s23, s5, s9
1543 ; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
1544 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s5, s9
1545 ; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
1546 ; GFX10PLUS-NEXT: s_mul_i32 s23, s6, s8
1547 ; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
1548 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s6, s8
1549 ; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
1550 ; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s13
1551 ; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
1552 ; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s0, s13
1553 ; GFX10PLUS-NEXT: s_add_u32 s23, s23, s24
1554 ; GFX10PLUS-NEXT: s_addc_u32 s21, s34, s21
1555 ; GFX10PLUS-NEXT: s_mul_i32 s34, s1, s12
1556 ; GFX10PLUS-NEXT: s_mul_hi_u32 s35, s1, s12
1557 ; GFX10PLUS-NEXT: s_cselect_b32 s24, 1, 0
1558 ; GFX10PLUS-NEXT: s_add_u32 s23, s34, s23
1559 ; GFX10PLUS-NEXT: s_addc_u32 s21, s35, s21
1560 ; GFX10PLUS-NEXT: s_mul_i32 s35, s2, s11
1561 ; GFX10PLUS-NEXT: s_mul_hi_u32 s36, s2, s11
1562 ; GFX10PLUS-NEXT: s_cselect_b32 s34, 1, 0
1563 ; GFX10PLUS-NEXT: s_add_u32 s23, s35, s23
1564 ; GFX10PLUS-NEXT: s_addc_u32 s21, s36, s21
1565 ; GFX10PLUS-NEXT: s_mul_i32 s36, s3, s10
1566 ; GFX10PLUS-NEXT: s_mul_hi_u32 s37, s3, s10
1567 ; GFX10PLUS-NEXT: s_cselect_b32 s35, 1, 0
1568 ; GFX10PLUS-NEXT: s_add_u32 s23, s36, s23
1569 ; GFX10PLUS-NEXT: s_addc_u32 s21, s37, s21
1570 ; GFX10PLUS-NEXT: s_mul_i32 s37, s4, s9
1571 ; GFX10PLUS-NEXT: s_mul_hi_u32 s38, s4, s9
1572 ; GFX10PLUS-NEXT: s_cselect_b32 s36, 1, 0
1573 ; GFX10PLUS-NEXT: s_add_u32 s23, s37, s23
1574 ; GFX10PLUS-NEXT: s_addc_u32 s21, s38, s21
1575 ; GFX10PLUS-NEXT: s_mul_i32 s38, s5, s8
1576 ; GFX10PLUS-NEXT: s_mul_hi_u32 s39, s5, s8
1577 ; GFX10PLUS-NEXT: s_cselect_b32 s37, 1, 0
1578 ; GFX10PLUS-NEXT: s_add_u32 s23, s38, s23
1579 ; GFX10PLUS-NEXT: s_addc_u32 s21, s39, s21
1580 ; GFX10PLUS-NEXT: s_cselect_b32 s38, 1, 0
1581 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s30, 0
1582 ; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s14
1583 ; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0
1584 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s31, 0
1585 ; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s13
1586 ; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0
1587 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s33, 0
1588 ; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s12
1589 ; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0
1590 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0
1591 ; GFX10PLUS-NEXT: s_mul_i32 s4, s4, s11
1592 ; GFX10PLUS-NEXT: s_addc_u32 s20, s29, s23
1593 ; GFX10PLUS-NEXT: s_cselect_b32 s23, 1, 0
1594 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s26, 0
1595 ; GFX10PLUS-NEXT: s_mul_i32 s26, s0, s15
1596 ; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0
1597 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s27, 0
1598 ; GFX10PLUS-NEXT: s_mul_i32 s5, s5, s10
1599 ; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0
1600 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s28, 0
1601 ; GFX10PLUS-NEXT: s_mul_i32 s6, s6, s9
1602 ; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0
1603 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s23, 0
1604 ; GFX10PLUS-NEXT: s_mul_i32 s7, s7, s8
1605 ; GFX10PLUS-NEXT: s_addc_u32 s15, s25, s21
1606 ; GFX10PLUS-NEXT: s_addc_u32 s21, s22, s26
1607 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s38, 0
1608 ; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s8
1609 ; GFX10PLUS-NEXT: s_addc_u32 s1, s21, s1
1610 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s37, 0
1611 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s2
1612 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s36, 0
1613 ; GFX10PLUS-NEXT: s_mov_b32 s2, s17
1614 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3
1615 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s35, 0
1616 ; GFX10PLUS-NEXT: s_mov_b32 s3, s18
1617 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s4
1618 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s34, 0
1619 ; GFX10PLUS-NEXT: s_mov_b32 s4, s19
1620 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5
1621 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s24, 0
1622 ; GFX10PLUS-NEXT: s_mov_b32 s5, s20
1623 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s6
1624 ; GFX10PLUS-NEXT: s_mov_b32 s6, s15
1625 ; GFX10PLUS-NEXT: s_add_i32 s7, s1, s7
1626 ; GFX10PLUS-NEXT: s_mov_b32 s1, s16
1627 ; GFX10PLUS-NEXT: ; return to shader part epilog
1628 %result = mul i256 %num, %den
1629 %cast = bitcast i256 %result to <8 x i32>
1633 define i256 @v_mul_i256(i256 %num, i256 %den) {
1634 ; GFX7-LABEL: v_mul_i256:
1636 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1637 ; GFX7-NEXT: v_mov_b32_e32 v16, v0
1638 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
1639 ; GFX7-NEXT: v_mov_b32_e32 v17, v1
1640 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
1641 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
1642 ; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
1643 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
1644 ; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
1645 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
1646 ; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
1647 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
1648 ; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc
1649 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
1650 ; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
1651 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
1652 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
1653 ; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
1654 ; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
1655 ; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
1656 ; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
1657 ; GFX7-NEXT: v_mov_b32_e32 v18, v23
1658 ; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
1659 ; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
1660 ; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
1661 ; GFX7-NEXT: v_mov_b32_e32 v0, v20
1662 ; GFX7-NEXT: v_mov_b32_e32 v1, v23
1663 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
1664 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
1665 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
1666 ; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9
1667 ; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9]
1668 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
1669 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
1670 ; GFX7-NEXT: v_mul_lo_u32 v23, v5, v10
1671 ; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11
1672 ; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
1673 ; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
1674 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
1675 ; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
1676 ; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13
1677 ; GFX7-NEXT: v_mov_b32_e32 v2, v22
1678 ; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
1679 ; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
1680 ; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
1681 ; GFX7-NEXT: v_mul_lo_u32 v12, v3, v12
1682 ; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
1683 ; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
1684 ; GFX7-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
1685 ; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
1686 ; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
1687 ; GFX7-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
1688 ; GFX7-NEXT: v_mul_lo_u32 v10, v16, v15
1689 ; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14
1690 ; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
1691 ; GFX7-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
1692 ; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
1693 ; GFX7-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
1694 ; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
1695 ; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
1696 ; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
1697 ; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
1698 ; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
1699 ; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc
1700 ; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
1701 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1703 ; GFX8-LABEL: v_mul_i256:
1705 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1706 ; GFX8-NEXT: v_mov_b32_e32 v16, v0
1707 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
1708 ; GFX8-NEXT: v_mov_b32_e32 v17, v1
1709 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
1710 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
1711 ; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
1712 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
1713 ; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
1714 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
1715 ; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
1716 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
1717 ; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc
1718 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
1719 ; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
1720 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
1721 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
1722 ; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
1723 ; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
1724 ; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
1725 ; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
1726 ; GFX8-NEXT: v_mov_b32_e32 v18, v23
1727 ; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
1728 ; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
1729 ; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
1730 ; GFX8-NEXT: v_mov_b32_e32 v0, v20
1731 ; GFX8-NEXT: v_mov_b32_e32 v1, v23
1732 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
1733 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
1734 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
1735 ; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9
1736 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9]
1737 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
1738 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
1739 ; GFX8-NEXT: v_mul_lo_u32 v23, v5, v10
1740 ; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11
1741 ; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
1742 ; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
1743 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
1744 ; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
1745 ; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13
1746 ; GFX8-NEXT: v_mov_b32_e32 v2, v22
1747 ; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
1748 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
1749 ; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
1750 ; GFX8-NEXT: v_mul_lo_u32 v12, v3, v12
1751 ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
1752 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
1753 ; GFX8-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
1754 ; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
1755 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
1756 ; GFX8-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
1757 ; GFX8-NEXT: v_mul_lo_u32 v10, v16, v15
1758 ; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14
1759 ; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
1760 ; GFX8-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
1761 ; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
1762 ; GFX8-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
1763 ; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
1764 ; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
1765 ; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
1766 ; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
1767 ; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
1768 ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc
1769 ; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
1770 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1772 ; GFX9-LABEL: v_mul_i256:
1774 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1775 ; GFX9-NEXT: v_mov_b32_e32 v16, v0
1776 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
1777 ; GFX9-NEXT: v_mov_b32_e32 v17, v1
1778 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
1779 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
1780 ; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
1781 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
1782 ; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
1783 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
1784 ; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
1785 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
1786 ; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v24, vcc
1787 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
1788 ; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
1789 ; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
1790 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
1791 ; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
1792 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
1793 ; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
1794 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
1795 ; GFX9-NEXT: v_mov_b32_e32 v18, v23
1796 ; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
1797 ; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
1798 ; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
1799 ; GFX9-NEXT: v_mov_b32_e32 v0, v20
1800 ; GFX9-NEXT: v_mov_b32_e32 v1, v23
1801 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
1802 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
1803 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
1804 ; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9
1805 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9]
1806 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
1807 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
1808 ; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10
1809 ; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11
1810 ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
1811 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
1812 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
1813 ; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
1814 ; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13
1815 ; GFX9-NEXT: v_mov_b32_e32 v2, v22
1816 ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
1817 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
1818 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
1819 ; GFX9-NEXT: v_mul_lo_u32 v12, v3, v12
1820 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
1821 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
1822 ; GFX9-NEXT: v_addc_co_u32_e64 v18, s[8:9], 0, v6, s[8:9]
1823 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
1824 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
1825 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[8:9], v9, v3, s[8:9]
1826 ; GFX9-NEXT: v_mul_lo_u32 v10, v16, v15
1827 ; GFX9-NEXT: v_mul_lo_u32 v9, v17, v14
1828 ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[8:9], v25, v4, s[8:9]
1829 ; GFX9-NEXT: v_addc_co_u32_e64 v5, s[8:9], v18, v5, s[8:9]
1830 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], v21, v6, s[8:9]
1831 ; GFX9-NEXT: v_addc_co_u32_e64 v10, s[8:9], v24, v10, s[8:9]
1832 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v10, v9, s[14:15]
1833 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v13, s[12:13]
1834 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[8:9], v9, v12, s[10:11]
1835 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v9, v26, s[6:7]
1836 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v9, v23, s[4:5]
1837 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v20, vcc
1838 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
1839 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1841 ; GFX10-LABEL: v_mul_i256:
1843 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1844 ; GFX10-NEXT: v_mov_b32_e32 v16, v0
1845 ; GFX10-NEXT: v_mov_b32_e32 v17, v1
1846 ; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9
1847 ; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10
1848 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8
1849 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0
1850 ; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0
1851 ; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14
1852 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
1853 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
1854 ; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
1855 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4
1856 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
1857 ; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
1858 ; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
1859 ; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0
1860 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
1861 ; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
1862 ; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
1863 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
1864 ; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
1865 ; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
1866 ; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
1867 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
1868 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
1869 ; GFX10-NEXT: v_mov_b32_e32 v20, v22
1870 ; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
1871 ; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
1872 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
1873 ; GFX10-NEXT: v_mov_b32_e32 v20, v18
1874 ; GFX10-NEXT: v_mov_b32_e32 v19, v22
1875 ; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15
1876 ; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
1877 ; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
1878 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0
1879 ; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11
1880 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6
1881 ; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
1882 ; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12
1883 ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
1884 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
1885 ; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13
1886 ; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
1887 ; GFX10-NEXT: v_mov_b32_e32 v13, v1
1888 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
1889 ; GFX10-NEXT: v_mov_b32_e32 v14, v21
1890 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
1891 ; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
1892 ; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
1893 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8
1894 ; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
1895 ; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
1896 ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
1897 ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
1898 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
1899 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
1900 ; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
1901 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
1902 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
1903 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
1904 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
1905 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
1906 ; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
1907 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
1908 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s4
1909 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v8, v7
1910 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1912 ; GFX11-LABEL: v_mul_i256:
1914 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1915 ; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
1916 ; GFX11-NEXT: v_mul_lo_u32 v7, v7, v8
1917 ; GFX11-NEXT: v_mul_lo_u32 v27, v6, v9
1918 ; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10
1919 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0
1920 ; GFX11-NEXT: v_mad_u64_u32 v[18:19], null, v16, v12, 0
1921 ; GFX11-NEXT: v_mul_lo_u32 v30, v17, v14
1922 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
1923 ; GFX11-NEXT: v_mad_u64_u32 v[18:19], s0, v17, v11, v[18:19]
1924 ; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
1925 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
1926 ; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
1927 ; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
1928 ; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0
1929 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
1930 ; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
1931 ; GFX11-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
1932 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
1933 ; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
1934 ; GFX11-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
1935 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
1936 ; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v8, v[0:1]
1937 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
1938 ; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
1939 ; GFX11-NEXT: v_mov_b32_e32 v20, v22
1940 ; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
1941 ; GFX11-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
1942 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[19:20]
1943 ; GFX11-NEXT: v_mov_b32_e32 v20, v18
1944 ; GFX11-NEXT: v_mov_b32_e32 v19, v22
1945 ; GFX11-NEXT: v_mul_lo_u32 v22, v16, v15
1946 ; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
1947 ; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[19:20]
1948 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v8, 0
1949 ; GFX11-NEXT: v_mul_lo_u32 v20, v4, v11
1950 ; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2
1951 ; GFX11-NEXT: v_mad_u64_u32 v[18:19], s1, v2, v11, v[24:25]
1952 ; GFX11-NEXT: v_mul_lo_u32 v25, v3, v12
1953 ; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15]
1954 ; GFX11-NEXT: v_mov_b32_e32 v14, v21
1955 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
1956 ; GFX11-NEXT: v_mad_u64_u32 v[18:19], s3, v3, v10, v[18:19]
1957 ; GFX11-NEXT: v_mul_lo_u32 v24, v2, v13
1958 ; GFX11-NEXT: v_mov_b32_e32 v13, v1
1959 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[11:12]
1960 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
1961 ; GFX11-NEXT: v_mad_u64_u32 v[10:11], s2, v4, v9, v[18:19]
1962 ; GFX11-NEXT: v_mad_u64_u32 v[12:13], s4, v16, v9, v[13:14]
1963 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
1964 ; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v8, v[1:2]
1965 ; GFX11-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4
1966 ; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v8, v[10:11]
1967 ; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v8, v[12:13]
1968 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
1969 ; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
1970 ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
1971 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
1972 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5
1973 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4
1974 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2
1975 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
1976 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
1977 ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
1978 ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0
1979 ; GFX11-NEXT: v_add_nc_u32_e32 v7, v8, v7
1980 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1981 %result = mul i256 %num, %den