1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
3 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
4 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
6 define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
7 ; CHECK-LABEL: @udiv_i32(
8 ; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
9 ; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
10 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
11 ; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
12 ; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]]
13 ; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
14 ; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
15 ; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
16 ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
17 ; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
18 ; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
19 ; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
20 ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
21 ; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
22 ; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
23 ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
24 ; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
25 ; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
26 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
27 ; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
28 ; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
29 ; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
30 ; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP19]], 1
31 ; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]]
32 ; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]]
33 ; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]]
34 ; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]]
35 ; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP24]], 1
36 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]]
37 ; CHECK-NEXT: store i32 [[TMP29]], ptr addrspace(1) [[OUT:%.*]], align 4
38 ; CHECK-NEXT: ret void
40 ; GFX6-LABEL: udiv_i32:
42 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
43 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
44 ; GFX6-NEXT: s_mov_b32 s6, -1
45 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
46 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
47 ; GFX6-NEXT: s_sub_i32 s4, 0, s3
48 ; GFX6-NEXT: s_mov_b32 s5, s1
49 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
50 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
51 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
52 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
53 ; GFX6-NEXT: s_mov_b32 s4, s0
54 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
55 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
56 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
57 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
58 ; GFX6-NEXT: s_mul_i32 s0, s0, s3
59 ; GFX6-NEXT: s_sub_i32 s0, s2, s0
60 ; GFX6-NEXT: s_sub_i32 s1, s0, s3
61 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
62 ; GFX6-NEXT: s_cmp_ge_u32 s0, s3
63 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
64 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
65 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0
66 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
67 ; GFX6-NEXT: s_cmp_ge_u32 s0, s3
68 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
69 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
70 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
73 ; GFX9-LABEL: udiv_i32:
75 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
76 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
77 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
78 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
79 ; GFX9-NEXT: s_sub_i32 s4, 0, s3
80 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
81 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
82 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
83 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0
84 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
85 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
86 ; GFX9-NEXT: s_add_i32 s5, s5, s4
87 ; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5
88 ; GFX9-NEXT: s_mul_i32 s5, s4, s3
89 ; GFX9-NEXT: s_sub_i32 s2, s2, s5
90 ; GFX9-NEXT: s_add_i32 s6, s4, 1
91 ; GFX9-NEXT: s_sub_i32 s5, s2, s3
92 ; GFX9-NEXT: s_cmp_ge_u32 s2, s3
93 ; GFX9-NEXT: s_cselect_b32 s4, s6, s4
94 ; GFX9-NEXT: s_cselect_b32 s2, s5, s2
95 ; GFX9-NEXT: s_add_i32 s5, s4, 1
96 ; GFX9-NEXT: s_cmp_ge_u32 s2, s3
97 ; GFX9-NEXT: s_cselect_b32 s2, s5, s4
98 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
99 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
100 ; GFX9-NEXT: s_endpgm
102 store i32 %r, ptr addrspace(1) %out
106 define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
107 ; CHECK-LABEL: @urem_i32(
108 ; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
109 ; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
110 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
111 ; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
112 ; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]]
113 ; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
114 ; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
115 ; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
116 ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
117 ; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
118 ; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
119 ; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
120 ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
121 ; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
122 ; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
123 ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
124 ; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
125 ; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
126 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
127 ; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
128 ; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
129 ; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
130 ; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]]
131 ; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]]
132 ; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]]
133 ; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]]
134 ; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]]
135 ; CHECK-NEXT: store i32 [[TMP27]], ptr addrspace(1) [[OUT:%.*]], align 4
136 ; CHECK-NEXT: ret void
138 ; GFX6-LABEL: urem_i32:
140 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
141 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
142 ; GFX6-NEXT: s_mov_b32 s6, -1
143 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
144 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
145 ; GFX6-NEXT: s_sub_i32 s4, 0, s3
146 ; GFX6-NEXT: s_mov_b32 s5, s1
147 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
148 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
149 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
150 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
151 ; GFX6-NEXT: s_mov_b32 s4, s0
152 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
153 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
154 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
155 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
156 ; GFX6-NEXT: s_mul_i32 s0, s0, s3
157 ; GFX6-NEXT: s_sub_i32 s0, s2, s0
158 ; GFX6-NEXT: s_sub_i32 s1, s0, s3
159 ; GFX6-NEXT: s_cmp_ge_u32 s0, s3
160 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0
161 ; GFX6-NEXT: s_sub_i32 s1, s0, s3
162 ; GFX6-NEXT: s_cmp_ge_u32 s0, s3
163 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0
164 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
165 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
166 ; GFX6-NEXT: s_endpgm
168 ; GFX9-LABEL: urem_i32:
170 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
171 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
172 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
173 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
174 ; GFX9-NEXT: s_sub_i32 s4, 0, s3
175 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
176 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
177 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
178 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0
179 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
180 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
181 ; GFX9-NEXT: s_add_i32 s5, s5, s4
182 ; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5
183 ; GFX9-NEXT: s_mul_i32 s4, s4, s3
184 ; GFX9-NEXT: s_sub_i32 s2, s2, s4
185 ; GFX9-NEXT: s_sub_i32 s4, s2, s3
186 ; GFX9-NEXT: s_cmp_ge_u32 s2, s3
187 ; GFX9-NEXT: s_cselect_b32 s2, s4, s2
188 ; GFX9-NEXT: s_sub_i32 s4, s2, s3
189 ; GFX9-NEXT: s_cmp_ge_u32 s2, s3
190 ; GFX9-NEXT: s_cselect_b32 s2, s4, s2
191 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
192 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
193 ; GFX9-NEXT: s_endpgm
195 store i32 %r, ptr addrspace(1) %out
199 define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
200 ; CHECK-LABEL: @sdiv_i32(
201 ; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
202 ; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
203 ; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
204 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]]
205 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]]
206 ; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]]
207 ; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]]
208 ; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float
209 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]])
210 ; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000
211 ; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32
212 ; CHECK-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP7]]
213 ; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]]
214 ; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64
215 ; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
216 ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
217 ; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
218 ; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
219 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
220 ; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]]
221 ; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP6]] to i64
222 ; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
223 ; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]]
224 ; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
225 ; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32
226 ; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
227 ; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]]
228 ; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]]
229 ; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]]
230 ; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1
231 ; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
232 ; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]]
233 ; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]]
234 ; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]]
235 ; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP31]], 1
236 ; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]]
237 ; CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]]
238 ; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]]
239 ; CHECK-NEXT: store i32 [[TMP38]], ptr addrspace(1) [[OUT:%.*]], align 4
240 ; CHECK-NEXT: ret void
242 ; GFX6-LABEL: sdiv_i32:
244 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
245 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
246 ; GFX6-NEXT: s_mov_b32 s6, -1
247 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
248 ; GFX6-NEXT: s_abs_i32 s8, s3
249 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
250 ; GFX6-NEXT: s_sub_i32 s4, 0, s8
251 ; GFX6-NEXT: s_mov_b32 s5, s1
252 ; GFX6-NEXT: s_xor_b32 s1, s2, s3
253 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
254 ; GFX6-NEXT: s_ashr_i32 s1, s1, 31
255 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
256 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
257 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
258 ; GFX6-NEXT: s_mov_b32 s4, s0
259 ; GFX6-NEXT: s_abs_i32 s0, s2
260 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
261 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
262 ; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
263 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0
264 ; GFX6-NEXT: s_mul_i32 s2, s2, s8
265 ; GFX6-NEXT: s_sub_i32 s0, s0, s2
266 ; GFX6-NEXT: s_sub_i32 s2, s0, s8
267 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
268 ; GFX6-NEXT: s_cmp_ge_u32 s0, s8
269 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
270 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
271 ; GFX6-NEXT: s_cselect_b32 s0, s2, s0
272 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
273 ; GFX6-NEXT: s_cmp_ge_u32 s0, s8
274 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
275 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
276 ; GFX6-NEXT: v_xor_b32_e32 v0, s1, v0
277 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s1, v0
278 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
279 ; GFX6-NEXT: s_endpgm
281 ; GFX9-LABEL: sdiv_i32:
283 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
284 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
285 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
286 ; GFX9-NEXT: s_abs_i32 s4, s3
287 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
288 ; GFX9-NEXT: s_sub_i32 s5, 0, s4
289 ; GFX9-NEXT: s_xor_b32 s3, s2, s3
290 ; GFX9-NEXT: s_abs_i32 s2, s2
291 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
292 ; GFX9-NEXT: s_ashr_i32 s3, s3, 31
293 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
294 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
295 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0
296 ; GFX9-NEXT: s_mul_i32 s5, s5, s6
297 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
298 ; GFX9-NEXT: s_add_i32 s6, s6, s5
299 ; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6
300 ; GFX9-NEXT: s_mul_i32 s6, s5, s4
301 ; GFX9-NEXT: s_sub_i32 s2, s2, s6
302 ; GFX9-NEXT: s_add_i32 s7, s5, 1
303 ; GFX9-NEXT: s_sub_i32 s6, s2, s4
304 ; GFX9-NEXT: s_cmp_ge_u32 s2, s4
305 ; GFX9-NEXT: s_cselect_b32 s5, s7, s5
306 ; GFX9-NEXT: s_cselect_b32 s2, s6, s2
307 ; GFX9-NEXT: s_add_i32 s6, s5, 1
308 ; GFX9-NEXT: s_cmp_ge_u32 s2, s4
309 ; GFX9-NEXT: s_cselect_b32 s2, s6, s5
310 ; GFX9-NEXT: s_xor_b32 s2, s2, s3
311 ; GFX9-NEXT: s_sub_i32 s2, s2, s3
312 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
313 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
314 ; GFX9-NEXT: s_endpgm
316 store i32 %r, ptr addrspace(1) %out
320 define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
321 ; CHECK-LABEL: @srem_i32(
322 ; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
323 ; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
324 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]]
325 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]]
326 ; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]]
327 ; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]]
328 ; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float
329 ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
330 ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000
331 ; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32
332 ; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP6]]
333 ; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]]
334 ; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64
335 ; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
336 ; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
337 ; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
338 ; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32
339 ; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
340 ; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]]
341 ; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
342 ; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
343 ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]]
344 ; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32
345 ; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32
346 ; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
347 ; CHECK-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]]
348 ; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]]
349 ; CHECK-NEXT: [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]]
350 ; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]]
351 ; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]]
352 ; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]]
353 ; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]]
354 ; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]]
355 ; CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]]
356 ; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]]
357 ; CHECK-NEXT: store i32 [[TMP35]], ptr addrspace(1) [[OUT:%.*]], align 4
358 ; CHECK-NEXT: ret void
360 ; GFX6-LABEL: srem_i32:
362 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
363 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
364 ; GFX6-NEXT: s_mov_b32 s6, -1
365 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
366 ; GFX6-NEXT: s_abs_i32 s3, s3
367 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
368 ; GFX6-NEXT: s_sub_i32 s4, 0, s3
369 ; GFX6-NEXT: s_abs_i32 s8, s2
370 ; GFX6-NEXT: s_mov_b32 s5, s1
371 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
372 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
373 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
374 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
375 ; GFX6-NEXT: s_mov_b32 s4, s0
376 ; GFX6-NEXT: s_ashr_i32 s0, s2, 31
377 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
378 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
379 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
380 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0
381 ; GFX6-NEXT: s_mul_i32 s1, s1, s3
382 ; GFX6-NEXT: s_sub_i32 s1, s8, s1
383 ; GFX6-NEXT: s_sub_i32 s2, s1, s3
384 ; GFX6-NEXT: s_cmp_ge_u32 s1, s3
385 ; GFX6-NEXT: s_cselect_b32 s1, s2, s1
386 ; GFX6-NEXT: s_sub_i32 s2, s1, s3
387 ; GFX6-NEXT: s_cmp_ge_u32 s1, s3
388 ; GFX6-NEXT: s_cselect_b32 s1, s2, s1
389 ; GFX6-NEXT: s_xor_b32 s1, s1, s0
390 ; GFX6-NEXT: s_sub_i32 s0, s1, s0
391 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
392 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
393 ; GFX6-NEXT: s_endpgm
395 ; GFX9-LABEL: srem_i32:
397 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
398 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
399 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
400 ; GFX9-NEXT: s_abs_i32 s3, s3
401 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
402 ; GFX9-NEXT: s_sub_i32 s5, 0, s3
403 ; GFX9-NEXT: s_ashr_i32 s4, s2, 31
404 ; GFX9-NEXT: s_abs_i32 s2, s2
405 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
406 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
407 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
408 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0
409 ; GFX9-NEXT: s_mul_i32 s5, s5, s6
410 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
411 ; GFX9-NEXT: s_add_i32 s6, s6, s5
412 ; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6
413 ; GFX9-NEXT: s_mul_i32 s5, s5, s3
414 ; GFX9-NEXT: s_sub_i32 s2, s2, s5
415 ; GFX9-NEXT: s_sub_i32 s5, s2, s3
416 ; GFX9-NEXT: s_cmp_ge_u32 s2, s3
417 ; GFX9-NEXT: s_cselect_b32 s2, s5, s2
418 ; GFX9-NEXT: s_sub_i32 s5, s2, s3
419 ; GFX9-NEXT: s_cmp_ge_u32 s2, s3
420 ; GFX9-NEXT: s_cselect_b32 s2, s5, s2
421 ; GFX9-NEXT: s_xor_b32 s2, s2, s4
422 ; GFX9-NEXT: s_sub_i32 s2, s2, s4
423 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
424 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
425 ; GFX9-NEXT: s_endpgm
427 store i32 %r, ptr addrspace(1) %out
431 define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
432 ; CHECK-LABEL: @udiv_i16(
433 ; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
434 ; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
435 ; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
436 ; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
437 ; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
438 ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
439 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
440 ; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]]
441 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
442 ; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
443 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
444 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
445 ; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
446 ; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
447 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
448 ; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535
449 ; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
450 ; CHECK-NEXT: store i16 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 2
451 ; CHECK-NEXT: ret void
453 ; GFX6-LABEL: udiv_i16:
455 ; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
456 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
457 ; GFX6-NEXT: s_mov_b32 s2, -1
458 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
459 ; GFX6-NEXT: s_lshr_b32 s1, s0, 16
460 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1
461 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
462 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0
463 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
464 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
465 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
466 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
467 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2
468 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
469 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
470 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
471 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
473 ; GFX6-NEXT: s_endpgm
475 ; GFX9-LABEL: udiv_i16:
477 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c
478 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
479 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
480 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16
481 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1
482 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
483 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0
484 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
485 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
486 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
487 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
488 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
489 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
490 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
491 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
492 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
493 ; GFX9-NEXT: global_store_short v3, v0, s[0:1]
494 ; GFX9-NEXT: s_endpgm
496 store i16 %r, ptr addrspace(1) %out
500 define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
501 ; CHECK-LABEL: @urem_i16(
502 ; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
503 ; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
504 ; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
505 ; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
506 ; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
507 ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
508 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
509 ; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]]
510 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
511 ; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
512 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
513 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
514 ; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
515 ; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
516 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
517 ; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
518 ; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
519 ; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535
520 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
521 ; CHECK-NEXT: store i16 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 2
522 ; CHECK-NEXT: ret void
524 ; GFX6-LABEL: urem_i16:
526 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
527 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
528 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
529 ; GFX6-NEXT: s_lshr_b32 s2, s6, 16
530 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
531 ; GFX6-NEXT: s_and_b32 s0, s6, 0xffff
532 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0
533 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
534 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
535 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
536 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
537 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2
538 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
539 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
540 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
541 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2
542 ; GFX6-NEXT: s_mov_b32 s2, -1
543 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
544 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
545 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
546 ; GFX6-NEXT: s_endpgm
548 ; GFX9-LABEL: urem_i16:
550 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
551 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
552 ; GFX9-NEXT: s_lshr_b32 s3, s2, 16
553 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
554 ; GFX9-NEXT: s_and_b32 s0, s2, 0xffff
555 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0
556 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
557 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
558 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
559 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
560 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2
561 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
562 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
563 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
564 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
565 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
566 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
567 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
568 ; GFX9-NEXT: global_store_short v1, v0, s[0:1]
569 ; GFX9-NEXT: s_endpgm
571 store i16 %r, ptr addrspace(1) %out
575 define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
576 ; CHECK-LABEL: @sdiv_i16(
577 ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
578 ; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
579 ; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
580 ; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
581 ; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1
582 ; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
583 ; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
584 ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
585 ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
586 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
587 ; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]]
588 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
589 ; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
590 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
591 ; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
592 ; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
593 ; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
594 ; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
595 ; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16
596 ; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
597 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
598 ; CHECK-NEXT: store i16 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 2
599 ; CHECK-NEXT: ret void
601 ; GFX6-LABEL: sdiv_i16:
603 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
604 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
605 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
606 ; GFX6-NEXT: s_mov_b32 s2, -1
607 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
608 ; GFX6-NEXT: s_ashr_i32 s4, s6, 16
609 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
610 ; GFX6-NEXT: s_sext_i32_i16 s5, s6
611 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
612 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
613 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
614 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
615 ; GFX6-NEXT: s_or_b32 s6, s4, 1
616 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
617 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
618 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
619 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
620 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
621 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
622 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
623 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2
624 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
625 ; GFX6-NEXT: s_endpgm
627 ; GFX9-LABEL: sdiv_i16:
629 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
630 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
631 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
632 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
633 ; GFX9-NEXT: s_ashr_i32 s3, s2, 16
634 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3
635 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
636 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2
637 ; GFX9-NEXT: s_xor_b32 s2, s2, s3
638 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
639 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30
640 ; GFX9-NEXT: s_or_b32 s4, s2, 1
641 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3
642 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
643 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2
644 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
645 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
646 ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
647 ; GFX9-NEXT: s_cselect_b32 s2, s4, 0
648 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v3
649 ; GFX9-NEXT: global_store_short v1, v0, s[0:1]
650 ; GFX9-NEXT: s_endpgm
652 store i16 %r, ptr addrspace(1) %out
656 define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
657 ; CHECK-LABEL: @srem_i16(
658 ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
659 ; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
660 ; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
661 ; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
662 ; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1
663 ; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
664 ; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
665 ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
666 ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
667 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
668 ; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]]
669 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
670 ; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
671 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
672 ; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
673 ; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
674 ; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
675 ; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
676 ; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
677 ; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
678 ; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16
679 ; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
680 ; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
681 ; CHECK-NEXT: store i16 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 2
682 ; CHECK-NEXT: ret void
684 ; GFX6-LABEL: srem_i16:
686 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
687 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
688 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
689 ; GFX6-NEXT: s_ashr_i32 s7, s6, 16
690 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s7
691 ; GFX6-NEXT: s_sext_i32_i16 s2, s6
692 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2
693 ; GFX6-NEXT: s_xor_b32 s2, s2, s7
694 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
695 ; GFX6-NEXT: s_ashr_i32 s2, s2, 30
696 ; GFX6-NEXT: s_or_b32 s4, s2, 1
697 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
698 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
699 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
700 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
701 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
702 ; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec
703 ; GFX6-NEXT: s_cselect_b32 s2, s4, 0
704 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
705 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s7
706 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
707 ; GFX6-NEXT: s_mov_b32 s2, -1
708 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
709 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
710 ; GFX6-NEXT: s_endpgm
712 ; GFX9-LABEL: srem_i16:
714 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
715 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
716 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
717 ; GFX9-NEXT: s_ashr_i32 s7, s6, 16
718 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s7
719 ; GFX9-NEXT: s_sext_i32_i16 s2, s6
720 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2
721 ; GFX9-NEXT: s_xor_b32 s2, s2, s7
722 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
723 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30
724 ; GFX9-NEXT: s_or_b32 s4, s2, 1
725 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
726 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
727 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
728 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
729 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
730 ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
731 ; GFX9-NEXT: s_cselect_b32 s2, s4, 0
732 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v2
733 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s7
734 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
735 ; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0
736 ; GFX9-NEXT: global_store_short v1, v0, s[0:1]
737 ; GFX9-NEXT: s_endpgm
739 store i16 %r, ptr addrspace(1) %out
743 define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
744 ; CHECK-LABEL: @udiv_i8(
745 ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
746 ; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
747 ; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
748 ; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
749 ; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
750 ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
751 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
752 ; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]]
753 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
754 ; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
755 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
756 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
757 ; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
758 ; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
759 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
760 ; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255
761 ; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
762 ; CHECK-NEXT: store i8 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 1
763 ; CHECK-NEXT: ret void
765 ; GFX6-LABEL: udiv_i8:
767 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
768 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
769 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
770 ; GFX6-NEXT: s_mov_b32 s2, -1
771 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
772 ; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s6
773 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0
774 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s6
775 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
776 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
777 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1
778 ; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2
779 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
780 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
781 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
782 ; GFX6-NEXT: s_endpgm
784 ; GFX9-LABEL: udiv_i8:
786 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
787 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
788 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
789 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
790 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2
791 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
792 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2
793 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1
794 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
795 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1
796 ; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3
797 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
798 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
799 ; GFX9-NEXT: global_store_byte v2, v0, s[0:1]
800 ; GFX9-NEXT: s_endpgm
802 store i8 %r, ptr addrspace(1) %out
806 define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
807 ; CHECK-LABEL: @urem_i8(
808 ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
809 ; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
810 ; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
811 ; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
812 ; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
813 ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
814 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
815 ; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]]
816 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
817 ; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
818 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
819 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
820 ; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
821 ; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
822 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
823 ; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
824 ; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
825 ; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255
826 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
827 ; CHECK-NEXT: store i8 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 1
828 ; CHECK-NEXT: ret void
830 ; GFX6-LABEL: urem_i8:
832 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
833 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
834 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
835 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
836 ; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s6
837 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0
838 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s6
839 ; GFX6-NEXT: s_lshr_b32 s2, s6, 8
840 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
841 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
842 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1
843 ; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2
844 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
845 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
846 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2
847 ; GFX6-NEXT: s_mov_b32 s2, -1
848 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
849 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
850 ; GFX6-NEXT: s_endpgm
852 ; GFX9-LABEL: urem_i8:
854 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
855 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
856 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
857 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2
858 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
859 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2
860 ; GFX9-NEXT: s_lshr_b32 s3, s2, 8
861 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1
862 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
863 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1
864 ; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2
865 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
866 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
867 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
868 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
869 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
870 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
871 ; GFX9-NEXT: s_endpgm
873 store i8 %r, ptr addrspace(1) %out
877 define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
878 ; CHECK-LABEL: @sdiv_i8(
879 ; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
880 ; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
881 ; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
882 ; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
883 ; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1
884 ; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
885 ; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
886 ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
887 ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
888 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
889 ; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]]
890 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
891 ; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
892 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
893 ; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
894 ; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
895 ; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
896 ; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
897 ; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24
898 ; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
899 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
900 ; CHECK-NEXT: store i8 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 1
901 ; CHECK-NEXT: ret void
903 ; GFX6-LABEL: sdiv_i8:
905 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
906 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
907 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
908 ; GFX6-NEXT: s_mov_b32 s2, -1
909 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
910 ; GFX6-NEXT: s_bfe_i32 s4, s6, 0x80008
911 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
912 ; GFX6-NEXT: s_sext_i32_i8 s5, s6
913 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
914 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
915 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
916 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
917 ; GFX6-NEXT: s_or_b32 s6, s4, 1
918 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
919 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
920 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
921 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
922 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
923 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
924 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
925 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2
926 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
927 ; GFX6-NEXT: s_endpgm
929 ; GFX9-LABEL: sdiv_i8:
931 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
932 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
933 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
934 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
935 ; GFX9-NEXT: s_bfe_i32 s3, s2, 0x80008
936 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3
937 ; GFX9-NEXT: s_sext_i32_i8 s2, s2
938 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2
939 ; GFX9-NEXT: s_xor_b32 s2, s2, s3
940 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
941 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30
942 ; GFX9-NEXT: s_or_b32 s4, s2, 1
943 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3
944 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
945 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2
946 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
947 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
948 ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
949 ; GFX9-NEXT: s_cselect_b32 s2, s4, 0
950 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v3
951 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
952 ; GFX9-NEXT: s_endpgm
954 store i8 %r, ptr addrspace(1) %out
958 define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
959 ; CHECK-LABEL: @srem_i8(
960 ; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
961 ; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
962 ; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
963 ; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
964 ; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1
965 ; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
966 ; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
967 ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
968 ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
969 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
970 ; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]]
971 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
972 ; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
973 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
974 ; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
975 ; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
976 ; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
977 ; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
978 ; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
979 ; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
980 ; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24
981 ; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
982 ; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
983 ; CHECK-NEXT: store i8 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 1
984 ; CHECK-NEXT: ret void
986 ; GFX6-LABEL: srem_i8:
988 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
989 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
990 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
991 ; GFX6-NEXT: s_bfe_i32 s2, s6, 0x80008
992 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2
993 ; GFX6-NEXT: s_sext_i32_i8 s3, s6
994 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3
995 ; GFX6-NEXT: s_xor_b32 s2, s3, s2
996 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
997 ; GFX6-NEXT: s_ashr_i32 s2, s2, 30
998 ; GFX6-NEXT: s_lshr_b32 s4, s6, 8
999 ; GFX6-NEXT: s_or_b32 s5, s2, 1
1000 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
1001 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
1002 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
1003 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
1004 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
1005 ; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec
1006 ; GFX6-NEXT: s_cselect_b32 s2, s5, 0
1007 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1008 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4
1009 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
1010 ; GFX6-NEXT: s_mov_b32 s2, -1
1011 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
1012 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
1013 ; GFX6-NEXT: s_endpgm
1015 ; GFX9-LABEL: srem_i8:
1017 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
1018 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1019 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1020 ; GFX9-NEXT: s_bfe_i32 s2, s6, 0x80008
1021 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
1022 ; GFX9-NEXT: s_sext_i32_i8 s3, s6
1023 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3
1024 ; GFX9-NEXT: s_xor_b32 s2, s3, s2
1025 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
1026 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30
1027 ; GFX9-NEXT: s_lshr_b32 s4, s6, 8
1028 ; GFX9-NEXT: s_or_b32 s5, s2, 1
1029 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
1030 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
1031 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
1032 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
1033 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
1034 ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
1035 ; GFX9-NEXT: s_cselect_b32 s2, s5, 0
1036 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v2
1037 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s4
1038 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1039 ; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0
1040 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
1041 ; GFX9-NEXT: s_endpgm
1043 store i8 %r, ptr addrspace(1) %out
1047 define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
1048 ; CHECK-LABEL: @udiv_v4i32(
1049 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1050 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1051 ; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1052 ; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1053 ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1054 ; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1055 ; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1056 ; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1057 ; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1058 ; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1059 ; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1060 ; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1061 ; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1062 ; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1063 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1064 ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1065 ; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1066 ; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1067 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1068 ; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1069 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1070 ; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1071 ; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1072 ; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1073 ; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1
1074 ; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
1075 ; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1076 ; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
1077 ; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
1078 ; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1
1079 ; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
1080 ; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP31]], i64 0
1081 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1
1082 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1083 ; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
1084 ; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
1085 ; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
1086 ; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
1087 ; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]]
1088 ; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
1089 ; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
1090 ; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
1091 ; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
1092 ; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1093 ; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
1094 ; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
1095 ; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
1096 ; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
1097 ; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
1098 ; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
1099 ; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1100 ; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
1101 ; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
1102 ; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
1103 ; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
1104 ; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
1105 ; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1
1106 ; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
1107 ; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
1108 ; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
1109 ; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
1110 ; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1
1111 ; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
1112 ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1
1113 ; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2
1114 ; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1115 ; CHECK-NEXT: [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float
1116 ; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]])
1117 ; CHECK-NEXT: [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000
1118 ; CHECK-NEXT: [[TMP70:%.*]] = fptoui float [[TMP69]] to i32
1119 ; CHECK-NEXT: [[TMP71:%.*]] = sub i32 0, [[TMP66]]
1120 ; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]]
1121 ; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP70]] to i64
1122 ; CHECK-NEXT: [[TMP74:%.*]] = zext i32 [[TMP72]] to i64
1123 ; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]]
1124 ; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32
1125 ; CHECK-NEXT: [[TMP77:%.*]] = lshr i64 [[TMP75]], 32
1126 ; CHECK-NEXT: [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32
1127 ; CHECK-NEXT: [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]]
1128 ; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP65]] to i64
1129 ; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP79]] to i64
1130 ; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]]
1131 ; CHECK-NEXT: [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32
1132 ; CHECK-NEXT: [[TMP84:%.*]] = lshr i64 [[TMP82]], 32
1133 ; CHECK-NEXT: [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32
1134 ; CHECK-NEXT: [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]]
1135 ; CHECK-NEXT: [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]]
1136 ; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]]
1137 ; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP85]], 1
1138 ; CHECK-NEXT: [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]]
1139 ; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]]
1140 ; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]]
1141 ; CHECK-NEXT: [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]]
1142 ; CHECK-NEXT: [[TMP94:%.*]] = add i32 [[TMP90]], 1
1143 ; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]]
1144 ; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2
1145 ; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3
1146 ; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1147 ; CHECK-NEXT: [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float
1148 ; CHECK-NEXT: [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]])
1149 ; CHECK-NEXT: [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000
1150 ; CHECK-NEXT: [[TMP102:%.*]] = fptoui float [[TMP101]] to i32
1151 ; CHECK-NEXT: [[TMP103:%.*]] = sub i32 0, [[TMP98]]
1152 ; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]]
1153 ; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP102]] to i64
1154 ; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1155 ; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1156 ; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1157 ; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1158 ; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1159 ; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]]
1160 ; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP97]] to i64
1161 ; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP111]] to i64
1162 ; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]]
1163 ; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32
1164 ; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32
1165 ; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32
1166 ; CHECK-NEXT: [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]]
1167 ; CHECK-NEXT: [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]]
1168 ; CHECK-NEXT: [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]]
1169 ; CHECK-NEXT: [[TMP121:%.*]] = add i32 [[TMP117]], 1
1170 ; CHECK-NEXT: [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]]
1171 ; CHECK-NEXT: [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]]
1172 ; CHECK-NEXT: [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]]
1173 ; CHECK-NEXT: [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]]
1174 ; CHECK-NEXT: [[TMP126:%.*]] = add i32 [[TMP122]], 1
1175 ; CHECK-NEXT: [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]]
1176 ; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3
1177 ; CHECK-NEXT: store <4 x i32> [[TMP128]], ptr addrspace(1) [[OUT:%.*]], align 16
1178 ; CHECK-NEXT: ret void
1180 ; GFX6-LABEL: udiv_v4i32:
1182 ; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
1183 ; GFX6-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x9
1184 ; GFX6-NEXT: s_mov_b32 s19, 0xf000
1185 ; GFX6-NEXT: s_mov_b32 s18, -1
1186 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1187 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12
1188 ; GFX6-NEXT: s_sub_i32 s0, 0, s12
1189 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13
1190 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s14
1191 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
1192 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s15
1193 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
1194 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4
1195 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1196 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
1197 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6
1198 ; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0
1199 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
1200 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1201 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
1202 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
1203 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
1204 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
1205 ; GFX6-NEXT: s_mul_i32 s0, s0, s12
1206 ; GFX6-NEXT: s_sub_i32 s0, s8, s0
1207 ; GFX6-NEXT: s_sub_i32 s1, s0, s12
1208 ; GFX6-NEXT: s_cmp_ge_u32 s0, s12
1209 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
1210 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0
1211 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
1212 ; GFX6-NEXT: s_cmp_ge_u32 s0, s12
1213 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
1214 ; GFX6-NEXT: s_sub_i32 s2, 0, s13
1215 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1
1216 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
1217 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
1218 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3
1219 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1220 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
1221 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1
1222 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4
1223 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3
1224 ; GFX6-NEXT: v_readfirstlane_b32 s2, v1
1225 ; GFX6-NEXT: s_mul_i32 s2, s2, s13
1226 ; GFX6-NEXT: s_sub_i32 s2, s9, s2
1227 ; GFX6-NEXT: s_sub_i32 s3, s2, s13
1228 ; GFX6-NEXT: s_cmp_ge_u32 s2, s13
1229 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1
1230 ; GFX6-NEXT: s_cselect_b32 s2, s3, s2
1231 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
1232 ; GFX6-NEXT: s_cmp_ge_u32 s2, s13
1233 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
1234 ; GFX6-NEXT: s_sub_i32 s6, 0, s14
1235 ; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3
1236 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1237 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1
1238 ; GFX6-NEXT: v_mul_hi_u32 v5, v3, v5
1239 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3]
1240 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
1241 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v3
1242 ; GFX6-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v6
1243 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5
1244 ; GFX6-NEXT: v_readfirstlane_b32 s6, v3
1245 ; GFX6-NEXT: s_mul_i32 s6, s6, s14
1246 ; GFX6-NEXT: s_sub_i32 s6, s10, s6
1247 ; GFX6-NEXT: s_sub_i32 s7, s6, s14
1248 ; GFX6-NEXT: s_cmp_ge_u32 s6, s14
1249 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v3
1250 ; GFX6-NEXT: s_cselect_b32 s6, s7, s6
1251 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
1252 ; GFX6-NEXT: s_cmp_ge_u32 s6, s14
1253 ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
1254 ; GFX6-NEXT: s_sub_i32 s8, 0, s15
1255 ; GFX6-NEXT: v_mul_lo_u32 v7, s8, v5
1256 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
1257 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v3
1258 ; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7
1259 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v3, v6, s[6:7]
1260 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7
1261 ; GFX6-NEXT: v_mul_hi_u32 v5, s11, v5
1262 ; GFX6-NEXT: v_readfirstlane_b32 s0, v5
1263 ; GFX6-NEXT: s_mul_i32 s0, s0, s15
1264 ; GFX6-NEXT: s_sub_i32 s0, s11, s0
1265 ; GFX6-NEXT: s_sub_i32 s1, s0, s15
1266 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v5
1267 ; GFX6-NEXT: s_cmp_ge_u32 s0, s15
1268 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
1269 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
1270 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0
1271 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v3
1272 ; GFX6-NEXT: s_cmp_ge_u32 s0, s15
1273 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
1274 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
1275 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
1276 ; GFX6-NEXT: s_endpgm
1278 ; GFX9-LABEL: udiv_v4i32:
1280 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
1281 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1282 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1283 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1284 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12
1285 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13
1286 ; GFX9-NEXT: s_sub_i32 s2, 0, s12
1287 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s14
1288 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
1289 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
1290 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1291 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
1292 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
1293 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
1294 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0
1295 ; GFX9-NEXT: s_mul_i32 s2, s2, s3
1296 ; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2
1297 ; GFX9-NEXT: s_add_i32 s3, s3, s2
1298 ; GFX9-NEXT: s_mul_hi_u32 s2, s8, s3
1299 ; GFX9-NEXT: s_mul_i32 s3, s2, s12
1300 ; GFX9-NEXT: s_sub_i32 s3, s8, s3
1301 ; GFX9-NEXT: s_add_i32 s5, s2, 1
1302 ; GFX9-NEXT: s_sub_i32 s6, s3, s12
1303 ; GFX9-NEXT: s_cmp_ge_u32 s3, s12
1304 ; GFX9-NEXT: s_cselect_b32 s2, s5, s2
1305 ; GFX9-NEXT: s_cselect_b32 s3, s6, s3
1306 ; GFX9-NEXT: s_add_i32 s5, s2, 1
1307 ; GFX9-NEXT: s_cmp_ge_u32 s3, s12
1308 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1
1309 ; GFX9-NEXT: s_cselect_b32 s2, s5, s2
1310 ; GFX9-NEXT: s_sub_i32 s3, 0, s13
1311 ; GFX9-NEXT: s_mul_i32 s3, s3, s4
1312 ; GFX9-NEXT: s_mul_hi_u32 s3, s4, s3
1313 ; GFX9-NEXT: s_add_i32 s4, s4, s3
1314 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v2
1315 ; GFX9-NEXT: s_mul_hi_u32 s3, s9, s4
1316 ; GFX9-NEXT: s_mul_i32 s4, s3, s13
1317 ; GFX9-NEXT: s_sub_i32 s4, s9, s4
1318 ; GFX9-NEXT: s_add_i32 s5, s3, 1
1319 ; GFX9-NEXT: s_sub_i32 s6, s4, s13
1320 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1321 ; GFX9-NEXT: s_cmp_ge_u32 s4, s13
1322 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
1323 ; GFX9-NEXT: s_cselect_b32 s3, s5, s3
1324 ; GFX9-NEXT: s_cselect_b32 s4, s6, s4
1325 ; GFX9-NEXT: s_add_i32 s5, s3, 1
1326 ; GFX9-NEXT: s_cmp_ge_u32 s4, s13
1327 ; GFX9-NEXT: s_cselect_b32 s3, s5, s3
1328 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0
1329 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15
1330 ; GFX9-NEXT: s_sub_i32 s4, 0, s14
1331 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
1332 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
1333 ; GFX9-NEXT: s_add_i32 s5, s5, s4
1334 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
1335 ; GFX9-NEXT: s_mul_hi_u32 s4, s10, s5
1336 ; GFX9-NEXT: s_mul_i32 s5, s4, s14
1337 ; GFX9-NEXT: s_sub_i32 s5, s10, s5
1338 ; GFX9-NEXT: s_add_i32 s6, s4, 1
1339 ; GFX9-NEXT: s_sub_i32 s7, s5, s14
1340 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1341 ; GFX9-NEXT: s_cmp_ge_u32 s5, s14
1342 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
1343 ; GFX9-NEXT: s_cselect_b32 s4, s6, s4
1344 ; GFX9-NEXT: s_cselect_b32 s5, s7, s5
1345 ; GFX9-NEXT: s_add_i32 s6, s4, 1
1346 ; GFX9-NEXT: s_cmp_ge_u32 s5, s14
1347 ; GFX9-NEXT: s_cselect_b32 s4, s6, s4
1348 ; GFX9-NEXT: s_sub_i32 s5, 0, s15
1349 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0
1350 ; GFX9-NEXT: s_mul_i32 s5, s5, s6
1351 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
1352 ; GFX9-NEXT: s_add_i32 s6, s6, s5
1353 ; GFX9-NEXT: s_mul_hi_u32 s5, s11, s6
1354 ; GFX9-NEXT: s_mul_i32 s6, s5, s15
1355 ; GFX9-NEXT: s_sub_i32 s6, s11, s6
1356 ; GFX9-NEXT: s_add_i32 s7, s5, 1
1357 ; GFX9-NEXT: s_sub_i32 s8, s6, s15
1358 ; GFX9-NEXT: s_cmp_ge_u32 s6, s15
1359 ; GFX9-NEXT: s_cselect_b32 s5, s7, s5
1360 ; GFX9-NEXT: s_cselect_b32 s6, s8, s6
1361 ; GFX9-NEXT: s_add_i32 s7, s5, 1
1362 ; GFX9-NEXT: s_cmp_ge_u32 s6, s15
1363 ; GFX9-NEXT: s_cselect_b32 s5, s7, s5
1364 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1365 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1366 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
1367 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
1368 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
1369 ; GFX9-NEXT: s_endpgm
1370 %r = udiv <4 x i32> %x, %y
1371 store <4 x i32> %r, ptr addrspace(1) %out
1375 define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
1376 ; CHECK-LABEL: @urem_v4i32(
1377 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1378 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1379 ; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1380 ; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1381 ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1382 ; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1383 ; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1384 ; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1385 ; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1386 ; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1387 ; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1388 ; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1389 ; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1390 ; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1391 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1392 ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1393 ; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1394 ; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1395 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1396 ; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1397 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1398 ; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1399 ; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1400 ; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1401 ; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1402 ; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
1403 ; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
1404 ; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
1405 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
1406 ; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP29]], i64 0
1407 ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1
1408 ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1409 ; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
1410 ; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
1411 ; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
1412 ; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
1413 ; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]]
1414 ; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
1415 ; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
1416 ; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
1417 ; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
1418 ; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
1419 ; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
1420 ; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1421 ; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
1422 ; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
1423 ; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
1424 ; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
1425 ; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
1426 ; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
1427 ; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1428 ; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
1429 ; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
1430 ; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
1431 ; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
1432 ; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
1433 ; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
1434 ; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
1435 ; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
1436 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1
1437 ; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2
1438 ; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1439 ; CHECK-NEXT: [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float
1440 ; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]])
1441 ; CHECK-NEXT: [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000
1442 ; CHECK-NEXT: [[TMP66:%.*]] = fptoui float [[TMP65]] to i32
1443 ; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP62]]
1444 ; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]]
1445 ; CHECK-NEXT: [[TMP69:%.*]] = zext i32 [[TMP66]] to i64
1446 ; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP68]] to i64
1447 ; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]]
1448 ; CHECK-NEXT: [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32
1449 ; CHECK-NEXT: [[TMP73:%.*]] = lshr i64 [[TMP71]], 32
1450 ; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32
1451 ; CHECK-NEXT: [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]]
1452 ; CHECK-NEXT: [[TMP76:%.*]] = zext i32 [[TMP61]] to i64
1453 ; CHECK-NEXT: [[TMP77:%.*]] = zext i32 [[TMP75]] to i64
1454 ; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]]
1455 ; CHECK-NEXT: [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32
1456 ; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP78]], 32
1457 ; CHECK-NEXT: [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32
1458 ; CHECK-NEXT: [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]]
1459 ; CHECK-NEXT: [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]]
1460 ; CHECK-NEXT: [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]]
1461 ; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]]
1462 ; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]]
1463 ; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]]
1464 ; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]]
1465 ; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]]
1466 ; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2
1467 ; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3
1468 ; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1469 ; CHECK-NEXT: [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float
1470 ; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]])
1471 ; CHECK-NEXT: [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000
1472 ; CHECK-NEXT: [[TMP96:%.*]] = fptoui float [[TMP95]] to i32
1473 ; CHECK-NEXT: [[TMP97:%.*]] = sub i32 0, [[TMP92]]
1474 ; CHECK-NEXT: [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]]
1475 ; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP96]] to i64
1476 ; CHECK-NEXT: [[TMP100:%.*]] = zext i32 [[TMP98]] to i64
1477 ; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]]
1478 ; CHECK-NEXT: [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32
1479 ; CHECK-NEXT: [[TMP103:%.*]] = lshr i64 [[TMP101]], 32
1480 ; CHECK-NEXT: [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32
1481 ; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]]
1482 ; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP91]] to i64
1483 ; CHECK-NEXT: [[TMP107:%.*]] = zext i32 [[TMP105]] to i64
1484 ; CHECK-NEXT: [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]]
1485 ; CHECK-NEXT: [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32
1486 ; CHECK-NEXT: [[TMP110:%.*]] = lshr i64 [[TMP108]], 32
1487 ; CHECK-NEXT: [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32
1488 ; CHECK-NEXT: [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]]
1489 ; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]]
1490 ; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]]
1491 ; CHECK-NEXT: [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]]
1492 ; CHECK-NEXT: [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]]
1493 ; CHECK-NEXT: [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]]
1494 ; CHECK-NEXT: [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]]
1495 ; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]]
1496 ; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3
1497 ; CHECK-NEXT: store <4 x i32> [[TMP120]], ptr addrspace(1) [[OUT:%.*]], align 16
1498 ; CHECK-NEXT: ret void
1500 ; GFX6-LABEL: urem_v4i32:
1502 ; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
1503 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
1504 ; GFX6-NEXT: s_mov_b32 s2, -1
1505 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1506 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12
1507 ; GFX6-NEXT: s_sub_i32 s0, 0, s12
1508 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13
1509 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
1510 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
1511 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1512 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
1513 ; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0
1514 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
1515 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1516 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
1517 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
1518 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
1519 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s14
1520 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
1521 ; GFX6-NEXT: s_mul_i32 s0, s0, s12
1522 ; GFX6-NEXT: s_sub_i32 s0, s8, s0
1523 ; GFX6-NEXT: s_sub_i32 s1, s0, s12
1524 ; GFX6-NEXT: s_cmp_ge_u32 s0, s12
1525 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0
1526 ; GFX6-NEXT: s_sub_i32 s1, s0, s12
1527 ; GFX6-NEXT: s_cmp_ge_u32 s0, s12
1528 ; GFX6-NEXT: s_cselect_b32 s6, s1, s0
1529 ; GFX6-NEXT: s_sub_i32 s0, 0, s13
1530 ; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1
1531 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
1532 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
1533 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
1534 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
1535 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
1536 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
1537 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s15
1538 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
1539 ; GFX6-NEXT: s_mul_i32 s0, s0, s13
1540 ; GFX6-NEXT: s_sub_i32 s0, s9, s0
1541 ; GFX6-NEXT: s_sub_i32 s1, s0, s13
1542 ; GFX6-NEXT: s_cmp_ge_u32 s0, s13
1543 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0
1544 ; GFX6-NEXT: s_sub_i32 s1, s0, s13
1545 ; GFX6-NEXT: s_cmp_ge_u32 s0, s13
1546 ; GFX6-NEXT: s_cselect_b32 s7, s1, s0
1547 ; GFX6-NEXT: s_sub_i32 s0, 0, s14
1548 ; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1
1549 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
1550 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
1551 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
1552 ; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
1553 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
1554 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
1555 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
1556 ; GFX6-NEXT: s_mul_i32 s0, s0, s14
1557 ; GFX6-NEXT: s_sub_i32 s0, s10, s0
1558 ; GFX6-NEXT: s_sub_i32 s1, s0, s14
1559 ; GFX6-NEXT: s_cmp_ge_u32 s0, s14
1560 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0
1561 ; GFX6-NEXT: s_sub_i32 s1, s0, s14
1562 ; GFX6-NEXT: s_cmp_ge_u32 s0, s14
1563 ; GFX6-NEXT: s_cselect_b32 s8, s1, s0
1564 ; GFX6-NEXT: s_sub_i32 s0, 0, s15
1565 ; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1
1566 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
1567 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
1568 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
1569 ; GFX6-NEXT: v_mul_hi_u32 v2, s11, v0
1570 ; GFX6-NEXT: v_mov_b32_e32 v0, s6
1571 ; GFX6-NEXT: v_mov_b32_e32 v1, s7
1572 ; GFX6-NEXT: v_readfirstlane_b32 s4, v2
1573 ; GFX6-NEXT: s_mul_i32 s4, s4, s15
1574 ; GFX6-NEXT: s_sub_i32 s4, s11, s4
1575 ; GFX6-NEXT: s_sub_i32 s5, s4, s15
1576 ; GFX6-NEXT: s_cmp_ge_u32 s4, s15
1577 ; GFX6-NEXT: s_cselect_b32 s4, s5, s4
1578 ; GFX6-NEXT: s_sub_i32 s5, s4, s15
1579 ; GFX6-NEXT: s_cmp_ge_u32 s4, s15
1580 ; GFX6-NEXT: s_cselect_b32 s4, s5, s4
1581 ; GFX6-NEXT: v_mov_b32_e32 v2, s8
1582 ; GFX6-NEXT: v_mov_b32_e32 v3, s4
1583 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1584 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1585 ; GFX6-NEXT: s_endpgm
1587 ; GFX9-LABEL: urem_v4i32:
1589 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
1590 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1591 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1592 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1593 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12
1594 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13
1595 ; GFX9-NEXT: s_sub_i32 s2, 0, s12
1596 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s14
1597 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
1598 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
1599 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
1600 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1601 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
1602 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
1603 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
1604 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
1605 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0
1606 ; GFX9-NEXT: s_mul_i32 s2, s2, s3
1607 ; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2
1608 ; GFX9-NEXT: s_add_i32 s3, s3, s2
1609 ; GFX9-NEXT: s_mul_hi_u32 s2, s8, s3
1610 ; GFX9-NEXT: s_mul_i32 s2, s2, s12
1611 ; GFX9-NEXT: s_sub_i32 s2, s8, s2
1612 ; GFX9-NEXT: s_sub_i32 s3, s2, s12
1613 ; GFX9-NEXT: s_cmp_ge_u32 s2, s12
1614 ; GFX9-NEXT: s_cselect_b32 s2, s3, s2
1615 ; GFX9-NEXT: s_sub_i32 s3, s2, s12
1616 ; GFX9-NEXT: s_cmp_ge_u32 s2, s12
1617 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1
1618 ; GFX9-NEXT: s_cselect_b32 s2, s3, s2
1619 ; GFX9-NEXT: s_sub_i32 s3, 0, s13
1620 ; GFX9-NEXT: s_mul_i32 s3, s3, s4
1621 ; GFX9-NEXT: s_mul_hi_u32 s3, s4, s3
1622 ; GFX9-NEXT: s_add_i32 s4, s4, s3
1623 ; GFX9-NEXT: s_mul_hi_u32 s3, s9, s4
1624 ; GFX9-NEXT: s_mul_i32 s3, s3, s13
1625 ; GFX9-NEXT: s_sub_i32 s3, s9, s3
1626 ; GFX9-NEXT: s_sub_i32 s4, s3, s13
1627 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
1628 ; GFX9-NEXT: s_cmp_ge_u32 s3, s13
1629 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3
1630 ; GFX9-NEXT: s_sub_i32 s4, s3, s13
1631 ; GFX9-NEXT: s_cmp_ge_u32 s3, s13
1632 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15
1633 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3
1634 ; GFX9-NEXT: s_sub_i32 s4, 0, s14
1635 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2
1636 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
1637 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
1638 ; GFX9-NEXT: s_add_i32 s5, s5, s4
1639 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
1640 ; GFX9-NEXT: s_mul_hi_u32 s4, s10, s5
1641 ; GFX9-NEXT: s_mul_i32 s4, s4, s14
1642 ; GFX9-NEXT: s_sub_i32 s4, s10, s4
1643 ; GFX9-NEXT: s_sub_i32 s5, s4, s14
1644 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1645 ; GFX9-NEXT: s_cmp_ge_u32 s4, s14
1646 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
1647 ; GFX9-NEXT: s_cselect_b32 s4, s5, s4
1648 ; GFX9-NEXT: s_sub_i32 s5, s4, s14
1649 ; GFX9-NEXT: s_cmp_ge_u32 s4, s14
1650 ; GFX9-NEXT: s_cselect_b32 s4, s5, s4
1651 ; GFX9-NEXT: s_sub_i32 s5, 0, s15
1652 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0
1653 ; GFX9-NEXT: s_mul_i32 s5, s5, s6
1654 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
1655 ; GFX9-NEXT: s_add_i32 s6, s6, s5
1656 ; GFX9-NEXT: s_mul_hi_u32 s5, s11, s6
1657 ; GFX9-NEXT: s_mul_i32 s5, s5, s15
1658 ; GFX9-NEXT: s_sub_i32 s5, s11, s5
1659 ; GFX9-NEXT: s_sub_i32 s6, s5, s15
1660 ; GFX9-NEXT: s_cmp_ge_u32 s5, s15
1661 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5
1662 ; GFX9-NEXT: s_sub_i32 s6, s5, s15
1663 ; GFX9-NEXT: s_cmp_ge_u32 s5, s15
1664 ; GFX9-NEXT: s_cselect_b32 s5, s6, s5
1665 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1666 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1667 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
1668 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
1669 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
1670 ; GFX9-NEXT: s_endpgm
1671 %r = urem <4 x i32> %x, %y
1672 store <4 x i32> %r, ptr addrspace(1) %out
1676 define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
1677 ; CHECK-LABEL: @sdiv_v4i32(
1678 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1679 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1680 ; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1681 ; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1682 ; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
1683 ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
1684 ; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
1685 ; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
1686 ; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
1687 ; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
1688 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
1689 ; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
1690 ; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
1691 ; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]]
1692 ; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
1693 ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
1694 ; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1695 ; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1696 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1697 ; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1698 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1699 ; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
1700 ; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
1701 ; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
1702 ; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
1703 ; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
1704 ; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
1705 ; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
1706 ; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
1707 ; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
1708 ; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
1709 ; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1
1710 ; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
1711 ; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
1712 ; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
1713 ; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
1714 ; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1
1715 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
1716 ; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
1717 ; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
1718 ; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP40]], i64 0
1719 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1
1720 ; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1721 ; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
1722 ; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
1723 ; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
1724 ; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
1725 ; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
1726 ; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
1727 ; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
1728 ; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
1729 ; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
1730 ; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
1731 ; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
1732 ; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]]
1733 ; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
1734 ; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
1735 ; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
1736 ; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
1737 ; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
1738 ; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
1739 ; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
1740 ; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
1741 ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
1742 ; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
1743 ; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
1744 ; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
1745 ; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
1746 ; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
1747 ; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
1748 ; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
1749 ; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
1750 ; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1
1751 ; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
1752 ; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
1753 ; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
1754 ; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
1755 ; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1
1756 ; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
1757 ; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
1758 ; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
1759 ; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1
1760 ; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2
1761 ; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1762 ; CHECK-NEXT: [[TMP85:%.*]] = ashr i32 [[TMP83]], 31
1763 ; CHECK-NEXT: [[TMP86:%.*]] = ashr i32 [[TMP84]], 31
1764 ; CHECK-NEXT: [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]]
1765 ; CHECK-NEXT: [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]]
1766 ; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]]
1767 ; CHECK-NEXT: [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]]
1768 ; CHECK-NEXT: [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]]
1769 ; CHECK-NEXT: [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float
1770 ; CHECK-NEXT: [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]])
1771 ; CHECK-NEXT: [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000
1772 ; CHECK-NEXT: [[TMP95:%.*]] = fptoui float [[TMP94]] to i32
1773 ; CHECK-NEXT: [[TMP96:%.*]] = sub i32 0, [[TMP91]]
1774 ; CHECK-NEXT: [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]]
1775 ; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP95]] to i64
1776 ; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
1777 ; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
1778 ; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1779 ; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
1780 ; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
1781 ; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]]
1782 ; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP90]] to i64
1783 ; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1784 ; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1785 ; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1786 ; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1787 ; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1788 ; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]]
1789 ; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]]
1790 ; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]]
1791 ; CHECK-NEXT: [[TMP114:%.*]] = add i32 [[TMP110]], 1
1792 ; CHECK-NEXT: [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]]
1793 ; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]]
1794 ; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]]
1795 ; CHECK-NEXT: [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]]
1796 ; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], 1
1797 ; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]]
1798 ; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]]
1799 ; CHECK-NEXT: [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]]
1800 ; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2
1801 ; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3
1802 ; CHECK-NEXT: [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1803 ; CHECK-NEXT: [[TMP126:%.*]] = ashr i32 [[TMP124]], 31
1804 ; CHECK-NEXT: [[TMP127:%.*]] = ashr i32 [[TMP125]], 31
1805 ; CHECK-NEXT: [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]]
1806 ; CHECK-NEXT: [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]]
1807 ; CHECK-NEXT: [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]]
1808 ; CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]]
1809 ; CHECK-NEXT: [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]]
1810 ; CHECK-NEXT: [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float
1811 ; CHECK-NEXT: [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]])
1812 ; CHECK-NEXT: [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000
1813 ; CHECK-NEXT: [[TMP136:%.*]] = fptoui float [[TMP135]] to i32
1814 ; CHECK-NEXT: [[TMP137:%.*]] = sub i32 0, [[TMP132]]
1815 ; CHECK-NEXT: [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]]
1816 ; CHECK-NEXT: [[TMP139:%.*]] = zext i32 [[TMP136]] to i64
1817 ; CHECK-NEXT: [[TMP140:%.*]] = zext i32 [[TMP138]] to i64
1818 ; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]]
1819 ; CHECK-NEXT: [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32
1820 ; CHECK-NEXT: [[TMP143:%.*]] = lshr i64 [[TMP141]], 32
1821 ; CHECK-NEXT: [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32
1822 ; CHECK-NEXT: [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]]
1823 ; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP131]] to i64
1824 ; CHECK-NEXT: [[TMP147:%.*]] = zext i32 [[TMP145]] to i64
1825 ; CHECK-NEXT: [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]]
1826 ; CHECK-NEXT: [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32
1827 ; CHECK-NEXT: [[TMP150:%.*]] = lshr i64 [[TMP148]], 32
1828 ; CHECK-NEXT: [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32
1829 ; CHECK-NEXT: [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]]
1830 ; CHECK-NEXT: [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]]
1831 ; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]]
1832 ; CHECK-NEXT: [[TMP155:%.*]] = add i32 [[TMP151]], 1
1833 ; CHECK-NEXT: [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]]
1834 ; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]]
1835 ; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]]
1836 ; CHECK-NEXT: [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]]
1837 ; CHECK-NEXT: [[TMP160:%.*]] = add i32 [[TMP156]], 1
1838 ; CHECK-NEXT: [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]]
1839 ; CHECK-NEXT: [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]]
1840 ; CHECK-NEXT: [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]]
1841 ; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3
1842 ; CHECK-NEXT: store <4 x i32> [[TMP164]], ptr addrspace(1) [[OUT:%.*]], align 16
1843 ; CHECK-NEXT: ret void
1845 ; GFX6-LABEL: sdiv_v4i32:
1847 ; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
1848 ; GFX6-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x9
1849 ; GFX6-NEXT: s_mov_b32 s19, 0xf000
1850 ; GFX6-NEXT: s_mov_b32 s18, -1
1851 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1852 ; GFX6-NEXT: s_abs_i32 s0, s12
1853 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0
1854 ; GFX6-NEXT: s_sub_i32 s1, 0, s0
1855 ; GFX6-NEXT: s_xor_b32 s2, s8, s12
1856 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
1857 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1858 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
1859 ; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0
1860 ; GFX6-NEXT: s_abs_i32 s1, s8
1861 ; GFX6-NEXT: s_ashr_i32 s8, s2, 31
1862 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
1863 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1864 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0
1865 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0
1866 ; GFX6-NEXT: s_mul_i32 s2, s2, s0
1867 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
1868 ; GFX6-NEXT: s_sub_i32 s2, s1, s0
1869 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0
1870 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
1871 ; GFX6-NEXT: s_cselect_b32 s1, s2, s1
1872 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
1873 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0
1874 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
1875 ; GFX6-NEXT: s_abs_i32 s2, s13
1876 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2
1877 ; GFX6-NEXT: s_sub_i32 s3, 0, s2
1878 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1879 ; GFX6-NEXT: s_xor_b32 s6, s9, s13
1880 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
1881 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
1882 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
1883 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
1884 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
1885 ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0
1886 ; GFX6-NEXT: v_mul_lo_u32 v3, s3, v2
1887 ; GFX6-NEXT: s_abs_i32 s3, s9
1888 ; GFX6-NEXT: s_ashr_i32 s9, s6, 31
1889 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3
1890 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
1891 ; GFX6-NEXT: v_mul_hi_u32 v2, s3, v2
1892 ; GFX6-NEXT: v_readfirstlane_b32 s6, v2
1893 ; GFX6-NEXT: s_mul_i32 s6, s6, s2
1894 ; GFX6-NEXT: s_sub_i32 s3, s3, s6
1895 ; GFX6-NEXT: s_sub_i32 s6, s3, s2
1896 ; GFX6-NEXT: s_cmp_ge_u32 s3, s2
1897 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v2
1898 ; GFX6-NEXT: s_cselect_b32 s3, s6, s3
1899 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
1900 ; GFX6-NEXT: s_cmp_ge_u32 s3, s2
1901 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
1902 ; GFX6-NEXT: s_abs_i32 s6, s14
1903 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6
1904 ; GFX6-NEXT: s_sub_i32 s7, 0, s6
1905 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
1906 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v2
1907 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4
1908 ; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
1909 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4
1910 ; GFX6-NEXT: v_mul_lo_u32 v5, s7, v4
1911 ; GFX6-NEXT: s_abs_i32 s7, s10
1912 ; GFX6-NEXT: s_xor_b32 s10, s10, s14
1913 ; GFX6-NEXT: s_ashr_i32 s10, s10, 31
1914 ; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5
1915 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
1916 ; GFX6-NEXT: v_mul_hi_u32 v4, s7, v4
1917 ; GFX6-NEXT: v_readfirstlane_b32 s12, v4
1918 ; GFX6-NEXT: s_mul_i32 s12, s12, s6
1919 ; GFX6-NEXT: s_sub_i32 s7, s7, s12
1920 ; GFX6-NEXT: s_sub_i32 s12, s7, s6
1921 ; GFX6-NEXT: s_cmp_ge_u32 s7, s6
1922 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4
1923 ; GFX6-NEXT: s_cselect_b32 s7, s12, s7
1924 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
1925 ; GFX6-NEXT: s_cmp_ge_u32 s7, s6
1926 ; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
1927 ; GFX6-NEXT: s_abs_i32 s12, s15
1928 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12
1929 ; GFX6-NEXT: s_sub_i32 s0, 0, s12
1930 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
1931 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4
1932 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v6
1933 ; GFX6-NEXT: s_abs_i32 s1, s11
1934 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0
1935 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
1936 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v1
1937 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[2:3]
1938 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v5, s[6:7]
1939 ; GFX6-NEXT: v_xor_b32_e32 v1, s9, v1
1940 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v6
1941 ; GFX6-NEXT: s_xor_b32 s0, s11, s15
1942 ; GFX6-NEXT: v_xor_b32_e32 v3, s10, v3
1943 ; GFX6-NEXT: s_ashr_i32 s0, s0, 31
1944 ; GFX6-NEXT: v_mul_hi_u32 v2, v6, v2
1945 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s9, v1
1946 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v6, v2
1947 ; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2
1948 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s10, v3
1949 ; GFX6-NEXT: v_readfirstlane_b32 s2, v4
1950 ; GFX6-NEXT: s_mul_i32 s2, s2, s12
1951 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
1952 ; GFX6-NEXT: s_sub_i32 s2, s1, s12
1953 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v4
1954 ; GFX6-NEXT: s_cmp_ge_u32 s1, s12
1955 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
1956 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
1957 ; GFX6-NEXT: s_cselect_b32 s1, s2, s1
1958 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v3
1959 ; GFX6-NEXT: s_cmp_ge_u32 s1, s12
1960 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
1961 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
1962 ; GFX6-NEXT: v_xor_b32_e32 v3, s0, v3
1963 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s0, v3
1964 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
1965 ; GFX6-NEXT: s_endpgm
1967 ; GFX9-LABEL: sdiv_v4i32:
1969 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
1970 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1971 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1972 ; GFX9-NEXT: s_abs_i32 s0, s12
1973 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0
1974 ; GFX9-NEXT: s_sub_i32 s3, 0, s0
1975 ; GFX9-NEXT: s_abs_i32 s2, s8
1976 ; GFX9-NEXT: s_xor_b32 s1, s8, s12
1977 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
1978 ; GFX9-NEXT: s_ashr_i32 s1, s1, 31
1979 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
1980 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
1981 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0
1982 ; GFX9-NEXT: s_mul_i32 s3, s3, s6
1983 ; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3
1984 ; GFX9-NEXT: s_add_i32 s6, s6, s3
1985 ; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6
1986 ; GFX9-NEXT: s_mul_i32 s6, s3, s0
1987 ; GFX9-NEXT: s_sub_i32 s2, s2, s6
1988 ; GFX9-NEXT: s_add_i32 s7, s3, 1
1989 ; GFX9-NEXT: s_sub_i32 s6, s2, s0
1990 ; GFX9-NEXT: s_cmp_ge_u32 s2, s0
1991 ; GFX9-NEXT: s_cselect_b32 s3, s7, s3
1992 ; GFX9-NEXT: s_cselect_b32 s2, s6, s2
1993 ; GFX9-NEXT: s_add_i32 s6, s3, 1
1994 ; GFX9-NEXT: s_cmp_ge_u32 s2, s0
1995 ; GFX9-NEXT: s_cselect_b32 s0, s6, s3
1996 ; GFX9-NEXT: s_abs_i32 s2, s13
1997 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
1998 ; GFX9-NEXT: s_xor_b32 s0, s0, s1
1999 ; GFX9-NEXT: s_sub_i32 s7, 0, s2
2000 ; GFX9-NEXT: s_sub_i32 s8, s0, s1
2001 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
2002 ; GFX9-NEXT: s_abs_i32 s6, s9
2003 ; GFX9-NEXT: s_xor_b32 s3, s9, s13
2004 ; GFX9-NEXT: s_ashr_i32 s3, s3, 31
2005 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2006 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
2007 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2008 ; GFX9-NEXT: s_mul_i32 s7, s7, s0
2009 ; GFX9-NEXT: s_mul_hi_u32 s1, s0, s7
2010 ; GFX9-NEXT: s_add_i32 s0, s0, s1
2011 ; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0
2012 ; GFX9-NEXT: s_mul_i32 s1, s0, s2
2013 ; GFX9-NEXT: s_sub_i32 s1, s6, s1
2014 ; GFX9-NEXT: s_add_i32 s7, s0, 1
2015 ; GFX9-NEXT: s_sub_i32 s6, s1, s2
2016 ; GFX9-NEXT: s_cmp_ge_u32 s1, s2
2017 ; GFX9-NEXT: s_cselect_b32 s0, s7, s0
2018 ; GFX9-NEXT: s_cselect_b32 s1, s6, s1
2019 ; GFX9-NEXT: s_add_i32 s6, s0, 1
2020 ; GFX9-NEXT: s_cmp_ge_u32 s1, s2
2021 ; GFX9-NEXT: s_cselect_b32 s0, s6, s0
2022 ; GFX9-NEXT: s_abs_i32 s1, s14
2023 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1
2024 ; GFX9-NEXT: s_xor_b32 s0, s0, s3
2025 ; GFX9-NEXT: s_sub_i32 s7, 0, s1
2026 ; GFX9-NEXT: s_sub_i32 s3, s0, s3
2027 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
2028 ; GFX9-NEXT: s_abs_i32 s6, s10
2029 ; GFX9-NEXT: s_xor_b32 s2, s10, s14
2030 ; GFX9-NEXT: s_ashr_i32 s2, s2, 31
2031 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2032 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
2033 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2034 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2035 ; GFX9-NEXT: s_mul_i32 s7, s7, s0
2036 ; GFX9-NEXT: s_mul_hi_u32 s7, s0, s7
2037 ; GFX9-NEXT: s_add_i32 s0, s0, s7
2038 ; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0
2039 ; GFX9-NEXT: s_mul_i32 s7, s0, s1
2040 ; GFX9-NEXT: s_sub_i32 s6, s6, s7
2041 ; GFX9-NEXT: s_add_i32 s9, s0, 1
2042 ; GFX9-NEXT: s_sub_i32 s7, s6, s1
2043 ; GFX9-NEXT: s_cmp_ge_u32 s6, s1
2044 ; GFX9-NEXT: s_cselect_b32 s0, s9, s0
2045 ; GFX9-NEXT: s_cselect_b32 s6, s7, s6
2046 ; GFX9-NEXT: s_add_i32 s7, s0, 1
2047 ; GFX9-NEXT: s_cmp_ge_u32 s6, s1
2048 ; GFX9-NEXT: s_cselect_b32 s6, s7, s0
2049 ; GFX9-NEXT: s_abs_i32 s7, s15
2050 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s7
2051 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2052 ; GFX9-NEXT: s_xor_b32 s5, s6, s2
2053 ; GFX9-NEXT: s_sub_i32 s6, 0, s7
2054 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
2055 ; GFX9-NEXT: s_sub_i32 s2, s5, s2
2056 ; GFX9-NEXT: s_abs_i32 s4, s11
2057 ; GFX9-NEXT: s_xor_b32 s3, s11, s15
2058 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
2059 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
2060 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
2061 ; GFX9-NEXT: s_ashr_i32 s3, s3, 31
2062 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2
2063 ; GFX9-NEXT: s_mul_i32 s6, s6, s5
2064 ; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6
2065 ; GFX9-NEXT: s_add_i32 s5, s5, s6
2066 ; GFX9-NEXT: s_mul_hi_u32 s5, s4, s5
2067 ; GFX9-NEXT: s_mul_i32 s6, s5, s7
2068 ; GFX9-NEXT: s_sub_i32 s4, s4, s6
2069 ; GFX9-NEXT: s_add_i32 s8, s5, 1
2070 ; GFX9-NEXT: s_sub_i32 s6, s4, s7
2071 ; GFX9-NEXT: s_cmp_ge_u32 s4, s7
2072 ; GFX9-NEXT: s_cselect_b32 s5, s8, s5
2073 ; GFX9-NEXT: s_cselect_b32 s4, s6, s4
2074 ; GFX9-NEXT: s_add_i32 s6, s5, 1
2075 ; GFX9-NEXT: s_cmp_ge_u32 s4, s7
2076 ; GFX9-NEXT: s_cselect_b32 s4, s6, s5
2077 ; GFX9-NEXT: s_xor_b32 s4, s4, s3
2078 ; GFX9-NEXT: s_sub_i32 s3, s4, s3
2079 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2080 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2081 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2082 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
2083 ; GFX9-NEXT: s_endpgm
2084 %r = sdiv <4 x i32> %x, %y
2085 store <4 x i32> %r, ptr addrspace(1) %out
2089 define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) {
2090 ; CHECK-LABEL: @srem_v4i32(
2091 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
2092 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
2093 ; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
2094 ; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
2095 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
2096 ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
2097 ; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
2098 ; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
2099 ; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
2100 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2101 ; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
2102 ; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
2103 ; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]]
2104 ; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
2105 ; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
2106 ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
2107 ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
2108 ; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
2109 ; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
2110 ; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
2111 ; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
2112 ; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
2113 ; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
2114 ; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
2115 ; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
2116 ; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
2117 ; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
2118 ; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
2119 ; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
2120 ; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
2121 ; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
2122 ; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
2123 ; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
2124 ; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
2125 ; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
2126 ; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
2127 ; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
2128 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i64 0
2129 ; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1
2130 ; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1
2131 ; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
2132 ; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
2133 ; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
2134 ; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
2135 ; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
2136 ; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
2137 ; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
2138 ; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
2139 ; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
2140 ; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
2141 ; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]]
2142 ; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
2143 ; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
2144 ; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
2145 ; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
2146 ; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
2147 ; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
2148 ; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
2149 ; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
2150 ; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
2151 ; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
2152 ; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
2153 ; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
2154 ; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
2155 ; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
2156 ; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
2157 ; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
2158 ; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
2159 ; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
2160 ; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
2161 ; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
2162 ; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
2163 ; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
2164 ; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
2165 ; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
2166 ; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1
2167 ; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2
2168 ; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2
2169 ; CHECK-NEXT: [[TMP79:%.*]] = ashr i32 [[TMP77]], 31
2170 ; CHECK-NEXT: [[TMP80:%.*]] = ashr i32 [[TMP78]], 31
2171 ; CHECK-NEXT: [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]]
2172 ; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]]
2173 ; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]]
2174 ; CHECK-NEXT: [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]]
2175 ; CHECK-NEXT: [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float
2176 ; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]])
2177 ; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000
2178 ; CHECK-NEXT: [[TMP88:%.*]] = fptoui float [[TMP87]] to i32
2179 ; CHECK-NEXT: [[TMP89:%.*]] = sub i32 0, [[TMP84]]
2180 ; CHECK-NEXT: [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]]
2181 ; CHECK-NEXT: [[TMP91:%.*]] = zext i32 [[TMP88]] to i64
2182 ; CHECK-NEXT: [[TMP92:%.*]] = zext i32 [[TMP90]] to i64
2183 ; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]]
2184 ; CHECK-NEXT: [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32
2185 ; CHECK-NEXT: [[TMP95:%.*]] = lshr i64 [[TMP93]], 32
2186 ; CHECK-NEXT: [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32
2187 ; CHECK-NEXT: [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]]
2188 ; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP83]] to i64
2189 ; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
2190 ; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
2191 ; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
2192 ; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
2193 ; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
2194 ; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]]
2195 ; CHECK-NEXT: [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]]
2196 ; CHECK-NEXT: [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]]
2197 ; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]]
2198 ; CHECK-NEXT: [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]]
2199 ; CHECK-NEXT: [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]]
2200 ; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]]
2201 ; CHECK-NEXT: [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]]
2202 ; CHECK-NEXT: [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]]
2203 ; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]]
2204 ; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2
2205 ; CHECK-NEXT: [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3
2206 ; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3
2207 ; CHECK-NEXT: [[TMP117:%.*]] = ashr i32 [[TMP115]], 31
2208 ; CHECK-NEXT: [[TMP118:%.*]] = ashr i32 [[TMP116]], 31
2209 ; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]]
2210 ; CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]]
2211 ; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]]
2212 ; CHECK-NEXT: [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]]
2213 ; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float
2214 ; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]])
2215 ; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000
2216 ; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32
2217 ; CHECK-NEXT: [[TMP127:%.*]] = sub i32 0, [[TMP122]]
2218 ; CHECK-NEXT: [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]]
2219 ; CHECK-NEXT: [[TMP129:%.*]] = zext i32 [[TMP126]] to i64
2220 ; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP128]] to i64
2221 ; CHECK-NEXT: [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]]
2222 ; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
2223 ; CHECK-NEXT: [[TMP133:%.*]] = lshr i64 [[TMP131]], 32
2224 ; CHECK-NEXT: [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32
2225 ; CHECK-NEXT: [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]]
2226 ; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP121]] to i64
2227 ; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP135]] to i64
2228 ; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]]
2229 ; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32
2230 ; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32
2231 ; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32
2232 ; CHECK-NEXT: [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]]
2233 ; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]]
2234 ; CHECK-NEXT: [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]]
2235 ; CHECK-NEXT: [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]]
2236 ; CHECK-NEXT: [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]]
2237 ; CHECK-NEXT: [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]]
2238 ; CHECK-NEXT: [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]]
2239 ; CHECK-NEXT: [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]]
2240 ; CHECK-NEXT: [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]]
2241 ; CHECK-NEXT: [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]]
2242 ; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3
2243 ; CHECK-NEXT: store <4 x i32> [[TMP152]], ptr addrspace(1) [[OUT:%.*]], align 16
2244 ; CHECK-NEXT: ret void
2246 ; GFX6-LABEL: srem_v4i32:
2248 ; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
2249 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2250 ; GFX6-NEXT: s_abs_i32 s0, s12
2251 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0
2252 ; GFX6-NEXT: s_sub_i32 s1, 0, s0
2253 ; GFX6-NEXT: s_ashr_i32 s2, s8, 31
2254 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
2255 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2256 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
2257 ; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0
2258 ; GFX6-NEXT: s_abs_i32 s1, s8
2259 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
2260 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
2261 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0
2262 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0
2263 ; GFX6-NEXT: s_mul_i32 s3, s3, s0
2264 ; GFX6-NEXT: s_sub_i32 s1, s1, s3
2265 ; GFX6-NEXT: s_sub_i32 s3, s1, s0
2266 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0
2267 ; GFX6-NEXT: s_cselect_b32 s1, s3, s1
2268 ; GFX6-NEXT: s_sub_i32 s3, s1, s0
2269 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0
2270 ; GFX6-NEXT: s_cselect_b32 s0, s3, s1
2271 ; GFX6-NEXT: s_abs_i32 s1, s13
2272 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1
2273 ; GFX6-NEXT: s_sub_i32 s3, 0, s1
2274 ; GFX6-NEXT: s_xor_b32 s0, s0, s2
2275 ; GFX6-NEXT: s_sub_i32 s7, s0, s2
2276 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
2277 ; GFX6-NEXT: s_ashr_i32 s6, s9, 31
2278 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2279 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
2280 ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0
2281 ; GFX6-NEXT: s_abs_i32 s3, s9
2282 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
2283 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
2284 ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
2285 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2286 ; GFX6-NEXT: s_mul_i32 s0, s0, s1
2287 ; GFX6-NEXT: s_sub_i32 s0, s3, s0
2288 ; GFX6-NEXT: s_sub_i32 s2, s0, s1
2289 ; GFX6-NEXT: s_cmp_ge_u32 s0, s1
2290 ; GFX6-NEXT: s_cselect_b32 s0, s2, s0
2291 ; GFX6-NEXT: s_sub_i32 s2, s0, s1
2292 ; GFX6-NEXT: s_cmp_ge_u32 s0, s1
2293 ; GFX6-NEXT: s_cselect_b32 s0, s2, s0
2294 ; GFX6-NEXT: s_abs_i32 s1, s14
2295 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1
2296 ; GFX6-NEXT: s_sub_i32 s2, 0, s1
2297 ; GFX6-NEXT: s_xor_b32 s0, s0, s6
2298 ; GFX6-NEXT: s_sub_i32 s6, s0, s6
2299 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
2300 ; GFX6-NEXT: s_ashr_i32 s8, s10, 31
2301 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
2302 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2303 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
2304 ; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0
2305 ; GFX6-NEXT: s_abs_i32 s2, s10
2306 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
2307 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
2308 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
2309 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
2310 ; GFX6-NEXT: s_mul_i32 s0, s0, s1
2311 ; GFX6-NEXT: s_sub_i32 s0, s2, s0
2312 ; GFX6-NEXT: s_sub_i32 s2, s0, s1
2313 ; GFX6-NEXT: s_cmp_ge_u32 s0, s1
2314 ; GFX6-NEXT: s_cselect_b32 s0, s2, s0
2315 ; GFX6-NEXT: s_sub_i32 s2, s0, s1
2316 ; GFX6-NEXT: s_cmp_ge_u32 s0, s1
2317 ; GFX6-NEXT: s_cselect_b32 s9, s2, s0
2318 ; GFX6-NEXT: s_abs_i32 s10, s15
2319 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10
2320 ; GFX6-NEXT: s_sub_i32 s0, 0, s10
2321 ; GFX6-NEXT: s_mov_b32 s2, -1
2322 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
2323 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2324 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0
2325 ; GFX6-NEXT: v_mov_b32_e32 v0, s7
2326 ; GFX6-NEXT: v_mul_lo_u32 v1, s0, v2
2327 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
2328 ; GFX6-NEXT: s_abs_i32 s4, s11
2329 ; GFX6-NEXT: s_ashr_i32 s5, s11, 31
2330 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v1
2331 ; GFX6-NEXT: v_mov_b32_e32 v1, s6
2332 ; GFX6-NEXT: s_xor_b32 s6, s9, s8
2333 ; GFX6-NEXT: s_sub_i32 s6, s6, s8
2334 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
2335 ; GFX6-NEXT: v_mul_hi_u32 v2, s4, v2
2336 ; GFX6-NEXT: v_readfirstlane_b32 s7, v2
2337 ; GFX6-NEXT: s_mul_i32 s7, s7, s10
2338 ; GFX6-NEXT: s_sub_i32 s4, s4, s7
2339 ; GFX6-NEXT: s_sub_i32 s7, s4, s10
2340 ; GFX6-NEXT: s_cmp_ge_u32 s4, s10
2341 ; GFX6-NEXT: s_cselect_b32 s4, s7, s4
2342 ; GFX6-NEXT: s_sub_i32 s7, s4, s10
2343 ; GFX6-NEXT: s_cmp_ge_u32 s4, s10
2344 ; GFX6-NEXT: s_cselect_b32 s4, s7, s4
2345 ; GFX6-NEXT: s_xor_b32 s4, s4, s5
2346 ; GFX6-NEXT: s_sub_i32 s4, s4, s5
2347 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
2348 ; GFX6-NEXT: v_mov_b32_e32 v3, s4
2349 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2350 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2351 ; GFX6-NEXT: s_endpgm
2353 ; GFX9-LABEL: srem_v4i32:
2355 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2356 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2357 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2358 ; GFX9-NEXT: s_abs_i32 s0, s12
2359 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0
2360 ; GFX9-NEXT: s_sub_i32 s3, 0, s0
2361 ; GFX9-NEXT: s_abs_i32 s2, s8
2362 ; GFX9-NEXT: s_ashr_i32 s1, s8, 31
2363 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
2364 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2365 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
2366 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0
2367 ; GFX9-NEXT: s_mul_i32 s3, s3, s6
2368 ; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3
2369 ; GFX9-NEXT: s_add_i32 s6, s6, s3
2370 ; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6
2371 ; GFX9-NEXT: s_mul_i32 s3, s3, s0
2372 ; GFX9-NEXT: s_sub_i32 s2, s2, s3
2373 ; GFX9-NEXT: s_sub_i32 s3, s2, s0
2374 ; GFX9-NEXT: s_cmp_ge_u32 s2, s0
2375 ; GFX9-NEXT: s_cselect_b32 s2, s3, s2
2376 ; GFX9-NEXT: s_sub_i32 s3, s2, s0
2377 ; GFX9-NEXT: s_cmp_ge_u32 s2, s0
2378 ; GFX9-NEXT: s_cselect_b32 s0, s3, s2
2379 ; GFX9-NEXT: s_abs_i32 s2, s13
2380 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
2381 ; GFX9-NEXT: s_xor_b32 s0, s0, s1
2382 ; GFX9-NEXT: s_sub_i32 s7, 0, s2
2383 ; GFX9-NEXT: s_sub_i32 s8, s0, s1
2384 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
2385 ; GFX9-NEXT: s_abs_i32 s6, s9
2386 ; GFX9-NEXT: s_ashr_i32 s3, s9, 31
2387 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2388 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
2389 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2390 ; GFX9-NEXT: s_mul_i32 s7, s7, s0
2391 ; GFX9-NEXT: s_mul_hi_u32 s1, s0, s7
2392 ; GFX9-NEXT: s_add_i32 s0, s0, s1
2393 ; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0
2394 ; GFX9-NEXT: s_mul_i32 s0, s0, s2
2395 ; GFX9-NEXT: s_sub_i32 s0, s6, s0
2396 ; GFX9-NEXT: s_sub_i32 s1, s0, s2
2397 ; GFX9-NEXT: s_cmp_ge_u32 s0, s2
2398 ; GFX9-NEXT: s_cselect_b32 s0, s1, s0
2399 ; GFX9-NEXT: s_sub_i32 s1, s0, s2
2400 ; GFX9-NEXT: s_cmp_ge_u32 s0, s2
2401 ; GFX9-NEXT: s_cselect_b32 s0, s1, s0
2402 ; GFX9-NEXT: s_abs_i32 s1, s14
2403 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1
2404 ; GFX9-NEXT: s_xor_b32 s0, s0, s3
2405 ; GFX9-NEXT: s_sub_i32 s7, 0, s1
2406 ; GFX9-NEXT: s_sub_i32 s3, s0, s3
2407 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
2408 ; GFX9-NEXT: s_abs_i32 s6, s10
2409 ; GFX9-NEXT: s_ashr_i32 s2, s10, 31
2410 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
2411 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
2412 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2413 ; GFX9-NEXT: s_mul_i32 s7, s7, s0
2414 ; GFX9-NEXT: s_mul_hi_u32 s7, s0, s7
2415 ; GFX9-NEXT: s_add_i32 s0, s0, s7
2416 ; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0
2417 ; GFX9-NEXT: s_mul_i32 s0, s0, s1
2418 ; GFX9-NEXT: s_sub_i32 s0, s6, s0
2419 ; GFX9-NEXT: s_sub_i32 s6, s0, s1
2420 ; GFX9-NEXT: s_cmp_ge_u32 s0, s1
2421 ; GFX9-NEXT: s_cselect_b32 s0, s6, s0
2422 ; GFX9-NEXT: s_sub_i32 s6, s0, s1
2423 ; GFX9-NEXT: s_cmp_ge_u32 s0, s1
2424 ; GFX9-NEXT: s_cselect_b32 s6, s6, s0
2425 ; GFX9-NEXT: s_abs_i32 s7, s15
2426 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
2427 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2428 ; GFX9-NEXT: s_xor_b32 s5, s6, s2
2429 ; GFX9-NEXT: s_sub_i32 s6, 0, s7
2430 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1
2431 ; GFX9-NEXT: s_sub_i32 s2, s5, s2
2432 ; GFX9-NEXT: s_abs_i32 s4, s11
2433 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2434 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
2435 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
2436 ; GFX9-NEXT: s_ashr_i32 s3, s11, 31
2437 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
2438 ; GFX9-NEXT: v_readfirstlane_b32 s5, v2
2439 ; GFX9-NEXT: s_mul_i32 s6, s6, s5
2440 ; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6
2441 ; GFX9-NEXT: s_add_i32 s5, s5, s6
2442 ; GFX9-NEXT: s_mul_hi_u32 s5, s4, s5
2443 ; GFX9-NEXT: s_mul_i32 s5, s5, s7
2444 ; GFX9-NEXT: s_sub_i32 s4, s4, s5
2445 ; GFX9-NEXT: s_sub_i32 s5, s4, s7
2446 ; GFX9-NEXT: s_cmp_ge_u32 s4, s7
2447 ; GFX9-NEXT: s_cselect_b32 s4, s5, s4
2448 ; GFX9-NEXT: s_sub_i32 s5, s4, s7
2449 ; GFX9-NEXT: s_cmp_ge_u32 s4, s7
2450 ; GFX9-NEXT: s_cselect_b32 s4, s5, s4
2451 ; GFX9-NEXT: s_xor_b32 s4, s4, s3
2452 ; GFX9-NEXT: s_sub_i32 s3, s4, s3
2453 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2454 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2455 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2456 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
2457 ; GFX9-NEXT: s_endpgm
2458 %r = srem <4 x i32> %x, %y
2459 store <4 x i32> %r, ptr addrspace(1) %out
2463 define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) {
2464 ; CHECK-LABEL: @udiv_v4i16(
2465 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2466 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2467 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2468 ; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2469 ; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2470 ; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2471 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2472 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2473 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2474 ; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]]
2475 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2476 ; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2477 ; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2478 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2479 ; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2480 ; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2481 ; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2482 ; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535
2483 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
2484 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> poison, i16 [[TMP19]], i64 0
2485 ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1
2486 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2487 ; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
2488 ; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
2489 ; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
2490 ; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
2491 ; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
2492 ; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
2493 ; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
2494 ; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]]
2495 ; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
2496 ; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
2497 ; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
2498 ; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
2499 ; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
2500 ; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
2501 ; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
2502 ; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535
2503 ; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
2504 ; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1
2505 ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2
2506 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2507 ; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
2508 ; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
2509 ; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
2510 ; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
2511 ; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
2512 ; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
2513 ; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
2514 ; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]]
2515 ; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
2516 ; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
2517 ; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
2518 ; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
2519 ; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
2520 ; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
2521 ; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
2522 ; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535
2523 ; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
2524 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2
2525 ; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3
2526 ; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2527 ; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32
2528 ; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32
2529 ; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float
2530 ; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float
2531 ; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]])
2532 ; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]]
2533 ; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]])
2534 ; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]]
2535 ; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]])
2536 ; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32
2537 ; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]])
2538 ; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
2539 ; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]]
2540 ; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0
2541 ; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]]
2542 ; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535
2543 ; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16
2544 ; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3
2545 ; CHECK-NEXT: store <4 x i16> [[TMP80]], ptr addrspace(1) [[OUT:%.*]], align 8
2546 ; CHECK-NEXT: ret void
2548 ; GFX6-LABEL: udiv_v4i16:
2550 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
2551 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
2552 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
2553 ; GFX6-NEXT: s_mov_b32 s2, -1
2554 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2555 ; GFX6-NEXT: s_and_b32 s5, s10, 0xffff
2556 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5
2557 ; GFX6-NEXT: s_lshr_b32 s5, s10, 16
2558 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
2559 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5
2560 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4
2561 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
2562 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16
2563 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
2564 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
2565 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3
2566 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3
2567 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1
2568 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
2569 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5
2570 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
2571 ; GFX6-NEXT: s_and_b32 s4, s11, 0xffff
2572 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3
2573 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4
2574 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
2575 ; GFX6-NEXT: s_and_b32 s4, s9, 0xffff
2576 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc
2577 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
2578 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4
2579 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4
2580 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2
2581 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc
2582 ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6
2583 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
2584 ; GFX6-NEXT: s_lshr_b32 s4, s11, 16
2585 ; GFX6-NEXT: v_mad_f32 v3, -v1, v4, v5
2586 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4
2587 ; GFX6-NEXT: s_lshr_b32 s4, s9, 16
2588 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4
2589 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
2590 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5
2591 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4
2592 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2593 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2594 ; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7
2595 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3
2596 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3
2597 ; GFX6-NEXT: v_mad_f32 v3, -v3, v5, v6
2598 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5
2599 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
2600 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
2601 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2602 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
2603 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
2604 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
2605 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2606 ; GFX6-NEXT: s_endpgm
2608 ; GFX9-LABEL: udiv_v4i16:
2610 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
2611 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
2612 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2613 ; GFX9-NEXT: s_and_b32 s7, s2, 0xffff
2614 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
2615 ; GFX9-NEXT: s_and_b32 s6, s0, 0xffff
2616 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
2617 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6
2618 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
2619 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
2620 ; GFX9-NEXT: s_lshr_b32 s0, s0, 16
2621 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0
2622 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
2623 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
2624 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
2625 ; GFX9-NEXT: s_and_b32 s0, s3, 0xffff
2626 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
2627 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
2628 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
2629 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
2630 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
2631 ; GFX9-NEXT: s_and_b32 s0, s1, 0xffff
2632 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
2633 ; GFX9-NEXT: v_trunc_f32_e32 v2, v5
2634 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
2635 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
2636 ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
2637 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
2638 ; GFX9-NEXT: s_lshr_b32 s0, s3, 16
2639 ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7
2640 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
2641 ; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5
2642 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
2643 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
2644 ; GFX9-NEXT: s_lshr_b32 s0, s1, 16
2645 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0
2646 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5
2647 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
2648 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
2649 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4
2650 ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8
2651 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
2652 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3
2653 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2654 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2655 ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7
2656 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5
2657 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
2658 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
2659 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
2660 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
2661 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
2662 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2663 ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
2664 ; GFX9-NEXT: s_endpgm
2665 %r = udiv <4 x i16> %x, %y
2666 store <4 x i16> %r, ptr addrspace(1) %out
2670 define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) {
2671 ; CHECK-LABEL: @urem_v4i16(
2672 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2673 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2674 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2675 ; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2676 ; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2677 ; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2678 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2679 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2680 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2681 ; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]]
2682 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2683 ; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2684 ; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2685 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2686 ; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2687 ; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2688 ; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2689 ; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
2690 ; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
2691 ; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535
2692 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
2693 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> poison, i16 [[TMP21]], i64 0
2694 ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1
2695 ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2696 ; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
2697 ; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
2698 ; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
2699 ; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
2700 ; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
2701 ; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
2702 ; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
2703 ; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]]
2704 ; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
2705 ; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
2706 ; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2707 ; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
2708 ; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
2709 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
2710 ; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
2711 ; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
2712 ; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
2713 ; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535
2714 ; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
2715 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1
2716 ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2
2717 ; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2718 ; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
2719 ; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
2720 ; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
2721 ; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
2722 ; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
2723 ; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
2724 ; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
2725 ; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]]
2726 ; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
2727 ; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
2728 ; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
2729 ; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
2730 ; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
2731 ; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
2732 ; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
2733 ; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
2734 ; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
2735 ; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535
2736 ; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
2737 ; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2
2738 ; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3
2739 ; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2740 ; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32
2741 ; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32
2742 ; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float
2743 ; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float
2744 ; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]])
2745 ; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]]
2746 ; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]])
2747 ; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]]
2748 ; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]])
2749 ; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32
2750 ; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]])
2751 ; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]])
2752 ; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]]
2753 ; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0
2754 ; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]]
2755 ; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]]
2756 ; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]]
2757 ; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535
2758 ; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16
2759 ; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3
2760 ; CHECK-NEXT: store <4 x i16> [[TMP88]], ptr addrspace(1) [[OUT:%.*]], align 8
2761 ; CHECK-NEXT: ret void
2763 ; GFX6-LABEL: urem_v4i16:
2765 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
2766 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
2767 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
2768 ; GFX6-NEXT: s_mov_b32 s2, -1
2769 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
2770 ; GFX6-NEXT: s_and_b32 s5, s10, 0xffff
2771 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5
2772 ; GFX6-NEXT: s_lshr_b32 s5, s10, 16
2773 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
2774 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5
2775 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4
2776 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
2777 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16
2778 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
2779 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
2780 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3
2781 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3
2782 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3
2783 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1
2784 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
2785 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5
2786 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
2787 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1
2788 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc
2789 ; GFX6-NEXT: v_mad_f32 v1, -v1, v2, v4
2790 ; GFX6-NEXT: s_and_b32 s6, s11, 0xffff
2791 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2
2792 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6
2793 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
2794 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5
2795 ; GFX6-NEXT: s_and_b32 s5, s9, 0xffff
2796 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5
2797 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2
2798 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v1
2799 ; GFX6-NEXT: s_lshr_b32 s4, s11, 16
2800 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4
2801 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
2802 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10
2803 ; GFX6-NEXT: s_lshr_b32 s5, s9, 16
2804 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5
2805 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4
2806 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
2807 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
2808 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3
2809 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
2810 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2
2811 ; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7
2812 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
2813 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2
2814 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2815 ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6
2816 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4
2817 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
2818 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11
2819 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4
2820 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
2821 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1
2822 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2
2823 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2824 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
2825 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
2826 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5
2827 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
2828 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2829 ; GFX6-NEXT: s_endpgm
2831 ; GFX9-LABEL: urem_v4i16:
2833 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
2834 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2835 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
2836 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2837 ; GFX9-NEXT: s_and_b32 s9, s2, 0xffff
2838 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
2839 ; GFX9-NEXT: s_and_b32 s8, s0, 0xffff
2840 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
2841 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
2842 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8
2843 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
2844 ; GFX9-NEXT: s_lshr_b32 s0, s0, 16
2845 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0
2846 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
2847 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
2848 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
2849 ; GFX9-NEXT: s_and_b32 s4, s3, 0xffff
2850 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
2851 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
2852 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4
2853 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
2854 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
2855 ; GFX9-NEXT: v_trunc_f32_e32 v2, v5
2856 ; GFX9-NEXT: s_and_b32 s5, s1, 0xffff
2857 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
2858 ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
2859 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
2860 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5
2861 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
2862 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
2863 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
2864 ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7
2865 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2
2866 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
2867 ; GFX9-NEXT: s_lshr_b32 s2, s3, 16
2868 ; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5
2869 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
2870 ; GFX9-NEXT: s_lshr_b32 s1, s1, 16
2871 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1
2872 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
2873 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5
2874 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4
2875 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9
2876 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
2877 ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8
2878 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
2879 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3
2880 ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7
2881 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5
2882 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4
2883 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
2884 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2
2885 ; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0
2886 ; GFX9-NEXT: v_sub_u32_e32 v4, s0, v1
2887 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v2
2888 ; GFX9-NEXT: v_sub_u32_e32 v2, s1, v3
2889 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
2890 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
2891 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
2892 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0
2893 ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
2894 ; GFX9-NEXT: s_endpgm
2895 %r = urem <4 x i16> %x, %y
2896 store <4 x i16> %r, ptr addrspace(1) %out
2900 define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) {
2901 ; CHECK-LABEL: @sdiv_v4i16(
2902 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2903 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2904 ; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2905 ; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2906 ; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2907 ; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2908 ; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1
2909 ; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2910 ; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2911 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2912 ; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2913 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2914 ; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]]
2915 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2916 ; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2917 ; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2918 ; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2919 ; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2920 ; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2921 ; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2922 ; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2923 ; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2924 ; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2925 ; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> poison, i16 [[TMP23]], i64 0
2926 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1
2927 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2928 ; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2929 ; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2930 ; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2931 ; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2932 ; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1
2933 ; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2934 ; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2935 ; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2936 ; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2937 ; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2938 ; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]]
2939 ; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2940 ; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2941 ; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2942 ; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2943 ; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2944 ; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2945 ; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2946 ; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2947 ; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2948 ; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2949 ; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2950 ; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2
2951 ; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2952 ; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2953 ; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2954 ; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2955 ; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2956 ; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1
2957 ; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2958 ; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2959 ; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2960 ; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2961 ; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2962 ; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]]
2963 ; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2964 ; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2965 ; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2966 ; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2967 ; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2968 ; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2969 ; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2970 ; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2971 ; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2972 ; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2973 ; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2974 ; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3
2975 ; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2976 ; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32
2977 ; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32
2978 ; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]]
2979 ; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30
2980 ; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1
2981 ; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float
2982 ; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float
2983 ; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]])
2984 ; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]]
2985 ; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]])
2986 ; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]]
2987 ; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]])
2988 ; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32
2989 ; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]])
2990 ; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]])
2991 ; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]]
2992 ; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0
2993 ; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]]
2994 ; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16
2995 ; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
2996 ; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
2997 ; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
2998 ; CHECK-NEXT: store <4 x i16> [[TMP96]], ptr addrspace(1) [[OUT:%.*]], align 8
2999 ; CHECK-NEXT: ret void
3001 ; GFX6-LABEL: sdiv_v4i16:
3003 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
3004 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
3005 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
3006 ; GFX6-NEXT: s_mov_b32 s2, -1
3007 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3008 ; GFX6-NEXT: s_sext_i32_i16 s4, s10
3009 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
3010 ; GFX6-NEXT: s_sext_i32_i16 s5, s8
3011 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
3012 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
3013 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
3014 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
3015 ; GFX6-NEXT: s_or_b32 s6, s4, 1
3016 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
3017 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
3018 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
3019 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3020 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
3021 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
3022 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
3023 ; GFX6-NEXT: s_ashr_i32 s5, s10, 16
3024 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5
3025 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v2
3026 ; GFX6-NEXT: s_ashr_i32 s4, s8, 16
3027 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4
3028 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
3029 ; GFX6-NEXT: s_xor_b32 s4, s4, s5
3030 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
3031 ; GFX6-NEXT: s_or_b32 s6, s4, 1
3032 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3
3033 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3
3034 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1
3035 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3036 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
3037 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3
3038 ; GFX6-NEXT: s_sext_i32_i16 s5, s11
3039 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5
3040 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
3041 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3
3042 ; GFX6-NEXT: s_sext_i32_i16 s4, s9
3043 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4
3044 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0
3045 ; GFX6-NEXT: s_xor_b32 s4, s4, s5
3046 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
3047 ; GFX6-NEXT: s_or_b32 s6, s4, 1
3048 ; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4
3049 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4
3050 ; GFX6-NEXT: v_mad_f32 v1, -v4, v0, v1
3051 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3052 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
3053 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4
3054 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
3055 ; GFX6-NEXT: s_ashr_i32 s5, s11, 16
3056 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5
3057 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4
3058 ; GFX6-NEXT: s_ashr_i32 s4, s9, 16
3059 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4
3060 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v0
3061 ; GFX6-NEXT: s_xor_b32 s4, s4, s5
3062 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
3063 ; GFX6-NEXT: s_or_b32 s6, s4, 1
3064 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5
3065 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5
3066 ; GFX6-NEXT: v_mad_f32 v4, -v5, v0, v4
3067 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5
3068 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v0|
3069 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
3070 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
3071 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v5
3072 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3073 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
3074 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v0
3075 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3
3076 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
3077 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
3078 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3079 ; GFX6-NEXT: s_endpgm
3081 ; GFX9-LABEL: sdiv_v4i16:
3083 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
3084 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
3085 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3086 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3087 ; GFX9-NEXT: s_sext_i32_i16 s4, s2
3088 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
3089 ; GFX9-NEXT: s_sext_i32_i16 s5, s0
3090 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s5
3091 ; GFX9-NEXT: s_xor_b32 s4, s5, s4
3092 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
3093 ; GFX9-NEXT: s_ashr_i32 s4, s4, 30
3094 ; GFX9-NEXT: s_or_b32 s8, s4, 1
3095 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3
3096 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
3097 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1
3098 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3099 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
3100 ; GFX9-NEXT: s_cselect_b32 s4, s8, 0
3101 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16
3102 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
3103 ; GFX9-NEXT: s_ashr_i32 s0, s0, 16
3104 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0
3105 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
3106 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
3107 ; GFX9-NEXT: s_xor_b32 s0, s0, s2
3108 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30
3109 ; GFX9-NEXT: s_sext_i32_i16 s2, s3
3110 ; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4
3111 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
3112 ; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1
3113 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
3114 ; GFX9-NEXT: v_add_u32_e32 v3, s4, v3
3115 ; GFX9-NEXT: s_or_b32 s0, s0, 1
3116 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3117 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
3118 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
3119 ; GFX9-NEXT: s_cselect_b32 s0, s0, 0
3120 ; GFX9-NEXT: v_add_u32_e32 v4, s0, v4
3121 ; GFX9-NEXT: s_sext_i32_i16 s0, s1
3122 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0
3123 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0
3124 ; GFX9-NEXT: s_xor_b32 s0, s0, s2
3125 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30
3126 ; GFX9-NEXT: s_or_b32 s0, s0, 1
3127 ; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5
3128 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
3129 ; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1
3130 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3131 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
3132 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5
3133 ; GFX9-NEXT: s_cselect_b32 s0, s0, 0
3134 ; GFX9-NEXT: s_ashr_i32 s2, s3, 16
3135 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
3136 ; GFX9-NEXT: v_add_u32_e32 v1, s0, v5
3137 ; GFX9-NEXT: s_ashr_i32 s0, s1, 16
3138 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0
3139 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0
3140 ; GFX9-NEXT: s_xor_b32 s0, s0, s2
3141 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30
3142 ; GFX9-NEXT: s_or_b32 s2, s0, 1
3143 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6
3144 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6
3145 ; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5
3146 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6
3147 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
3148 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
3149 ; GFX9-NEXT: s_cselect_b32 s0, s2, 0
3150 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v6
3151 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
3152 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
3153 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3
3154 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0
3155 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
3156 ; GFX9-NEXT: s_endpgm
3157 %r = sdiv <4 x i16> %x, %y
3158 store <4 x i16> %r, ptr addrspace(1) %out
3162 define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) {
3163 ; CHECK-LABEL: @srem_v4i16(
3164 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
3165 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
3166 ; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
3167 ; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
3168 ; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3169 ; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3170 ; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1
3171 ; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3172 ; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3173 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3174 ; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3175 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3176 ; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]]
3177 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3178 ; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3179 ; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3180 ; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3181 ; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3182 ; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3183 ; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3184 ; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3185 ; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3186 ; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16
3187 ; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
3188 ; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
3189 ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0
3190 ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1
3191 ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1
3192 ; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
3193 ; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
3194 ; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3195 ; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3196 ; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1
3197 ; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3198 ; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3199 ; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3200 ; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3201 ; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3202 ; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]]
3203 ; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3204 ; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3205 ; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3206 ; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3207 ; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3208 ; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3209 ; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3210 ; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3211 ; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3212 ; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16
3213 ; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
3214 ; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
3215 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1
3216 ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2
3217 ; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2
3218 ; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
3219 ; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
3220 ; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3221 ; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3222 ; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1
3223 ; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3224 ; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3225 ; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3226 ; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3227 ; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3228 ; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]]
3229 ; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3230 ; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3231 ; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3232 ; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3233 ; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3234 ; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3235 ; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3236 ; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3237 ; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3238 ; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16
3239 ; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
3240 ; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
3241 ; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2
3242 ; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3
3243 ; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3
3244 ; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32
3245 ; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32
3246 ; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]]
3247 ; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30
3248 ; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1
3249 ; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float
3250 ; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float
3251 ; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]])
3252 ; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]]
3253 ; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]])
3254 ; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]]
3255 ; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]])
3256 ; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32
3257 ; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]])
3258 ; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]])
3259 ; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]]
3260 ; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0
3261 ; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]]
3262 ; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]]
3263 ; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]]
3264 ; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16
3265 ; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
3266 ; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
3267 ; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
3268 ; CHECK-NEXT: store <4 x i16> [[TMP104]], ptr addrspace(1) [[OUT:%.*]], align 8
3269 ; CHECK-NEXT: ret void
3271 ; GFX6-LABEL: srem_v4i16:
3273 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
3274 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
3275 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
3276 ; GFX6-NEXT: s_mov_b32 s2, -1
3277 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3278 ; GFX6-NEXT: s_sext_i32_i16 s4, s10
3279 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
3280 ; GFX6-NEXT: s_sext_i32_i16 s5, s8
3281 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
3282 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
3283 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
3284 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
3285 ; GFX6-NEXT: s_or_b32 s6, s4, 1
3286 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
3287 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
3288 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
3289 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
3290 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3291 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
3292 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
3293 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2
3294 ; GFX6-NEXT: s_ashr_i32 s4, s10, 16
3295 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4
3296 ; GFX6-NEXT: s_ashr_i32 s5, s8, 16
3297 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5
3298 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10
3299 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1
3300 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
3301 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
3302 ; GFX6-NEXT: s_lshr_b32 s6, s8, 16
3303 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3
3304 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3
3305 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2
3306 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3
3307 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
3308 ; GFX6-NEXT: s_lshr_b32 s7, s10, 16
3309 ; GFX6-NEXT: s_or_b32 s8, s4, 1
3310 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1|
3311 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
3312 ; GFX6-NEXT: s_cselect_b32 s4, s8, 0
3313 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3
3314 ; GFX6-NEXT: s_sext_i32_i16 s4, s11
3315 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7
3316 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
3317 ; GFX6-NEXT: s_sext_i32_i16 s5, s9
3318 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
3319 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1
3320 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
3321 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2
3322 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
3323 ; GFX6-NEXT: s_or_b32 s6, s4, 1
3324 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
3325 ; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4
3326 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4
3327 ; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1
3328 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4
3329 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v2|
3330 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
3331 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
3332 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4
3333 ; GFX6-NEXT: s_ashr_i32 s4, s11, 16
3334 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
3335 ; GFX6-NEXT: s_ashr_i32 s5, s9, 16
3336 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5
3337 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
3338 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
3339 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
3340 ; GFX6-NEXT: s_lshr_b32 s6, s9, 16
3341 ; GFX6-NEXT: s_lshr_b32 s7, s11, 16
3342 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5
3343 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5
3344 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4
3345 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5
3346 ; GFX6-NEXT: s_or_b32 s8, s4, 1
3347 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2|
3348 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
3349 ; GFX6-NEXT: s_cselect_b32 s4, s8, 0
3350 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v5
3351 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11
3352 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7
3353 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1
3354 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2
3355 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
3356 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
3357 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
3358 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
3359 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
3360 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3361 ; GFX6-NEXT: s_endpgm
3363 ; GFX9-LABEL: srem_v4i16:
3365 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
3366 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
3367 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3368 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3369 ; GFX9-NEXT: s_sext_i32_i16 s8, s2
3370 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8
3371 ; GFX9-NEXT: s_sext_i32_i16 s9, s0
3372 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9
3373 ; GFX9-NEXT: s_xor_b32 s4, s9, s8
3374 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
3375 ; GFX9-NEXT: s_ashr_i32 s4, s4, 30
3376 ; GFX9-NEXT: s_or_b32 s10, s4, 1
3377 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3
3378 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
3379 ; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1
3380 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3381 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
3382 ; GFX9-NEXT: s_cselect_b32 s4, s10, 0
3383 ; GFX9-NEXT: s_ashr_i32 s10, s0, 16
3384 ; GFX9-NEXT: s_ashr_i32 s0, s2, 16
3385 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
3386 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0
3387 ; GFX9-NEXT: s_xor_b32 s2, s10, s0
3388 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30
3389 ; GFX9-NEXT: v_add_u32_e32 v1, s4, v3
3390 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s10
3391 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
3392 ; GFX9-NEXT: s_or_b32 s2, s2, 1
3393 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8
3394 ; GFX9-NEXT: s_sext_i32_i16 s8, s1
3395 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4
3396 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
3397 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3
3398 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
3399 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
3400 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
3401 ; GFX9-NEXT: s_cselect_b32 s2, s2, 0
3402 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v4
3403 ; GFX9-NEXT: s_sext_i32_i16 s2, s3
3404 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2
3405 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s8
3406 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
3407 ; GFX9-NEXT: s_xor_b32 s0, s8, s2
3408 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3
3409 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30
3410 ; GFX9-NEXT: s_or_b32 s0, s0, 1
3411 ; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0
3412 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5
3413 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
3414 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4
3415 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3|
3416 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5
3417 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
3418 ; GFX9-NEXT: s_cselect_b32 s0, s0, 0
3419 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16
3420 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3
3421 ; GFX9-NEXT: v_add_u32_e32 v3, s0, v5
3422 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2
3423 ; GFX9-NEXT: s_ashr_i32 s2, s1, 16
3424 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2
3425 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4
3426 ; GFX9-NEXT: s_xor_b32 s0, s2, s3
3427 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30
3428 ; GFX9-NEXT: s_or_b32 s4, s0, 1
3429 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6
3430 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6
3431 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5
3432 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6
3433 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
3434 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
3435 ; GFX9-NEXT: s_cselect_b32 s0, s4, 0
3436 ; GFX9-NEXT: v_add_u32_e32 v4, s0, v6
3437 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s3
3438 ; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1
3439 ; GFX9-NEXT: v_sub_u32_e32 v1, s8, v3
3440 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
3441 ; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4
3442 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
3443 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5
3444 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3
3445 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
3446 ; GFX9-NEXT: s_endpgm
3447 %r = srem <4 x i16> %x, %y
3448 store <4 x i16> %r, ptr addrspace(1) %out
3452 define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
3453 ; CHECK-LABEL: @udiv_i3(
3454 ; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
3455 ; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
3456 ; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
3457 ; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
3458 ; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
3459 ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
3460 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
3461 ; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]]
3462 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
3463 ; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
3464 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3465 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
3466 ; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
3467 ; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
3468 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
3469 ; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7
3470 ; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3
3471 ; CHECK-NEXT: store i3 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 1
3472 ; CHECK-NEXT: ret void
3474 ; GFX6-LABEL: udiv_i3:
3476 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
3477 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
3478 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
3479 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3480 ; GFX6-NEXT: s_bfe_u32 s2, s6, 0x30008
3481 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
3482 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0
3483 ; GFX6-NEXT: s_and_b32 s4, s6, 7
3484 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4
3485 ; GFX6-NEXT: s_mov_b32 s2, -1
3486 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
3487 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
3488 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1
3489 ; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2
3490 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
3491 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
3492 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
3493 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
3494 ; GFX6-NEXT: s_endpgm
3496 ; GFX9-LABEL: udiv_i3:
3498 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
3499 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3500 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3501 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3502 ; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008
3503 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
3504 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
3505 ; GFX9-NEXT: s_and_b32 s2, s2, 7
3506 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2
3507 ; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1
3508 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
3509 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1
3510 ; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3
3511 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
3512 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
3513 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
3514 ; GFX9-NEXT: global_store_byte v2, v0, s[0:1]
3515 ; GFX9-NEXT: s_endpgm
3517 store i3 %r, ptr addrspace(1) %out
3521 define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
3522 ; CHECK-LABEL: @urem_i3(
3523 ; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
3524 ; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
3525 ; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
3526 ; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
3527 ; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
3528 ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
3529 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
3530 ; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]]
3531 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
3532 ; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
3533 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3534 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
3535 ; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
3536 ; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
3537 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
3538 ; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
3539 ; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
3540 ; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7
3541 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
3542 ; CHECK-NEXT: store i3 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 1
3543 ; CHECK-NEXT: ret void
3545 ; GFX6-LABEL: urem_i3:
3547 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
3548 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
3549 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3550 ; GFX6-NEXT: s_bfe_u32 s2, s6, 0x30008
3551 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
3552 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0
3553 ; GFX6-NEXT: s_and_b32 s3, s6, 7
3554 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s3
3555 ; GFX6-NEXT: s_lshr_b32 s2, s6, 8
3556 ; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
3557 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
3558 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1
3559 ; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2
3560 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
3561 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
3562 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
3563 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2
3564 ; GFX6-NEXT: s_mov_b32 s2, -1
3565 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
3566 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
3567 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
3568 ; GFX6-NEXT: s_endpgm
3570 ; GFX9-LABEL: urem_i3:
3572 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
3573 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3574 ; GFX9-NEXT: s_bfe_u32 s0, s2, 0x30008
3575 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
3576 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
3577 ; GFX9-NEXT: s_and_b32 s1, s2, 7
3578 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s1
3579 ; GFX9-NEXT: s_lshr_b32 s0, s2, 8
3580 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1
3581 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
3582 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1
3583 ; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2
3584 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
3585 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
3586 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
3587 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
3588 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3589 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
3590 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
3591 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3592 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
3593 ; GFX9-NEXT: s_endpgm
3595 store i3 %r, ptr addrspace(1) %out
3599 define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
3600 ; CHECK-LABEL: @sdiv_i3(
3601 ; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
3602 ; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
3603 ; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
3604 ; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
3605 ; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1
3606 ; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
3607 ; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
3608 ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
3609 ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
3610 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
3611 ; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]]
3612 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
3613 ; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
3614 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
3615 ; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
3616 ; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
3617 ; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
3618 ; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
3619 ; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29
3620 ; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
3621 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
3622 ; CHECK-NEXT: store i3 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 1
3623 ; CHECK-NEXT: ret void
3625 ; GFX6-LABEL: sdiv_i3:
3627 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
3628 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
3629 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
3630 ; GFX6-NEXT: s_mov_b32 s2, -1
3631 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3632 ; GFX6-NEXT: s_bfe_i32 s4, s6, 0x30008
3633 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
3634 ; GFX6-NEXT: s_bfe_i32 s5, s6, 0x30000
3635 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
3636 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
3637 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
3638 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
3639 ; GFX6-NEXT: s_or_b32 s6, s4, 1
3640 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
3641 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
3642 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
3643 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
3644 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
3645 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
3646 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
3647 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2
3648 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
3649 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
3650 ; GFX6-NEXT: s_endpgm
3652 ; GFX9-LABEL: sdiv_i3:
3654 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
3655 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3656 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
3657 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3658 ; GFX9-NEXT: s_bfe_i32 s3, s2, 0x30008
3659 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3
3660 ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x30000
3661 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2
3662 ; GFX9-NEXT: s_xor_b32 s2, s2, s3
3663 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
3664 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30
3665 ; GFX9-NEXT: s_or_b32 s4, s2, 1
3666 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3
3667 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
3668 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2
3669 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
3670 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
3671 ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
3672 ; GFX9-NEXT: s_cselect_b32 s2, s4, 0
3673 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v3
3674 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
3675 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
3676 ; GFX9-NEXT: s_endpgm
3678 store i3 %r, ptr addrspace(1) %out
3682 define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
3683 ; CHECK-LABEL: @srem_i3(
3684 ; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
3685 ; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
3686 ; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
3687 ; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
3688 ; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1
3689 ; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
3690 ; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
3691 ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
3692 ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
3693 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
3694 ; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]]
3695 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
3696 ; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
3697 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
3698 ; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
3699 ; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
3700 ; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
3701 ; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
3702 ; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
3703 ; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
3704 ; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29
3705 ; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
3706 ; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
3707 ; CHECK-NEXT: store i3 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 1
3708 ; CHECK-NEXT: ret void
3710 ; GFX6-LABEL: srem_i3:
3712 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
3713 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
3714 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3715 ; GFX6-NEXT: s_bfe_i32 s2, s6, 0x30008
3716 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2
3717 ; GFX6-NEXT: s_bfe_i32 s3, s6, 0x30000
3718 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3
3719 ; GFX6-NEXT: s_xor_b32 s2, s3, s2
3720 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
3721 ; GFX6-NEXT: s_ashr_i32 s2, s2, 30
3722 ; GFX6-NEXT: s_lshr_b32 s4, s6, 8
3723 ; GFX6-NEXT: s_or_b32 s5, s2, 1
3724 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
3725 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
3726 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
3727 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
3728 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
3729 ; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec
3730 ; GFX6-NEXT: s_cselect_b32 s2, s5, 0
3731 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
3732 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4
3733 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
3734 ; GFX6-NEXT: s_mov_b32 s2, -1
3735 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
3736 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0
3737 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
3738 ; GFX6-NEXT: s_endpgm
3740 ; GFX9-LABEL: srem_i3:
3742 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
3743 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3744 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x30008
3745 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0
3746 ; GFX9-NEXT: s_bfe_i32 s1, s2, 0x30000
3747 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1
3748 ; GFX9-NEXT: s_xor_b32 s0, s1, s0
3749 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
3750 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30
3751 ; GFX9-NEXT: s_lshr_b32 s3, s2, 8
3752 ; GFX9-NEXT: s_or_b32 s6, s0, 1
3753 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
3754 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
3755 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
3756 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
3757 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3758 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
3759 ; GFX9-NEXT: s_cselect_b32 s0, s6, 0
3760 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v2
3761 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
3762 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
3763 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
3764 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
3765 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
3766 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3767 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
3768 ; GFX9-NEXT: s_endpgm
3770 store i3 %r, ptr addrspace(1) %out
3774 define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) {
3775 ; CHECK-LABEL: @udiv_v3i16(
3776 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3777 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3778 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3779 ; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3780 ; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3781 ; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3782 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3783 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3784 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3785 ; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]]
3786 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3787 ; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3788 ; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3789 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3790 ; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3791 ; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3792 ; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3793 ; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535
3794 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
3795 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> poison, i16 [[TMP19]], i64 0
3796 ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1
3797 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3798 ; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
3799 ; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
3800 ; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
3801 ; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
3802 ; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
3803 ; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
3804 ; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
3805 ; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]]
3806 ; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
3807 ; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
3808 ; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
3809 ; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
3810 ; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
3811 ; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
3812 ; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
3813 ; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535
3814 ; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
3815 ; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1
3816 ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2
3817 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3818 ; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
3819 ; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
3820 ; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
3821 ; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
3822 ; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
3823 ; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
3824 ; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
3825 ; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]]
3826 ; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
3827 ; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
3828 ; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
3829 ; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
3830 ; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
3831 ; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
3832 ; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
3833 ; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535
3834 ; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
3835 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2
3836 ; CHECK-NEXT: store <3 x i16> [[TMP60]], ptr addrspace(1) [[OUT:%.*]], align 8
3837 ; CHECK-NEXT: ret void
3839 ; GFX6-LABEL: udiv_v3i16:
3841 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
3842 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
3843 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
3844 ; GFX6-NEXT: s_mov_b32 s2, -1
3845 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
3846 ; GFX6-NEXT: s_and_b32 s5, s10, 0xffff
3847 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5
3848 ; GFX6-NEXT: s_lshr_b32 s5, s10, 16
3849 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
3850 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5
3851 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4
3852 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
3853 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16
3854 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
3855 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
3856 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3
3857 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3
3858 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1
3859 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
3860 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5
3861 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
3862 ; GFX6-NEXT: s_and_b32 s4, s11, 0xffff
3863 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3
3864 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4
3865 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
3866 ; GFX6-NEXT: s_and_b32 s4, s9, 0xffff
3867 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc
3868 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4
3869 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4
3870 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
3871 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2
3872 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
3873 ; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6
3874 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
3875 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2
3876 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3877 ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5
3878 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4
3879 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
3880 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3881 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
3882 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4
3883 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
3884 ; GFX6-NEXT: s_endpgm
3886 ; GFX9-LABEL: udiv_v3i16:
3888 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
3889 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
3890 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3891 ; GFX9-NEXT: s_and_b32 s7, s2, 0xffff
3892 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
3893 ; GFX9-NEXT: s_and_b32 s6, s0, 0xffff
3894 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
3895 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
3896 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6
3897 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
3898 ; GFX9-NEXT: s_lshr_b32 s0, s0, 16
3899 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0
3900 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
3901 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
3902 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
3903 ; GFX9-NEXT: s_and_b32 s0, s3, 0xffff
3904 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
3905 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
3906 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
3907 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
3908 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
3909 ; GFX9-NEXT: v_trunc_f32_e32 v2, v5
3910 ; GFX9-NEXT: s_and_b32 s0, s1, 0xffff
3911 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
3912 ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
3913 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
3914 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
3915 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
3916 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
3917 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
3918 ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7
3919 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
3920 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
3921 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2
3922 ; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5
3923 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4
3924 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
3925 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
3926 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
3927 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3928 ; GFX9-NEXT: global_store_short v6, v2, s[6:7] offset:4
3929 ; GFX9-NEXT: global_store_dword v6, v0, s[6:7]
3930 ; GFX9-NEXT: s_endpgm
3931 %r = udiv <3 x i16> %x, %y
3932 store <3 x i16> %r, ptr addrspace(1) %out
3936 define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) {
3937 ; CHECK-LABEL: @urem_v3i16(
3938 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3939 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3940 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3941 ; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3942 ; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3943 ; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3944 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3945 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3946 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3947 ; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]]
3948 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3949 ; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3950 ; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3951 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3952 ; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3953 ; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3954 ; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3955 ; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
3956 ; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
3957 ; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535
3958 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
3959 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> poison, i16 [[TMP21]], i64 0
3960 ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1
3961 ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3962 ; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
3963 ; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
3964 ; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
3965 ; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
3966 ; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
3967 ; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
3968 ; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
3969 ; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]]
3970 ; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
3971 ; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
3972 ; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3973 ; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
3974 ; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
3975 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
3976 ; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
3977 ; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
3978 ; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
3979 ; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535
3980 ; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
3981 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1
3982 ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2
3983 ; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3984 ; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
3985 ; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
3986 ; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
3987 ; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
3988 ; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
3989 ; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
3990 ; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
3991 ; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]]
3992 ; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
3993 ; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
3994 ; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
3995 ; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
3996 ; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
3997 ; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
3998 ; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
3999 ; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
4000 ; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
4001 ; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535
4002 ; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
4003 ; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2
4004 ; CHECK-NEXT: store <3 x i16> [[TMP66]], ptr addrspace(1) [[OUT:%.*]], align 8
4005 ; CHECK-NEXT: ret void
4007 ; GFX6-LABEL: urem_v3i16:
4009 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
4010 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
4011 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
4012 ; GFX6-NEXT: s_mov_b32 s2, -1
4013 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
4014 ; GFX6-NEXT: s_and_b32 s5, s10, 0xffff
4015 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5
4016 ; GFX6-NEXT: s_lshr_b32 s5, s10, 16
4017 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
4018 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5
4019 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4
4020 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
4021 ; GFX6-NEXT: s_lshr_b32 s4, s8, 16
4022 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4
4023 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
4024 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3
4025 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3
4026 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1
4027 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3
4028 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
4029 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5
4030 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
4031 ; GFX6-NEXT: s_and_b32 s6, s11, 0xffff
4032 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4
4033 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6
4034 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc
4035 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10
4036 ; GFX6-NEXT: s_and_b32 s6, s9, 0xffff
4037 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s6
4038 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4
4039 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
4040 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
4041 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2
4042 ; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6
4043 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
4044 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2
4045 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4046 ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5
4047 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4
4048 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5
4049 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
4050 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11
4051 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1
4052 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
4053 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2
4054 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
4055 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
4056 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4
4057 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
4058 ; GFX6-NEXT: s_endpgm
4060 ; GFX9-LABEL: urem_v3i16:
4062 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
4063 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
4064 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4065 ; GFX9-NEXT: s_and_b32 s9, s2, 0xffff
4066 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
4067 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
4068 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
4069 ; GFX9-NEXT: s_and_b32 s8, s0, 0xffff
4070 ; GFX9-NEXT: s_lshr_b32 s0, s0, 16
4071 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8
4072 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
4073 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0
4074 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
4075 ; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
4076 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
4077 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
4078 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
4079 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
4080 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
4081 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4
4082 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
4083 ; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3
4084 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3
4085 ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
4086 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
4087 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5
4088 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1
4089 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3
4090 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1
4091 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
4092 ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6
4093 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
4094 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
4095 ; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5
4096 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3
4097 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9
4098 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
4099 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2
4100 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3
4101 ; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0
4102 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
4103 ; GFX9-NEXT: v_sub_u32_e32 v1, s0, v1
4104 ; GFX9-NEXT: v_sub_u32_e32 v2, s1, v2
4105 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
4106 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
4107 ; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4
4108 ; GFX9-NEXT: global_store_dword v3, v0, s[6:7]
4109 ; GFX9-NEXT: s_endpgm
4110 %r = urem <3 x i16> %x, %y
4111 store <3 x i16> %r, ptr addrspace(1) %out
4115 define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) {
4116 ; CHECK-LABEL: @sdiv_v3i16(
4117 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4118 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4119 ; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4120 ; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4121 ; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4122 ; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4123 ; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1
4124 ; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4125 ; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4126 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4127 ; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4128 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4129 ; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]]
4130 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4131 ; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4132 ; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4133 ; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4134 ; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4135 ; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4136 ; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4137 ; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16
4138 ; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
4139 ; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
4140 ; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> poison, i16 [[TMP23]], i64 0
4141 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1
4142 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4143 ; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
4144 ; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
4145 ; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
4146 ; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
4147 ; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1
4148 ; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
4149 ; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
4150 ; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4151 ; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
4152 ; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
4153 ; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]]
4154 ; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
4155 ; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
4156 ; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
4157 ; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4158 ; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
4159 ; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
4160 ; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
4161 ; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16
4162 ; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
4163 ; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
4164 ; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1
4165 ; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2
4166 ; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4167 ; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
4168 ; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
4169 ; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
4170 ; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
4171 ; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1
4172 ; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
4173 ; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
4174 ; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
4175 ; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
4176 ; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
4177 ; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]]
4178 ; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
4179 ; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
4180 ; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
4181 ; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
4182 ; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
4183 ; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
4184 ; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
4185 ; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16
4186 ; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
4187 ; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
4188 ; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
4189 ; CHECK-NEXT: store <3 x i16> [[TMP72]], ptr addrspace(1) [[OUT:%.*]], align 8
4190 ; CHECK-NEXT: ret void
4192 ; GFX6-LABEL: sdiv_v3i16:
4194 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
4195 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
4196 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
4197 ; GFX6-NEXT: s_mov_b32 s2, -1
4198 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
4199 ; GFX6-NEXT: s_sext_i32_i16 s4, s10
4200 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
4201 ; GFX6-NEXT: s_sext_i32_i16 s5, s8
4202 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
4203 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
4204 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
4205 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
4206 ; GFX6-NEXT: s_or_b32 s6, s4, 1
4207 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
4208 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
4209 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
4210 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
4211 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
4212 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
4213 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
4214 ; GFX6-NEXT: s_ashr_i32 s5, s10, 16
4215 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5
4216 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v2
4217 ; GFX6-NEXT: s_ashr_i32 s4, s8, 16
4218 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
4219 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0
4220 ; GFX6-NEXT: s_xor_b32 s4, s4, s5
4221 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
4222 ; GFX6-NEXT: s_or_b32 s6, s4, 1
4223 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3
4224 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3
4225 ; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2
4226 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
4227 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
4228 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3
4229 ; GFX6-NEXT: s_sext_i32_i16 s5, s11
4230 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5
4231 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
4232 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v3
4233 ; GFX6-NEXT: s_sext_i32_i16 s4, s9
4234 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4
4235 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0
4236 ; GFX6-NEXT: s_xor_b32 s4, s4, s5
4237 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
4238 ; GFX6-NEXT: s_or_b32 s6, s4, 1
4239 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4
4240 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4
4241 ; GFX6-NEXT: v_mad_f32 v3, -v4, v0, v3
4242 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4
4243 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
4244 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
4245 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
4246 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v4
4247 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
4248 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
4249 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
4250 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
4251 ; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0
4252 ; GFX6-NEXT: s_endpgm
4254 ; GFX9-LABEL: sdiv_v3i16:
4256 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
4257 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
4258 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4259 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4260 ; GFX9-NEXT: s_sext_i32_i16 s4, s2
4261 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
4262 ; GFX9-NEXT: s_sext_i32_i16 s5, s0
4263 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s5
4264 ; GFX9-NEXT: s_xor_b32 s4, s5, s4
4265 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
4266 ; GFX9-NEXT: s_ashr_i32 s4, s4, 30
4267 ; GFX9-NEXT: s_or_b32 s8, s4, 1
4268 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3
4269 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
4270 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2
4271 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
4272 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
4273 ; GFX9-NEXT: s_cselect_b32 s4, s8, 0
4274 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16
4275 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
4276 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
4277 ; GFX9-NEXT: s_ashr_i32 s0, s0, 16
4278 ; GFX9-NEXT: v_add_u32_e32 v2, s4, v3
4279 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0
4280 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
4281 ; GFX9-NEXT: s_xor_b32 s0, s0, s2
4282 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30
4283 ; GFX9-NEXT: s_sext_i32_i16 s2, s3
4284 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4
4285 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
4286 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3
4287 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
4288 ; GFX9-NEXT: s_or_b32 s0, s0, 1
4289 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
4290 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
4291 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
4292 ; GFX9-NEXT: s_cselect_b32 s0, s0, 0
4293 ; GFX9-NEXT: v_add_u32_e32 v3, s0, v4
4294 ; GFX9-NEXT: s_sext_i32_i16 s0, s1
4295 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0
4296 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0
4297 ; GFX9-NEXT: s_xor_b32 s0, s0, s2
4298 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30
4299 ; GFX9-NEXT: s_or_b32 s2, s0, 1
4300 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5
4301 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
4302 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4
4303 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5
4304 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
4305 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
4306 ; GFX9-NEXT: s_cselect_b32 s0, s2, 0
4307 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v5
4308 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
4309 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
4310 ; GFX9-NEXT: global_store_short v1, v0, s[6:7] offset:4
4311 ; GFX9-NEXT: global_store_dword v1, v2, s[6:7]
4312 ; GFX9-NEXT: s_endpgm
4313 %r = sdiv <3 x i16> %x, %y
4314 store <3 x i16> %r, ptr addrspace(1) %out
4318 define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) {
4319 ; CHECK-LABEL: @srem_v3i16(
4320 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4321 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4322 ; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4323 ; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4324 ; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4325 ; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4326 ; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1
4327 ; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4328 ; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4329 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4330 ; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4331 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4332 ; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]]
4333 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4334 ; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4335 ; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4336 ; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4337 ; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4338 ; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4339 ; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4340 ; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
4341 ; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
4342 ; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16
4343 ; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
4344 ; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
4345 ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> poison, i16 [[TMP25]], i64 0
4346 ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1
4347 ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4348 ; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
4349 ; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
4350 ; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
4351 ; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
4352 ; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1
4353 ; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
4354 ; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
4355 ; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
4356 ; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
4357 ; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
4358 ; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]]
4359 ; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
4360 ; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
4361 ; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
4362 ; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
4363 ; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
4364 ; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
4365 ; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
4366 ; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
4367 ; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
4368 ; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16
4369 ; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
4370 ; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
4371 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1
4372 ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2
4373 ; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4374 ; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
4375 ; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
4376 ; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
4377 ; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
4378 ; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1
4379 ; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
4380 ; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
4381 ; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
4382 ; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
4383 ; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
4384 ; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]]
4385 ; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
4386 ; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
4387 ; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
4388 ; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
4389 ; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
4390 ; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
4391 ; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
4392 ; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
4393 ; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
4394 ; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16
4395 ; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
4396 ; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
4397 ; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
4398 ; CHECK-NEXT: store <3 x i16> [[TMP78]], ptr addrspace(1) [[OUT:%.*]], align 8
4399 ; CHECK-NEXT: ret void
4401 ; GFX6-LABEL: srem_v3i16:
4403 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
4404 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
4405 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
4406 ; GFX6-NEXT: s_mov_b32 s2, -1
4407 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
4408 ; GFX6-NEXT: s_sext_i32_i16 s4, s10
4409 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4
4410 ; GFX6-NEXT: s_sext_i32_i16 s5, s8
4411 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5
4412 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
4413 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0
4414 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
4415 ; GFX6-NEXT: s_or_b32 s6, s4, 1
4416 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2
4417 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
4418 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1
4419 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
4420 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
4421 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
4422 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
4423 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2
4424 ; GFX6-NEXT: s_ashr_i32 s4, s10, 16
4425 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4
4426 ; GFX6-NEXT: s_ashr_i32 s5, s8, 16
4427 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5
4428 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10
4429 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1
4430 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
4431 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
4432 ; GFX6-NEXT: s_lshr_b32 s6, s8, 16
4433 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3
4434 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3
4435 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2
4436 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3
4437 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
4438 ; GFX6-NEXT: s_lshr_b32 s7, s10, 16
4439 ; GFX6-NEXT: s_or_b32 s8, s4, 1
4440 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1|
4441 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
4442 ; GFX6-NEXT: s_cselect_b32 s4, s8, 0
4443 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3
4444 ; GFX6-NEXT: s_sext_i32_i16 s4, s11
4445 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
4446 ; GFX6-NEXT: s_sext_i32_i16 s5, s9
4447 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5
4448 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
4449 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2
4450 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
4451 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7
4452 ; GFX6-NEXT: s_or_b32 s7, s4, 1
4453 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4
4454 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4
4455 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3
4456 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4
4457 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2|
4458 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
4459 ; GFX6-NEXT: s_cselect_b32 s4, s7, 0
4460 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v4
4461 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11
4462 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1
4463 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
4464 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2
4465 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
4466 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
4467 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4
4468 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
4469 ; GFX6-NEXT: s_endpgm
4471 ; GFX9-LABEL: srem_v3i16:
4473 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
4474 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
4475 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4476 ; GFX9-NEXT: s_sext_i32_i16 s8, s2
4477 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8
4478 ; GFX9-NEXT: s_sext_i32_i16 s9, s0
4479 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9
4480 ; GFX9-NEXT: s_xor_b32 s4, s9, s8
4481 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
4482 ; GFX9-NEXT: s_ashr_i32 s4, s4, 30
4483 ; GFX9-NEXT: s_or_b32 s10, s4, 1
4484 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
4485 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
4486 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
4487 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
4488 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
4489 ; GFX9-NEXT: s_cselect_b32 s4, s10, 0
4490 ; GFX9-NEXT: s_ashr_i32 s10, s0, 16
4491 ; GFX9-NEXT: s_ashr_i32 s0, s2, 16
4492 ; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
4493 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0
4494 ; GFX9-NEXT: s_xor_b32 s2, s10, s0
4495 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30
4496 ; GFX9-NEXT: v_add_u32_e32 v1, s4, v2
4497 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s10
4498 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
4499 ; GFX9-NEXT: s_or_b32 s2, s2, 1
4500 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8
4501 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3
4502 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
4503 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2
4504 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
4505 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
4506 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
4507 ; GFX9-NEXT: s_cselect_b32 s2, s2, 0
4508 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v3
4509 ; GFX9-NEXT: s_sext_i32_i16 s2, s3
4510 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2
4511 ; GFX9-NEXT: s_sext_i32_i16 s3, s1
4512 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s3
4513 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
4514 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2
4515 ; GFX9-NEXT: s_xor_b32 s0, s3, s2
4516 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30
4517 ; GFX9-NEXT: s_or_b32 s4, s0, 1
4518 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4
4519 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
4520 ; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3
4521 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
4522 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
4523 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
4524 ; GFX9-NEXT: s_cselect_b32 s0, s4, 0
4525 ; GFX9-NEXT: v_add_u32_e32 v2, s0, v4
4526 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
4527 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1
4528 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
4529 ; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0
4530 ; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2
4531 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
4532 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
4533 ; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4
4534 ; GFX9-NEXT: global_store_dword v3, v0, s[6:7]
4535 ; GFX9-NEXT: s_endpgm
4536 %r = srem <3 x i16> %x, %y
4537 store <3 x i16> %r, ptr addrspace(1) %out
4541 define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) {
4542 ; CHECK-LABEL: @udiv_v3i15(
4543 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4544 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4545 ; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
4546 ; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
4547 ; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4548 ; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4549 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4550 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4551 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4552 ; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]]
4553 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4554 ; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4555 ; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4556 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4557 ; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4558 ; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4559 ; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4560 ; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767
4561 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15
4562 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> poison, i15 [[TMP19]], i64 0
4563 ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1
4564 ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4565 ; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32
4566 ; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32
4567 ; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
4568 ; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
4569 ; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
4570 ; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
4571 ; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
4572 ; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]]
4573 ; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
4574 ; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
4575 ; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
4576 ; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
4577 ; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
4578 ; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
4579 ; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
4580 ; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767
4581 ; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15
4582 ; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1
4583 ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2
4584 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4585 ; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32
4586 ; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32
4587 ; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
4588 ; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
4589 ; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
4590 ; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
4591 ; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
4592 ; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]]
4593 ; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
4594 ; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
4595 ; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
4596 ; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
4597 ; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
4598 ; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
4599 ; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
4600 ; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767
4601 ; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15
4602 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2
4603 ; CHECK-NEXT: store <3 x i15> [[TMP60]], ptr addrspace(1) [[OUT:%.*]], align 8
4604 ; CHECK-NEXT: ret void
4606 ; GFX6-LABEL: udiv_v3i15:
4608 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
4609 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
4610 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
4611 ; GFX6-NEXT: s_mov_b32 s2, -1
4612 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
4613 ; GFX6-NEXT: s_and_b32 s6, s10, 0x7fff
4614 ; GFX6-NEXT: s_and_b32 s7, s4, 0x7fff
4615 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7
4616 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
4617 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf000f
4618 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6
4619 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1
4620 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4
4621 ; GFX6-NEXT: s_bfe_u32 s7, s10, 0xf000f
4622 ; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 30
4623 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4
4624 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s7
4625 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5
4626 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2
4627 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4
4628 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3
4629 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4
4630 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2
4631 ; GFX6-NEXT: v_mov_b32_e32 v0, s10
4632 ; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30
4633 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
4634 ; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7
4635 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0
4636 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
4637 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
4638 ; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6
4639 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
4640 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
4641 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2
4642 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5
4643 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
4644 ; GFX6-NEXT: v_mul_f32_e32 v1, v0, v6
4645 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
4646 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1
4647 ; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0
4648 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2
4649 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3
4650 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc
4651 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4
4652 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
4653 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3
4654 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
4655 ; GFX6-NEXT: s_mov_b32 s0, s8
4656 ; GFX6-NEXT: s_mov_b32 s1, s9
4657 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
4658 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
4659 ; GFX6-NEXT: s_waitcnt expcnt(0)
4660 ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
4661 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
4662 ; GFX6-NEXT: s_endpgm
4664 ; GFX9-LABEL: udiv_v3i15:
4666 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4667 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
4668 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4669 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4670 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4671 ; GFX9-NEXT: s_and_b32 s5, s6, 0x7fff
4672 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5
4673 ; GFX9-NEXT: s_and_b32 s4, s2, 0x7fff
4674 ; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30
4675 ; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f
4676 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4
4677 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
4678 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3
4679 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf000f
4680 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
4681 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30
4682 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5
4683 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2
4684 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6
4685 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3
4686 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
4687 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4
4688 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
4689 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3
4690 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1
4691 ; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8
4692 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
4693 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
4694 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
4695 ; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7
4696 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
4697 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
4698 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3
4699 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6
4700 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
4701 ; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7
4702 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
4703 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1
4704 ; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0
4705 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3
4706 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4
4707 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
4708 ; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5
4709 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1]
4710 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4
4711 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
4712 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0
4713 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
4714 ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1
4715 ; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4
4716 ; GFX9-NEXT: s_endpgm
4717 %r = udiv <3 x i15> %x, %y
4718 store <3 x i15> %r, ptr addrspace(1) %out
4722 define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) {
4723 ; CHECK-LABEL: @urem_v3i15(
4724 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4725 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4726 ; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
4727 ; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
4728 ; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4729 ; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4730 ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4731 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4732 ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4733 ; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]]
4734 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4735 ; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4736 ; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4737 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4738 ; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4739 ; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4740 ; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4741 ; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
4742 ; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
4743 ; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767
4744 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15
4745 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> poison, i15 [[TMP21]], i64 0
4746 ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1
4747 ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4748 ; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32
4749 ; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32
4750 ; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
4751 ; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
4752 ; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
4753 ; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
4754 ; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
4755 ; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]]
4756 ; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
4757 ; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
4758 ; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4759 ; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
4760 ; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
4761 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
4762 ; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
4763 ; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
4764 ; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
4765 ; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767
4766 ; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15
4767 ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1
4768 ; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2
4769 ; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4770 ; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32
4771 ; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32
4772 ; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
4773 ; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
4774 ; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
4775 ; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
4776 ; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
4777 ; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]]
4778 ; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
4779 ; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
4780 ; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
4781 ; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
4782 ; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
4783 ; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
4784 ; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
4785 ; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
4786 ; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
4787 ; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767
4788 ; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15
4789 ; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2
4790 ; CHECK-NEXT: store <3 x i15> [[TMP66]], ptr addrspace(1) [[OUT:%.*]], align 8
4791 ; CHECK-NEXT: ret void
4793 ; GFX6-LABEL: urem_v3i15:
4795 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
4796 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
4797 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
4798 ; GFX6-NEXT: s_mov_b32 s2, -1
4799 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
4800 ; GFX6-NEXT: s_mov_b32 s0, s8
4801 ; GFX6-NEXT: s_and_b32 s8, s4, 0x7fff
4802 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8
4803 ; GFX6-NEXT: s_and_b32 s7, s10, 0x7fff
4804 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7
4805 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
4806 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1
4807 ; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 30
4808 ; GFX6-NEXT: s_bfe_u32 s5, s4, 0xf000f
4809 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5
4810 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4
4811 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4
4812 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3
4813 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4
4814 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
4815 ; GFX6-NEXT: s_bfe_u32 s8, s10, 0xf000f
4816 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8
4817 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
4818 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4
4819 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5
4820 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2
4821 ; GFX6-NEXT: v_mov_b32_e32 v0, s10
4822 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s10, v1
4823 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4
4824 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2
4825 ; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30
4826 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0
4827 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0
4828 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4
4829 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
4830 ; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3
4831 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
4832 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5
4833 ; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8
4834 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3
4835 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3
4836 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4837 ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7
4838 ; GFX6-NEXT: s_lshr_b32 s4, s4, 15
4839 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4
4840 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4
4841 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
4842 ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2
4843 ; GFX6-NEXT: s_lshr_b32 s6, s10, 15
4844 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1
4845 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
4846 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3
4847 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
4848 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6
4849 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3
4850 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
4851 ; GFX6-NEXT: s_mov_b32 s1, s9
4852 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
4853 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
4854 ; GFX6-NEXT: s_waitcnt expcnt(0)
4855 ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
4856 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
4857 ; GFX6-NEXT: s_endpgm
4859 ; GFX9-LABEL: urem_v3i15:
4861 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4862 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
4863 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4864 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4865 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4866 ; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30
4867 ; GFX9-NEXT: s_and_b32 s3, s6, 0x7fff
4868 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
4869 ; GFX9-NEXT: s_and_b32 s4, s2, 0x7fff
4870 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4
4871 ; GFX9-NEXT: s_bfe_u32 s4, s6, 0xf000f
4872 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
4873 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s4
4874 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
4875 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30
4876 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5
4877 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
4878 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4
4879 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
4880 ; GFX9-NEXT: s_bfe_u32 s5, s2, 0xf000f
4881 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3
4882 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1
4883 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s5
4884 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6
4885 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
4886 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3
4887 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
4888 ; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8
4889 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0
4890 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5
4891 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4
4892 ; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7
4893 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
4894 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6
4895 ; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9
4896 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6
4897 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6
4898 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
4899 ; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8
4900 ; GFX9-NEXT: s_lshr_b32 s3, s6, 15
4901 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5
4902 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s3
4903 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
4904 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6
4905 ; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3
4906 ; GFX9-NEXT: s_lshr_b32 s3, s2, 15
4907 ; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4
4908 ; GFX9-NEXT: v_sub_u32_e32 v5, s2, v1
4909 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3
4910 ; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4
4911 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1]
4912 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v5
4913 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4
4914 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
4915 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0
4916 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
4917 ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1
4918 ; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4
4919 ; GFX9-NEXT: s_endpgm
4920 %r = urem <3 x i15> %x, %y
4921 store <3 x i15> %r, ptr addrspace(1) %out
4925 define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) {
4926 ; CHECK-LABEL: @sdiv_v3i15(
4927 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4928 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4929 ; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
4930 ; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
4931 ; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4932 ; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4933 ; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1
4934 ; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4935 ; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4936 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4937 ; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4938 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4939 ; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]]
4940 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4941 ; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4942 ; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4943 ; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4944 ; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4945 ; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4946 ; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4947 ; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17
4948 ; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17
4949 ; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
4950 ; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> poison, i15 [[TMP23]], i64 0
4951 ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1
4952 ; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4953 ; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32
4954 ; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32
4955 ; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
4956 ; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
4957 ; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1
4958 ; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
4959 ; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
4960 ; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4961 ; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
4962 ; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
4963 ; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]]
4964 ; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
4965 ; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
4966 ; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
4967 ; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4968 ; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
4969 ; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
4970 ; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
4971 ; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17
4972 ; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17
4973 ; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15
4974 ; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1
4975 ; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2
4976 ; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4977 ; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32
4978 ; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32
4979 ; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
4980 ; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
4981 ; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1
4982 ; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
4983 ; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
4984 ; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
4985 ; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
4986 ; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
4987 ; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]]
4988 ; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
4989 ; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
4990 ; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
4991 ; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
4992 ; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
4993 ; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
4994 ; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
4995 ; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17
4996 ; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
4997 ; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
4998 ; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
4999 ; CHECK-NEXT: store <3 x i15> [[TMP72]], ptr addrspace(1) [[OUT:%.*]], align 8
5000 ; CHECK-NEXT: ret void
5002 ; GFX6-LABEL: sdiv_v3i15:
5004 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
5005 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
5006 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
5007 ; GFX6-NEXT: s_mov_b32 s2, -1
5008 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
5009 ; GFX6-NEXT: v_mov_b32_e32 v0, s10
5010 ; GFX6-NEXT: s_bfe_i32 s6, s4, 0xf0000
5011 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6
5012 ; GFX6-NEXT: v_mov_b32_e32 v1, s4
5013 ; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 30
5014 ; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf0000
5015 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5
5016 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2
5017 ; GFX6-NEXT: s_xor_b32 s5, s5, s6
5018 ; GFX6-NEXT: s_ashr_i32 s5, s5, 30
5019 ; GFX6-NEXT: s_or_b32 s5, s5, 1
5020 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4
5021 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4
5022 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3
5023 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v2|
5024 ; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec
5025 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4
5026 ; GFX6-NEXT: s_cselect_b32 s5, s5, 0
5027 ; GFX6-NEXT: s_bfe_i32 s4, s4, 0xf000f
5028 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4
5029 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s5, v4
5030 ; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf000f
5031 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5
5032 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2
5033 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
5034 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
5035 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15
5036 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5
5037 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5
5038 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4
5039 ; GFX6-NEXT: s_or_b32 s6, s4, 1
5040 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5
5041 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2|
5042 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1
5043 ; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30
5044 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
5045 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
5046 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15
5047 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v5
5048 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0
5049 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2
5050 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1
5051 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0
5052 ; GFX6-NEXT: v_or_b32_e32 v0, 1, v0
5053 ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6
5054 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
5055 ; GFX6-NEXT: v_mad_f32 v5, -v1, v2, v5
5056 ; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1
5057 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v2|
5058 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
5059 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3
5060 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
5061 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4
5062 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
5063 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3
5064 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
5065 ; GFX6-NEXT: s_mov_b32 s0, s8
5066 ; GFX6-NEXT: s_mov_b32 s1, s9
5067 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
5068 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
5069 ; GFX6-NEXT: s_waitcnt expcnt(0)
5070 ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
5071 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
5072 ; GFX6-NEXT: s_endpgm
5074 ; GFX9-LABEL: sdiv_v3i15:
5076 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5077 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
5078 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5079 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5080 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
5081 ; GFX9-NEXT: s_bfe_i32 s4, s6, 0xf0000
5082 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4
5083 ; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30
5084 ; GFX9-NEXT: s_bfe_i32 s3, s2, 0xf0000
5085 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3
5086 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3
5087 ; GFX9-NEXT: s_xor_b32 s3, s3, s4
5088 ; GFX9-NEXT: s_ashr_i32 s3, s3, 30
5089 ; GFX9-NEXT: s_or_b32 s3, s3, 1
5090 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5
5091 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
5092 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4
5093 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3|
5094 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
5095 ; GFX9-NEXT: s_cselect_b32 s3, s3, 0
5096 ; GFX9-NEXT: s_bfe_i32 s4, s6, 0xf000f
5097 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5
5098 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4
5099 ; GFX9-NEXT: s_bfe_i32 s2, s2, 0xf000f
5100 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
5101 ; GFX9-NEXT: v_add_u32_e32 v4, s3, v5
5102 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2
5103 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3
5104 ; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30
5105 ; GFX9-NEXT: s_xor_b32 s2, s2, s4
5106 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30
5107 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6
5108 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6
5109 ; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5
5110 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15
5111 ; GFX9-NEXT: s_or_b32 s4, s2, 1
5112 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6
5113 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v3|
5114 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1
5115 ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
5116 ; GFX9-NEXT: s_cselect_b32 s2, s4, 0
5117 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15
5118 ; GFX9-NEXT: v_add_u32_e32 v5, s2, v6
5119 ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0
5120 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3
5121 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1
5122 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0
5123 ; GFX9-NEXT: v_or_b32_e32 v0, 1, v0
5124 ; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7
5125 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
5126 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1
5127 ; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6
5128 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3|
5129 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
5130 ; GFX9-NEXT: v_add_u32_e32 v0, v7, v0
5131 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4
5132 ; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5
5133 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1]
5134 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4
5135 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
5136 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0
5137 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
5138 ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1
5139 ; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4
5140 ; GFX9-NEXT: s_endpgm
5141 %r = sdiv <3 x i15> %x, %y
5142 store <3 x i15> %r, ptr addrspace(1) %out
5146 define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) {
5147 ; CHECK-LABEL: @srem_v3i15(
5148 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
5149 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
5150 ; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
5151 ; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
5152 ; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
5153 ; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
5154 ; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1
5155 ; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
5156 ; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
5157 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
5158 ; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
5159 ; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
5160 ; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]]
5161 ; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
5162 ; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
5163 ; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
5164 ; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
5165 ; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
5166 ; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
5167 ; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
5168 ; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
5169 ; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
5170 ; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17
5171 ; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17
5172 ; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15
5173 ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> poison, i15 [[TMP25]], i64 0
5174 ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1
5175 ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1
5176 ; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32
5177 ; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32
5178 ; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
5179 ; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
5180 ; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1
5181 ; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
5182 ; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
5183 ; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5184 ; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
5185 ; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
5186 ; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]]
5187 ; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
5188 ; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
5189 ; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
5190 ; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
5191 ; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
5192 ; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
5193 ; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
5194 ; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
5195 ; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
5196 ; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17
5197 ; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17
5198 ; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15
5199 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1
5200 ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2
5201 ; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2
5202 ; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32
5203 ; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32
5204 ; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
5205 ; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
5206 ; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1
5207 ; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
5208 ; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
5209 ; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
5210 ; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
5211 ; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
5212 ; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]]
5213 ; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
5214 ; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
5215 ; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
5216 ; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
5217 ; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
5218 ; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
5219 ; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
5220 ; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
5221 ; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
5222 ; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17
5223 ; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
5224 ; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
5225 ; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
5226 ; CHECK-NEXT: store <3 x i15> [[TMP78]], ptr addrspace(1) [[OUT:%.*]], align 8
5227 ; CHECK-NEXT: ret void
5229 ; GFX6-LABEL: srem_v3i15:
5231 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
5232 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
5233 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
5234 ; GFX6-NEXT: s_mov_b32 s2, -1
5235 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
5236 ; GFX6-NEXT: s_bfe_i32 s6, s10, 0xf0000
5237 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
5238 ; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 30
5239 ; GFX6-NEXT: s_bfe_i32 s5, s4, 0xf0000
5240 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5
5241 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s6
5242 ; GFX6-NEXT: s_xor_b32 s5, s6, s5
5243 ; GFX6-NEXT: s_ashr_i32 s5, s5, 30
5244 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4
5245 ; GFX6-NEXT: s_mov_b32 s0, s8
5246 ; GFX6-NEXT: s_mov_b32 s1, s9
5247 ; GFX6-NEXT: s_lshr_b32 s8, s10, 15
5248 ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6
5249 ; GFX6-NEXT: v_trunc_f32_e32 v6, v6
5250 ; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5
5251 ; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6
5252 ; GFX6-NEXT: s_lshr_b32 s9, s4, 15
5253 ; GFX6-NEXT: s_or_b32 s5, s5, 1
5254 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v5|, |v4|
5255 ; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec
5256 ; GFX6-NEXT: s_cselect_b32 s5, s5, 0
5257 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, s5, v6
5258 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, s4
5259 ; GFX6-NEXT: s_bfe_i32 s4, s4, 0xf000f
5260 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s4
5261 ; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf000f
5262 ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s5
5263 ; GFX6-NEXT: s_xor_b32 s4, s5, s4
5264 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5
5265 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2
5266 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30
5267 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15
5268 ; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7
5269 ; GFX6-NEXT: v_trunc_f32_e32 v7, v7
5270 ; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6
5271 ; GFX6-NEXT: s_or_b32 s6, s4, 1
5272 ; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7
5273 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5|
5274 ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2
5275 ; GFX6-NEXT: v_mov_b32_e32 v0, s10
5276 ; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30
5277 ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec
5278 ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0
5279 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0
5280 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15
5281 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, s4, v7
5282 ; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0
5283 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6
5284 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2
5285 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s10, v4
5286 ; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8
5287 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2
5288 ; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7
5289 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2
5290 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0
5291 ; GFX6-NEXT: v_or_b32_e32 v0, 1, v0
5292 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6|
5293 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
5294 ; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9
5295 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
5296 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3
5297 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v5
5298 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2
5299 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
5300 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30
5301 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4
5302 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2
5303 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
5304 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
5305 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
5306 ; GFX6-NEXT: s_waitcnt expcnt(0)
5307 ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1
5308 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
5309 ; GFX6-NEXT: s_endpgm
5311 ; GFX9-LABEL: srem_v3i15:
5313 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5314 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
5315 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5316 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5317 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
5318 ; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30
5319 ; GFX9-NEXT: s_bfe_i32 s3, s6, 0xf0000
5320 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3
5321 ; GFX9-NEXT: s_bfe_i32 s4, s2, 0xf0000
5322 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s4
5323 ; GFX9-NEXT: s_xor_b32 s3, s4, s3
5324 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4
5325 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
5326 ; GFX9-NEXT: s_ashr_i32 s3, s3, 30
5327 ; GFX9-NEXT: s_lshr_b32 s8, s2, 15
5328 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6
5329 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6
5330 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5
5331 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6
5332 ; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30
5333 ; GFX9-NEXT: s_lshr_b32 s7, s6, 15
5334 ; GFX9-NEXT: s_or_b32 s3, s3, 1
5335 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, |v4|
5336 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
5337 ; GFX9-NEXT: s_cselect_b32 s3, s3, 0
5338 ; GFX9-NEXT: v_add_u32_e32 v4, s3, v6
5339 ; GFX9-NEXT: s_bfe_i32 s3, s6, 0xf000f
5340 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s3
5341 ; GFX9-NEXT: s_bfe_i32 s4, s2, 0xf000f
5342 ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s4
5343 ; GFX9-NEXT: s_xor_b32 s3, s4, s3
5344 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5
5345 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1
5346 ; GFX9-NEXT: s_ashr_i32 s3, s3, 30
5347 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15
5348 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7
5349 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7
5350 ; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6
5351 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7
5352 ; GFX9-NEXT: s_or_b32 s3, s3, 1
5353 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5|
5354 ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1
5355 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
5356 ; GFX9-NEXT: s_cselect_b32 s3, s3, 0
5357 ; GFX9-NEXT: v_add_u32_e32 v5, s3, v7
5358 ; GFX9-NEXT: v_bfe_i32 v7, v0, 0, 15
5359 ; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v7
5360 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v6
5361 ; GFX9-NEXT: v_xor_b32_e32 v1, v7, v1
5362 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1
5363 ; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
5364 ; GFX9-NEXT: v_mul_f32_e32 v7, v8, v9
5365 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7
5366 ; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v7
5367 ; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8
5368 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6|
5369 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
5370 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6
5371 ; GFX9-NEXT: v_mul_lo_u32 v5, v5, s7
5372 ; GFX9-NEXT: v_add_u32_e32 v1, v9, v1
5373 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3
5374 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
5375 ; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4
5376 ; GFX9-NEXT: v_sub_u32_e32 v4, s8, v5
5377 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
5378 ; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4
5379 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1]
5380 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3
5381 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4
5382 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
5383 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0
5384 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
5385 ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1
5386 ; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4
5387 ; GFX9-NEXT: s_endpgm
5388 %r = srem <3 x i15> %x, %y
5389 store <3 x i15> %r, ptr addrspace(1) %out
5393 define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
5394 ; CHECK-LABEL: @udiv_i32_oddk_denom(
5395 ; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
5396 ; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
5397 ; CHECK-NEXT: ret void
5399 ; GFX6-LABEL: udiv_i32_oddk_denom:
5401 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
5402 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
5403 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881
5404 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
5405 ; GFX6-NEXT: s_mov_b32 s2, -1
5406 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
5407 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
5408 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v0
5409 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
5410 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
5411 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0
5412 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
5413 ; GFX6-NEXT: s_endpgm
5415 ; GFX9-LABEL: udiv_i32_oddk_denom:
5417 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
5418 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
5419 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5420 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5421 ; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xb2a50881
5422 ; GFX9-NEXT: s_sub_i32 s2, s2, s3
5423 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1
5424 ; GFX9-NEXT: s_add_i32 s2, s2, s3
5425 ; GFX9-NEXT: s_lshr_b32 s2, s2, 20
5426 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
5427 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
5428 ; GFX9-NEXT: s_endpgm
5429 %r = udiv i32 %x, 1235195
5430 store i32 %r, ptr addrspace(1) %out
5434 define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
5435 ; CHECK-LABEL: @udiv_i32_pow2k_denom(
5436 ; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096
5437 ; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
5438 ; CHECK-NEXT: ret void
5440 ; GFX6-LABEL: udiv_i32_pow2k_denom:
5442 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
5443 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
5444 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
5445 ; GFX6-NEXT: s_mov_b32 s2, -1
5446 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
5447 ; GFX6-NEXT: s_lshr_b32 s4, s6, 12
5448 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
5449 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
5450 ; GFX6-NEXT: s_endpgm
5452 ; GFX9-LABEL: udiv_i32_pow2k_denom:
5454 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
5455 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
5456 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5457 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5458 ; GFX9-NEXT: s_lshr_b32 s2, s2, 12
5459 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
5460 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
5461 ; GFX9-NEXT: s_endpgm
5462 %r = udiv i32 %x, 4096
5463 store i32 %r, ptr addrspace(1) %out
5467 define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) {
5468 ; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
5469 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5470 ; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
5471 ; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
5472 ; CHECK-NEXT: ret void
5474 ; GFX6-LABEL: udiv_i32_pow2_shl_denom:
5476 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
5477 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
5478 ; GFX6-NEXT: s_mov_b32 s6, -1
5479 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
5480 ; GFX6-NEXT: s_mov_b32 s4, s0
5481 ; GFX6-NEXT: s_add_i32 s0, s3, 12
5482 ; GFX6-NEXT: s_lshr_b32 s0, s2, s0
5483 ; GFX6-NEXT: s_mov_b32 s5, s1
5484 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
5485 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
5486 ; GFX6-NEXT: s_endpgm
5488 ; GFX9-LABEL: udiv_i32_pow2_shl_denom:
5490 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5491 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5492 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5493 ; GFX9-NEXT: s_add_i32 s3, s3, 12
5494 ; GFX9-NEXT: s_lshr_b32 s2, s2, s3
5495 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
5496 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
5497 ; GFX9-NEXT: s_endpgm
5498 %shl.y = shl i32 4096, %y
5499 %r = udiv i32 %x, %shl.y
5500 store i32 %r, ptr addrspace(1) %out
5504 define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
5505 ; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
5506 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5507 ; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
5508 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
5509 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5510 ; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096
5511 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5512 ; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
5513 ; CHECK-NEXT: ret void
5515 ; GFX6-LABEL: udiv_v2i32_pow2k_denom:
5517 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
5518 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
5519 ; GFX6-NEXT: s_mov_b32 s6, -1
5520 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
5521 ; GFX6-NEXT: s_mov_b32 s4, s0
5522 ; GFX6-NEXT: s_mov_b32 s5, s1
5523 ; GFX6-NEXT: s_lshr_b32 s0, s2, 12
5524 ; GFX6-NEXT: s_lshr_b32 s1, s3, 12
5525 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
5526 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
5527 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5528 ; GFX6-NEXT: s_endpgm
5530 ; GFX9-LABEL: udiv_v2i32_pow2k_denom:
5532 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5533 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5534 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5535 ; GFX9-NEXT: s_lshr_b32 s2, s2, 12
5536 ; GFX9-NEXT: s_lshr_b32 s3, s3, 12
5537 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
5538 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
5539 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
5540 ; GFX9-NEXT: s_endpgm
5541 %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
5542 store <2 x i32> %r, ptr addrspace(1) %out
5546 define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
5547 ; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
5548 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5549 ; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
5550 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
5551 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5552 ; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095
5553 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5554 ; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
5555 ; CHECK-NEXT: ret void
5557 ; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom:
5559 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
5560 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101
5561 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
5562 ; GFX6-NEXT: s_mov_b32 s6, -1
5563 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
5564 ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
5565 ; GFX6-NEXT: s_mov_b32 s4, s0
5566 ; GFX6-NEXT: s_lshr_b32 s0, s2, 12
5567 ; GFX6-NEXT: s_mov_b32 s5, s1
5568 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v0
5569 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
5570 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
5571 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0
5572 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
5573 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5574 ; GFX6-NEXT: s_endpgm
5576 ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
5578 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5579 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5580 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5581 ; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101
5582 ; GFX9-NEXT: s_sub_i32 s3, s3, s4
5583 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1
5584 ; GFX9-NEXT: s_add_i32 s3, s3, s4
5585 ; GFX9-NEXT: s_lshr_b32 s2, s2, 12
5586 ; GFX9-NEXT: s_lshr_b32 s3, s3, 11
5587 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
5588 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
5589 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
5590 ; GFX9-NEXT: s_endpgm
5591 %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
5592 store <2 x i32> %r, ptr addrspace(1) %out
5596 define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
5597 ; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
5598 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> splat (i32 4096), [[Y:%.*]]
5599 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5600 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5601 ; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
5602 ; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
5603 ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
5604 ; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
5605 ; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]]
5606 ; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
5607 ; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
5608 ; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
5609 ; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
5610 ; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
5611 ; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
5612 ; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
5613 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
5614 ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
5615 ; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
5616 ; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
5617 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
5618 ; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
5619 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
5620 ; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
5621 ; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
5622 ; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
5623 ; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1
5624 ; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
5625 ; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
5626 ; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
5627 ; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
5628 ; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1
5629 ; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
5630 ; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> poison, i32 [[TMP31]], i64 0
5631 ; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1
5632 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5633 ; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
5634 ; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5635 ; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
5636 ; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
5637 ; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]]
5638 ; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
5639 ; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
5640 ; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
5641 ; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
5642 ; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
5643 ; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
5644 ; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
5645 ; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
5646 ; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
5647 ; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
5648 ; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
5649 ; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
5650 ; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
5651 ; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
5652 ; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
5653 ; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
5654 ; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
5655 ; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1
5656 ; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
5657 ; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
5658 ; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
5659 ; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
5660 ; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1
5661 ; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
5662 ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1
5663 ; CHECK-NEXT: store <2 x i32> [[TMP64]], ptr addrspace(1) [[OUT:%.*]], align 8
5664 ; CHECK-NEXT: ret void
5666 ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom:
5668 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb
5669 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
5670 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
5671 ; GFX6-NEXT: s_mov_b32 s6, -1
5672 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
5673 ; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s10
5674 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0
5675 ; GFX6-NEXT: s_sub_i32 s1, 0, s0
5676 ; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s11
5677 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2
5678 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
5679 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
5680 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
5681 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
5682 ; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0
5683 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
5684 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
5685 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
5686 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
5687 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
5688 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0
5689 ; GFX6-NEXT: s_mul_i32 s1, s1, s0
5690 ; GFX6-NEXT: s_sub_i32 s1, s8, s1
5691 ; GFX6-NEXT: s_sub_i32 s3, s1, s0
5692 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0
5693 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
5694 ; GFX6-NEXT: s_cselect_b32 s1, s3, s1
5695 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
5696 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0
5697 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
5698 ; GFX6-NEXT: s_sub_i32 s3, 0, s2
5699 ; GFX6-NEXT: v_mul_lo_u32 v3, s3, v1
5700 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
5701 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
5702 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3
5703 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
5704 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
5705 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1
5706 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1
5707 ; GFX6-NEXT: s_mul_i32 s0, s0, s2
5708 ; GFX6-NEXT: s_sub_i32 s0, s9, s0
5709 ; GFX6-NEXT: s_sub_i32 s1, s0, s2
5710 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1
5711 ; GFX6-NEXT: s_cmp_ge_u32 s0, s2
5712 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
5713 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
5714 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0
5715 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1
5716 ; GFX6-NEXT: s_cmp_ge_u32 s0, s2
5717 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
5718 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
5719 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5720 ; GFX6-NEXT: s_endpgm
5722 ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
5724 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
5725 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5726 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5727 ; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2
5728 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
5729 ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3
5730 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
5731 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
5732 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
5733 ; GFX9-NEXT: s_sub_i32 s4, 0, s7
5734 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
5735 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
5736 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
5737 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
5738 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
5739 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0
5740 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
5741 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
5742 ; GFX9-NEXT: s_add_i32 s5, s5, s4
5743 ; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5
5744 ; GFX9-NEXT: s_mul_i32 s5, s4, s7
5745 ; GFX9-NEXT: s_sub_i32 s0, s0, s5
5746 ; GFX9-NEXT: s_add_i32 s9, s4, 1
5747 ; GFX9-NEXT: s_sub_i32 s5, s0, s7
5748 ; GFX9-NEXT: s_cmp_ge_u32 s0, s7
5749 ; GFX9-NEXT: s_cselect_b32 s4, s9, s4
5750 ; GFX9-NEXT: s_cselect_b32 s0, s5, s0
5751 ; GFX9-NEXT: s_add_i32 s5, s4, 1
5752 ; GFX9-NEXT: s_cmp_ge_u32 s0, s7
5753 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1
5754 ; GFX9-NEXT: s_cselect_b32 s0, s5, s4
5755 ; GFX9-NEXT: s_sub_i32 s4, 0, s6
5756 ; GFX9-NEXT: s_mul_i32 s4, s4, s8
5757 ; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
5758 ; GFX9-NEXT: s_add_i32 s8, s8, s4
5759 ; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8
5760 ; GFX9-NEXT: s_mul_i32 s5, s4, s6
5761 ; GFX9-NEXT: s_sub_i32 s1, s1, s5
5762 ; GFX9-NEXT: s_add_i32 s7, s4, 1
5763 ; GFX9-NEXT: s_sub_i32 s5, s1, s6
5764 ; GFX9-NEXT: s_cmp_ge_u32 s1, s6
5765 ; GFX9-NEXT: s_cselect_b32 s4, s7, s4
5766 ; GFX9-NEXT: s_cselect_b32 s1, s5, s1
5767 ; GFX9-NEXT: s_add_i32 s5, s4, 1
5768 ; GFX9-NEXT: s_cmp_ge_u32 s1, s6
5769 ; GFX9-NEXT: s_cselect_b32 s1, s5, s4
5770 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
5771 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
5772 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5773 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
5774 ; GFX9-NEXT: s_endpgm
5775 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
5776 %r = udiv <2 x i32> %x, %shl.y
5777 store <2 x i32> %r, ptr addrspace(1) %out
5781 define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
5782 ; CHECK-LABEL: @urem_i32_oddk_denom(
5783 ; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195
5784 ; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
5785 ; CHECK-NEXT: ret void
5787 ; GFX6-LABEL: urem_i32_oddk_denom:
5789 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
5790 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
5791 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881
5792 ; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb
5793 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
5794 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
5795 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
5796 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v0
5797 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
5798 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
5799 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0
5800 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2
5801 ; GFX6-NEXT: s_mov_b32 s2, -1
5802 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
5803 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
5804 ; GFX6-NEXT: s_endpgm
5806 ; GFX9-LABEL: urem_i32_oddk_denom:
5808 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
5809 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
5810 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5811 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5812 ; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xb2a50881
5813 ; GFX9-NEXT: s_sub_i32 s4, s2, s3
5814 ; GFX9-NEXT: s_lshr_b32 s4, s4, 1
5815 ; GFX9-NEXT: s_add_i32 s4, s4, s3
5816 ; GFX9-NEXT: s_lshr_b32 s3, s4, 20
5817 ; GFX9-NEXT: s_mul_i32 s3, s3, 0x12d8fb
5818 ; GFX9-NEXT: s_sub_i32 s2, s2, s3
5819 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
5820 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
5821 ; GFX9-NEXT: s_endpgm
5822 %r = urem i32 %x, 1235195
5823 store i32 %r, ptr addrspace(1) %out
5827 define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
5828 ; CHECK-LABEL: @urem_i32_pow2k_denom(
5829 ; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096
5830 ; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
5831 ; CHECK-NEXT: ret void
5833 ; GFX6-LABEL: urem_i32_pow2k_denom:
5835 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
5836 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
5837 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
5838 ; GFX6-NEXT: s_mov_b32 s2, -1
5839 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
5840 ; GFX6-NEXT: s_and_b32 s4, s6, 0xfff
5841 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
5842 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
5843 ; GFX6-NEXT: s_endpgm
5845 ; GFX9-LABEL: urem_i32_pow2k_denom:
5847 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
5848 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
5849 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5850 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5851 ; GFX9-NEXT: s_and_b32 s2, s2, 0xfff
5852 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
5853 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
5854 ; GFX9-NEXT: s_endpgm
5855 %r = urem i32 %x, 4096
5856 store i32 %r, ptr addrspace(1) %out
5860 define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) {
5861 ; CHECK-LABEL: @urem_i32_pow2_shl_denom(
5862 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5863 ; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
5864 ; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
5865 ; CHECK-NEXT: ret void
5867 ; GFX6-LABEL: urem_i32_pow2_shl_denom:
5869 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
5870 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
5871 ; GFX6-NEXT: s_mov_b32 s6, -1
5872 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
5873 ; GFX6-NEXT: s_mov_b32 s4, s0
5874 ; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s3
5875 ; GFX6-NEXT: s_add_i32 s0, s0, -1
5876 ; GFX6-NEXT: s_and_b32 s0, s2, s0
5877 ; GFX6-NEXT: s_mov_b32 s5, s1
5878 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
5879 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
5880 ; GFX6-NEXT: s_endpgm
5882 ; GFX9-LABEL: urem_i32_pow2_shl_denom:
5884 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5885 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5886 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5887 ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
5888 ; GFX9-NEXT: s_add_i32 s3, s3, -1
5889 ; GFX9-NEXT: s_and_b32 s2, s2, s3
5890 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
5891 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
5892 ; GFX9-NEXT: s_endpgm
5893 %shl.y = shl i32 4096, %y
5894 %r = urem i32 %x, %shl.y
5895 store i32 %r, ptr addrspace(1) %out
5899 define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
5900 ; CHECK-LABEL: @urem_v2i32_pow2k_denom(
5901 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5902 ; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
5903 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
5904 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5905 ; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096
5906 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5907 ; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
5908 ; CHECK-NEXT: ret void
5910 ; GFX6-LABEL: urem_v2i32_pow2k_denom:
5912 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
5913 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
5914 ; GFX6-NEXT: s_mov_b32 s6, -1
5915 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
5916 ; GFX6-NEXT: s_mov_b32 s4, s0
5917 ; GFX6-NEXT: s_mov_b32 s5, s1
5918 ; GFX6-NEXT: s_and_b32 s0, s2, 0xfff
5919 ; GFX6-NEXT: s_and_b32 s1, s3, 0xfff
5920 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
5921 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
5922 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5923 ; GFX6-NEXT: s_endpgm
5925 ; GFX9-LABEL: urem_v2i32_pow2k_denom:
5927 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5928 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5929 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5930 ; GFX9-NEXT: s_and_b32 s2, s2, 0xfff
5931 ; GFX9-NEXT: s_and_b32 s3, s3, 0xfff
5932 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
5933 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
5934 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
5935 ; GFX9-NEXT: s_endpgm
5936 %r = urem <2 x i32> %x, <i32 4096, i32 4096>
5937 store <2 x i32> %r, ptr addrspace(1) %out
5941 define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
5942 ; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
5943 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> splat (i32 4096), [[Y:%.*]]
5944 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5945 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5946 ; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
5947 ; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
5948 ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
5949 ; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
5950 ; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]]
5951 ; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
5952 ; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
5953 ; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
5954 ; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
5955 ; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
5956 ; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
5957 ; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
5958 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
5959 ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
5960 ; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
5961 ; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
5962 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
5963 ; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
5964 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
5965 ; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
5966 ; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
5967 ; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
5968 ; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
5969 ; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
5970 ; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
5971 ; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
5972 ; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
5973 ; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP29]], i64 0
5974 ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1
5975 ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5976 ; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
5977 ; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
5978 ; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
5979 ; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
5980 ; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]]
5981 ; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
5982 ; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
5983 ; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
5984 ; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
5985 ; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
5986 ; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
5987 ; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
5988 ; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
5989 ; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
5990 ; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
5991 ; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
5992 ; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
5993 ; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
5994 ; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
5995 ; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
5996 ; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
5997 ; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
5998 ; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
5999 ; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
6000 ; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
6001 ; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
6002 ; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
6003 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1
6004 ; CHECK-NEXT: store <2 x i32> [[TMP60]], ptr addrspace(1) [[OUT:%.*]], align 8
6005 ; CHECK-NEXT: ret void
6007 ; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
6009 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
6010 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
6011 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
6012 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
6013 ; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2
6014 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
6015 ; GFX6-NEXT: s_sub_i32 s6, 0, s2
6016 ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
6017 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3
6018 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
6019 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
6020 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
6021 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
6022 ; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0
6023 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
6024 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
6025 ; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
6026 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
6027 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
6028 ; GFX6-NEXT: v_readfirstlane_b32 s6, v0
6029 ; GFX6-NEXT: s_mul_i32 s6, s6, s2
6030 ; GFX6-NEXT: s_sub_i32 s0, s0, s6
6031 ; GFX6-NEXT: s_sub_i32 s6, s0, s2
6032 ; GFX6-NEXT: s_cmp_ge_u32 s0, s2
6033 ; GFX6-NEXT: s_cselect_b32 s0, s6, s0
6034 ; GFX6-NEXT: s_sub_i32 s6, s0, s2
6035 ; GFX6-NEXT: s_cmp_ge_u32 s0, s2
6036 ; GFX6-NEXT: s_cselect_b32 s0, s6, s0
6037 ; GFX6-NEXT: s_sub_i32 s2, 0, s3
6038 ; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1
6039 ; GFX6-NEXT: s_mov_b32 s6, -1
6040 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0
6041 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
6042 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0
6043 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0
6044 ; GFX6-NEXT: s_mul_i32 s2, s2, s3
6045 ; GFX6-NEXT: s_sub_i32 s1, s1, s2
6046 ; GFX6-NEXT: s_sub_i32 s2, s1, s3
6047 ; GFX6-NEXT: s_cmp_ge_u32 s1, s3
6048 ; GFX6-NEXT: s_cselect_b32 s1, s2, s1
6049 ; GFX6-NEXT: s_sub_i32 s2, s1, s3
6050 ; GFX6-NEXT: s_cmp_ge_u32 s1, s3
6051 ; GFX6-NEXT: s_cselect_b32 s1, s2, s1
6052 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
6053 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
6054 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6055 ; GFX6-NEXT: s_endpgm
6057 ; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
6059 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
6060 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6061 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6062 ; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2
6063 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
6064 ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3
6065 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
6066 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
6067 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
6068 ; GFX9-NEXT: s_sub_i32 s4, 0, s7
6069 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
6070 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
6071 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
6072 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
6073 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
6074 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0
6075 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
6076 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
6077 ; GFX9-NEXT: s_add_i32 s5, s5, s4
6078 ; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5
6079 ; GFX9-NEXT: s_mul_i32 s4, s4, s7
6080 ; GFX9-NEXT: s_sub_i32 s0, s0, s4
6081 ; GFX9-NEXT: s_sub_i32 s4, s0, s7
6082 ; GFX9-NEXT: s_cmp_ge_u32 s0, s7
6083 ; GFX9-NEXT: s_cselect_b32 s0, s4, s0
6084 ; GFX9-NEXT: s_sub_i32 s4, s0, s7
6085 ; GFX9-NEXT: s_cmp_ge_u32 s0, s7
6086 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1
6087 ; GFX9-NEXT: s_cselect_b32 s0, s4, s0
6088 ; GFX9-NEXT: s_sub_i32 s4, 0, s6
6089 ; GFX9-NEXT: s_mul_i32 s4, s4, s8
6090 ; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
6091 ; GFX9-NEXT: s_add_i32 s8, s8, s4
6092 ; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8
6093 ; GFX9-NEXT: s_mul_i32 s4, s4, s6
6094 ; GFX9-NEXT: s_sub_i32 s1, s1, s4
6095 ; GFX9-NEXT: s_sub_i32 s4, s1, s6
6096 ; GFX9-NEXT: s_cmp_ge_u32 s1, s6
6097 ; GFX9-NEXT: s_cselect_b32 s1, s4, s1
6098 ; GFX9-NEXT: s_sub_i32 s4, s1, s6
6099 ; GFX9-NEXT: s_cmp_ge_u32 s1, s6
6100 ; GFX9-NEXT: s_cselect_b32 s1, s4, s1
6101 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
6102 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
6103 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6104 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
6105 ; GFX9-NEXT: s_endpgm
6106 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
6107 %r = urem <2 x i32> %x, %shl.y
6108 store <2 x i32> %r, ptr addrspace(1) %out
6112 define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
6113 ; CHECK-LABEL: @sdiv_i32_oddk_denom(
6114 ; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
6115 ; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
6116 ; CHECK-NEXT: ret void
6118 ; GFX6-LABEL: sdiv_i32_oddk_denom:
6120 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
6121 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
6122 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441
6123 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
6124 ; GFX6-NEXT: s_mov_b32 s2, -1
6125 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
6126 ; GFX6-NEXT: v_mul_hi_i32 v0, s6, v0
6127 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s6, v0
6128 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0
6129 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0
6130 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
6131 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
6132 ; GFX6-NEXT: s_endpgm
6134 ; GFX9-LABEL: sdiv_i32_oddk_denom:
6136 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
6137 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
6138 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6139 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6140 ; GFX9-NEXT: s_mul_hi_i32 s3, s2, 0xd9528441
6141 ; GFX9-NEXT: s_add_i32 s3, s3, s2
6142 ; GFX9-NEXT: s_lshr_b32 s2, s3, 31
6143 ; GFX9-NEXT: s_ashr_i32 s3, s3, 20
6144 ; GFX9-NEXT: s_add_i32 s2, s3, s2
6145 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
6146 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
6147 ; GFX9-NEXT: s_endpgm
6148 %r = sdiv i32 %x, 1235195
6149 store i32 %r, ptr addrspace(1) %out
6153 define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
6154 ; CHECK-LABEL: @sdiv_i32_pow2k_denom(
6155 ; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
6156 ; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
6157 ; CHECK-NEXT: ret void
6159 ; GFX6-LABEL: sdiv_i32_pow2k_denom:
6161 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
6162 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
6163 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
6164 ; GFX6-NEXT: s_mov_b32 s2, -1
6165 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
6166 ; GFX6-NEXT: s_ashr_i32 s4, s6, 31
6167 ; GFX6-NEXT: s_lshr_b32 s4, s4, 20
6168 ; GFX6-NEXT: s_add_i32 s6, s6, s4
6169 ; GFX6-NEXT: s_ashr_i32 s4, s6, 12
6170 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
6171 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
6172 ; GFX6-NEXT: s_endpgm
6174 ; GFX9-LABEL: sdiv_i32_pow2k_denom:
6176 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
6177 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
6178 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6179 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6180 ; GFX9-NEXT: s_ashr_i32 s3, s2, 31
6181 ; GFX9-NEXT: s_lshr_b32 s3, s3, 20
6182 ; GFX9-NEXT: s_add_i32 s2, s2, s3
6183 ; GFX9-NEXT: s_ashr_i32 s2, s2, 12
6184 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
6185 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
6186 ; GFX9-NEXT: s_endpgm
6187 %r = sdiv i32 %x, 4096
6188 store i32 %r, ptr addrspace(1) %out
6192 define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) {
6193 ; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
6194 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
6195 ; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
6196 ; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
6197 ; CHECK-NEXT: ret void
6199 ; GFX6-LABEL: sdiv_i32_pow2_shl_denom:
6201 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
6202 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
6203 ; GFX6-NEXT: s_mov_b32 s6, -1
6204 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
6205 ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
6206 ; GFX6-NEXT: s_ashr_i32 s8, s3, 31
6207 ; GFX6-NEXT: s_add_i32 s3, s3, s8
6208 ; GFX6-NEXT: s_xor_b32 s3, s3, s8
6209 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
6210 ; GFX6-NEXT: s_sub_i32 s4, 0, s3
6211 ; GFX6-NEXT: s_ashr_i32 s9, s2, 31
6212 ; GFX6-NEXT: s_add_i32 s2, s2, s9
6213 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
6214 ; GFX6-NEXT: s_xor_b32 s2, s2, s9
6215 ; GFX6-NEXT: s_mov_b32 s5, s1
6216 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
6217 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
6218 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
6219 ; GFX6-NEXT: s_mov_b32 s4, s0
6220 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
6221 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
6222 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
6223 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0
6224 ; GFX6-NEXT: s_mul_i32 s0, s0, s3
6225 ; GFX6-NEXT: s_sub_i32 s0, s2, s0
6226 ; GFX6-NEXT: s_sub_i32 s1, s0, s3
6227 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
6228 ; GFX6-NEXT: s_cmp_ge_u32 s0, s3
6229 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
6230 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0
6231 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
6232 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
6233 ; GFX6-NEXT: s_cmp_ge_u32 s0, s3
6234 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
6235 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
6236 ; GFX6-NEXT: s_xor_b32 s0, s9, s8
6237 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
6238 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
6239 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
6240 ; GFX6-NEXT: s_endpgm
6242 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
6244 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
6245 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
6246 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6247 ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
6248 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31
6249 ; GFX9-NEXT: s_add_i32 s3, s3, s4
6250 ; GFX9-NEXT: s_xor_b32 s3, s3, s4
6251 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
6252 ; GFX9-NEXT: s_sub_i32 s6, 0, s3
6253 ; GFX9-NEXT: s_ashr_i32 s5, s2, 31
6254 ; GFX9-NEXT: s_add_i32 s2, s2, s5
6255 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
6256 ; GFX9-NEXT: s_xor_b32 s2, s2, s5
6257 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
6258 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
6259 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0
6260 ; GFX9-NEXT: s_mul_i32 s6, s6, s7
6261 ; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6
6262 ; GFX9-NEXT: s_add_i32 s7, s7, s6
6263 ; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7
6264 ; GFX9-NEXT: s_mul_i32 s8, s6, s3
6265 ; GFX9-NEXT: s_sub_i32 s2, s2, s8
6266 ; GFX9-NEXT: s_add_i32 s7, s6, 1
6267 ; GFX9-NEXT: s_sub_i32 s8, s2, s3
6268 ; GFX9-NEXT: s_cmp_ge_u32 s2, s3
6269 ; GFX9-NEXT: s_cselect_b32 s6, s7, s6
6270 ; GFX9-NEXT: s_cselect_b32 s2, s8, s2
6271 ; GFX9-NEXT: s_add_i32 s7, s6, 1
6272 ; GFX9-NEXT: s_cmp_ge_u32 s2, s3
6273 ; GFX9-NEXT: s_cselect_b32 s2, s7, s6
6274 ; GFX9-NEXT: s_xor_b32 s3, s5, s4
6275 ; GFX9-NEXT: s_xor_b32 s2, s2, s3
6276 ; GFX9-NEXT: s_sub_i32 s2, s2, s3
6277 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6278 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
6279 ; GFX9-NEXT: s_endpgm
6280 %shl.y = shl i32 4096, %y
6281 %r = sdiv i32 %x, %shl.y
6282 store i32 %r, ptr addrspace(1) %out
6286 define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
6287 ; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
6288 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6289 ; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
6290 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
6291 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6292 ; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096
6293 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6294 ; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
6295 ; CHECK-NEXT: ret void
6297 ; GFX6-LABEL: sdiv_v2i32_pow2k_denom:
6299 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
6300 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
6301 ; GFX6-NEXT: s_mov_b32 s6, -1
6302 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
6303 ; GFX6-NEXT: s_mov_b32 s4, s0
6304 ; GFX6-NEXT: s_mov_b32 s5, s1
6305 ; GFX6-NEXT: s_ashr_i32 s0, s2, 31
6306 ; GFX6-NEXT: s_ashr_i32 s1, s3, 31
6307 ; GFX6-NEXT: s_lshr_b32 s0, s0, 20
6308 ; GFX6-NEXT: s_lshr_b32 s1, s1, 20
6309 ; GFX6-NEXT: s_add_i32 s0, s2, s0
6310 ; GFX6-NEXT: s_add_i32 s1, s3, s1
6311 ; GFX6-NEXT: s_ashr_i32 s0, s0, 12
6312 ; GFX6-NEXT: s_ashr_i32 s1, s1, 12
6313 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
6314 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
6315 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6316 ; GFX6-NEXT: s_endpgm
6318 ; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
6320 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
6321 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6322 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6323 ; GFX9-NEXT: s_ashr_i32 s4, s2, 31
6324 ; GFX9-NEXT: s_ashr_i32 s5, s3, 31
6325 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20
6326 ; GFX9-NEXT: s_lshr_b32 s5, s5, 20
6327 ; GFX9-NEXT: s_add_i32 s2, s2, s4
6328 ; GFX9-NEXT: s_add_i32 s3, s3, s5
6329 ; GFX9-NEXT: s_ashr_i32 s2, s2, 12
6330 ; GFX9-NEXT: s_ashr_i32 s3, s3, 12
6331 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6332 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
6333 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
6334 ; GFX9-NEXT: s_endpgm
6335 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
6336 store <2 x i32> %r, ptr addrspace(1) %out
6340 define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
6341 ; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
6342 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6343 ; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
6344 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
6345 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6346 ; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095
6347 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6348 ; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
6349 ; CHECK-NEXT: ret void
6351 ; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
6353 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
6354 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081
6355 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
6356 ; GFX6-NEXT: s_mov_b32 s6, -1
6357 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
6358 ; GFX6-NEXT: v_mul_hi_i32 v0, s3, v0
6359 ; GFX6-NEXT: s_mov_b32 s4, s0
6360 ; GFX6-NEXT: s_ashr_i32 s0, s2, 31
6361 ; GFX6-NEXT: s_lshr_b32 s0, s0, 20
6362 ; GFX6-NEXT: s_add_i32 s0, s2, s0
6363 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s3, v0
6364 ; GFX6-NEXT: s_ashr_i32 s0, s0, 12
6365 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0
6366 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0
6367 ; GFX6-NEXT: s_mov_b32 s5, s1
6368 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1
6369 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
6370 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6371 ; GFX6-NEXT: s_endpgm
6373 ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
6375 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
6376 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6377 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6378 ; GFX9-NEXT: s_ashr_i32 s4, s2, 31
6379 ; GFX9-NEXT: s_mul_hi_i32 s5, s3, 0x80080081
6380 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20
6381 ; GFX9-NEXT: s_add_i32 s5, s5, s3
6382 ; GFX9-NEXT: s_add_i32 s2, s2, s4
6383 ; GFX9-NEXT: s_lshr_b32 s3, s5, 31
6384 ; GFX9-NEXT: s_ashr_i32 s4, s5, 11
6385 ; GFX9-NEXT: s_ashr_i32 s2, s2, 12
6386 ; GFX9-NEXT: s_add_i32 s4, s4, s3
6387 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6388 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6389 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
6390 ; GFX9-NEXT: s_endpgm
6391 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
6392 store <2 x i32> %r, ptr addrspace(1) %out
6396 define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
6397 ; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
6398 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> splat (i32 4096), [[Y:%.*]]
6399 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6400 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
6401 ; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
6402 ; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
6403 ; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
6404 ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
6405 ; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
6406 ; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
6407 ; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
6408 ; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
6409 ; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
6410 ; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
6411 ; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
6412 ; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]]
6413 ; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
6414 ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
6415 ; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
6416 ; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
6417 ; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
6418 ; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
6419 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
6420 ; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
6421 ; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
6422 ; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
6423 ; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
6424 ; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
6425 ; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
6426 ; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
6427 ; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
6428 ; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
6429 ; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
6430 ; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1
6431 ; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
6432 ; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
6433 ; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
6434 ; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
6435 ; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1
6436 ; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
6437 ; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
6438 ; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
6439 ; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> poison, i32 [[TMP40]], i64 0
6440 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1
6441 ; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
6442 ; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
6443 ; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
6444 ; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
6445 ; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
6446 ; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
6447 ; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
6448 ; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
6449 ; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
6450 ; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
6451 ; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
6452 ; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
6453 ; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]]
6454 ; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
6455 ; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
6456 ; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
6457 ; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
6458 ; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
6459 ; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
6460 ; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
6461 ; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
6462 ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
6463 ; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
6464 ; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
6465 ; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
6466 ; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
6467 ; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
6468 ; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
6469 ; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
6470 ; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
6471 ; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1
6472 ; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
6473 ; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
6474 ; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
6475 ; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
6476 ; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1
6477 ; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
6478 ; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
6479 ; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
6480 ; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1
6481 ; CHECK-NEXT: store <2 x i32> [[TMP82]], ptr addrspace(1) [[OUT:%.*]], align 8
6482 ; CHECK-NEXT: ret void
6484 ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
6486 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
6487 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
6488 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
6489 ; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2
6490 ; GFX6-NEXT: s_abs_i32 s6, s2
6491 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6
6492 ; GFX6-NEXT: s_sub_i32 s7, 0, s6
6493 ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
6494 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
6495 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
6496 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
6497 ; GFX6-NEXT: v_mul_lo_u32 v1, s7, v0
6498 ; GFX6-NEXT: s_abs_i32 s7, s0
6499 ; GFX6-NEXT: s_xor_b32 s0, s0, s2
6500 ; GFX6-NEXT: s_ashr_i32 s0, s0, 31
6501 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
6502 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
6503 ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0
6504 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0
6505 ; GFX6-NEXT: s_mul_i32 s2, s2, s6
6506 ; GFX6-NEXT: s_sub_i32 s2, s7, s2
6507 ; GFX6-NEXT: s_sub_i32 s7, s2, s6
6508 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
6509 ; GFX6-NEXT: s_cmp_ge_u32 s2, s6
6510 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
6511 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
6512 ; GFX6-NEXT: s_cselect_b32 s2, s7, s2
6513 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
6514 ; GFX6-NEXT: s_cmp_ge_u32 s2, s6
6515 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
6516 ; GFX6-NEXT: s_abs_i32 s2, s3
6517 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2
6518 ; GFX6-NEXT: s_sub_i32 s6, 0, s2
6519 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
6520 ; GFX6-NEXT: s_xor_b32 s3, s1, s3
6521 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
6522 ; GFX6-NEXT: s_abs_i32 s1, s1
6523 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
6524 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
6525 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
6526 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
6527 ; GFX6-NEXT: s_ashr_i32 s3, s3, 31
6528 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
6529 ; GFX6-NEXT: v_mul_lo_u32 v3, s6, v2
6530 ; GFX6-NEXT: s_mov_b32 s6, -1
6531 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3
6532 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
6533 ; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1
6534 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1
6535 ; GFX6-NEXT: s_mul_i32 s0, s0, s2
6536 ; GFX6-NEXT: s_sub_i32 s0, s1, s0
6537 ; GFX6-NEXT: s_sub_i32 s1, s0, s2
6538 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1
6539 ; GFX6-NEXT: s_cmp_ge_u32 s0, s2
6540 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
6541 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
6542 ; GFX6-NEXT: s_cselect_b32 s0, s1, s0
6543 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1
6544 ; GFX6-NEXT: s_cmp_ge_u32 s0, s2
6545 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
6546 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
6547 ; GFX6-NEXT: v_xor_b32_e32 v1, s3, v1
6548 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s3, v1
6549 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6550 ; GFX6-NEXT: s_endpgm
6552 ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
6554 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
6555 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6556 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6557 ; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2
6558 ; GFX9-NEXT: s_abs_i32 s6, s2
6559 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
6560 ; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s3
6561 ; GFX9-NEXT: s_abs_i32 s3, s0
6562 ; GFX9-NEXT: s_xor_b32 s0, s0, s2
6563 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
6564 ; GFX9-NEXT: s_sub_i32 s2, 0, s6
6565 ; GFX9-NEXT: s_ashr_i32 s0, s0, 31
6566 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
6567 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
6568 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0
6569 ; GFX9-NEXT: s_mul_i32 s2, s2, s8
6570 ; GFX9-NEXT: s_mul_hi_u32 s2, s8, s2
6571 ; GFX9-NEXT: s_add_i32 s8, s8, s2
6572 ; GFX9-NEXT: s_mul_hi_u32 s2, s3, s8
6573 ; GFX9-NEXT: s_mul_i32 s8, s2, s6
6574 ; GFX9-NEXT: s_sub_i32 s3, s3, s8
6575 ; GFX9-NEXT: s_add_i32 s9, s2, 1
6576 ; GFX9-NEXT: s_sub_i32 s8, s3, s6
6577 ; GFX9-NEXT: s_cmp_ge_u32 s3, s6
6578 ; GFX9-NEXT: s_cselect_b32 s2, s9, s2
6579 ; GFX9-NEXT: s_cselect_b32 s3, s8, s3
6580 ; GFX9-NEXT: s_add_i32 s8, s2, 1
6581 ; GFX9-NEXT: s_cmp_ge_u32 s3, s6
6582 ; GFX9-NEXT: s_cselect_b32 s6, s8, s2
6583 ; GFX9-NEXT: s_abs_i32 s8, s7
6584 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
6585 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
6586 ; GFX9-NEXT: s_xor_b32 s5, s6, s0
6587 ; GFX9-NEXT: s_sub_i32 s6, 0, s8
6588 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
6589 ; GFX9-NEXT: s_sub_i32 s0, s5, s0
6590 ; GFX9-NEXT: s_xor_b32 s4, s1, s7
6591 ; GFX9-NEXT: s_abs_i32 s1, s1
6592 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
6593 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
6594 ; GFX9-NEXT: s_ashr_i32 s4, s4, 31
6595 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0
6596 ; GFX9-NEXT: s_mul_i32 s6, s6, s5
6597 ; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6
6598 ; GFX9-NEXT: s_add_i32 s5, s5, s6
6599 ; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5
6600 ; GFX9-NEXT: s_mul_i32 s6, s5, s8
6601 ; GFX9-NEXT: s_sub_i32 s1, s1, s6
6602 ; GFX9-NEXT: s_add_i32 s7, s5, 1
6603 ; GFX9-NEXT: s_sub_i32 s6, s1, s8
6604 ; GFX9-NEXT: s_cmp_ge_u32 s1, s8
6605 ; GFX9-NEXT: s_cselect_b32 s5, s7, s5
6606 ; GFX9-NEXT: s_cselect_b32 s1, s6, s1
6607 ; GFX9-NEXT: s_add_i32 s6, s5, 1
6608 ; GFX9-NEXT: s_cmp_ge_u32 s1, s8
6609 ; GFX9-NEXT: s_cselect_b32 s1, s6, s5
6610 ; GFX9-NEXT: s_xor_b32 s1, s1, s4
6611 ; GFX9-NEXT: s_sub_i32 s1, s1, s4
6612 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
6613 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
6614 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6615 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
6616 ; GFX9-NEXT: s_endpgm
6617 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
6618 %r = sdiv <2 x i32> %x, %shl.y
6619 store <2 x i32> %r, ptr addrspace(1) %out
6623 define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
6624 ; CHECK-LABEL: @srem_i32_oddk_denom(
6625 ; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195
6626 ; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
6627 ; CHECK-NEXT: ret void
6629 ; GFX6-LABEL: srem_i32_oddk_denom:
6631 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
6632 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
6633 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441
6634 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
6635 ; GFX6-NEXT: s_mov_b32 s2, -1
6636 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
6637 ; GFX6-NEXT: v_mul_hi_i32 v0, s6, v0
6638 ; GFX6-NEXT: v_readfirstlane_b32 s4, v0
6639 ; GFX6-NEXT: s_add_i32 s4, s4, s6
6640 ; GFX6-NEXT: s_lshr_b32 s5, s4, 31
6641 ; GFX6-NEXT: s_ashr_i32 s4, s4, 20
6642 ; GFX6-NEXT: s_add_i32 s4, s4, s5
6643 ; GFX6-NEXT: s_mul_i32 s4, s4, 0x12d8fb
6644 ; GFX6-NEXT: s_sub_i32 s4, s6, s4
6645 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
6646 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
6647 ; GFX6-NEXT: s_endpgm
6649 ; GFX9-LABEL: srem_i32_oddk_denom:
6651 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
6652 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
6653 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6654 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6655 ; GFX9-NEXT: s_mul_hi_i32 s3, s2, 0xd9528441
6656 ; GFX9-NEXT: s_add_i32 s3, s3, s2
6657 ; GFX9-NEXT: s_lshr_b32 s4, s3, 31
6658 ; GFX9-NEXT: s_ashr_i32 s3, s3, 20
6659 ; GFX9-NEXT: s_add_i32 s3, s3, s4
6660 ; GFX9-NEXT: s_mul_i32 s3, s3, 0x12d8fb
6661 ; GFX9-NEXT: s_sub_i32 s2, s2, s3
6662 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
6663 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
6664 ; GFX9-NEXT: s_endpgm
6665 %r = srem i32 %x, 1235195
6666 store i32 %r, ptr addrspace(1) %out
6670 define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
6671 ; CHECK-LABEL: @srem_i32_pow2k_denom(
6672 ; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096
6673 ; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
6674 ; CHECK-NEXT: ret void
6676 ; GFX6-LABEL: srem_i32_pow2k_denom:
6678 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
6679 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
6680 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
6681 ; GFX6-NEXT: s_mov_b32 s2, -1
6682 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
6683 ; GFX6-NEXT: s_ashr_i32 s4, s6, 31
6684 ; GFX6-NEXT: s_lshr_b32 s4, s4, 20
6685 ; GFX6-NEXT: s_add_i32 s4, s6, s4
6686 ; GFX6-NEXT: s_and_b32 s4, s4, 0xfffff000
6687 ; GFX6-NEXT: s_sub_i32 s4, s6, s4
6688 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
6689 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
6690 ; GFX6-NEXT: s_endpgm
6692 ; GFX9-LABEL: srem_i32_pow2k_denom:
6694 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
6695 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
6696 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6697 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6698 ; GFX9-NEXT: s_ashr_i32 s3, s2, 31
6699 ; GFX9-NEXT: s_lshr_b32 s3, s3, 20
6700 ; GFX9-NEXT: s_add_i32 s3, s2, s3
6701 ; GFX9-NEXT: s_and_b32 s3, s3, 0xfffff000
6702 ; GFX9-NEXT: s_sub_i32 s2, s2, s3
6703 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
6704 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
6705 ; GFX9-NEXT: s_endpgm
6706 %r = srem i32 %x, 4096
6707 store i32 %r, ptr addrspace(1) %out
6711 define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) {
6712 ; CHECK-LABEL: @srem_i32_pow2_shl_denom(
6713 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
6714 ; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
6715 ; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
6716 ; CHECK-NEXT: ret void
6718 ; GFX6-LABEL: srem_i32_pow2_shl_denom:
6720 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
6721 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
6722 ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
6723 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31
6724 ; GFX6-NEXT: s_add_i32 s3, s3, s4
6725 ; GFX6-NEXT: s_xor_b32 s4, s3, s4
6726 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4
6727 ; GFX6-NEXT: s_sub_i32 s3, 0, s4
6728 ; GFX6-NEXT: s_ashr_i32 s5, s2, 31
6729 ; GFX6-NEXT: s_add_i32 s2, s2, s5
6730 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
6731 ; GFX6-NEXT: s_xor_b32 s6, s2, s5
6732 ; GFX6-NEXT: s_mov_b32 s2, -1
6733 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
6734 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
6735 ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0
6736 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
6737 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
6738 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
6739 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
6740 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0
6741 ; GFX6-NEXT: s_mul_i32 s7, s7, s4
6742 ; GFX6-NEXT: s_sub_i32 s6, s6, s7
6743 ; GFX6-NEXT: s_sub_i32 s7, s6, s4
6744 ; GFX6-NEXT: s_cmp_ge_u32 s6, s4
6745 ; GFX6-NEXT: s_cselect_b32 s6, s7, s6
6746 ; GFX6-NEXT: s_sub_i32 s7, s6, s4
6747 ; GFX6-NEXT: s_cmp_ge_u32 s6, s4
6748 ; GFX6-NEXT: s_cselect_b32 s4, s7, s6
6749 ; GFX6-NEXT: s_xor_b32 s4, s4, s5
6750 ; GFX6-NEXT: s_sub_i32 s4, s4, s5
6751 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
6752 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
6753 ; GFX6-NEXT: s_endpgm
6755 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
6757 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
6758 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
6759 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6760 ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
6761 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31
6762 ; GFX9-NEXT: s_add_i32 s3, s3, s4
6763 ; GFX9-NEXT: s_xor_b32 s3, s3, s4
6764 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
6765 ; GFX9-NEXT: s_sub_i32 s5, 0, s3
6766 ; GFX9-NEXT: s_ashr_i32 s4, s2, 31
6767 ; GFX9-NEXT: s_add_i32 s2, s2, s4
6768 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
6769 ; GFX9-NEXT: s_xor_b32 s2, s2, s4
6770 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
6771 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
6772 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0
6773 ; GFX9-NEXT: s_mul_i32 s5, s5, s6
6774 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
6775 ; GFX9-NEXT: s_add_i32 s6, s6, s5
6776 ; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6
6777 ; GFX9-NEXT: s_mul_i32 s5, s5, s3
6778 ; GFX9-NEXT: s_sub_i32 s2, s2, s5
6779 ; GFX9-NEXT: s_sub_i32 s5, s2, s3
6780 ; GFX9-NEXT: s_cmp_ge_u32 s2, s3
6781 ; GFX9-NEXT: s_cselect_b32 s2, s5, s2
6782 ; GFX9-NEXT: s_sub_i32 s5, s2, s3
6783 ; GFX9-NEXT: s_cmp_ge_u32 s2, s3
6784 ; GFX9-NEXT: s_cselect_b32 s2, s5, s2
6785 ; GFX9-NEXT: s_xor_b32 s2, s2, s4
6786 ; GFX9-NEXT: s_sub_i32 s2, s2, s4
6787 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6788 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
6789 ; GFX9-NEXT: s_endpgm
6790 %shl.y = shl i32 4096, %y
6791 %r = srem i32 %x, %shl.y
6792 store i32 %r, ptr addrspace(1) %out
6796 define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) {
6797 ; CHECK-LABEL: @srem_v2i32_pow2k_denom(
6798 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6799 ; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
6800 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
6801 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6802 ; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096
6803 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6804 ; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8
6805 ; CHECK-NEXT: ret void
6807 ; GFX6-LABEL: srem_v2i32_pow2k_denom:
6809 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
6810 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
6811 ; GFX6-NEXT: s_mov_b32 s6, -1
6812 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
6813 ; GFX6-NEXT: s_mov_b32 s4, s0
6814 ; GFX6-NEXT: s_mov_b32 s5, s1
6815 ; GFX6-NEXT: s_ashr_i32 s0, s2, 31
6816 ; GFX6-NEXT: s_ashr_i32 s1, s3, 31
6817 ; GFX6-NEXT: s_lshr_b32 s0, s0, 20
6818 ; GFX6-NEXT: s_lshr_b32 s1, s1, 20
6819 ; GFX6-NEXT: s_add_i32 s0, s2, s0
6820 ; GFX6-NEXT: s_add_i32 s1, s3, s1
6821 ; GFX6-NEXT: s_and_b32 s0, s0, 0xfffff000
6822 ; GFX6-NEXT: s_and_b32 s1, s1, 0xfffff000
6823 ; GFX6-NEXT: s_sub_i32 s0, s2, s0
6824 ; GFX6-NEXT: s_sub_i32 s1, s3, s1
6825 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
6826 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
6827 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6828 ; GFX6-NEXT: s_endpgm
6830 ; GFX9-LABEL: srem_v2i32_pow2k_denom:
6832 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
6833 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6834 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6835 ; GFX9-NEXT: s_ashr_i32 s4, s2, 31
6836 ; GFX9-NEXT: s_ashr_i32 s5, s3, 31
6837 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20
6838 ; GFX9-NEXT: s_lshr_b32 s5, s5, 20
6839 ; GFX9-NEXT: s_add_i32 s4, s2, s4
6840 ; GFX9-NEXT: s_add_i32 s5, s3, s5
6841 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000
6842 ; GFX9-NEXT: s_sub_i32 s2, s2, s4
6843 ; GFX9-NEXT: s_and_b32 s4, s5, 0xfffff000
6844 ; GFX9-NEXT: s_sub_i32 s3, s3, s4
6845 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6846 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
6847 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
6848 ; GFX9-NEXT: s_endpgm
6849 %r = srem <2 x i32> %x, <i32 4096, i32 4096>
6850 store <2 x i32> %r, ptr addrspace(1) %out
6854 define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) {
6855 ; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
6856 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> splat (i32 4096), [[Y:%.*]]
6857 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6858 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
6859 ; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
6860 ; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
6861 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
6862 ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
6863 ; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
6864 ; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
6865 ; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
6866 ; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
6867 ; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
6868 ; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
6869 ; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]]
6870 ; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
6871 ; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
6872 ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
6873 ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
6874 ; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
6875 ; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
6876 ; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
6877 ; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
6878 ; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
6879 ; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
6880 ; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
6881 ; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
6882 ; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
6883 ; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
6884 ; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
6885 ; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
6886 ; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
6887 ; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
6888 ; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
6889 ; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
6890 ; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
6891 ; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
6892 ; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
6893 ; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
6894 ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> poison, i32 [[TMP37]], i64 0
6895 ; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1
6896 ; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
6897 ; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
6898 ; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
6899 ; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
6900 ; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
6901 ; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
6902 ; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
6903 ; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
6904 ; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
6905 ; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
6906 ; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
6907 ; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]]
6908 ; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
6909 ; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
6910 ; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
6911 ; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
6912 ; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
6913 ; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
6914 ; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
6915 ; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
6916 ; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
6917 ; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
6918 ; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
6919 ; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
6920 ; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
6921 ; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
6922 ; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
6923 ; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
6924 ; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
6925 ; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
6926 ; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
6927 ; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
6928 ; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
6929 ; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
6930 ; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
6931 ; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
6932 ; CHECK-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1
6933 ; CHECK-NEXT: store <2 x i32> [[TMP76]], ptr addrspace(1) [[OUT:%.*]], align 8
6934 ; CHECK-NEXT: ret void
6936 ; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
6938 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
6939 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
6940 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
6941 ; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2
6942 ; GFX6-NEXT: s_abs_i32 s2, s2
6943 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
6944 ; GFX6-NEXT: s_sub_i32 s6, 0, s2
6945 ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
6946 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
6947 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
6948 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
6949 ; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0
6950 ; GFX6-NEXT: s_abs_i32 s6, s0
6951 ; GFX6-NEXT: s_ashr_i32 s0, s0, 31
6952 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
6953 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
6954 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
6955 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0
6956 ; GFX6-NEXT: s_mul_i32 s7, s7, s2
6957 ; GFX6-NEXT: s_sub_i32 s6, s6, s7
6958 ; GFX6-NEXT: s_sub_i32 s7, s6, s2
6959 ; GFX6-NEXT: s_cmp_ge_u32 s6, s2
6960 ; GFX6-NEXT: s_cselect_b32 s6, s7, s6
6961 ; GFX6-NEXT: s_sub_i32 s7, s6, s2
6962 ; GFX6-NEXT: s_cmp_ge_u32 s6, s2
6963 ; GFX6-NEXT: s_cselect_b32 s2, s7, s6
6964 ; GFX6-NEXT: s_abs_i32 s3, s3
6965 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
6966 ; GFX6-NEXT: s_sub_i32 s6, 0, s3
6967 ; GFX6-NEXT: s_abs_i32 s8, s1
6968 ; GFX6-NEXT: s_xor_b32 s2, s2, s0
6969 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
6970 ; GFX6-NEXT: s_sub_i32 s0, s2, s0
6971 ; GFX6-NEXT: s_ashr_i32 s1, s1, 31
6972 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
6973 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
6974 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
6975 ; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0
6976 ; GFX6-NEXT: s_mov_b32 s6, -1
6977 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
6978 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
6979 ; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
6980 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0
6981 ; GFX6-NEXT: s_mul_i32 s2, s2, s3
6982 ; GFX6-NEXT: s_sub_i32 s2, s8, s2
6983 ; GFX6-NEXT: s_sub_i32 s8, s2, s3
6984 ; GFX6-NEXT: s_cmp_ge_u32 s2, s3
6985 ; GFX6-NEXT: s_cselect_b32 s2, s8, s2
6986 ; GFX6-NEXT: s_sub_i32 s8, s2, s3
6987 ; GFX6-NEXT: s_cmp_ge_u32 s2, s3
6988 ; GFX6-NEXT: s_cselect_b32 s2, s8, s2
6989 ; GFX6-NEXT: s_xor_b32 s2, s2, s1
6990 ; GFX6-NEXT: s_sub_i32 s1, s2, s1
6991 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
6992 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
6993 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6994 ; GFX6-NEXT: s_endpgm
6996 ; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
6998 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
6999 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7001 ; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2
7002 ; GFX9-NEXT: s_abs_i32 s2, s2
7003 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
7004 ; GFX9-NEXT: s_sub_i32 s7, 0, s2
7005 ; GFX9-NEXT: s_ashr_i32 s6, s0, 31
7006 ; GFX9-NEXT: s_abs_i32 s0, s0
7007 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
7008 ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
7009 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
7010 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
7011 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0
7012 ; GFX9-NEXT: s_mul_i32 s7, s7, s8
7013 ; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7
7014 ; GFX9-NEXT: s_add_i32 s8, s8, s7
7015 ; GFX9-NEXT: s_mul_hi_u32 s7, s0, s8
7016 ; GFX9-NEXT: s_mul_i32 s7, s7, s2
7017 ; GFX9-NEXT: s_sub_i32 s0, s0, s7
7018 ; GFX9-NEXT: s_sub_i32 s7, s0, s2
7019 ; GFX9-NEXT: s_cmp_ge_u32 s0, s2
7020 ; GFX9-NEXT: s_cselect_b32 s0, s7, s0
7021 ; GFX9-NEXT: s_sub_i32 s7, s0, s2
7022 ; GFX9-NEXT: s_cmp_ge_u32 s0, s2
7023 ; GFX9-NEXT: s_cselect_b32 s0, s7, s0
7024 ; GFX9-NEXT: s_abs_i32 s7, s3
7025 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
7026 ; GFX9-NEXT: s_xor_b32 s0, s0, s6
7027 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
7028 ; GFX9-NEXT: s_sub_i32 s5, 0, s7
7029 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
7030 ; GFX9-NEXT: s_sub_i32 s0, s0, s6
7031 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31
7032 ; GFX9-NEXT: s_abs_i32 s1, s1
7033 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
7034 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
7035 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0
7036 ; GFX9-NEXT: s_mul_i32 s5, s5, s6
7037 ; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
7038 ; GFX9-NEXT: s_add_i32 s6, s6, s5
7039 ; GFX9-NEXT: s_mul_hi_u32 s5, s1, s6
7040 ; GFX9-NEXT: s_mul_i32 s5, s5, s7
7041 ; GFX9-NEXT: s_sub_i32 s1, s1, s5
7042 ; GFX9-NEXT: s_sub_i32 s5, s1, s7
7043 ; GFX9-NEXT: s_cmp_ge_u32 s1, s7
7044 ; GFX9-NEXT: s_cselect_b32 s1, s5, s1
7045 ; GFX9-NEXT: s_sub_i32 s5, s1, s7
7046 ; GFX9-NEXT: s_cmp_ge_u32 s1, s7
7047 ; GFX9-NEXT: s_cselect_b32 s1, s5, s1
7048 ; GFX9-NEXT: s_xor_b32 s1, s1, s4
7049 ; GFX9-NEXT: s_sub_i32 s1, s1, s4
7050 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
7051 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
7052 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7053 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
7054 ; GFX9-NEXT: s_endpgm
7055 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
7056 %r = srem <2 x i32> %x, %shl.y
7057 store <2 x i32> %r, ptr addrspace(1) %out
7061 define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
7062 ; CHECK-LABEL: @udiv_i64_oddk_denom(
7063 ; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
7064 ; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7065 ; CHECK-NEXT: ret void
7067 ; GFX6-LABEL: udiv_i64_oddk_denom:
7069 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
7070 ; GFX6-NEXT: v_mov_b32_e32 v1, 0x64c139ef
7071 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x38f83e5
7072 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
7073 ; GFX6-NEXT: s_mov_b32 s6, -1
7074 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7075 ; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1
7076 ; GFX6-NEXT: v_mul_hi_u32 v3, s3, v1
7077 ; GFX6-NEXT: s_mov_b32 s5, s1
7078 ; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0
7079 ; GFX6-NEXT: s_mul_i32 s1, s3, 0x64c139ef
7080 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, s1, v4
7081 ; GFX6-NEXT: s_mov_b32 s4, s0
7082 ; GFX6-NEXT: s_mul_i32 s0, s2, 0x38f83e5
7083 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
7084 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v4
7085 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
7086 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
7087 ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
7088 ; GFX6-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
7089 ; GFX6-NEXT: s_mul_i32 s0, s3, 0x38f83e5
7090 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v2
7091 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v0, v3, vcc
7092 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
7093 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 2, v0
7094 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7095 ; GFX6-NEXT: s_endpgm
7097 ; GFX9-LABEL: udiv_i64_oddk_denom:
7099 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
7100 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7101 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7102 ; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0x38f83e5
7103 ; GFX9-NEXT: s_mul_i32 s5, s2, 0x38f83e5
7104 ; GFX9-NEXT: s_mul_i32 s7, s3, 0x64c139ef
7105 ; GFX9-NEXT: s_mul_hi_u32 s2, s2, 0x64c139ef
7106 ; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0x64c139ef
7107 ; GFX9-NEXT: s_add_u32 s2, s7, s2
7108 ; GFX9-NEXT: s_addc_u32 s6, s6, 0
7109 ; GFX9-NEXT: s_add_u32 s2, s5, s2
7110 ; GFX9-NEXT: s_addc_u32 s2, s4, 0
7111 ; GFX9-NEXT: s_add_u32 s2, s6, s2
7112 ; GFX9-NEXT: s_addc_u32 s4, 0, 0
7113 ; GFX9-NEXT: s_mul_hi_u32 s5, s3, 0x38f83e5
7114 ; GFX9-NEXT: s_mul_i32 s3, s3, 0x38f83e5
7115 ; GFX9-NEXT: s_add_u32 s2, s3, s2
7116 ; GFX9-NEXT: s_addc_u32 s2, s5, s4
7117 ; GFX9-NEXT: s_lshr_b32 s2, s2, 2
7118 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
7119 ; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
7120 ; GFX9-NEXT: s_endpgm
7121 %r = udiv i64 %x, 1235195949943
7122 store i64 %r, ptr addrspace(1) %out
7126 define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
7127 ; CHECK-LABEL: @udiv_i64_pow2k_denom(
7128 ; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096
7129 ; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7130 ; CHECK-NEXT: ret void
7132 ; GFX6-LABEL: udiv_i64_pow2k_denom:
7134 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
7135 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
7136 ; GFX6-NEXT: s_mov_b32 s6, -1
7137 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7138 ; GFX6-NEXT: s_mov_b32 s4, s0
7139 ; GFX6-NEXT: s_mov_b32 s5, s1
7140 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 12
7141 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
7142 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
7143 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7144 ; GFX6-NEXT: s_endpgm
7146 ; GFX9-LABEL: udiv_i64_pow2k_denom:
7148 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
7149 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7150 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7151 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12
7152 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
7153 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
7154 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
7155 ; GFX9-NEXT: s_endpgm
7156 %r = udiv i64 %x, 4096
7157 store i64 %r, ptr addrspace(1) %out
7161 define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) {
7162 ; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
7163 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
7164 ; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
7165 ; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7166 ; CHECK-NEXT: ret void
7168 ; GFX6-LABEL: udiv_i64_pow2_shl_denom:
7170 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
7171 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0xd
7172 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
7173 ; GFX6-NEXT: s_mov_b32 s6, -1
7174 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7175 ; GFX6-NEXT: s_mov_b32 s4, s0
7176 ; GFX6-NEXT: s_add_i32 s8, s8, 12
7177 ; GFX6-NEXT: s_mov_b32 s5, s1
7178 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], s8
7179 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
7180 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
7181 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7182 ; GFX6-NEXT: s_endpgm
7184 ; GFX9-LABEL: udiv_i64_pow2_shl_denom:
7186 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34
7187 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
7188 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7189 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7190 ; GFX9-NEXT: s_add_i32 s6, s6, 12
7191 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s6
7192 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
7193 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
7194 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
7195 ; GFX9-NEXT: s_endpgm
7196 %shl.y = shl i64 4096, %y
7197 %r = udiv i64 %x, %shl.y
7198 store i64 %r, ptr addrspace(1) %out
7202 define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
7203 ; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
7204 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7205 ; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
7206 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
7207 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7208 ; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
7209 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7210 ; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
7211 ; CHECK-NEXT: ret void
7213 ; GFX6-LABEL: udiv_v2i64_pow2k_denom:
7215 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
7216 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
7217 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
7218 ; GFX6-NEXT: s_mov_b32 s6, -1
7219 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7220 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 12
7221 ; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 12
7222 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
7223 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
7224 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
7225 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
7226 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7227 ; GFX6-NEXT: s_endpgm
7229 ; GFX9-LABEL: udiv_v2i64_pow2k_denom:
7231 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
7232 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
7233 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
7234 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7235 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 12
7236 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12
7237 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
7238 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
7239 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
7240 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
7241 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
7242 ; GFX9-NEXT: s_endpgm
7243 %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
7244 store <2 x i64> %r, ptr addrspace(1) %out
7248 define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
7249 ; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
7250 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7251 ; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
7252 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
7253 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7254 ; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
7255 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7256 ; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
7257 ; CHECK-NEXT: ret void
7259 ; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom:
7261 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd
7262 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
7263 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x10010011
7264 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100100
7265 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
7266 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7267 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2
7268 ; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2
7269 ; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0
7270 ; GFX6-NEXT: s_mul_i32 s7, s11, 0x10010011
7271 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s7, v3
7272 ; GFX6-NEXT: s_mul_i32 s6, s10, 0x100100
7273 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
7274 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s6, v3
7275 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7276 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
7277 ; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0
7278 ; GFX6-NEXT: v_addc_u32_e64 v2, s[6:7], 0, 0, vcc
7279 ; GFX6-NEXT: s_mul_i32 s6, s11, 0x100100
7280 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s6, v1
7281 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v0, v2, vcc
7282 ; GFX6-NEXT: v_mov_b32_e32 v1, s11
7283 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s10, v3
7284 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
7285 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1
7286 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[8:9], 12
7287 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
7288 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
7289 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[0:1], 11
7290 ; GFX6-NEXT: s_mov_b32 s2, -1
7291 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
7292 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
7293 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
7294 ; GFX6-NEXT: s_endpgm
7296 ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
7298 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
7299 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
7300 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
7301 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7302 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 12
7303 ; GFX9-NEXT: s_mul_i32 s9, s3, 0x10010011
7304 ; GFX9-NEXT: s_mul_hi_u32 s10, s2, 0x10010011
7305 ; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x10010011
7306 ; GFX9-NEXT: s_add_u32 s9, s9, s10
7307 ; GFX9-NEXT: s_mul_i32 s5, s2, 0x100100
7308 ; GFX9-NEXT: s_addc_u32 s8, s8, 0
7309 ; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0x100100
7310 ; GFX9-NEXT: s_add_u32 s5, s5, s9
7311 ; GFX9-NEXT: s_addc_u32 s4, s4, 0
7312 ; GFX9-NEXT: s_add_u32 s4, s8, s4
7313 ; GFX9-NEXT: s_addc_u32 s5, 0, 0
7314 ; GFX9-NEXT: s_mul_i32 s9, s3, 0x100100
7315 ; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x100100
7316 ; GFX9-NEXT: s_add_u32 s4, s9, s4
7317 ; GFX9-NEXT: s_addc_u32 s5, s8, s5
7318 ; GFX9-NEXT: s_sub_u32 s2, s2, s4
7319 ; GFX9-NEXT: s_subb_u32 s3, s3, s5
7320 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
7321 ; GFX9-NEXT: s_add_u32 s2, s2, s4
7322 ; GFX9-NEXT: s_addc_u32 s3, s3, s5
7323 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 11
7324 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
7325 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
7326 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
7327 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
7328 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
7329 ; GFX9-NEXT: s_endpgm
7330 %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
7331 store <2 x i64> %r, ptr addrspace(1) %out
7335 define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
7336 ; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
7337 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> splat (i64 4096), [[Y:%.*]]
7338 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7339 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
7340 ; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]]
7341 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
7342 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
7343 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
7344 ; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
7345 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
7346 ; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16
7347 ; CHECK-NEXT: ret void
7349 ; GFX6-LABEL: udiv_v2i64_pow2_shl_denom:
7351 ; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
7352 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
7353 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
7354 ; GFX6-NEXT: s_mov_b32 s2, -1
7355 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7356 ; GFX6-NEXT: s_add_i32 s4, s12, 12
7357 ; GFX6-NEXT: s_add_i32 s6, s14, 12
7358 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[8:9], s4
7359 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], s6
7360 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
7361 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
7362 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
7363 ; GFX6-NEXT: v_mov_b32_e32 v3, s7
7364 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
7365 ; GFX6-NEXT: s_endpgm
7367 ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
7369 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
7370 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
7371 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
7372 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7373 ; GFX9-NEXT: s_add_i32 s2, s12, 12
7374 ; GFX9-NEXT: s_add_i32 s4, s14, 12
7375 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2
7376 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[10:11], s4
7377 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
7378 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
7379 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
7380 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
7381 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
7382 ; GFX9-NEXT: s_endpgm
7383 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
7384 %r = udiv <2 x i64> %x, %shl.y
7385 store <2 x i64> %r, ptr addrspace(1) %out
7389 define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
7390 ; CHECK-LABEL: @urem_i64_oddk_denom(
7391 ; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
7392 ; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7393 ; CHECK-NEXT: ret void
7395 ; GFX6-LABEL: urem_i64_oddk_denom:
7397 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
7398 ; GFX6-NEXT: v_mov_b32_e32 v2, 0xf6841139
7399 ; GFX6-NEXT: v_mov_b32_e32 v0, 0xe3e10011
7400 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
7401 ; GFX6-NEXT: s_mov_b32 s6, -1
7402 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7403 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v2
7404 ; GFX6-NEXT: v_mul_hi_u32 v2, s3, v2
7405 ; GFX6-NEXT: v_mul_hi_u32 v1, s2, v0
7406 ; GFX6-NEXT: s_mul_i32 s5, s3, 0xf6841139
7407 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s5, v3
7408 ; GFX6-NEXT: s_mov_b32 s4, s0
7409 ; GFX6-NEXT: s_mul_i32 s0, s2, 0xe3e10011
7410 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
7411 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s0, v3
7412 ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
7413 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7414 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
7415 ; GFX6-NEXT: s_mul_i32 s0, s3, 0xe3e10011
7416 ; GFX6-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc
7417 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1
7418 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v0, v2, vcc
7419 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v0
7420 ; GFX6-NEXT: s_movk_i32 s0, 0x11f
7421 ; GFX6-NEXT: v_mul_lo_u32 v1, v0, s0
7422 ; GFX6-NEXT: s_mov_b32 s0, 0x9761f7c9
7423 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s0
7424 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0
7425 ; GFX6-NEXT: s_mov_b32 s5, s1
7426 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
7427 ; GFX6-NEXT: v_mov_b32_e32 v2, s3
7428 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
7429 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
7430 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7431 ; GFX6-NEXT: s_endpgm
7433 ; GFX9-LABEL: urem_i64_oddk_denom:
7435 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
7436 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7437 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7438 ; GFX9-NEXT: s_mul_i32 s7, s3, 0xf6841139
7439 ; GFX9-NEXT: s_mul_hi_u32 s8, s2, 0xf6841139
7440 ; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xf6841139
7441 ; GFX9-NEXT: s_add_u32 s7, s7, s8
7442 ; GFX9-NEXT: s_mul_i32 s5, s2, 0xe3e10011
7443 ; GFX9-NEXT: s_addc_u32 s6, s6, 0
7444 ; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0xe3e10011
7445 ; GFX9-NEXT: s_add_u32 s5, s5, s7
7446 ; GFX9-NEXT: s_addc_u32 s4, s4, 0
7447 ; GFX9-NEXT: s_add_u32 s4, s6, s4
7448 ; GFX9-NEXT: s_addc_u32 s5, 0, 0
7449 ; GFX9-NEXT: s_mul_i32 s7, s3, 0xe3e10011
7450 ; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xe3e10011
7451 ; GFX9-NEXT: s_add_u32 s4, s7, s4
7452 ; GFX9-NEXT: s_addc_u32 s4, s6, s5
7453 ; GFX9-NEXT: s_lshr_b32 s4, s4, 8
7454 ; GFX9-NEXT: s_mul_i32 s5, s4, 0x11f
7455 ; GFX9-NEXT: s_mul_hi_u32 s6, s4, 0x9761f7c9
7456 ; GFX9-NEXT: s_add_i32 s6, s6, s5
7457 ; GFX9-NEXT: s_mul_i32 s4, s4, 0x9761f7c9
7458 ; GFX9-NEXT: s_sub_u32 s2, s2, s4
7459 ; GFX9-NEXT: s_subb_u32 s3, s3, s6
7460 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
7461 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
7462 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
7463 ; GFX9-NEXT: s_endpgm
7464 %r = urem i64 %x, 1235195393993
7465 store i64 %r, ptr addrspace(1) %out
7469 define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
7470 ; CHECK-LABEL: @urem_i64_pow2k_denom(
7471 ; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096
7472 ; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7473 ; CHECK-NEXT: ret void
7475 ; GFX6-LABEL: urem_i64_pow2k_denom:
7477 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
7478 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
7479 ; GFX6-NEXT: s_mov_b32 s6, -1
7480 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
7481 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7482 ; GFX6-NEXT: s_mov_b32 s4, s0
7483 ; GFX6-NEXT: s_and_b32 s0, s2, 0xfff
7484 ; GFX6-NEXT: s_mov_b32 s5, s1
7485 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
7486 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7487 ; GFX6-NEXT: s_endpgm
7489 ; GFX9-LABEL: urem_i64_pow2k_denom:
7491 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
7492 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7493 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7494 ; GFX9-NEXT: s_and_b32 s2, s2, 0xfff
7495 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
7496 ; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
7497 ; GFX9-NEXT: s_endpgm
7498 %r = urem i64 %x, 4096
7499 store i64 %r, ptr addrspace(1) %out
7503 define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) {
7504 ; CHECK-LABEL: @urem_i64_pow2_shl_denom(
7505 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
7506 ; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
7507 ; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7508 ; CHECK-NEXT: ret void
7510 ; GFX6-LABEL: urem_i64_pow2_shl_denom:
7512 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
7513 ; GFX6-NEXT: s_load_dword s8, s[4:5], 0xd
7514 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
7515 ; GFX6-NEXT: s_mov_b32 s6, -1
7516 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7517 ; GFX6-NEXT: s_mov_b32 s4, s0
7518 ; GFX6-NEXT: s_mov_b32 s5, s1
7519 ; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8
7520 ; GFX6-NEXT: s_add_u32 s0, s0, -1
7521 ; GFX6-NEXT: s_addc_u32 s1, s1, -1
7522 ; GFX6-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
7523 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
7524 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
7525 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7526 ; GFX6-NEXT: s_endpgm
7528 ; GFX9-LABEL: urem_i64_pow2_shl_denom:
7530 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34
7531 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
7532 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7533 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7534 ; GFX9-NEXT: s_lshl_b64 s[4:5], 0x1000, s6
7535 ; GFX9-NEXT: s_add_u32 s4, s4, -1
7536 ; GFX9-NEXT: s_addc_u32 s5, s5, -1
7537 ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
7538 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
7539 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
7540 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
7541 ; GFX9-NEXT: s_endpgm
7542 %shl.y = shl i64 4096, %y
7543 %r = urem i64 %x, %shl.y
7544 store i64 %r, ptr addrspace(1) %out
7548 define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
7549 ; CHECK-LABEL: @urem_v2i64_pow2k_denom(
7550 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7551 ; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
7552 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
7553 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7554 ; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
7555 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7556 ; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
7557 ; CHECK-NEXT: ret void
7559 ; GFX6-LABEL: urem_v2i64_pow2k_denom:
7561 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
7562 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
7563 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
7564 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
7565 ; GFX6-NEXT: s_mov_b32 s6, -1
7566 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7567 ; GFX6-NEXT: s_and_b32 s0, s0, 0xfff
7568 ; GFX6-NEXT: s_and_b32 s1, s2, 0xfff
7569 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
7570 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
7571 ; GFX6-NEXT: v_mov_b32_e32 v3, v1
7572 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7573 ; GFX6-NEXT: s_endpgm
7575 ; GFX9-LABEL: urem_v2i64_pow2k_denom:
7577 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
7578 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
7579 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7580 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
7581 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7582 ; GFX9-NEXT: s_and_b32 s0, s0, 0xfff
7583 ; GFX9-NEXT: s_and_b32 s1, s2, 0xfff
7584 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
7585 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
7586 ; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[6:7]
7587 ; GFX9-NEXT: s_endpgm
7588 %r = urem <2 x i64> %x, <i64 4096, i64 4096>
7589 store <2 x i64> %r, ptr addrspace(1) %out
7593 define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
7594 ; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
7595 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> splat (i64 4096), [[Y:%.*]]
7596 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7597 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
7598 ; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]]
7599 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
7600 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
7601 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
7602 ; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
7603 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
7604 ; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16
7605 ; CHECK-NEXT: ret void
7607 ; GFX6-LABEL: urem_v2i64_pow2_shl_denom:
7609 ; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
7610 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
7611 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
7612 ; GFX6-NEXT: s_mov_b32 s2, -1
7613 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7614 ; GFX6-NEXT: s_lshl_b64 s[4:5], 0x1000, s14
7615 ; GFX6-NEXT: s_lshl_b64 s[6:7], 0x1000, s12
7616 ; GFX6-NEXT: s_add_u32 s6, s6, -1
7617 ; GFX6-NEXT: s_addc_u32 s7, s7, -1
7618 ; GFX6-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7]
7619 ; GFX6-NEXT: s_add_u32 s4, s4, -1
7620 ; GFX6-NEXT: s_addc_u32 s5, s5, -1
7621 ; GFX6-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5]
7622 ; GFX6-NEXT: v_mov_b32_e32 v0, s6
7623 ; GFX6-NEXT: v_mov_b32_e32 v1, s7
7624 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
7625 ; GFX6-NEXT: v_mov_b32_e32 v3, s5
7626 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
7627 ; GFX6-NEXT: s_endpgm
7629 ; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
7631 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
7632 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
7633 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
7634 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7635 ; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s14
7636 ; GFX9-NEXT: s_lshl_b64 s[4:5], 0x1000, s12
7637 ; GFX9-NEXT: s_add_u32 s4, s4, -1
7638 ; GFX9-NEXT: s_addc_u32 s5, s5, -1
7639 ; GFX9-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5]
7640 ; GFX9-NEXT: s_add_u32 s2, s2, -1
7641 ; GFX9-NEXT: s_addc_u32 s3, s3, -1
7642 ; GFX9-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3]
7643 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
7644 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
7645 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
7646 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
7647 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
7648 ; GFX9-NEXT: s_endpgm
7649 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
7650 %r = urem <2 x i64> %x, %shl.y
7651 store <2 x i64> %r, ptr addrspace(1) %out
7655 define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
7656 ; CHECK-LABEL: @sdiv_i64_oddk_denom(
7657 ; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
7658 ; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7659 ; CHECK-NEXT: ret void
7661 ; GFX6-LABEL: sdiv_i64_oddk_denom:
7663 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
7664 ; GFX6-NEXT: v_mov_b32_e32 v2, 0xfd81e19
7665 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x6ca94220
7666 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
7667 ; GFX6-NEXT: s_mov_b32 s6, -1
7668 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7669 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v2
7670 ; GFX6-NEXT: v_mul_hi_u32 v4, s3, v2
7671 ; GFX6-NEXT: s_mov_b32 s5, s1
7672 ; GFX6-NEXT: v_mul_hi_u32 v1, s2, v0
7673 ; GFX6-NEXT: s_mul_i32 s1, s3, 0xfd81e19
7674 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s1, v3
7675 ; GFX6-NEXT: s_mov_b32 s4, s0
7676 ; GFX6-NEXT: s_mul_i32 s0, s2, 0x6ca94220
7677 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
7678 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s0, v3
7679 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7680 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1
7681 ; GFX6-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
7682 ; GFX6-NEXT: s_ashr_i32 s1, s3, 31
7683 ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
7684 ; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2
7685 ; GFX6-NEXT: s_mul_i32 s0, s3, 0x6ca94220
7686 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1
7687 ; GFX6-NEXT: s_mul_i32 s0, s1, 0x6ca94220
7688 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v0, v3, vcc
7689 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
7690 ; GFX6-NEXT: s_mul_i32 s1, s1, 0xfd81e19
7691 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s1, v0
7692 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s1, v1
7693 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
7694 ; GFX6-NEXT: v_ashr_i64 v[2:3], v[0:1], 19
7695 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1
7696 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
7697 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
7698 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7699 ; GFX6-NEXT: s_endpgm
7701 ; GFX9-LABEL: sdiv_i64_oddk_denom:
7703 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
7704 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7705 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7706 ; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0x6ca94220
7707 ; GFX9-NEXT: s_mul_i32 s5, s2, 0x6ca94220
7708 ; GFX9-NEXT: s_mul_i32 s7, s3, 0xfd81e19
7709 ; GFX9-NEXT: s_mul_hi_u32 s2, s2, 0xfd81e19
7710 ; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xfd81e19
7711 ; GFX9-NEXT: s_add_u32 s2, s7, s2
7712 ; GFX9-NEXT: s_addc_u32 s6, s6, 0
7713 ; GFX9-NEXT: s_add_u32 s2, s5, s2
7714 ; GFX9-NEXT: s_addc_u32 s2, s4, 0
7715 ; GFX9-NEXT: s_add_u32 s2, s6, s2
7716 ; GFX9-NEXT: s_addc_u32 s4, 0, 0
7717 ; GFX9-NEXT: s_mul_i32 s6, s3, 0x6ca94220
7718 ; GFX9-NEXT: s_mul_hi_u32 s5, s3, 0x6ca94220
7719 ; GFX9-NEXT: s_add_u32 s2, s6, s2
7720 ; GFX9-NEXT: s_addc_u32 s4, s5, s4
7721 ; GFX9-NEXT: s_ashr_i32 s3, s3, 31
7722 ; GFX9-NEXT: s_mul_i32 s5, s3, 0x6ca94220
7723 ; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xfd81e19
7724 ; GFX9-NEXT: s_add_i32 s5, s6, s5
7725 ; GFX9-NEXT: s_mul_i32 s3, s3, 0xfd81e19
7726 ; GFX9-NEXT: s_add_i32 s5, s5, s3
7727 ; GFX9-NEXT: s_add_u32 s2, s2, s3
7728 ; GFX9-NEXT: s_addc_u32 s3, s4, s5
7729 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], 19
7730 ; GFX9-NEXT: s_lshr_b32 s2, s3, 31
7731 ; GFX9-NEXT: s_add_u32 s2, s4, s2
7732 ; GFX9-NEXT: s_addc_u32 s3, s5, 0
7733 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
7734 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
7735 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
7736 ; GFX9-NEXT: s_endpgm
7737 %r = sdiv i64 %x, 1235195
7738 store i64 %r, ptr addrspace(1) %out
7742 define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
7743 ; CHECK-LABEL: @sdiv_i64_pow2k_denom(
7744 ; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
7745 ; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7746 ; CHECK-NEXT: ret void
7748 ; GFX6-LABEL: sdiv_i64_pow2k_denom:
7750 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
7751 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
7752 ; GFX6-NEXT: s_mov_b32 s6, -1
7753 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7754 ; GFX6-NEXT: s_mov_b32 s4, s0
7755 ; GFX6-NEXT: s_ashr_i32 s0, s3, 31
7756 ; GFX6-NEXT: s_lshr_b32 s0, s0, 20
7757 ; GFX6-NEXT: s_add_u32 s0, s2, s0
7758 ; GFX6-NEXT: s_mov_b32 s5, s1
7759 ; GFX6-NEXT: s_addc_u32 s1, s3, 0
7760 ; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12
7761 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
7762 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
7763 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7764 ; GFX6-NEXT: s_endpgm
7766 ; GFX9-LABEL: sdiv_i64_pow2k_denom:
7768 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
7769 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7770 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7771 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31
7772 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20
7773 ; GFX9-NEXT: s_add_u32 s2, s2, s4
7774 ; GFX9-NEXT: s_addc_u32 s3, s3, 0
7775 ; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12
7776 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
7777 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
7778 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
7779 ; GFX9-NEXT: s_endpgm
7780 %r = sdiv i64 %x, 4096
7781 store i64 %r, ptr addrspace(1) %out
7785 define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) {
7786 ; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
7787 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
7788 ; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
7789 ; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
7790 ; CHECK-NEXT: ret void
7792 ; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
7794 ; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd
7795 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
7796 ; GFX6-NEXT: s_mov_b32 s6, -1
7797 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7798 ; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
7799 ; GFX6-NEXT: s_ashr_i32 s8, s1, 31
7800 ; GFX6-NEXT: s_add_u32 s0, s0, s8
7801 ; GFX6-NEXT: s_mov_b32 s9, s8
7802 ; GFX6-NEXT: s_addc_u32 s1, s1, s8
7803 ; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9]
7804 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10
7805 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11
7806 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
7807 ; GFX6-NEXT: s_sub_u32 s4, 0, s10
7808 ; GFX6-NEXT: s_subb_u32 s5, 0, s11
7809 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
7810 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0
7811 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7812 ; GFX6-NEXT: s_ashr_i32 s12, s3, 31
7813 ; GFX6-NEXT: s_add_u32 s2, s2, s12
7814 ; GFX6-NEXT: s_mov_b32 s13, s12
7815 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
7816 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
7817 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
7818 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
7819 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
7820 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
7821 ; GFX6-NEXT: s_addc_u32 s3, s3, s12
7822 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13]
7823 ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1
7824 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0
7825 ; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0
7826 ; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0
7827 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
7828 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
7829 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4
7830 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2
7831 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2
7832 ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4
7833 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4
7834 ; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2
7835 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
7836 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
7837 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2
7838 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6
7839 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
7840 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
7841 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
7842 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
7843 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
7844 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
7845 ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1
7846 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0
7847 ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0
7848 ; GFX6-NEXT: s_mov_b32 s5, s1
7849 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
7850 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0
7851 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
7852 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
7853 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
7854 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
7855 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3
7856 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3
7857 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2
7858 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6
7859 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
7860 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2
7861 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3
7862 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
7863 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
7864 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
7865 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
7866 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
7867 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
7868 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1
7869 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0
7870 ; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1
7871 ; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1
7872 ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1
7873 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
7874 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
7875 ; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0
7876 ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
7877 ; GFX6-NEXT: s_mov_b32 s4, s0
7878 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
7879 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
7880 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
7881 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
7882 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
7883 ; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1
7884 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0
7885 ; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0
7886 ; GFX6-NEXT: v_mov_b32_e32 v5, s11
7887 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
7888 ; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0
7889 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
7890 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2
7891 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3
7892 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
7893 ; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3
7894 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
7895 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4
7896 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
7897 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5
7898 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
7899 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4
7900 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1]
7901 ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0
7902 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
7903 ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0
7904 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
7905 ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
7906 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1]
7907 ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1]
7908 ; GFX6-NEXT: v_mov_b32_e32 v6, s3
7909 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc
7910 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2
7911 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
7912 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3
7913 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
7914 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2
7915 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
7916 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
7917 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
7918 ; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9]
7919 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
7920 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
7921 ; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1
7922 ; GFX6-NEXT: v_mov_b32_e32 v2, s1
7923 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
7924 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
7925 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7926 ; GFX6-NEXT: s_endpgm
7928 ; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
7930 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34
7931 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
7932 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7933 ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
7934 ; GFX9-NEXT: s_ashr_i32 s2, s1, 31
7935 ; GFX9-NEXT: s_add_u32 s0, s0, s2
7936 ; GFX9-NEXT: s_mov_b32 s3, s2
7937 ; GFX9-NEXT: s_addc_u32 s1, s1, s2
7938 ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3]
7939 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
7940 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
7941 ; GFX9-NEXT: s_sub_u32 s0, 0, s6
7942 ; GFX9-NEXT: s_subb_u32 s1, 0, s7
7943 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
7944 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0
7945 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
7946 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
7947 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
7948 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
7949 ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
7950 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
7951 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
7952 ; GFX9-NEXT: v_readfirstlane_b32 s4, v2
7953 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1
7954 ; GFX9-NEXT: s_mul_i32 s12, s0, s4
7955 ; GFX9-NEXT: s_mul_hi_u32 s14, s0, s5
7956 ; GFX9-NEXT: s_mul_i32 s13, s1, s5
7957 ; GFX9-NEXT: s_add_i32 s12, s14, s12
7958 ; GFX9-NEXT: s_mul_i32 s15, s0, s5
7959 ; GFX9-NEXT: s_add_i32 s12, s12, s13
7960 ; GFX9-NEXT: s_mul_hi_u32 s14, s5, s15
7961 ; GFX9-NEXT: s_mul_hi_u32 s13, s5, s12
7962 ; GFX9-NEXT: s_mul_i32 s5, s5, s12
7963 ; GFX9-NEXT: s_add_u32 s5, s14, s5
7964 ; GFX9-NEXT: s_addc_u32 s13, 0, s13
7965 ; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15
7966 ; GFX9-NEXT: s_mul_i32 s15, s4, s15
7967 ; GFX9-NEXT: s_add_u32 s5, s5, s15
7968 ; GFX9-NEXT: s_mul_hi_u32 s14, s4, s12
7969 ; GFX9-NEXT: s_addc_u32 s5, s13, s16
7970 ; GFX9-NEXT: s_addc_u32 s13, s14, 0
7971 ; GFX9-NEXT: s_mul_i32 s12, s4, s12
7972 ; GFX9-NEXT: s_add_u32 s5, s5, s12
7973 ; GFX9-NEXT: s_addc_u32 s12, 0, s13
7974 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s5, v1
7975 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
7976 ; GFX9-NEXT: s_addc_u32 s4, s4, s12
7977 ; GFX9-NEXT: v_readfirstlane_b32 s12, v1
7978 ; GFX9-NEXT: s_mul_i32 s5, s0, s4
7979 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12
7980 ; GFX9-NEXT: s_add_i32 s5, s13, s5
7981 ; GFX9-NEXT: s_mul_i32 s1, s1, s12
7982 ; GFX9-NEXT: s_add_i32 s5, s5, s1
7983 ; GFX9-NEXT: s_mul_i32 s0, s0, s12
7984 ; GFX9-NEXT: s_mul_hi_u32 s13, s4, s0
7985 ; GFX9-NEXT: s_mul_i32 s14, s4, s0
7986 ; GFX9-NEXT: s_mul_i32 s16, s12, s5
7987 ; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0
7988 ; GFX9-NEXT: s_mul_hi_u32 s15, s12, s5
7989 ; GFX9-NEXT: s_add_u32 s0, s0, s16
7990 ; GFX9-NEXT: s_addc_u32 s12, 0, s15
7991 ; GFX9-NEXT: s_add_u32 s0, s0, s14
7992 ; GFX9-NEXT: s_mul_hi_u32 s1, s4, s5
7993 ; GFX9-NEXT: s_addc_u32 s0, s12, s13
7994 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
7995 ; GFX9-NEXT: s_mul_i32 s5, s4, s5
7996 ; GFX9-NEXT: s_add_u32 s0, s0, s5
7997 ; GFX9-NEXT: s_addc_u32 s1, 0, s1
7998 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1
7999 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
8000 ; GFX9-NEXT: s_addc_u32 s12, s4, s1
8001 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31
8002 ; GFX9-NEXT: s_add_u32 s0, s10, s4
8003 ; GFX9-NEXT: s_mov_b32 s5, s4
8004 ; GFX9-NEXT: s_addc_u32 s1, s11, s4
8005 ; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[4:5]
8006 ; GFX9-NEXT: v_readfirstlane_b32 s13, v1
8007 ; GFX9-NEXT: s_mul_i32 s1, s10, s12
8008 ; GFX9-NEXT: s_mul_hi_u32 s14, s10, s13
8009 ; GFX9-NEXT: s_mul_hi_u32 s0, s10, s12
8010 ; GFX9-NEXT: s_add_u32 s1, s14, s1
8011 ; GFX9-NEXT: s_addc_u32 s0, 0, s0
8012 ; GFX9-NEXT: s_mul_hi_u32 s15, s11, s13
8013 ; GFX9-NEXT: s_mul_i32 s13, s11, s13
8014 ; GFX9-NEXT: s_add_u32 s1, s1, s13
8015 ; GFX9-NEXT: s_mul_hi_u32 s14, s11, s12
8016 ; GFX9-NEXT: s_addc_u32 s0, s0, s15
8017 ; GFX9-NEXT: s_addc_u32 s1, s14, 0
8018 ; GFX9-NEXT: s_mul_i32 s12, s11, s12
8019 ; GFX9-NEXT: s_add_u32 s12, s0, s12
8020 ; GFX9-NEXT: s_addc_u32 s13, 0, s1
8021 ; GFX9-NEXT: s_mul_i32 s0, s6, s13
8022 ; GFX9-NEXT: s_mul_hi_u32 s1, s6, s12
8023 ; GFX9-NEXT: s_add_i32 s0, s1, s0
8024 ; GFX9-NEXT: s_mul_i32 s1, s7, s12
8025 ; GFX9-NEXT: s_add_i32 s14, s0, s1
8026 ; GFX9-NEXT: s_mul_i32 s1, s6, s12
8027 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
8028 ; GFX9-NEXT: s_sub_i32 s0, s11, s14
8029 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s10, v1
8030 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
8031 ; GFX9-NEXT: s_subb_u32 s10, s0, s7
8032 ; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s6, v1
8033 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
8034 ; GFX9-NEXT: s_subb_u32 s10, s10, 0
8035 ; GFX9-NEXT: s_cmp_ge_u32 s10, s7
8036 ; GFX9-NEXT: s_cselect_b32 s15, -1, 0
8037 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v2
8038 ; GFX9-NEXT: s_cmp_eq_u32 s10, s7
8039 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
8040 ; GFX9-NEXT: v_mov_b32_e32 v3, s15
8041 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
8042 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1]
8043 ; GFX9-NEXT: s_add_u32 s0, s12, 1
8044 ; GFX9-NEXT: s_addc_u32 s10, s13, 0
8045 ; GFX9-NEXT: s_add_u32 s1, s12, 2
8046 ; GFX9-NEXT: s_addc_u32 s15, s13, 0
8047 ; GFX9-NEXT: v_mov_b32_e32 v3, s0
8048 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
8049 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
8050 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1]
8051 ; GFX9-NEXT: v_mov_b32_e32 v3, s10
8052 ; GFX9-NEXT: v_mov_b32_e32 v4, s15
8053 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
8054 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
8055 ; GFX9-NEXT: s_subb_u32 s0, s11, s14
8056 ; GFX9-NEXT: s_cmp_ge_u32 s0, s7
8057 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0
8058 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1
8059 ; GFX9-NEXT: s_cmp_eq_u32 s0, s7
8060 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
8061 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
8062 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
8063 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
8064 ; GFX9-NEXT: v_mov_b32_e32 v4, s13
8065 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
8066 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
8067 ; GFX9-NEXT: v_mov_b32_e32 v3, s12
8068 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
8069 ; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
8070 ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2
8071 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v1
8072 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
8073 ; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s0, v2
8074 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v4, vcc
8075 ; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9]
8076 ; GFX9-NEXT: s_endpgm
8077 %shl.y = shl i64 4096, %y
8078 %r = sdiv i64 %x, %shl.y
8079 store i64 %r, ptr addrspace(1) %out
8083 define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
8084 ; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
8085 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8086 ; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
8087 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
8088 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8089 ; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
8090 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8091 ; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
8092 ; CHECK-NEXT: ret void
8094 ; GFX6-LABEL: sdiv_v2i64_pow2k_denom:
8096 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
8097 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
8098 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
8099 ; GFX6-NEXT: s_mov_b32 s6, -1
8100 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
8101 ; GFX6-NEXT: s_ashr_i32 s8, s1, 31
8102 ; GFX6-NEXT: s_lshr_b32 s8, s8, 20
8103 ; GFX6-NEXT: s_add_u32 s0, s0, s8
8104 ; GFX6-NEXT: s_addc_u32 s1, s1, 0
8105 ; GFX6-NEXT: s_ashr_i32 s8, s3, 31
8106 ; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12
8107 ; GFX6-NEXT: s_lshr_b32 s8, s8, 20
8108 ; GFX6-NEXT: s_add_u32 s2, s2, s8
8109 ; GFX6-NEXT: s_addc_u32 s3, s3, 0
8110 ; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12
8111 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
8112 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
8113 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
8114 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
8115 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8116 ; GFX6-NEXT: s_endpgm
8118 ; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
8120 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
8121 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
8122 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
8123 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8124 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31
8125 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20
8126 ; GFX9-NEXT: s_add_u32 s0, s0, s4
8127 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
8128 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31
8129 ; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12
8130 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20
8131 ; GFX9-NEXT: s_add_u32 s2, s2, s4
8132 ; GFX9-NEXT: s_addc_u32 s3, s3, 0
8133 ; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12
8134 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
8135 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
8136 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
8137 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
8138 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
8139 ; GFX9-NEXT: s_endpgm
8140 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
8141 store <2 x i64> %r, ptr addrspace(1) %out
8145 define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
8146 ; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
8147 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8148 ; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
8149 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
8150 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8151 ; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
8152 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8153 ; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
8154 ; CHECK-NEXT: ret void
8156 ; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
8158 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd
8159 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
8160 ; GFX6-NEXT: v_mov_b32_e32 v2, 0x8008009
8161 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080080
8162 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
8163 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
8164 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2
8165 ; GFX6-NEXT: v_mul_hi_u32 v4, s11, v2
8166 ; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0
8167 ; GFX6-NEXT: s_mul_i32 s7, s11, 0x8008009
8168 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s7, v3
8169 ; GFX6-NEXT: s_mul_i32 s6, s10, 0x80080080
8170 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
8171 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s6, v3
8172 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31
8173 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8174 ; GFX6-NEXT: s_lshr_b32 s4, s4, 20
8175 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1
8176 ; GFX6-NEXT: s_add_u32 s4, s8, s4
8177 ; GFX6-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc
8178 ; GFX6-NEXT: s_addc_u32 s5, s9, 0
8179 ; GFX6-NEXT: s_ashr_i32 s7, s11, 31
8180 ; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0
8181 ; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2
8182 ; GFX6-NEXT: s_mul_i32 s6, s11, 0x80080080
8183 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v1
8184 ; GFX6-NEXT: s_mul_i32 s6, s7, 0x80080080
8185 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v0, v3, vcc
8186 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s6, v2
8187 ; GFX6-NEXT: s_mul_i32 s6, s7, 0x8008009
8188 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s6, v2
8189 ; GFX6-NEXT: v_mov_b32_e32 v3, s6
8190 ; GFX6-NEXT: v_mov_b32_e32 v4, s11
8191 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v3
8192 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc
8193 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
8194 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v0, v2, vcc
8195 ; GFX6-NEXT: v_mov_b32_e32 v3, s11
8196 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s10, v1
8197 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc
8198 ; GFX6-NEXT: v_ashr_i64 v[2:3], v[0:1], 11
8199 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1
8200 ; GFX6-NEXT: s_ashr_i64 s[4:5], s[4:5], 12
8201 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v0
8202 ; GFX6-NEXT: s_mov_b32 s2, -1
8203 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
8204 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
8205 ; GFX6-NEXT: v_mov_b32_e32 v1, s5
8206 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
8207 ; GFX6-NEXT: s_endpgm
8209 ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
8211 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
8212 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
8213 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
8214 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8215 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31
8216 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20
8217 ; GFX9-NEXT: s_add_u32 s0, s0, s4
8218 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
8219 ; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12
8220 ; GFX9-NEXT: s_mul_i32 s9, s3, 0x8008009
8221 ; GFX9-NEXT: s_mul_hi_u32 s10, s2, 0x8008009
8222 ; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x8008009
8223 ; GFX9-NEXT: s_add_u32 s9, s9, s10
8224 ; GFX9-NEXT: s_mul_i32 s8, s2, 0x80080080
8225 ; GFX9-NEXT: s_addc_u32 s4, s4, 0
8226 ; GFX9-NEXT: s_mul_hi_u32 s5, s2, 0x80080080
8227 ; GFX9-NEXT: s_add_u32 s8, s8, s9
8228 ; GFX9-NEXT: s_addc_u32 s5, s5, 0
8229 ; GFX9-NEXT: s_add_u32 s4, s4, s5
8230 ; GFX9-NEXT: s_addc_u32 s5, 0, 0
8231 ; GFX9-NEXT: s_mul_i32 s9, s3, 0x80080080
8232 ; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x80080080
8233 ; GFX9-NEXT: s_add_u32 s4, s9, s4
8234 ; GFX9-NEXT: s_addc_u32 s5, s8, s5
8235 ; GFX9-NEXT: s_ashr_i32 s8, s3, 31
8236 ; GFX9-NEXT: s_mul_i32 s9, s8, 0x80080080
8237 ; GFX9-NEXT: s_mul_hi_u32 s10, s8, 0x8008009
8238 ; GFX9-NEXT: s_add_i32 s9, s10, s9
8239 ; GFX9-NEXT: s_mul_i32 s8, s8, 0x8008009
8240 ; GFX9-NEXT: s_add_i32 s9, s9, s8
8241 ; GFX9-NEXT: s_sub_u32 s8, s8, s2
8242 ; GFX9-NEXT: s_subb_u32 s9, s9, s3
8243 ; GFX9-NEXT: s_add_u32 s4, s4, s8
8244 ; GFX9-NEXT: s_addc_u32 s5, s5, s9
8245 ; GFX9-NEXT: s_add_u32 s2, s4, s2
8246 ; GFX9-NEXT: s_addc_u32 s3, s5, s3
8247 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], 11
8248 ; GFX9-NEXT: s_lshr_b32 s2, s3, 31
8249 ; GFX9-NEXT: s_add_u32 s2, s4, s2
8250 ; GFX9-NEXT: s_addc_u32 s3, s5, 0
8251 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
8252 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
8253 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
8254 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
8255 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
8256 ; GFX9-NEXT: s_endpgm
8257 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
8258 store <2 x i64> %r, ptr addrspace(1) %out
8262 define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
8263 ; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
8264 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> splat (i64 4096), [[Y:%.*]]
8265 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8266 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
8267 ; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]]
8268 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
8269 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
8270 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
8271 ; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
8272 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
8273 ; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16
8274 ; CHECK-NEXT: ret void
8276 ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
8278 ; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
8279 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
8280 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
8281 ; GFX6-NEXT: s_mov_b32 s6, -1
8282 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
8283 ; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s12
8284 ; GFX6-NEXT: s_lshl_b64 s[14:15], 0x1000, s14
8285 ; GFX6-NEXT: s_ashr_i32 s12, s1, 31
8286 ; GFX6-NEXT: s_add_u32 s0, s0, s12
8287 ; GFX6-NEXT: s_mov_b32 s13, s12
8288 ; GFX6-NEXT: s_addc_u32 s1, s1, s12
8289 ; GFX6-NEXT: s_xor_b64 s[2:3], s[0:1], s[12:13]
8290 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
8291 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3
8292 ; GFX6-NEXT: s_sub_u32 s0, 0, s2
8293 ; GFX6-NEXT: s_subb_u32 s1, 0, s3
8294 ; GFX6-NEXT: s_ashr_i32 s16, s9, 31
8295 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
8296 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0
8297 ; GFX6-NEXT: s_mov_b32 s17, s16
8298 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
8299 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
8300 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
8301 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
8302 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
8303 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
8304 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1
8305 ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0
8306 ; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0
8307 ; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0
8308 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
8309 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
8310 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4
8311 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2
8312 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2
8313 ; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4
8314 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4
8315 ; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2
8316 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
8317 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
8318 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2
8319 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
8320 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc
8321 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
8322 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
8323 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
8324 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
8325 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
8326 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1
8327 ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0
8328 ; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0
8329 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
8330 ; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0
8331 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
8332 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
8333 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
8334 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
8335 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3
8336 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3
8337 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2
8338 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6
8339 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
8340 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2
8341 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3
8342 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
8343 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
8344 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
8345 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
8346 ; GFX6-NEXT: s_add_u32 s0, s8, s16
8347 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
8348 ; GFX6-NEXT: s_addc_u32 s1, s9, s16
8349 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
8350 ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[16:17]
8351 ; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1
8352 ; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0
8353 ; GFX6-NEXT: v_mul_hi_u32 v4, s8, v1
8354 ; GFX6-NEXT: v_mul_hi_u32 v5, s9, v1
8355 ; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1
8356 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
8357 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
8358 ; GFX6-NEXT: v_mul_lo_u32 v4, s9, v0
8359 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
8360 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
8361 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
8362 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
8363 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
8364 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
8365 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1
8366 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0
8367 ; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0
8368 ; GFX6-NEXT: v_mov_b32_e32 v5, s3
8369 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
8370 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0
8371 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
8372 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s9, v2
8373 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3
8374 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
8375 ; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3
8376 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
8377 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4
8378 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
8379 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5
8380 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
8381 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4
8382 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1]
8383 ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0
8384 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
8385 ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0
8386 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
8387 ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
8388 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1]
8389 ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1]
8390 ; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[12:13]
8391 ; GFX6-NEXT: s_ashr_i32 s8, s15, 31
8392 ; GFX6-NEXT: s_add_u32 s12, s14, s8
8393 ; GFX6-NEXT: v_mov_b32_e32 v6, s9
8394 ; GFX6-NEXT: s_mov_b32 s9, s8
8395 ; GFX6-NEXT: s_addc_u32 s13, s15, s8
8396 ; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[8:9]
8397 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc
8398 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12
8399 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s13
8400 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2
8401 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
8402 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
8403 ; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7
8404 ; GFX6-NEXT: v_rcp_f32_e32 v6, v6
8405 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
8406 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2
8407 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc
8408 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
8409 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6
8410 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
8411 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3
8412 ; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
8413 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
8414 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3
8415 ; GFX6-NEXT: s_sub_u32 s2, 0, s12
8416 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
8417 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
8418 ; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2
8419 ; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3
8420 ; GFX6-NEXT: s_subb_u32 s3, 0, s13
8421 ; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2
8422 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
8423 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
8424 ; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2
8425 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6
8426 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4
8427 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5
8428 ; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4
8429 ; GFX6-NEXT: v_mul_hi_u32 v9, v3, v4
8430 ; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4
8431 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6
8432 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
8433 ; GFX6-NEXT: v_mul_lo_u32 v8, v3, v5
8434 ; GFX6-NEXT: v_mul_hi_u32 v5, v3, v5
8435 ; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1
8436 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8
8437 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc
8438 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
8439 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
8440 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
8441 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
8442 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
8443 ; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3
8444 ; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2
8445 ; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2
8446 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
8447 ; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2
8448 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6
8449 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4
8450 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5
8451 ; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4
8452 ; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5
8453 ; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5
8454 ; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4
8455 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8
8456 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
8457 ; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4
8458 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5
8459 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc
8460 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
8461 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
8462 ; GFX6-NEXT: s_ashr_i32 s2, s11, 31
8463 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
8464 ; GFX6-NEXT: s_add_u32 s10, s10, s2
8465 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
8466 ; GFX6-NEXT: s_mov_b32 s3, s2
8467 ; GFX6-NEXT: s_addc_u32 s11, s11, s2
8468 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
8469 ; GFX6-NEXT: s_xor_b64 s[10:11], s[10:11], s[2:3]
8470 ; GFX6-NEXT: v_mul_lo_u32 v4, s10, v3
8471 ; GFX6-NEXT: v_mul_hi_u32 v5, s10, v2
8472 ; GFX6-NEXT: v_mul_hi_u32 v7, s10, v3
8473 ; GFX6-NEXT: v_mul_hi_u32 v8, s11, v3
8474 ; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3
8475 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
8476 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
8477 ; GFX6-NEXT: v_mul_lo_u32 v7, s11, v2
8478 ; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2
8479 ; GFX6-NEXT: v_mov_b32_e32 v6, s1
8480 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7
8481 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc
8482 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
8483 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
8484 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
8485 ; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3
8486 ; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2
8487 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
8488 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
8489 ; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2
8490 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
8491 ; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2
8492 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4
8493 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s11, v4
8494 ; GFX6-NEXT: v_mov_b32_e32 v7, s13
8495 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s10, v5
8496 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
8497 ; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5
8498 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
8499 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6
8500 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
8501 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7
8502 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
8503 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6
8504 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1]
8505 ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2
8506 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
8507 ; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 2, v2
8508 ; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
8509 ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
8510 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v7, v9, s[0:1]
8511 ; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1]
8512 ; GFX6-NEXT: v_mov_b32_e32 v8, s11
8513 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc
8514 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4
8515 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
8516 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5
8517 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
8518 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4
8519 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc
8520 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
8521 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
8522 ; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9]
8523 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
8524 ; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2
8525 ; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3
8526 ; GFX6-NEXT: v_mov_b32_e32 v4, s1
8527 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2
8528 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
8529 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8530 ; GFX6-NEXT: s_endpgm
8532 ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
8534 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
8535 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
8536 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
8537 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8538 ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s12
8539 ; GFX9-NEXT: s_lshl_b64 s[6:7], 0x1000, s14
8540 ; GFX9-NEXT: s_ashr_i32 s12, s1, 31
8541 ; GFX9-NEXT: s_add_u32 s0, s0, s12
8542 ; GFX9-NEXT: s_mov_b32 s13, s12
8543 ; GFX9-NEXT: s_addc_u32 s1, s1, s12
8544 ; GFX9-NEXT: s_xor_b64 s[14:15], s[0:1], s[12:13]
8545 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14
8546 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15
8547 ; GFX9-NEXT: s_sub_u32 s0, 0, s14
8548 ; GFX9-NEXT: s_subb_u32 s1, 0, s15
8549 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
8550 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0
8551 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
8552 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
8553 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
8554 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
8555 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
8556 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
8557 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1
8558 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0
8559 ; GFX9-NEXT: s_mul_i32 s16, s0, s4
8560 ; GFX9-NEXT: s_mul_hi_u32 s18, s0, s5
8561 ; GFX9-NEXT: s_mul_i32 s17, s1, s5
8562 ; GFX9-NEXT: s_add_i32 s16, s18, s16
8563 ; GFX9-NEXT: s_mul_i32 s19, s0, s5
8564 ; GFX9-NEXT: s_add_i32 s16, s16, s17
8565 ; GFX9-NEXT: s_mul_hi_u32 s17, s5, s16
8566 ; GFX9-NEXT: s_mul_i32 s18, s5, s16
8567 ; GFX9-NEXT: s_mul_hi_u32 s5, s5, s19
8568 ; GFX9-NEXT: s_add_u32 s5, s5, s18
8569 ; GFX9-NEXT: s_addc_u32 s17, 0, s17
8570 ; GFX9-NEXT: s_mul_hi_u32 s20, s4, s19
8571 ; GFX9-NEXT: s_mul_i32 s19, s4, s19
8572 ; GFX9-NEXT: s_add_u32 s5, s5, s19
8573 ; GFX9-NEXT: s_mul_hi_u32 s18, s4, s16
8574 ; GFX9-NEXT: s_addc_u32 s5, s17, s20
8575 ; GFX9-NEXT: s_addc_u32 s17, s18, 0
8576 ; GFX9-NEXT: s_mul_i32 s16, s4, s16
8577 ; GFX9-NEXT: s_add_u32 s5, s5, s16
8578 ; GFX9-NEXT: s_addc_u32 s16, 0, s17
8579 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0
8580 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
8581 ; GFX9-NEXT: s_addc_u32 s4, s4, s16
8582 ; GFX9-NEXT: v_readfirstlane_b32 s16, v0
8583 ; GFX9-NEXT: s_mul_i32 s5, s0, s4
8584 ; GFX9-NEXT: s_mul_hi_u32 s17, s0, s16
8585 ; GFX9-NEXT: s_add_i32 s5, s17, s5
8586 ; GFX9-NEXT: s_mul_i32 s1, s1, s16
8587 ; GFX9-NEXT: s_add_i32 s5, s5, s1
8588 ; GFX9-NEXT: s_mul_i32 s0, s0, s16
8589 ; GFX9-NEXT: s_mul_hi_u32 s17, s4, s0
8590 ; GFX9-NEXT: s_mul_i32 s18, s4, s0
8591 ; GFX9-NEXT: s_mul_i32 s20, s16, s5
8592 ; GFX9-NEXT: s_mul_hi_u32 s0, s16, s0
8593 ; GFX9-NEXT: s_mul_hi_u32 s19, s16, s5
8594 ; GFX9-NEXT: s_add_u32 s0, s0, s20
8595 ; GFX9-NEXT: s_addc_u32 s16, 0, s19
8596 ; GFX9-NEXT: s_add_u32 s0, s0, s18
8597 ; GFX9-NEXT: s_mul_hi_u32 s1, s4, s5
8598 ; GFX9-NEXT: s_addc_u32 s0, s16, s17
8599 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
8600 ; GFX9-NEXT: s_mul_i32 s5, s4, s5
8601 ; GFX9-NEXT: s_add_u32 s0, s0, s5
8602 ; GFX9-NEXT: s_addc_u32 s1, 0, s1
8603 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
8604 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
8605 ; GFX9-NEXT: s_addc_u32 s16, s4, s1
8606 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31
8607 ; GFX9-NEXT: s_add_u32 s0, s8, s4
8608 ; GFX9-NEXT: s_mov_b32 s5, s4
8609 ; GFX9-NEXT: s_addc_u32 s1, s9, s4
8610 ; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5]
8611 ; GFX9-NEXT: v_readfirstlane_b32 s17, v0
8612 ; GFX9-NEXT: s_mul_i32 s1, s8, s16
8613 ; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17
8614 ; GFX9-NEXT: s_mul_hi_u32 s0, s8, s16
8615 ; GFX9-NEXT: s_add_u32 s1, s18, s1
8616 ; GFX9-NEXT: s_addc_u32 s0, 0, s0
8617 ; GFX9-NEXT: s_mul_hi_u32 s19, s9, s17
8618 ; GFX9-NEXT: s_mul_i32 s17, s9, s17
8619 ; GFX9-NEXT: s_add_u32 s1, s1, s17
8620 ; GFX9-NEXT: s_mul_hi_u32 s18, s9, s16
8621 ; GFX9-NEXT: s_addc_u32 s0, s0, s19
8622 ; GFX9-NEXT: s_addc_u32 s1, s18, 0
8623 ; GFX9-NEXT: s_mul_i32 s16, s9, s16
8624 ; GFX9-NEXT: s_add_u32 s16, s0, s16
8625 ; GFX9-NEXT: s_addc_u32 s17, 0, s1
8626 ; GFX9-NEXT: s_mul_i32 s0, s14, s17
8627 ; GFX9-NEXT: s_mul_hi_u32 s1, s14, s16
8628 ; GFX9-NEXT: s_add_i32 s0, s1, s0
8629 ; GFX9-NEXT: s_mul_i32 s1, s15, s16
8630 ; GFX9-NEXT: s_add_i32 s18, s0, s1
8631 ; GFX9-NEXT: s_mul_i32 s1, s14, s16
8632 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
8633 ; GFX9-NEXT: s_sub_i32 s0, s9, s18
8634 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0
8635 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
8636 ; GFX9-NEXT: s_subb_u32 s8, s0, s15
8637 ; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s14, v0
8638 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
8639 ; GFX9-NEXT: s_subb_u32 s8, s8, 0
8640 ; GFX9-NEXT: s_cmp_ge_u32 s8, s15
8641 ; GFX9-NEXT: s_cselect_b32 s19, -1, 0
8642 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v1
8643 ; GFX9-NEXT: s_cmp_eq_u32 s8, s15
8644 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
8645 ; GFX9-NEXT: v_mov_b32_e32 v2, s19
8646 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
8647 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1]
8648 ; GFX9-NEXT: s_add_u32 s0, s16, 1
8649 ; GFX9-NEXT: s_addc_u32 s8, s17, 0
8650 ; GFX9-NEXT: s_add_u32 s1, s16, 2
8651 ; GFX9-NEXT: s_addc_u32 s19, s17, 0
8652 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
8653 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
8654 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
8655 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1]
8656 ; GFX9-NEXT: v_mov_b32_e32 v2, s8
8657 ; GFX9-NEXT: v_mov_b32_e32 v3, s19
8658 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
8659 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
8660 ; GFX9-NEXT: s_subb_u32 s0, s9, s18
8661 ; GFX9-NEXT: s_cmp_ge_u32 s0, s15
8662 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0
8663 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v0
8664 ; GFX9-NEXT: s_cmp_eq_u32 s0, s15
8665 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
8666 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
8667 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
8668 ; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[12:13]
8669 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31
8670 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
8671 ; GFX9-NEXT: s_add_u32 s6, s6, s4
8672 ; GFX9-NEXT: v_mov_b32_e32 v3, s17
8673 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
8674 ; GFX9-NEXT: s_mov_b32 s5, s4
8675 ; GFX9-NEXT: s_addc_u32 s7, s7, s4
8676 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
8677 ; GFX9-NEXT: v_mov_b32_e32 v2, s16
8678 ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
8679 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
8680 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6
8681 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7
8682 ; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1
8683 ; GFX9-NEXT: v_xor_b32_e32 v5, s1, v0
8684 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v1
8685 ; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
8686 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2
8687 ; GFX9-NEXT: s_sub_u32 s0, 0, s6
8688 ; GFX9-NEXT: v_mov_b32_e32 v6, s1
8689 ; GFX9-NEXT: s_subb_u32 s1, 0, s7
8690 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
8691 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
8692 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
8693 ; GFX9-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
8694 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
8695 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
8696 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v6, vcc
8697 ; GFX9-NEXT: v_readfirstlane_b32 s8, v2
8698 ; GFX9-NEXT: v_readfirstlane_b32 s13, v3
8699 ; GFX9-NEXT: s_mul_hi_u32 s12, s0, s8
8700 ; GFX9-NEXT: s_mul_i32 s14, s0, s13
8701 ; GFX9-NEXT: s_mul_i32 s9, s1, s8
8702 ; GFX9-NEXT: s_add_i32 s12, s12, s14
8703 ; GFX9-NEXT: s_add_i32 s12, s12, s9
8704 ; GFX9-NEXT: s_mul_i32 s15, s0, s8
8705 ; GFX9-NEXT: s_mul_hi_u32 s9, s8, s12
8706 ; GFX9-NEXT: s_mul_i32 s14, s8, s12
8707 ; GFX9-NEXT: s_mul_hi_u32 s8, s8, s15
8708 ; GFX9-NEXT: s_add_u32 s8, s8, s14
8709 ; GFX9-NEXT: s_addc_u32 s9, 0, s9
8710 ; GFX9-NEXT: s_mul_hi_u32 s16, s13, s15
8711 ; GFX9-NEXT: s_mul_i32 s15, s13, s15
8712 ; GFX9-NEXT: s_add_u32 s8, s8, s15
8713 ; GFX9-NEXT: s_mul_hi_u32 s14, s13, s12
8714 ; GFX9-NEXT: s_addc_u32 s8, s9, s16
8715 ; GFX9-NEXT: s_addc_u32 s9, s14, 0
8716 ; GFX9-NEXT: s_mul_i32 s12, s13, s12
8717 ; GFX9-NEXT: s_add_u32 s8, s8, s12
8718 ; GFX9-NEXT: s_addc_u32 s9, 0, s9
8719 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
8720 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
8721 ; GFX9-NEXT: s_addc_u32 s8, s13, s9
8722 ; GFX9-NEXT: v_readfirstlane_b32 s12, v2
8723 ; GFX9-NEXT: s_mul_i32 s9, s0, s8
8724 ; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12
8725 ; GFX9-NEXT: s_add_i32 s9, s13, s9
8726 ; GFX9-NEXT: s_mul_i32 s1, s1, s12
8727 ; GFX9-NEXT: s_add_i32 s9, s9, s1
8728 ; GFX9-NEXT: s_mul_i32 s0, s0, s12
8729 ; GFX9-NEXT: s_mul_hi_u32 s13, s8, s0
8730 ; GFX9-NEXT: s_mul_i32 s14, s8, s0
8731 ; GFX9-NEXT: s_mul_i32 s16, s12, s9
8732 ; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0
8733 ; GFX9-NEXT: s_mul_hi_u32 s15, s12, s9
8734 ; GFX9-NEXT: s_add_u32 s0, s0, s16
8735 ; GFX9-NEXT: s_addc_u32 s12, 0, s15
8736 ; GFX9-NEXT: s_add_u32 s0, s0, s14
8737 ; GFX9-NEXT: s_mul_hi_u32 s1, s8, s9
8738 ; GFX9-NEXT: s_addc_u32 s0, s12, s13
8739 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
8740 ; GFX9-NEXT: s_mul_i32 s9, s8, s9
8741 ; GFX9-NEXT: s_add_u32 s0, s0, s9
8742 ; GFX9-NEXT: s_addc_u32 s1, 0, s1
8743 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
8744 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
8745 ; GFX9-NEXT: s_addc_u32 s12, s8, s1
8746 ; GFX9-NEXT: s_ashr_i32 s8, s11, 31
8747 ; GFX9-NEXT: s_add_u32 s0, s10, s8
8748 ; GFX9-NEXT: s_mov_b32 s9, s8
8749 ; GFX9-NEXT: s_addc_u32 s1, s11, s8
8750 ; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9]
8751 ; GFX9-NEXT: v_readfirstlane_b32 s13, v2
8752 ; GFX9-NEXT: s_mul_i32 s1, s10, s12
8753 ; GFX9-NEXT: s_mul_hi_u32 s14, s10, s13
8754 ; GFX9-NEXT: s_mul_hi_u32 s0, s10, s12
8755 ; GFX9-NEXT: s_add_u32 s1, s14, s1
8756 ; GFX9-NEXT: s_addc_u32 s0, 0, s0
8757 ; GFX9-NEXT: s_mul_hi_u32 s15, s11, s13
8758 ; GFX9-NEXT: s_mul_i32 s13, s11, s13
8759 ; GFX9-NEXT: s_add_u32 s1, s1, s13
8760 ; GFX9-NEXT: s_mul_hi_u32 s14, s11, s12
8761 ; GFX9-NEXT: s_addc_u32 s0, s0, s15
8762 ; GFX9-NEXT: s_addc_u32 s1, s14, 0
8763 ; GFX9-NEXT: s_mul_i32 s12, s11, s12
8764 ; GFX9-NEXT: s_add_u32 s12, s0, s12
8765 ; GFX9-NEXT: s_addc_u32 s13, 0, s1
8766 ; GFX9-NEXT: s_mul_i32 s0, s6, s13
8767 ; GFX9-NEXT: s_mul_hi_u32 s1, s6, s12
8768 ; GFX9-NEXT: s_add_i32 s0, s1, s0
8769 ; GFX9-NEXT: s_mul_i32 s1, s7, s12
8770 ; GFX9-NEXT: s_add_i32 s14, s0, s1
8771 ; GFX9-NEXT: s_mul_i32 s1, s6, s12
8772 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
8773 ; GFX9-NEXT: s_sub_i32 s0, s11, s14
8774 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2
8775 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
8776 ; GFX9-NEXT: s_subb_u32 s10, s0, s7
8777 ; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s6, v2
8778 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
8779 ; GFX9-NEXT: s_subb_u32 s10, s10, 0
8780 ; GFX9-NEXT: s_cmp_ge_u32 s10, s7
8781 ; GFX9-NEXT: s_cselect_b32 s15, -1, 0
8782 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3
8783 ; GFX9-NEXT: s_cmp_eq_u32 s10, s7
8784 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
8785 ; GFX9-NEXT: v_mov_b32_e32 v5, s15
8786 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
8787 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1]
8788 ; GFX9-NEXT: s_add_u32 s0, s12, 1
8789 ; GFX9-NEXT: s_addc_u32 s10, s13, 0
8790 ; GFX9-NEXT: s_add_u32 s1, s12, 2
8791 ; GFX9-NEXT: s_addc_u32 s15, s13, 0
8792 ; GFX9-NEXT: v_mov_b32_e32 v5, s0
8793 ; GFX9-NEXT: v_mov_b32_e32 v6, s1
8794 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3
8795 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1]
8796 ; GFX9-NEXT: v_mov_b32_e32 v5, s10
8797 ; GFX9-NEXT: v_mov_b32_e32 v6, s15
8798 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
8799 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1]
8800 ; GFX9-NEXT: s_subb_u32 s0, s11, s14
8801 ; GFX9-NEXT: s_cmp_ge_u32 s0, s7
8802 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0
8803 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2
8804 ; GFX9-NEXT: s_cmp_eq_u32 s0, s7
8805 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
8806 ; GFX9-NEXT: v_mov_b32_e32 v6, s1
8807 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
8808 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
8809 ; GFX9-NEXT: v_mov_b32_e32 v6, s13
8810 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
8811 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v5, vcc
8812 ; GFX9-NEXT: v_mov_b32_e32 v5, s12
8813 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
8814 ; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], s[4:5]
8815 ; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3
8816 ; GFX9-NEXT: v_xor_b32_e32 v5, s1, v2
8817 ; GFX9-NEXT: v_mov_b32_e32 v6, s1
8818 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3
8819 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
8820 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
8821 ; GFX9-NEXT: s_endpgm
8822 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
8823 %r = sdiv <2 x i64> %x, %shl.y
8824 store <2 x i64> %r, ptr addrspace(1) %out
8828 define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
8829 ; CHECK-LABEL: @srem_i64_oddk_denom(
8830 ; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195
8831 ; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
8832 ; CHECK-NEXT: ret void
8834 ; GFX6-LABEL: srem_i64_oddk_denom:
8836 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
8837 ; GFX6-NEXT: v_mov_b32_e32 v2, 0xfd81e19
8838 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x6ca94220
8839 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
8840 ; GFX6-NEXT: s_mov_b32 s2, -1
8841 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
8842 ; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2
8843 ; GFX6-NEXT: v_mul_hi_u32 v4, s7, v2
8844 ; GFX6-NEXT: s_mov_b32 s0, s4
8845 ; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0
8846 ; GFX6-NEXT: s_mul_i32 s4, s7, 0xfd81e19
8847 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3
8848 ; GFX6-NEXT: s_mul_i32 s1, s6, 0x6ca94220
8849 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
8850 ; GFX6-NEXT: s_ashr_i32 s4, s7, 31
8851 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s1, v3
8852 ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0
8853 ; GFX6-NEXT: v_mul_hi_u32 v2, s4, v2
8854 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8855 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1
8856 ; GFX6-NEXT: s_mul_i32 s1, s7, 0x6ca94220
8857 ; GFX6-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, vcc
8858 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s1, v1
8859 ; GFX6-NEXT: s_mul_i32 s1, s4, 0x6ca94220
8860 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v0, v3, vcc
8861 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s1, v2
8862 ; GFX6-NEXT: s_mul_i32 s4, s4, 0xfd81e19
8863 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v0
8864 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
8865 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
8866 ; GFX6-NEXT: v_ashr_i64 v[2:3], v[0:1], 19
8867 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1
8868 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
8869 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
8870 ; GFX6-NEXT: s_mov_b32 s4, 0x12d8fb
8871 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4
8872 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s4
8873 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4
8874 ; GFX6-NEXT: s_mov_b32 s1, s5
8875 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
8876 ; GFX6-NEXT: v_mov_b32_e32 v2, s7
8877 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
8878 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
8879 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
8880 ; GFX6-NEXT: s_endpgm
8882 ; GFX9-LABEL: srem_i64_oddk_denom:
8884 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
8885 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
8886 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8887 ; GFX9-NEXT: s_mul_i32 s7, s3, 0xfd81e19
8888 ; GFX9-NEXT: s_mul_hi_u32 s8, s2, 0xfd81e19
8889 ; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xfd81e19
8890 ; GFX9-NEXT: s_add_u32 s7, s7, s8
8891 ; GFX9-NEXT: s_mul_i32 s5, s2, 0x6ca94220
8892 ; GFX9-NEXT: s_addc_u32 s6, s6, 0
8893 ; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0x6ca94220
8894 ; GFX9-NEXT: s_add_u32 s5, s5, s7
8895 ; GFX9-NEXT: s_addc_u32 s4, s4, 0
8896 ; GFX9-NEXT: s_add_u32 s4, s6, s4
8897 ; GFX9-NEXT: s_addc_u32 s5, 0, 0
8898 ; GFX9-NEXT: s_mul_i32 s7, s3, 0x6ca94220
8899 ; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0x6ca94220
8900 ; GFX9-NEXT: s_add_u32 s4, s7, s4
8901 ; GFX9-NEXT: s_addc_u32 s5, s6, s5
8902 ; GFX9-NEXT: s_ashr_i32 s6, s3, 31
8903 ; GFX9-NEXT: s_mul_i32 s7, s6, 0x6ca94220
8904 ; GFX9-NEXT: s_mul_hi_u32 s8, s6, 0xfd81e19
8905 ; GFX9-NEXT: s_add_i32 s7, s8, s7
8906 ; GFX9-NEXT: s_mul_i32 s6, s6, 0xfd81e19
8907 ; GFX9-NEXT: s_add_i32 s7, s7, s6
8908 ; GFX9-NEXT: s_add_u32 s4, s4, s6
8909 ; GFX9-NEXT: s_addc_u32 s5, s5, s7
8910 ; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 19
8911 ; GFX9-NEXT: s_lshr_b32 s4, s5, 31
8912 ; GFX9-NEXT: s_add_u32 s4, s6, s4
8913 ; GFX9-NEXT: s_addc_u32 s5, s7, 0
8914 ; GFX9-NEXT: s_mul_i32 s5, s5, 0x12d8fb
8915 ; GFX9-NEXT: s_mul_hi_u32 s6, s4, 0x12d8fb
8916 ; GFX9-NEXT: s_add_i32 s6, s6, s5
8917 ; GFX9-NEXT: s_mul_i32 s4, s4, 0x12d8fb
8918 ; GFX9-NEXT: s_sub_u32 s2, s2, s4
8919 ; GFX9-NEXT: s_subb_u32 s3, s3, s6
8920 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
8921 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
8922 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
8923 ; GFX9-NEXT: s_endpgm
8924 %r = srem i64 %x, 1235195
8925 store i64 %r, ptr addrspace(1) %out
8929 define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
8930 ; CHECK-LABEL: @srem_i64_pow2k_denom(
8931 ; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096
8932 ; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
8933 ; CHECK-NEXT: ret void
8935 ; GFX6-LABEL: srem_i64_pow2k_denom:
8937 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
8938 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
8939 ; GFX6-NEXT: s_mov_b32 s6, -1
8940 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
8941 ; GFX6-NEXT: s_mov_b32 s4, s0
8942 ; GFX6-NEXT: s_ashr_i32 s0, s3, 31
8943 ; GFX6-NEXT: s_lshr_b32 s0, s0, 20
8944 ; GFX6-NEXT: s_add_u32 s0, s2, s0
8945 ; GFX6-NEXT: s_mov_b32 s5, s1
8946 ; GFX6-NEXT: s_addc_u32 s1, s3, 0
8947 ; GFX6-NEXT: s_and_b32 s0, s0, 0xfffff000
8948 ; GFX6-NEXT: s_sub_u32 s0, s2, s0
8949 ; GFX6-NEXT: s_subb_u32 s1, s3, s1
8950 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
8951 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
8952 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8953 ; GFX6-NEXT: s_endpgm
8955 ; GFX9-LABEL: srem_i64_pow2k_denom:
8957 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
8958 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
8959 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8960 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31
8961 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20
8962 ; GFX9-NEXT: s_add_u32 s4, s2, s4
8963 ; GFX9-NEXT: s_addc_u32 s5, s3, 0
8964 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000
8965 ; GFX9-NEXT: s_sub_u32 s2, s2, s4
8966 ; GFX9-NEXT: s_subb_u32 s3, s3, s5
8967 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
8968 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
8969 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
8970 ; GFX9-NEXT: s_endpgm
8971 %r = srem i64 %x, 4096
8972 store i64 %r, ptr addrspace(1) %out
8976 define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) {
8977 ; CHECK-LABEL: @srem_i64_pow2_shl_denom(
8978 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
8979 ; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
8980 ; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
8981 ; CHECK-NEXT: ret void
8983 ; GFX6-LABEL: srem_i64_pow2_shl_denom:
8985 ; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd
8986 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
8987 ; GFX6-NEXT: s_mov_b32 s6, -1
8988 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
8989 ; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
8990 ; GFX6-NEXT: s_ashr_i32 s2, s1, 31
8991 ; GFX6-NEXT: s_add_u32 s0, s0, s2
8992 ; GFX6-NEXT: s_mov_b32 s3, s2
8993 ; GFX6-NEXT: s_addc_u32 s1, s1, s2
8994 ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3]
8995 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
8996 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9
8997 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
8998 ; GFX6-NEXT: s_sub_u32 s4, 0, s8
8999 ; GFX6-NEXT: s_subb_u32 s5, 0, s9
9000 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
9001 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0
9002 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
9003 ; GFX6-NEXT: s_ashr_i32 s10, s3, 31
9004 ; GFX6-NEXT: s_add_u32 s2, s2, s10
9005 ; GFX6-NEXT: s_mov_b32 s11, s10
9006 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
9007 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
9008 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
9009 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
9010 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
9011 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
9012 ; GFX6-NEXT: s_addc_u32 s3, s3, s10
9013 ; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11]
9014 ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1
9015 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0
9016 ; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0
9017 ; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0
9018 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
9019 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
9020 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4
9021 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2
9022 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2
9023 ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4
9024 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4
9025 ; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2
9026 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
9027 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
9028 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2
9029 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6
9030 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
9031 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
9032 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
9033 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
9034 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
9035 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
9036 ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1
9037 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0
9038 ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0
9039 ; GFX6-NEXT: s_mov_b32 s5, s1
9040 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
9041 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0
9042 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
9043 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
9044 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
9045 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
9046 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3
9047 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3
9048 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2
9049 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6
9050 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
9051 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2
9052 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3
9053 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
9054 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
9055 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
9056 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
9057 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
9058 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
9059 ; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1
9060 ; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0
9061 ; GFX6-NEXT: v_mul_hi_u32 v4, s12, v1
9062 ; GFX6-NEXT: v_mul_hi_u32 v5, s13, v1
9063 ; GFX6-NEXT: v_mul_lo_u32 v1, s13, v1
9064 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
9065 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
9066 ; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0
9067 ; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0
9068 ; GFX6-NEXT: s_mov_b32 s4, s0
9069 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
9070 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
9071 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
9072 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
9073 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
9074 ; GFX6-NEXT: v_mul_lo_u32 v1, s8, v1
9075 ; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0
9076 ; GFX6-NEXT: v_mul_lo_u32 v3, s9, v0
9077 ; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0
9078 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
9079 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1
9080 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s13, v1
9081 ; GFX6-NEXT: v_mov_b32_e32 v3, s9
9082 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s12, v0
9083 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
9084 ; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0
9085 ; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
9086 ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5
9087 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3]
9088 ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4
9089 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
9090 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3]
9091 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5
9092 ; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4
9093 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3]
9094 ; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
9095 ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
9096 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1]
9097 ; GFX6-NEXT: v_mov_b32_e32 v4, s13
9098 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
9099 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1
9100 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
9101 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
9102 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1]
9103 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
9104 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1
9105 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
9106 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
9107 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
9108 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
9109 ; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0
9110 ; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1
9111 ; GFX6-NEXT: v_mov_b32_e32 v2, s10
9112 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0
9113 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
9114 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
9115 ; GFX6-NEXT: s_endpgm
9117 ; GFX9-LABEL: srem_i64_pow2_shl_denom:
9119 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34
9120 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
9121 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
9122 ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
9123 ; GFX9-NEXT: s_ashr_i32 s2, s1, 31
9124 ; GFX9-NEXT: s_add_u32 s0, s0, s2
9125 ; GFX9-NEXT: s_mov_b32 s3, s2
9126 ; GFX9-NEXT: s_addc_u32 s1, s1, s2
9127 ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3]
9128 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
9129 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
9130 ; GFX9-NEXT: s_sub_u32 s0, 0, s6
9131 ; GFX9-NEXT: s_subb_u32 s1, 0, s7
9132 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
9133 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0
9134 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
9135 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
9136 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
9137 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
9138 ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
9139 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
9140 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
9141 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
9142 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1
9143 ; GFX9-NEXT: s_mul_i32 s4, s0, s2
9144 ; GFX9-NEXT: s_mul_hi_u32 s12, s0, s3
9145 ; GFX9-NEXT: s_mul_i32 s5, s1, s3
9146 ; GFX9-NEXT: s_add_i32 s4, s12, s4
9147 ; GFX9-NEXT: s_mul_i32 s13, s0, s3
9148 ; GFX9-NEXT: s_add_i32 s4, s4, s5
9149 ; GFX9-NEXT: s_mul_hi_u32 s12, s3, s13
9150 ; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4
9151 ; GFX9-NEXT: s_mul_i32 s3, s3, s4
9152 ; GFX9-NEXT: s_add_u32 s3, s12, s3
9153 ; GFX9-NEXT: s_addc_u32 s5, 0, s5
9154 ; GFX9-NEXT: s_mul_hi_u32 s14, s2, s13
9155 ; GFX9-NEXT: s_mul_i32 s13, s2, s13
9156 ; GFX9-NEXT: s_add_u32 s3, s3, s13
9157 ; GFX9-NEXT: s_mul_hi_u32 s12, s2, s4
9158 ; GFX9-NEXT: s_addc_u32 s3, s5, s14
9159 ; GFX9-NEXT: s_addc_u32 s5, s12, 0
9160 ; GFX9-NEXT: s_mul_i32 s4, s2, s4
9161 ; GFX9-NEXT: s_add_u32 s3, s3, s4
9162 ; GFX9-NEXT: s_addc_u32 s4, 0, s5
9163 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s3, v1
9164 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
9165 ; GFX9-NEXT: s_addc_u32 s2, s2, s4
9166 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1
9167 ; GFX9-NEXT: s_mul_i32 s3, s0, s2
9168 ; GFX9-NEXT: s_mul_hi_u32 s5, s0, s4
9169 ; GFX9-NEXT: s_add_i32 s3, s5, s3
9170 ; GFX9-NEXT: s_mul_i32 s1, s1, s4
9171 ; GFX9-NEXT: s_add_i32 s3, s3, s1
9172 ; GFX9-NEXT: s_mul_i32 s0, s0, s4
9173 ; GFX9-NEXT: s_mul_hi_u32 s5, s2, s0
9174 ; GFX9-NEXT: s_mul_i32 s12, s2, s0
9175 ; GFX9-NEXT: s_mul_i32 s14, s4, s3
9176 ; GFX9-NEXT: s_mul_hi_u32 s0, s4, s0
9177 ; GFX9-NEXT: s_mul_hi_u32 s13, s4, s3
9178 ; GFX9-NEXT: s_add_u32 s0, s0, s14
9179 ; GFX9-NEXT: s_addc_u32 s4, 0, s13
9180 ; GFX9-NEXT: s_add_u32 s0, s0, s12
9181 ; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3
9182 ; GFX9-NEXT: s_addc_u32 s0, s4, s5
9183 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
9184 ; GFX9-NEXT: s_mul_i32 s3, s2, s3
9185 ; GFX9-NEXT: s_add_u32 s0, s0, s3
9186 ; GFX9-NEXT: s_addc_u32 s1, 0, s1
9187 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1
9188 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
9189 ; GFX9-NEXT: s_addc_u32 s2, s2, s1
9190 ; GFX9-NEXT: s_ashr_i32 s4, s11, 31
9191 ; GFX9-NEXT: s_add_u32 s0, s10, s4
9192 ; GFX9-NEXT: s_mov_b32 s5, s4
9193 ; GFX9-NEXT: s_addc_u32 s1, s11, s4
9194 ; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[4:5]
9195 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1
9196 ; GFX9-NEXT: s_mul_i32 s1, s10, s2
9197 ; GFX9-NEXT: s_mul_hi_u32 s5, s10, s3
9198 ; GFX9-NEXT: s_mul_hi_u32 s0, s10, s2
9199 ; GFX9-NEXT: s_add_u32 s1, s5, s1
9200 ; GFX9-NEXT: s_addc_u32 s0, 0, s0
9201 ; GFX9-NEXT: s_mul_hi_u32 s12, s11, s3
9202 ; GFX9-NEXT: s_mul_i32 s3, s11, s3
9203 ; GFX9-NEXT: s_add_u32 s1, s1, s3
9204 ; GFX9-NEXT: s_mul_hi_u32 s5, s11, s2
9205 ; GFX9-NEXT: s_addc_u32 s0, s0, s12
9206 ; GFX9-NEXT: s_addc_u32 s1, s5, 0
9207 ; GFX9-NEXT: s_mul_i32 s2, s11, s2
9208 ; GFX9-NEXT: s_add_u32 s0, s0, s2
9209 ; GFX9-NEXT: s_addc_u32 s1, 0, s1
9210 ; GFX9-NEXT: s_mul_i32 s1, s6, s1
9211 ; GFX9-NEXT: s_mul_hi_u32 s2, s6, s0
9212 ; GFX9-NEXT: s_add_i32 s1, s2, s1
9213 ; GFX9-NEXT: s_mul_i32 s2, s7, s0
9214 ; GFX9-NEXT: s_mul_i32 s0, s6, s0
9215 ; GFX9-NEXT: s_add_i32 s5, s1, s2
9216 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
9217 ; GFX9-NEXT: s_sub_i32 s1, s11, s5
9218 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s10, v1
9219 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
9220 ; GFX9-NEXT: s_subb_u32 s10, s1, s7
9221 ; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s6, v1
9222 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
9223 ; GFX9-NEXT: s_subb_u32 s12, s10, 0
9224 ; GFX9-NEXT: s_cmp_ge_u32 s12, s7
9225 ; GFX9-NEXT: s_cselect_b32 s13, -1, 0
9226 ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s6, v2
9227 ; GFX9-NEXT: s_cmp_eq_u32 s12, s7
9228 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[2:3]
9229 ; GFX9-NEXT: v_mov_b32_e32 v4, s13
9230 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
9231 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
9232 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[2:3]
9233 ; GFX9-NEXT: s_subb_u32 s2, s10, s7
9234 ; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s6, v2
9235 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
9236 ; GFX9-NEXT: s_subb_u32 s2, s2, 0
9237 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3
9238 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
9239 ; GFX9-NEXT: v_mov_b32_e32 v3, s12
9240 ; GFX9-NEXT: v_mov_b32_e32 v4, s2
9241 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
9242 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
9243 ; GFX9-NEXT: s_subb_u32 s0, s11, s5
9244 ; GFX9-NEXT: s_cmp_ge_u32 s0, s7
9245 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0
9246 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1
9247 ; GFX9-NEXT: s_cmp_eq_u32 s0, s7
9248 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
9249 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
9250 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
9251 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
9252 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
9253 ; GFX9-NEXT: v_mov_b32_e32 v5, s0
9254 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
9255 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
9256 ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1
9257 ; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3
9258 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
9259 ; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s4, v1
9260 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
9261 ; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9]
9262 ; GFX9-NEXT: s_endpgm
9263 %shl.y = shl i64 4096, %y
9264 %r = srem i64 %x, %shl.y
9265 store i64 %r, ptr addrspace(1) %out
9269 define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) {
9270 ; CHECK-LABEL: @srem_v2i64_pow2k_denom(
9271 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
9272 ; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
9273 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
9274 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
9275 ; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
9276 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
9277 ; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16
9278 ; CHECK-NEXT: ret void
9280 ; GFX6-LABEL: srem_v2i64_pow2k_denom:
9282 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
9283 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
9284 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
9285 ; GFX6-NEXT: s_mov_b32 s6, -1
9286 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
9287 ; GFX6-NEXT: s_ashr_i32 s8, s1, 31
9288 ; GFX6-NEXT: s_lshr_b32 s8, s8, 20
9289 ; GFX6-NEXT: s_add_u32 s8, s0, s8
9290 ; GFX6-NEXT: s_addc_u32 s9, s1, 0
9291 ; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000
9292 ; GFX6-NEXT: s_sub_u32 s0, s0, s8
9293 ; GFX6-NEXT: s_subb_u32 s1, s1, s9
9294 ; GFX6-NEXT: s_ashr_i32 s8, s3, 31
9295 ; GFX6-NEXT: s_lshr_b32 s8, s8, 20
9296 ; GFX6-NEXT: s_add_u32 s8, s2, s8
9297 ; GFX6-NEXT: s_addc_u32 s9, s3, 0
9298 ; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000
9299 ; GFX6-NEXT: s_sub_u32 s2, s2, s8
9300 ; GFX6-NEXT: s_subb_u32 s3, s3, s9
9301 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
9302 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
9303 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
9304 ; GFX6-NEXT: v_mov_b32_e32 v3, s3
9305 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
9306 ; GFX6-NEXT: s_endpgm
9308 ; GFX9-LABEL: srem_v2i64_pow2k_denom:
9310 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
9311 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
9312 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
9313 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
9314 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31
9315 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20
9316 ; GFX9-NEXT: s_add_u32 s4, s0, s4
9317 ; GFX9-NEXT: s_addc_u32 s5, s1, 0
9318 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000
9319 ; GFX9-NEXT: s_sub_u32 s0, s0, s4
9320 ; GFX9-NEXT: s_subb_u32 s1, s1, s5
9321 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31
9322 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20
9323 ; GFX9-NEXT: s_add_u32 s4, s2, s4
9324 ; GFX9-NEXT: s_addc_u32 s5, s3, 0
9325 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000
9326 ; GFX9-NEXT: s_sub_u32 s2, s2, s4
9327 ; GFX9-NEXT: s_subb_u32 s3, s3, s5
9328 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
9329 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
9330 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
9331 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
9332 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
9333 ; GFX9-NEXT: s_endpgm
9334 %r = srem <2 x i64> %x, <i64 4096, i64 4096>
9335 store <2 x i64> %r, ptr addrspace(1) %out
9339 define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) {
9340 ; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
9341 ; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> splat (i64 4096), [[Y:%.*]]
9342 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
9343 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
9344 ; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]]
9345 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
9346 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
9347 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
9348 ; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
9349 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
9350 ; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16
9351 ; CHECK-NEXT: ret void
9353 ; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
9355 ; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
9356 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
9357 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
9358 ; GFX6-NEXT: s_mov_b32 s6, -1
9359 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
9360 ; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s12
9361 ; GFX6-NEXT: s_lshl_b64 s[16:17], 0x1000, s14
9362 ; GFX6-NEXT: s_ashr_i32 s2, s1, 31
9363 ; GFX6-NEXT: s_add_u32 s0, s0, s2
9364 ; GFX6-NEXT: s_mov_b32 s3, s2
9365 ; GFX6-NEXT: s_addc_u32 s1, s1, s2
9366 ; GFX6-NEXT: s_xor_b64 s[14:15], s[0:1], s[2:3]
9367 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14
9368 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15
9369 ; GFX6-NEXT: s_sub_u32 s0, 0, s14
9370 ; GFX6-NEXT: s_subb_u32 s1, 0, s15
9371 ; GFX6-NEXT: s_ashr_i32 s12, s9, 31
9372 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
9373 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0
9374 ; GFX6-NEXT: s_mov_b32 s13, s12
9375 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
9376 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
9377 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
9378 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
9379 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
9380 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
9381 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1
9382 ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0
9383 ; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0
9384 ; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0
9385 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
9386 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
9387 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4
9388 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2
9389 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2
9390 ; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4
9391 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4
9392 ; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2
9393 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
9394 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
9395 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2
9396 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
9397 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc
9398 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
9399 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
9400 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
9401 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
9402 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
9403 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1
9404 ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0
9405 ; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0
9406 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
9407 ; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0
9408 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
9409 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2
9410 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3
9411 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2
9412 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3
9413 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3
9414 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2
9415 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6
9416 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
9417 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2
9418 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3
9419 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc
9420 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
9421 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
9422 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
9423 ; GFX6-NEXT: s_add_u32 s0, s8, s12
9424 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
9425 ; GFX6-NEXT: s_addc_u32 s1, s9, s12
9426 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
9427 ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13]
9428 ; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1
9429 ; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0
9430 ; GFX6-NEXT: v_mul_hi_u32 v4, s8, v1
9431 ; GFX6-NEXT: v_mul_hi_u32 v5, s9, v1
9432 ; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1
9433 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
9434 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
9435 ; GFX6-NEXT: v_mul_lo_u32 v4, s9, v0
9436 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
9437 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
9438 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
9439 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
9440 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
9441 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
9442 ; GFX6-NEXT: v_mul_lo_u32 v1, s14, v1
9443 ; GFX6-NEXT: v_mul_hi_u32 v2, s14, v0
9444 ; GFX6-NEXT: v_mul_lo_u32 v3, s15, v0
9445 ; GFX6-NEXT: v_mul_lo_u32 v0, s14, v0
9446 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
9447 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1
9448 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v1
9449 ; GFX6-NEXT: v_mov_b32_e32 v3, s15
9450 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0
9451 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
9452 ; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s14, v0
9453 ; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
9454 ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s15, v5
9455 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3]
9456 ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v4
9457 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
9458 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3]
9459 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s15, v5
9460 ; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s14, v4
9461 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3]
9462 ; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
9463 ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
9464 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1]
9465 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1]
9466 ; GFX6-NEXT: s_ashr_i32 s0, s17, 31
9467 ; GFX6-NEXT: s_add_u32 s2, s16, s0
9468 ; GFX6-NEXT: s_mov_b32 s1, s0
9469 ; GFX6-NEXT: s_addc_u32 s3, s17, s0
9470 ; GFX6-NEXT: v_mov_b32_e32 v4, s9
9471 ; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[0:1]
9472 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
9473 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8
9474 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s9
9475 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s15, v1
9476 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
9477 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v0
9478 ; GFX6-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
9479 ; GFX6-NEXT: v_rcp_f32_e32 v4, v4
9480 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
9481 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s15, v1
9482 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
9483 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
9484 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
9485 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4
9486 ; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2
9487 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4
9488 ; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
9489 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
9490 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4
9491 ; GFX6-NEXT: s_sub_u32 s0, 0, s8
9492 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
9493 ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v2
9494 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4
9495 ; GFX6-NEXT: s_subb_u32 s1, 0, s9
9496 ; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2
9497 ; GFX6-NEXT: s_ashr_i32 s14, s11, 31
9498 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3
9499 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2
9500 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6
9501 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v3
9502 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5
9503 ; GFX6-NEXT: v_mul_hi_u32 v8, v2, v3
9504 ; GFX6-NEXT: v_mul_hi_u32 v9, v4, v3
9505 ; GFX6-NEXT: v_mul_lo_u32 v3, v4, v3
9506 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6
9507 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc
9508 ; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5
9509 ; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5
9510 ; GFX6-NEXT: s_mov_b32 s15, s14
9511 ; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0
9512 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8
9513 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc
9514 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
9515 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3
9516 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
9517 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
9518 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc
9519 ; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3
9520 ; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2
9521 ; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2
9522 ; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1
9523 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
9524 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2
9525 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6
9526 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4
9527 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5
9528 ; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4
9529 ; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5
9530 ; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5
9531 ; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4
9532 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8
9533 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
9534 ; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4
9535 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5
9536 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc
9537 ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
9538 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
9539 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
9540 ; GFX6-NEXT: s_add_u32 s0, s10, s14
9541 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
9542 ; GFX6-NEXT: s_addc_u32 s1, s11, s14
9543 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
9544 ; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15]
9545 ; GFX6-NEXT: v_mul_lo_u32 v4, s10, v3
9546 ; GFX6-NEXT: v_mul_hi_u32 v5, s10, v2
9547 ; GFX6-NEXT: v_mul_hi_u32 v7, s10, v3
9548 ; GFX6-NEXT: v_mul_hi_u32 v8, s11, v3
9549 ; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3
9550 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
9551 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
9552 ; GFX6-NEXT: v_mul_lo_u32 v7, s11, v2
9553 ; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2
9554 ; GFX6-NEXT: v_mov_b32_e32 v6, s12
9555 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7
9556 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc
9557 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
9558 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
9559 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
9560 ; GFX6-NEXT: v_mul_lo_u32 v3, s8, v3
9561 ; GFX6-NEXT: v_mul_hi_u32 v4, s8, v2
9562 ; GFX6-NEXT: v_mul_lo_u32 v5, s9, v2
9563 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0
9564 ; GFX6-NEXT: v_mul_lo_u32 v2, s8, v2
9565 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
9566 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
9567 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3
9568 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v3
9569 ; GFX6-NEXT: v_mov_b32_e32 v5, s9
9570 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2
9571 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
9572 ; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2
9573 ; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
9574 ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7
9575 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3]
9576 ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6
9577 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
9578 ; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3]
9579 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7
9580 ; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6
9581 ; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3]
9582 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
9583 ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
9584 ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1]
9585 ; GFX6-NEXT: v_mov_b32_e32 v6, s11
9586 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
9587 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3
9588 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
9589 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2
9590 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1]
9591 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
9592 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3
9593 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
9594 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
9595 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
9596 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
9597 ; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2
9598 ; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3
9599 ; GFX6-NEXT: v_mov_b32_e32 v4, s14
9600 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2
9601 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
9602 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
9603 ; GFX6-NEXT: s_endpgm
9605 ; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
9607 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
9608 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
9609 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
9610 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
9611 ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s12
9612 ; GFX9-NEXT: s_lshl_b64 s[14:15], 0x1000, s14
9613 ; GFX9-NEXT: s_ashr_i32 s2, s1, 31
9614 ; GFX9-NEXT: s_add_u32 s0, s0, s2
9615 ; GFX9-NEXT: s_mov_b32 s3, s2
9616 ; GFX9-NEXT: s_addc_u32 s1, s1, s2
9617 ; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3]
9618 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12
9619 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13
9620 ; GFX9-NEXT: s_sub_u32 s0, 0, s12
9621 ; GFX9-NEXT: s_subb_u32 s1, 0, s13
9622 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
9623 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0
9624 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
9625 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
9626 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
9627 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
9628 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
9629 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
9630 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1
9631 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0
9632 ; GFX9-NEXT: s_mul_i32 s4, s0, s2
9633 ; GFX9-NEXT: s_mul_hi_u32 s16, s0, s3
9634 ; GFX9-NEXT: s_mul_i32 s5, s1, s3
9635 ; GFX9-NEXT: s_add_i32 s4, s16, s4
9636 ; GFX9-NEXT: s_mul_i32 s17, s0, s3
9637 ; GFX9-NEXT: s_add_i32 s4, s4, s5
9638 ; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4
9639 ; GFX9-NEXT: s_mul_i32 s16, s3, s4
9640 ; GFX9-NEXT: s_mul_hi_u32 s3, s3, s17
9641 ; GFX9-NEXT: s_add_u32 s3, s3, s16
9642 ; GFX9-NEXT: s_addc_u32 s5, 0, s5
9643 ; GFX9-NEXT: s_mul_hi_u32 s18, s2, s17
9644 ; GFX9-NEXT: s_mul_i32 s17, s2, s17
9645 ; GFX9-NEXT: s_add_u32 s3, s3, s17
9646 ; GFX9-NEXT: s_mul_hi_u32 s16, s2, s4
9647 ; GFX9-NEXT: s_addc_u32 s3, s5, s18
9648 ; GFX9-NEXT: s_addc_u32 s5, s16, 0
9649 ; GFX9-NEXT: s_mul_i32 s4, s2, s4
9650 ; GFX9-NEXT: s_add_u32 s3, s3, s4
9651 ; GFX9-NEXT: s_addc_u32 s4, 0, s5
9652 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s3, v0
9653 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
9654 ; GFX9-NEXT: s_addc_u32 s2, s2, s4
9655 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0
9656 ; GFX9-NEXT: s_mul_i32 s3, s0, s2
9657 ; GFX9-NEXT: s_mul_hi_u32 s5, s0, s4
9658 ; GFX9-NEXT: s_add_i32 s3, s5, s3
9659 ; GFX9-NEXT: s_mul_i32 s1, s1, s4
9660 ; GFX9-NEXT: s_add_i32 s3, s3, s1
9661 ; GFX9-NEXT: s_mul_i32 s0, s0, s4
9662 ; GFX9-NEXT: s_mul_hi_u32 s5, s2, s0
9663 ; GFX9-NEXT: s_mul_i32 s16, s2, s0
9664 ; GFX9-NEXT: s_mul_i32 s18, s4, s3
9665 ; GFX9-NEXT: s_mul_hi_u32 s0, s4, s0
9666 ; GFX9-NEXT: s_mul_hi_u32 s17, s4, s3
9667 ; GFX9-NEXT: s_add_u32 s0, s0, s18
9668 ; GFX9-NEXT: s_addc_u32 s4, 0, s17
9669 ; GFX9-NEXT: s_add_u32 s0, s0, s16
9670 ; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3
9671 ; GFX9-NEXT: s_addc_u32 s0, s4, s5
9672 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
9673 ; GFX9-NEXT: s_mul_i32 s3, s2, s3
9674 ; GFX9-NEXT: s_add_u32 s0, s0, s3
9675 ; GFX9-NEXT: s_addc_u32 s1, 0, s1
9676 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
9677 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
9678 ; GFX9-NEXT: s_addc_u32 s2, s2, s1
9679 ; GFX9-NEXT: s_ashr_i32 s16, s9, 31
9680 ; GFX9-NEXT: s_add_u32 s0, s8, s16
9681 ; GFX9-NEXT: s_mov_b32 s17, s16
9682 ; GFX9-NEXT: s_addc_u32 s1, s9, s16
9683 ; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17]
9684 ; GFX9-NEXT: v_readfirstlane_b32 s3, v0
9685 ; GFX9-NEXT: s_mul_i32 s1, s4, s2
9686 ; GFX9-NEXT: s_mul_hi_u32 s8, s4, s3
9687 ; GFX9-NEXT: s_mul_hi_u32 s0, s4, s2
9688 ; GFX9-NEXT: s_add_u32 s1, s8, s1
9689 ; GFX9-NEXT: s_addc_u32 s0, 0, s0
9690 ; GFX9-NEXT: s_mul_hi_u32 s9, s5, s3
9691 ; GFX9-NEXT: s_mul_i32 s3, s5, s3
9692 ; GFX9-NEXT: s_add_u32 s1, s1, s3
9693 ; GFX9-NEXT: s_mul_hi_u32 s8, s5, s2
9694 ; GFX9-NEXT: s_addc_u32 s0, s0, s9
9695 ; GFX9-NEXT: s_addc_u32 s1, s8, 0
9696 ; GFX9-NEXT: s_mul_i32 s2, s5, s2
9697 ; GFX9-NEXT: s_add_u32 s0, s0, s2
9698 ; GFX9-NEXT: s_addc_u32 s1, 0, s1
9699 ; GFX9-NEXT: s_mul_i32 s1, s12, s1
9700 ; GFX9-NEXT: s_mul_hi_u32 s2, s12, s0
9701 ; GFX9-NEXT: s_add_i32 s1, s2, s1
9702 ; GFX9-NEXT: s_mul_i32 s2, s13, s0
9703 ; GFX9-NEXT: s_mul_i32 s0, s12, s0
9704 ; GFX9-NEXT: s_add_i32 s8, s1, s2
9705 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
9706 ; GFX9-NEXT: s_sub_i32 s1, s5, s8
9707 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
9708 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
9709 ; GFX9-NEXT: s_subb_u32 s4, s1, s13
9710 ; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s12, v0
9711 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
9712 ; GFX9-NEXT: s_subb_u32 s9, s4, 0
9713 ; GFX9-NEXT: s_cmp_ge_u32 s9, s13
9714 ; GFX9-NEXT: s_cselect_b32 s17, -1, 0
9715 ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v1
9716 ; GFX9-NEXT: s_cmp_eq_u32 s9, s13
9717 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
9718 ; GFX9-NEXT: v_mov_b32_e32 v3, s17
9719 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
9720 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
9721 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[2:3]
9722 ; GFX9-NEXT: s_subb_u32 s2, s4, s13
9723 ; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v1
9724 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
9725 ; GFX9-NEXT: s_subb_u32 s2, s2, 0
9726 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
9727 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
9728 ; GFX9-NEXT: v_mov_b32_e32 v2, s9
9729 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
9730 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
9731 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
9732 ; GFX9-NEXT: s_subb_u32 s0, s5, s8
9733 ; GFX9-NEXT: s_cmp_ge_u32 s0, s13
9734 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0
9735 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0
9736 ; GFX9-NEXT: s_cmp_eq_u32 s0, s13
9737 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
9738 ; GFX9-NEXT: v_mov_b32_e32 v5, s1
9739 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
9740 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
9741 ; GFX9-NEXT: v_mov_b32_e32 v5, s0
9742 ; GFX9-NEXT: s_ashr_i32 s0, s15, 31
9743 ; GFX9-NEXT: s_add_u32 s2, s14, s0
9744 ; GFX9-NEXT: s_mov_b32 s1, s0
9745 ; GFX9-NEXT: s_addc_u32 s3, s15, s0
9746 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
9747 ; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1]
9748 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
9749 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4
9750 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5
9751 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
9752 ; GFX9-NEXT: v_xor_b32_e32 v0, s16, v0
9753 ; GFX9-NEXT: v_xor_b32_e32 v2, s16, v2
9754 ; GFX9-NEXT: v_mac_f32_e32 v1, 0x4f800000, v3
9755 ; GFX9-NEXT: v_rcp_f32_e32 v3, v1
9756 ; GFX9-NEXT: v_mov_b32_e32 v5, s16
9757 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v0
9758 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v5, vcc
9759 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v3
9760 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
9761 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
9762 ; GFX9-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
9763 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
9764 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
9765 ; GFX9-NEXT: s_sub_u32 s0, 0, s4
9766 ; GFX9-NEXT: s_subb_u32 s1, 0, s5
9767 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
9768 ; GFX9-NEXT: v_readfirstlane_b32 s9, v3
9769 ; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2
9770 ; GFX9-NEXT: s_mul_i32 s12, s0, s9
9771 ; GFX9-NEXT: s_mul_i32 s3, s1, s2
9772 ; GFX9-NEXT: s_add_i32 s8, s8, s12
9773 ; GFX9-NEXT: s_add_i32 s8, s8, s3
9774 ; GFX9-NEXT: s_mul_i32 s13, s0, s2
9775 ; GFX9-NEXT: s_mul_hi_u32 s3, s2, s8
9776 ; GFX9-NEXT: s_mul_i32 s12, s2, s8
9777 ; GFX9-NEXT: s_mul_hi_u32 s2, s2, s13
9778 ; GFX9-NEXT: s_add_u32 s2, s2, s12
9779 ; GFX9-NEXT: s_addc_u32 s3, 0, s3
9780 ; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13
9781 ; GFX9-NEXT: s_mul_i32 s13, s9, s13
9782 ; GFX9-NEXT: s_add_u32 s2, s2, s13
9783 ; GFX9-NEXT: s_mul_hi_u32 s12, s9, s8
9784 ; GFX9-NEXT: s_addc_u32 s2, s3, s14
9785 ; GFX9-NEXT: s_addc_u32 s3, s12, 0
9786 ; GFX9-NEXT: s_mul_i32 s8, s9, s8
9787 ; GFX9-NEXT: s_add_u32 s2, s2, s8
9788 ; GFX9-NEXT: s_addc_u32 s3, 0, s3
9789 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
9790 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
9791 ; GFX9-NEXT: s_addc_u32 s2, s9, s3
9792 ; GFX9-NEXT: v_readfirstlane_b32 s8, v2
9793 ; GFX9-NEXT: s_mul_i32 s3, s0, s2
9794 ; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8
9795 ; GFX9-NEXT: s_add_i32 s3, s9, s3
9796 ; GFX9-NEXT: s_mul_i32 s1, s1, s8
9797 ; GFX9-NEXT: s_add_i32 s3, s3, s1
9798 ; GFX9-NEXT: s_mul_i32 s0, s0, s8
9799 ; GFX9-NEXT: s_mul_hi_u32 s9, s2, s0
9800 ; GFX9-NEXT: s_mul_i32 s12, s2, s0
9801 ; GFX9-NEXT: s_mul_i32 s14, s8, s3
9802 ; GFX9-NEXT: s_mul_hi_u32 s0, s8, s0
9803 ; GFX9-NEXT: s_mul_hi_u32 s13, s8, s3
9804 ; GFX9-NEXT: s_add_u32 s0, s0, s14
9805 ; GFX9-NEXT: s_addc_u32 s8, 0, s13
9806 ; GFX9-NEXT: s_add_u32 s0, s0, s12
9807 ; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3
9808 ; GFX9-NEXT: s_addc_u32 s0, s8, s9
9809 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
9810 ; GFX9-NEXT: s_mul_i32 s3, s2, s3
9811 ; GFX9-NEXT: s_add_u32 s0, s0, s3
9812 ; GFX9-NEXT: s_addc_u32 s1, 0, s1
9813 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
9814 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
9815 ; GFX9-NEXT: s_addc_u32 s2, s2, s1
9816 ; GFX9-NEXT: s_ashr_i32 s8, s11, 31
9817 ; GFX9-NEXT: s_add_u32 s0, s10, s8
9818 ; GFX9-NEXT: s_mov_b32 s9, s8
9819 ; GFX9-NEXT: s_addc_u32 s1, s11, s8
9820 ; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9]
9821 ; GFX9-NEXT: v_readfirstlane_b32 s3, v2
9822 ; GFX9-NEXT: s_mul_i32 s1, s10, s2
9823 ; GFX9-NEXT: s_mul_hi_u32 s9, s10, s3
9824 ; GFX9-NEXT: s_mul_hi_u32 s0, s10, s2
9825 ; GFX9-NEXT: s_add_u32 s1, s9, s1
9826 ; GFX9-NEXT: s_addc_u32 s0, 0, s0
9827 ; GFX9-NEXT: s_mul_hi_u32 s12, s11, s3
9828 ; GFX9-NEXT: s_mul_i32 s3, s11, s3
9829 ; GFX9-NEXT: s_add_u32 s1, s1, s3
9830 ; GFX9-NEXT: s_mul_hi_u32 s9, s11, s2
9831 ; GFX9-NEXT: s_addc_u32 s0, s0, s12
9832 ; GFX9-NEXT: s_addc_u32 s1, s9, 0
9833 ; GFX9-NEXT: s_mul_i32 s2, s11, s2
9834 ; GFX9-NEXT: s_add_u32 s0, s0, s2
9835 ; GFX9-NEXT: s_addc_u32 s1, 0, s1
9836 ; GFX9-NEXT: s_mul_i32 s1, s4, s1
9837 ; GFX9-NEXT: s_mul_hi_u32 s2, s4, s0
9838 ; GFX9-NEXT: s_add_i32 s1, s2, s1
9839 ; GFX9-NEXT: s_mul_i32 s2, s5, s0
9840 ; GFX9-NEXT: s_mul_i32 s0, s4, s0
9841 ; GFX9-NEXT: s_add_i32 s9, s1, s2
9842 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
9843 ; GFX9-NEXT: s_sub_i32 s1, s11, s9
9844 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2
9845 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
9846 ; GFX9-NEXT: s_subb_u32 s10, s1, s5
9847 ; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s4, v2
9848 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
9849 ; GFX9-NEXT: s_subb_u32 s12, s10, 0
9850 ; GFX9-NEXT: s_cmp_ge_u32 s12, s5
9851 ; GFX9-NEXT: s_cselect_b32 s13, -1, 0
9852 ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v3
9853 ; GFX9-NEXT: s_cmp_eq_u32 s12, s5
9854 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[2:3]
9855 ; GFX9-NEXT: v_mov_b32_e32 v6, s13
9856 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
9857 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
9858 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3]
9859 ; GFX9-NEXT: s_subb_u32 s2, s10, s5
9860 ; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v3
9861 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
9862 ; GFX9-NEXT: s_subb_u32 s2, s2, 0
9863 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5
9864 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
9865 ; GFX9-NEXT: v_mov_b32_e32 v5, s12
9866 ; GFX9-NEXT: v_mov_b32_e32 v6, s2
9867 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
9868 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1]
9869 ; GFX9-NEXT: s_subb_u32 s0, s11, s9
9870 ; GFX9-NEXT: s_cmp_ge_u32 s0, s5
9871 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0
9872 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2
9873 ; GFX9-NEXT: s_cmp_eq_u32 s0, s5
9874 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
9875 ; GFX9-NEXT: v_mov_b32_e32 v7, s1
9876 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
9877 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
9878 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
9879 ; GFX9-NEXT: v_mov_b32_e32 v7, s0
9880 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
9881 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
9882 ; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2
9883 ; GFX9-NEXT: v_xor_b32_e32 v3, s8, v5
9884 ; GFX9-NEXT: v_mov_b32_e32 v5, s8
9885 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s8, v2
9886 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
9887 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
9888 ; GFX9-NEXT: s_endpgm
9889 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
9890 %r = srem <2 x i64> %x, %shl.y
9891 store <2 x i64> %r, ptr addrspace(1) %out
9895 define <2 x i32> @v_sdiv_i32_exact(<2 x i32> %num) {
9896 ; CHECK-LABEL: @v_sdiv_i32_exact(
9897 ; CHECK: %1 = extractelement <2 x i32> %num, i64 0
9898 ; CHECK-NEXT: %2 = sdiv exact i32 %1, 4096
9899 ; CHECK-NEXT: %3 = insertelement <2 x i32> poison, i32 %2, i64 0
9900 ; CHECK-NEXT: %4 = extractelement <2 x i32> %num, i64 1
9901 ; CHECK-NEXT: %5 = sdiv exact i32 %4, 1024
9902 ; CHECK-NEXT: %6 = insertelement <2 x i32> %3, i32 %5, i64 1
9903 ; CHECK-NEXT: ret <2 x i32> %6
9905 ; GFX6-LABEL: v_sdiv_i32_exact:
9907 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9908 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 12, v0
9909 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 10, v1
9910 ; GFX6-NEXT: s_setpc_b64 s[30:31]
9912 ; GFX9-LABEL: v_sdiv_i32_exact:
9914 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9915 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0
9916 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 10, v1
9917 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9918 %result = sdiv exact <2 x i32> %num, <i32 4096, i32 1024>
9919 ret <2 x i32> %result
9922 define <2 x i64> @v_sdiv_i64_exact(<2 x i64> %num) {
9923 ; CHECK-LABEL: @v_sdiv_i64_exact(
9924 ; CHECK: %1 = extractelement <2 x i64> %num, i64 0
9925 ; CHECK-NEXT: %2 = sdiv exact i64 %1, 4096
9926 ; CHECK-NEXT: %3 = insertelement <2 x i64> poison, i64 %2, i64 0
9927 ; CHECK-NEXT: %4 = extractelement <2 x i64> %num, i64 1
9928 ; CHECK-NEXT: %5 = sdiv exact i64 %4, 1024
9929 ; CHECK-NEXT: %6 = insertelement <2 x i64> %3, i64 %5, i64 1
9930 ; CHECK-NEXT: ret <2 x i64> %6
9932 ; GFX6-LABEL: v_sdiv_i64_exact:
9934 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9935 ; GFX6-NEXT: v_ashr_i64 v[0:1], v[0:1], 12
9936 ; GFX6-NEXT: v_ashr_i64 v[2:3], v[2:3], 10
9937 ; GFX6-NEXT: s_setpc_b64 s[30:31]
9939 ; GFX9-LABEL: v_sdiv_i64_exact:
9941 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9942 ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 12, v[0:1]
9943 ; GFX9-NEXT: v_ashrrev_i64 v[2:3], 10, v[2:3]
9944 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9945 %result = sdiv exact <2 x i64> %num, <i64 4096, i64 1024>
9946 ret <2 x i64> %result
9949 define <2 x i32> @v_udiv_i32_exact(<2 x i32> %num) {
9950 ; CHECK-LABEL: @v_udiv_i32_exact(
9951 ; CHECK: %1 = extractelement <2 x i32> %num, i64 0
9952 ; CHECK-NEXT: %2 = udiv exact i32 %1, 4096
9953 ; CHECK-NEXT: %3 = insertelement <2 x i32> poison, i32 %2, i64 0
9954 ; CHECK-NEXT: %4 = extractelement <2 x i32> %num, i64 1
9955 ; CHECK-NEXT: %5 = udiv exact i32 %4, 1024
9956 ; CHECK-NEXT: %6 = insertelement <2 x i32> %3, i32 %5, i64 1
9957 ; CHECK-NEXT: ret <2 x i32> %6
9959 ; GFX6-LABEL: v_udiv_i32_exact:
9961 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9962 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 12, v0
9963 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 10, v1
9964 ; GFX6-NEXT: s_setpc_b64 s[30:31]
9966 ; GFX9-LABEL: v_udiv_i32_exact:
9968 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9969 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 12, v0
9970 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 10, v1
9971 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9972 %result = udiv exact <2 x i32> %num, <i32 4096, i32 1024>
9973 ret <2 x i32> %result
9976 define <2 x i64> @v_udiv_i64_exact(<2 x i64> %num) {
9977 ; CHECK-LABEL: @v_udiv_i64_exact(
9978 ; CHECK: %1 = extractelement <2 x i64> %num, i64 0
9979 ; CHECK-NEXT: %2 = udiv exact i64 %1, 4096
9980 ; CHECK-NEXT: %3 = insertelement <2 x i64> poison, i64 %2, i64 0
9981 ; CHECK-NEXT: %4 = extractelement <2 x i64> %num, i64 1
9982 ; CHECK-NEXT: %5 = udiv exact i64 %4, 1024
9983 ; CHECK-NEXT: %6 = insertelement <2 x i64> %3, i64 %5, i64 1
9984 ; CHECK-NEXT: ret <2 x i64> %6
9986 ; GFX6-LABEL: v_udiv_i64_exact:
9988 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9989 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 12
9990 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 10
9991 ; GFX6-NEXT: s_setpc_b64 s[30:31]
9993 ; GFX9-LABEL: v_udiv_i64_exact:
9995 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9996 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 12, v[0:1]
9997 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 10, v[2:3]
9998 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9999 %result = udiv exact <2 x i64> %num, <i64 4096, i64 1024>
10000 ret <2 x i64> %result
10003 define i64 @udiv_i64_gt_smax(i8 %size) {
10004 ; GFX6-LABEL: udiv_i64_gt_smax:
10006 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10007 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
10008 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
10009 ; GFX6-NEXT: v_not_b32_e32 v1, v1
10010 ; GFX6-NEXT: v_not_b32_e32 v0, v0
10011 ; GFX6-NEXT: s_mov_b32 s4, 0xcccccccd
10012 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, s4
10013 ; GFX6-NEXT: v_mul_hi_u32 v4, v0, s4
10014 ; GFX6-NEXT: s_mov_b32 s6, 0xcccccccc
10015 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, s4
10016 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6
10017 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6
10018 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
10019 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
10020 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
10021 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
10022 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6
10023 ; GFX6-NEXT: v_mul_hi_u32 v1, v1, s6
10024 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0
10025 ; GFX6-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
10026 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
10027 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
10028 ; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 3
10029 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 3, v1
10030 ; GFX6-NEXT: s_setpc_b64 s[30:31]
10032 ; GFX9-LABEL: udiv_i64_gt_smax:
10034 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10035 ; GFX9-NEXT: v_mov_b32_e32 v1, 31
10036 ; GFX9-NEXT: v_not_b32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
10037 ; GFX9-NEXT: s_mov_b32 s4, 0xcccccccd
10038 ; GFX9-NEXT: v_ashrrev_i32_sdwa v1, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
10039 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, s4
10040 ; GFX9-NEXT: v_not_b32_e32 v5, v1
10041 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
10042 ; GFX9-NEXT: s_mov_b32 s6, 0xcccccccc
10043 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, s4, v[0:1]
10044 ; GFX9-NEXT: v_mov_b32_e32 v6, v3
10045 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
10046 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[2:3]
10047 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
10048 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
10049 ; GFX9-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc
10050 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1]
10051 ; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 3
10052 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 3, v1
10053 ; GFX9-NEXT: s_setpc_b64 s[30:31]
10054 %esize = sext i8 %size to i64
10055 %minus = sub nuw nsw i64 -1, %esize
10056 %div = udiv i64 %minus, 10
10060 define i64 @udiv_i64_9divbits(i8 %size) {
10061 ; GFX6-LABEL: udiv_i64_9divbits:
10063 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10064 ; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0
10065 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0
10066 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
10067 ; GFX6-NEXT: s_mov_b32 s4, 0x41200000
10068 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x3dcccccd, v0
10069 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
10070 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1
10071 ; GFX6-NEXT: v_mad_f32 v0, -v1, s4, v0
10072 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4
10073 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
10074 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
10075 ; GFX6-NEXT: v_and_b32_e32 v0, 0x1ff, v0
10076 ; GFX6-NEXT: s_setpc_b64 s[30:31]
10078 ; GFX9-LABEL: udiv_i64_9divbits:
10080 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10081 ; GFX9-NEXT: v_mov_b32_e32 v1, 1
10082 ; GFX9-NEXT: v_add_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
10083 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
10084 ; GFX9-NEXT: s_mov_b32 s4, 0x41200000
10085 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x3dcccccd, v0
10086 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
10087 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1
10088 ; GFX9-NEXT: v_mad_f32 v0, -v1, s4, v0
10089 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4
10090 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
10091 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
10092 ; GFX9-NEXT: v_and_b32_e32 v0, 0x1ff, v0
10093 ; GFX9-NEXT: s_setpc_b64 s[30:31]
10094 %zextend = zext i8 %size to i64
10095 %num = add nuw nsw i64 1, %zextend
10096 %div = udiv i64 %num, 10