1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
5 ; FUNC-LABEL: {{^}}udiv24_i8:
7 ; SI-DAG: v_cvt_f32_ubyte
8 ; SI-DAG: v_rcp_iflag_f32
15 define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
16 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
17 %num = load i8, i8 addrspace(1) * %in
18 %den = load i8, i8 addrspace(1) * %den_ptr
19 %result = udiv i8 %num, %den
20 store i8 %result, i8 addrspace(1)* %out
24 ; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_in_out:
26 ; SI-DAG: v_cvt_f32_ubyte
27 ; SI-DAG: v_rcp_iflag_f32
34 define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
35 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
36 %num = load i8, i8 addrspace(1) * %in
37 %den = load i8, i8 addrspace(1) * %den_ptr
38 %result = udiv i8 %num, %den
39 store i8 %result, i8 addrspace(1)* %out
43 ; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_in:
45 ; SI-DAG: v_cvt_f32_ubyte
46 ; SI-DAG: v_rcp_iflag_f32
53 define amdgpu_kernel void @udiv24_i8_denorm_flush_in(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
54 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
55 %num = load i8, i8 addrspace(1) * %in
56 %den = load i8, i8 addrspace(1) * %den_ptr
57 %result = udiv i8 %num, %den
58 store i8 %result, i8 addrspace(1)* %out
62 ; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_out:
64 ; SI-DAG: v_cvt_f32_ubyte
65 ; SI-DAG: v_rcp_iflag_f32
72 define amdgpu_kernel void @udiv24_i8_denorm_flush_out(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #2 {
73 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
74 %num = load i8, i8 addrspace(1) * %in
75 %den = load i8, i8 addrspace(1) * %den_ptr
76 %result = udiv i8 %num, %den
77 store i8 %result, i8 addrspace(1)* %out
81 ; FUNC-LABEL: {{^}}udiv24_i16:
91 define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
92 %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
93 %num = load i16, i16 addrspace(1) * %in, align 2
94 %den = load i16, i16 addrspace(1) * %den_ptr, align 2
95 %result = udiv i16 %num, %den
96 store i16 %result, i16 addrspace(1)* %out, align 2
100 ; FUNC-LABEL: {{^}}udiv23_i32:
102 ; SI-DAG: v_cvt_f32_u32
103 ; SI-DAG: v_rcp_iflag_f32
107 ; EG-DAG: UINT_TO_FLT
110 define amdgpu_kernel void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
111 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
112 %num = load i32, i32 addrspace(1) * %in, align 4
113 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
114 %num.i23.0 = shl i32 %num, 9
115 %den.i23.0 = shl i32 %den, 9
116 %num.i23 = lshr i32 %num.i23.0, 9
117 %den.i23 = lshr i32 %den.i23.0, 9
118 %result = udiv i32 %num.i23, %den.i23
119 store i32 %result, i32 addrspace(1)* %out, align 4
123 ; FUNC-LABEL: {{^}}udiv24_i32:
127 define amdgpu_kernel void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
128 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
129 %num = load i32, i32 addrspace(1) * %in, align 4
130 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
131 %num.i24.0 = shl i32 %num, 8
132 %den.i24.0 = shl i32 %den, 8
133 %num.i24 = lshr i32 %num.i24.0, 8
134 %den.i24 = lshr i32 %den.i24.0, 8
135 %result = udiv i32 %num.i24, %den.i24
136 store i32 %result, i32 addrspace(1)* %out, align 4
140 ; FUNC-LABEL: {{^}}no_udiv24_u23_u24_i32:
144 define amdgpu_kernel void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
145 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
146 %num = load i32, i32 addrspace(1) * %in, align 4
147 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
148 %num.i23.0 = shl i32 %num, 9
149 %den.i24.0 = shl i32 %den, 8
150 %num.i23 = lshr i32 %num.i23.0, 9
151 %den.i24 = lshr i32 %den.i24.0, 8
152 %result = udiv i32 %num.i23, %den.i24
153 store i32 %result, i32 addrspace(1)* %out, align 4
157 ; FUNC-LABEL: {{^}}no_udiv24_u24_u23_i32:
161 define amdgpu_kernel void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
162 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
163 %num = load i32, i32 addrspace(1) * %in, align 4
164 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
165 %num.i24.0 = shl i32 %num, 8
166 %den.i23.0 = shl i32 %den, 9
167 %num.i24 = lshr i32 %num.i24.0, 8
168 %den.i23 = lshr i32 %den.i23.0, 9
169 %result = udiv i32 %num.i24, %den.i23
170 store i32 %result, i32 addrspace(1)* %out, align 4
174 ; FUNC-LABEL: {{^}}udiv25_i32:
175 ; RCP_IFLAG is for URECIP in the full 32b alg
179 ; EG-NOT: UINT_TO_FLT
181 define amdgpu_kernel void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
182 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
183 %num = load i32, i32 addrspace(1) * %in, align 4
184 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
185 %num.i25.0 = shl i32 %num, 7
186 %den.i25.0 = shl i32 %den, 7
187 %num.i25 = lshr i32 %num.i25.0, 7
188 %den.i25 = lshr i32 %den.i25.0, 7
189 %result = udiv i32 %num.i25, %den.i25
190 store i32 %result, i32 addrspace(1)* %out, align 4
194 ; FUNC-LABEL: {{^}}test_no_udiv24_i32_1:
195 ; RCP_IFLAG is for URECIP in the full 32b alg
199 ; EG-NOT: UINT_TO_FLT
201 define amdgpu_kernel void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
202 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
203 %num = load i32, i32 addrspace(1) * %in, align 4
204 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
205 %num.i24.0 = shl i32 %num, 8
206 %den.i24.0 = shl i32 %den, 7
207 %num.i24 = lshr i32 %num.i24.0, 8
208 %den.i24 = lshr i32 %den.i24.0, 7
209 %result = udiv i32 %num.i24, %den.i24
210 store i32 %result, i32 addrspace(1)* %out, align 4
214 ; FUNC-LABEL: {{^}}test_no_udiv24_i32_2:
215 ; RCP_IFLAG is for URECIP in the full 32b alg
219 ; EG-NOT: UINT_TO_FLT
221 define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
222 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
223 %num = load i32, i32 addrspace(1) * %in, align 4
224 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
225 %num.i24.0 = shl i32 %num, 7
226 %den.i24.0 = shl i32 %den, 8
227 %num.i24 = lshr i32 %num.i24.0, 7
228 %den.i24 = lshr i32 %den.i24.0, 8
229 %result = udiv i32 %num.i24, %den.i24
230 store i32 %result, i32 addrspace(1)* %out, align 4
234 ; FUNC-LABEL: {{^}}urem24_i8:
235 ; SI: v_cvt_f32_ubyte
236 ; SI-DAG: v_cvt_f32_ubyte
237 ; SI-DAG: v_rcp_iflag_f32
241 ; EG-DAG: UINT_TO_FLT
244 define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
245 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
246 %num = load i8, i8 addrspace(1) * %in
247 %den = load i8, i8 addrspace(1) * %den_ptr
248 %result = urem i8 %num, %den
249 store i8 %result, i8 addrspace(1)* %out
253 ; FUNC-LABEL: {{^}}urem24_i16:
256 ; SI: v_rcp_iflag_f32
260 ; EG-DAG: UINT_TO_FLT
263 define amdgpu_kernel void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
264 %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
265 %num = load i16, i16 addrspace(1) * %in, align 2
266 %den = load i16, i16 addrspace(1) * %den_ptr, align 2
267 %result = urem i16 %num, %den
268 store i16 %result, i16 addrspace(1)* %out, align 2
272 ; FUNC-LABEL: {{^}}urem24_i32:
275 define amdgpu_kernel void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
276 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
277 %num = load i32, i32 addrspace(1) * %in, align 4
278 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
279 %num.i24.0 = shl i32 %num, 8
280 %den.i24.0 = shl i32 %den, 8
281 %num.i24 = lshr i32 %num.i24.0, 8
282 %den.i24 = lshr i32 %den.i24.0, 8
283 %result = urem i32 %num.i24, %den.i24
284 store i32 %result, i32 addrspace(1)* %out, align 4
288 ; FUNC-LABEL: {{^}}urem25_i32:
289 ; RCP_IFLAG is for URECIP in the full 32b alg
293 ; EG-NOT: UINT_TO_FLT
295 define amdgpu_kernel void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
296 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
297 %num = load i32, i32 addrspace(1) * %in, align 4
298 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
299 %num.i24.0 = shl i32 %num, 7
300 %den.i24.0 = shl i32 %den, 7
301 %num.i24 = lshr i32 %num.i24.0, 7
302 %den.i24 = lshr i32 %den.i24.0, 7
303 %result = urem i32 %num.i24, %den.i24
304 store i32 %result, i32 addrspace(1)* %out, align 4
308 ; FUNC-LABEL: {{^}}test_no_urem24_i32_1:
309 ; RCP_IFLAG is for URECIP in the full 32b alg
313 ; EG-NOT: UINT_TO_FLT
315 define amdgpu_kernel void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
316 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
317 %num = load i32, i32 addrspace(1) * %in, align 4
318 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
319 %num.i24.0 = shl i32 %num, 8
320 %den.i24.0 = shl i32 %den, 7
321 %num.i24 = lshr i32 %num.i24.0, 8
322 %den.i24 = lshr i32 %den.i24.0, 7
323 %result = urem i32 %num.i24, %den.i24
324 store i32 %result, i32 addrspace(1)* %out, align 4
328 ; FUNC-LABEL: {{^}}test_no_urem24_i32_2:
329 ; RCP_IFLAG is for URECIP in the full 32b alg
333 ; EG-NOT: UINT_TO_FLT
335 define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
336 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
337 %num = load i32, i32 addrspace(1) * %in, align 4
338 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
339 %num.i24.0 = shl i32 %num, 7
340 %den.i24.0 = shl i32 %den, 8
341 %num.i24 = lshr i32 %num.i24.0, 7
342 %den.i24 = lshr i32 %den.i24.0, 8
343 %result = urem i32 %num.i24, %den.i24
344 store i32 %result, i32 addrspace(1)* %out, align 4
348 ; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32:
349 ; SI-DAG: v_rcp_iflag_f32
350 ; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}}
351 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
354 define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
355 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
356 %num = load i32, i32 addrspace(1) * %in, align 4
357 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
358 %num.i16.0 = shl i32 %num, 16
359 %den.i23.0 = shl i32 %den, 9
360 %num.i16 = lshr i32 %num.i16.0, 16
361 %den.i23 = lshr i32 %den.i23.0, 9
362 %result = udiv i32 %num.i16, %den.i23
363 store i32 %result, i32 addrspace(1)* %out, align 4
367 ; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32:
368 ; SI-DAG: v_rcp_iflag_f32
369 ; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}}
370 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
373 define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
374 %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
375 %num = load i32, i32 addrspace(1) * %in, align 4
376 %den = load i32, i32 addrspace(1) * %den_ptr, align 4
377 %num.i23.0 = shl i32 %num, 9
378 %den.i16.0 = shl i32 %den, 16
379 %num.i23 = lshr i32 %num.i23.0, 9
380 %den.i16 = lshr i32 %den.i16.0, 16
381 %result = udiv i32 %num.i23, %den.i16
382 store i32 %result, i32 addrspace(1)* %out, align 4
386 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
387 attributes #1 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
388 attributes #2 = { "denormal-fp-math-f32"="preserve-sign,ieee" }