1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
8 ; FUNC-LABEL: {{^}}udiv_i32:
12 ; SI: v_rcp_iflag_f32_e32
13 define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
14 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
15 %a = load i32, i32 addrspace(1)* %in
16 %b = load i32, i32 addrspace(1)* %b_ptr
17 %result = udiv i32 %a, %b
18 store i32 %result, i32 addrspace(1)* %out
22 ; FUNC-LABEL: {{^}}s_udiv_i32:
23 ; SI: v_rcp_iflag_f32_e32
24 define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
25 %result = udiv i32 %a, %b
26 store i32 %result, i32 addrspace(1)* %out
31 ; The code generated by udiv is long and complex and may frequently
32 ; change. The goal of this test is to make sure the ISel doesn't fail
33 ; when it gets a v4i32 udiv
35 ; FUNC-LABEL: {{^}}udiv_v2i32:
38 ; SI: v_rcp_iflag_f32_e32
39 ; SI: v_rcp_iflag_f32_e32
41 define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
42 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
43 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
44 %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
45 %result = udiv <2 x i32> %a, %b
46 store <2 x i32> %result, <2 x i32> addrspace(1)* %out
50 ; FUNC-LABEL: {{^}}udiv_v4i32:
53 define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
54 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
55 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
56 %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
57 %result = udiv <4 x i32> %a, %b
58 store <4 x i32> %result, <4 x i32> addrspace(1)* %out
62 ; FUNC-LABEL: {{^}}udiv_i32_div_pow2:
63 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
64 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 4, [[VAL]]
65 ; SI: buffer_store_dword [[RESULT]]
66 define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
67 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
68 %a = load i32, i32 addrspace(1)* %in
69 %result = udiv i32 %a, 16
70 store i32 %result, i32 addrspace(1)* %out
74 ; FUNC-LABEL: {{^}}udiv_i32_div_k_even:
75 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
76 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xfabbd9c1
77 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]]
78 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]]
79 ; SI: buffer_store_dword [[RESULT]]
80 define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
81 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
82 %a = load i32, i32 addrspace(1)* %in
83 %result = udiv i32 %a, 34259182
84 store i32 %result, i32 addrspace(1)* %out
88 ; FUNC-LABEL: {{^}}udiv_i32_div_k_odd:
89 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
90 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7d5deca3
91 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]]
92 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]]
93 ; SI: buffer_store_dword [[RESULT]]
94 define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
95 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
96 %a = load i32, i32 addrspace(1)* %in
97 %result = udiv i32 %a, 34259183
98 store i32 %result, i32 addrspace(1)* %out
102 ; FUNC-LABEL: {{^}}v_udiv_i8:
103 ; SI: v_rcp_iflag_f32
104 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}}
105 ; SI: buffer_store_dword [[TRUNC]]
106 define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
107 %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
108 %num = load i8, i8 addrspace(1) * %in
109 %den = load i8, i8 addrspace(1) * %den_ptr
110 %result = udiv i8 %num, %den
111 %result.ext = zext i8 %result to i32
112 store i32 %result.ext, i32 addrspace(1)* %out
116 ; FUNC-LABEL: {{^}}v_udiv_i16:
117 ; SI: v_rcp_iflag_f32
118 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}}
119 ; SI: buffer_store_dword [[TRUNC]]
120 define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
121 %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
122 %num = load i16, i16 addrspace(1) * %in
123 %den = load i16, i16 addrspace(1) * %den_ptr
124 %result = udiv i16 %num, %den
125 %result.ext = zext i16 %result to i32
126 store i32 %result.ext, i32 addrspace(1)* %out
130 ; FUNC-LABEL: {{^}}v_udiv_i23:
131 ; SI: v_rcp_iflag_f32
132 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
133 ; SI: buffer_store_dword [[TRUNC]]
134 define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
135 %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
136 %num = load i23, i23 addrspace(1) * %in
137 %den = load i23, i23 addrspace(1) * %den_ptr
138 %result = udiv i23 %num, %den
139 %result.ext = zext i23 %result to i32
140 store i32 %result.ext, i32 addrspace(1)* %out
144 ; FUNC-LABEL: {{^}}v_udiv_i24:
146 define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
147 %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
148 %num = load i24, i24 addrspace(1) * %in
149 %den = load i24, i24 addrspace(1) * %den_ptr
150 %result = udiv i24 %num, %den
151 %result.ext = zext i24 %result to i32
152 store i32 %result.ext, i32 addrspace(1)* %out
156 ; FUNC-LABEL: @scalarize_mulhu_4xi32
162 define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
163 %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
164 %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
165 store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
169 ; FUNC-LABEL: {{^}}test_udiv2:
170 ; SI: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
171 define amdgpu_kernel void @test_udiv2(i32 %p) {
173 store volatile i32 %i, i32 addrspace(1)* undef
177 ; FUNC-LABEL: {{^}}test_udiv_3_mulhu:
178 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
179 ; SI: v_mul_hi_u32 v0, {{s[0-9]+}}, {{v[0-9]+}}
180 ; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
181 define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
183 store volatile i32 %i, i32 addrspace(1)* undef
187 ; GCN-LABEL: {{^}}fdiv_test_denormals
188 ; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
189 define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
191 %tmp = load i8, i8 addrspace(1)* null, align 1
192 %tmp1 = sext i8 %tmp to i32
193 %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef
194 %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
195 %tmp4 = sext i8 %tmp3 to i32
196 %tmp5 = sdiv i32 %tmp1, %tmp4
197 %tmp6 = trunc i32 %tmp5 to i8
198 store i8 %tmp6, i8 addrspace(1)* null, align 1