1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck --check-prefixes=GCN,GCN-DENORM %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-fp32-denormals < %s | FileCheck --check-prefixes=GCN,GCN-FLUSH %s
4 ; GCN-LABEL: {{^}}div_1_by_x_25ulp:
5 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
6 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
7 ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
8 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
9 ; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
10 ; GCN-DENORM: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
11 ; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
12 ; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
14 ; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
16 ; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
17 define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) {
18 %load = load float, float addrspace(1)* %arg, align 4
19 %div = fdiv float 1.000000e+00, %load, !fpmath !0
20 store float %div, float addrspace(1)* %arg, align 4
24 ; GCN-LABEL: {{^}}div_minus_1_by_x_25ulp:
25 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
26 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
27 ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
28 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
29 ; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
30 ; GCN-DENORM: v_mul_f32_e64 [[PRESCALED:v[0-9]+]], [[VAL]], -[[SCALE]]
31 ; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
32 ; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
34 ; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
36 ; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
37 define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) {
38 %load = load float, float addrspace(1)* %arg, align 4
39 %div = fdiv float -1.000000e+00, %load, !fpmath !0
40 store float %div, float addrspace(1)* %arg, align 4
44 ; GCN-LABEL: {{^}}div_1_by_minus_x_25ulp:
45 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
46 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
47 ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
48 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
49 ; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
50 ; GCN-DENORM: v_mul_f32_e64 [[PRESCALED:v[0-9]+]], -[[VAL]], [[SCALE]]
51 ; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
52 ; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
54 ; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
56 ; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
57 define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
58 %load = load float, float addrspace(1)* %arg, align 4
59 %neg = fsub float -0.000000e+00, %load
60 %div = fdiv float 1.000000e+00, %neg, !fpmath !0
61 store float %div, float addrspace(1)* %arg, align 4
65 ; GCN-LABEL: {{^}}div_minus_1_by_minus_x_25ulp:
66 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
67 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
68 ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
69 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
70 ; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
71 ; GCN-DENORM: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
72 ; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
73 ; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
75 ; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
77 ; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
78 define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
79 %load = load float, float addrspace(1)* %arg, align 4
80 %neg = fsub float -0.000000e+00, %load
81 %div = fdiv float -1.000000e+00, %neg, !fpmath !0
82 store float %div, float addrspace(1)* %arg, align 4
86 ; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp:
87 ; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
88 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
89 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
90 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
91 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
92 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
93 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
94 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
95 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
96 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
97 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
98 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
99 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
100 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
101 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
102 ; GCN-DENORM-DAG: v_rcp_f32_e32
103 ; GCN-DENORM-DAG: v_rcp_f32_e32
104 ; GCN-DENORM-DAG: v_rcp_f32_e32
105 ; GCN-DENORM-DAG: v_rcp_f32_e32
106 ; GCN-DENORM-DAG: v_mul_f32_e32
107 ; GCN-DENORM-DAG: v_mul_f32_e32
108 ; GCN-DENORM-DAG: v_mul_f32_e32
109 ; GCN-DENORM-DAG: v_mul_f32_e32
111 ; GCN-FLUSH: v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]]
112 ; GCN-FLUSH: v_rcp_f32_e32
113 ; GCN-FLUSH: v_rcp_f32_e32
114 ; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
115 ; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
116 define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
117 %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
118 %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %load, !fpmath !0
119 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
123 ; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp:
124 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
125 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
126 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
127 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
128 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
129 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
130 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
131 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
132 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
133 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
134 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
135 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
136 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
137 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
138 ; GCN-DENORM-DAG: v_rcp_f32_e32
139 ; GCN-DENORM-DAG: v_rcp_f32_e32
140 ; GCN-DENORM-DAG: v_rcp_f32_e32
141 ; GCN-DENORM-DAG: v_rcp_f32_e32
142 ; GCN-DENORM-DAG: v_mul_f32_e32
143 ; GCN-DENORM-DAG: v_mul_f32_e32
144 ; GCN-DENORM-DAG: v_mul_f32_e32
145 ; GCN-DENORM-DAG: v_mul_f32_e32
147 ; GCN-FLUSH: v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]]
148 ; GCN-FLUSH: v_rcp_f32_e64
149 ; GCN-FLUSH: v_rcp_f32_e64
150 ; GCN-FLUSH: v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
151 define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
152 %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
153 %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %load, !fpmath !0
154 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
158 ; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp:
159 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
160 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
161 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
162 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
163 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
164 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
165 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
166 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
167 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
168 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
169 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
170 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
171 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
172 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
173 ; GCN-DENORM-DAG: v_rcp_f32_e32
174 ; GCN-DENORM-DAG: v_rcp_f32_e32
175 ; GCN-DENORM-DAG: v_rcp_f32_e32
176 ; GCN-DENORM-DAG: v_rcp_f32_e32
177 ; GCN-DENORM-DAG: v_mul_f32_e32
178 ; GCN-DENORM-DAG: v_mul_f32_e32
179 ; GCN-DENORM-DAG: v_mul_f32_e32
180 ; GCN-DENORM-DAG: v_mul_f32_e32
182 ; GCN-FLUSH: v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]]
183 ; GCN-FLUSH: v_rcp_f32_e64
184 ; GCN-FLUSH: v_rcp_f32_e64
185 ; GCN-FLUSH: v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
186 ; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
187 define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
188 %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
189 %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
190 %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %neg, !fpmath !0
191 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
195 ; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp:
196 ; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
197 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
198 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
199 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
200 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
201 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
202 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
203 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
204 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
205 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
206 ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
207 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
208 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
209 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
210 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
211 ; GCN-DENORM-DAG: v_rcp_f32_e32
212 ; GCN-DENORM-DAG: v_rcp_f32_e32
213 ; GCN-DENORM-DAG: v_rcp_f32_e32
214 ; GCN-DENORM-DAG: v_rcp_f32_e32
215 ; GCN-DENORM-DAG: v_mul_f32_e32
216 ; GCN-DENORM-DAG: v_mul_f32_e32
217 ; GCN-DENORM-DAG: v_mul_f32_e32
218 ; GCN-DENORM-DAG: v_mul_f32_e32
220 ; GCN-FLUSH: v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]]
221 ; GCN-FLUSH: v_rcp_f32_e32
222 ; GCN-FLUSH: v_rcp_f32_e32
223 ; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
224 ; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
225 define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
226 %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
227 %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
228 %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %neg, !fpmath !0
229 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
233 ; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp:
234 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
235 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
236 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
237 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
238 ; GCN-DENORM-DAG: v_rcp_f32_e32
239 ; GCN-DENORM-DAG: v_rcp_f32_e32
241 ; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
242 ; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
244 ; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
245 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
246 ; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
247 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
249 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
250 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
251 ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
252 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
253 ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
254 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]]
256 ; GCN-DENORM-DAG: v_div_fmas_f32
257 ; GCN-DENORM-DAG: v_div_fmas_f32
258 ; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}}
259 ; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
261 ; GCN-FLUSH-DAG: v_rcp_f32_e32
262 ; GCN-FLUSH-DAG: v_rcp_f32_e64
264 ; GCN-NOT: v_cmp_gt_f32_e64
265 ; GCN-NOT: v_cndmask_b32_e32
266 ; GCN-FLUSH-NOT: v_div
268 ; GCN: global_store_dwordx4
269 define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
270 %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
271 %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %load, !fpmath !0
272 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
276 ; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp:
277 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
278 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
279 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
280 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
281 ; GCN-DENORM-DAG: v_rcp_f32_e32
282 ; GCN-DENORM-DAG: v_rcp_f32_e32
284 ; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
285 ; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
287 ; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
288 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
289 ; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
290 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
292 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
293 ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
294 ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
295 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
296 ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
297 ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]]
299 ; GCN-DENORM-DAG: v_div_fmas_f32
300 ; GCN-DENORM-DAG: v_div_fmas_f32
301 ; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
302 ; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
304 ; GCN-FLUSH-DAG: v_rcp_f32_e32
305 ; GCN-FLUSH-DAG: v_rcp_f32_e64
307 ; GCN-NOT: v_cmp_gt_f32_e64
308 ; GCN-NOT: v_cndmask_b32_e32
309 ; GCN-FLUSH-NOT: v_div
311 ; GCN: global_store_dwordx4
312 define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
313 %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
314 %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
315 %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %neg, !fpmath !0
316 store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
320 ; GCN-LABEL: {{^}}div_v_by_x_25ulp:
321 ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
323 ; GCN-DENORM-DAG: v_div_scale_f32
324 ; GCN-DENORM-DAG: v_rcp_f32_e32
325 ; GCN-DENORM-DAG: v_div_scale_f32
326 ; GCN-DENORM: v_div_fmas_f32
327 ; GCN-DENORM: v_div_fixup_f32 [[OUT:v[0-9]+]],
329 ; GCN-FLUSH-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
330 ; GCN-FLUSH-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
331 ; GCN-FLUSH-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
332 ; GCN-FLUSH-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
333 ; GCN-FLUSH: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
334 ; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
335 ; GCN-FLUSH: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
337 ; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
338 define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num) {
339 %load = load float, float addrspace(1)* %arg, align 4
340 %div = fdiv float %num, %load, !fpmath !0
341 store float %div, float addrspace(1)* %arg, align 4
345 ; GCN-LABEL: {{^}}div_1_by_x_fast:
346 ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
347 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
348 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
349 define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
350 %load = load float, float addrspace(1)* %arg, align 4
351 %div = fdiv fast float 1.000000e+00, %load
352 store float %div, float addrspace(1)* %arg, align 4
356 ; GCN-LABEL: {{^}}div_minus_1_by_x_fast:
357 ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
358 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
359 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
360 define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
361 %load = load float, float addrspace(1)* %arg, align 4
362 %div = fdiv fast float -1.000000e+00, %load
363 store float %div, float addrspace(1)* %arg, align 4
367 ; GCN-LABEL: {{^}}div_1_by_minus_x_fast:
368 ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
369 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
370 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
371 define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
372 %load = load float, float addrspace(1)* %arg, align 4
373 %neg = fsub float -0.000000e+00, %load
374 %div = fdiv fast float 1.000000e+00, %neg
375 store float %div, float addrspace(1)* %arg, align 4
379 ; GCN-LABEL: {{^}}div_minus_1_by_minus_x_fast:
380 ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
381 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
382 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
383 define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) {
384 %load = load float, float addrspace(1)* %arg, align 4
385 %neg = fsub float -0.000000e+00, %load
386 %div = fdiv fast float -1.000000e+00, %neg
387 store float %div, float addrspace(1)* %arg, align 4
391 ; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded:
392 ; GCN-DENORM-DAG: v_div_scale_f32
393 ; GCN-DENORM-DAG: v_rcp_f32_e32
394 ; GCN-DENORM-DAG: v_div_scale_f32
395 ; GCN-DENORM: v_div_fmas_f32
396 ; GCN-DENORM: v_div_fixup_f32
398 ; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
399 ; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
400 ; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
401 define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
402 %load = load float, float addrspace(1)* %arg, align 4
403 %div = fdiv float 1.000000e+00, %load
404 store float %div, float addrspace(1)* %arg, align 4
408 ; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded:
409 ; GCN-DENORM-DAG: v_div_scale_f32
410 ; GCN-DENORM-DAG: v_rcp_f32_e32
411 ; GCN-DENORM-DAG: v_div_scale_f32
412 ; GCN-DENORM: v_div_fmas_f32
413 ; GCN-DENORM: v_div_fixup_f32
415 ; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
416 ; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
417 ; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
418 define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
419 %load = load float, float addrspace(1)* %arg, align 4
420 %div = fdiv float -1.000000e+00, %load
421 store float %div, float addrspace(1)* %arg, align 4
425 ; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded:
426 ; GCN-DENORM-DAG: v_div_scale_f32
427 ; GCN-DENORM-DAG: v_rcp_f32_e32
428 ; GCN-DENORM-DAG: v_div_scale_f32
429 ; GCN-DENORM: v_div_fmas_f32
430 ; GCN-DENORM: v_div_fixup_f32
432 ; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
433 ; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
434 ; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
435 define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
436 %load = load float, float addrspace(1)* %arg, align 4
437 %neg = fsub float -0.000000e+00, %load
438 %div = fdiv float 1.000000e+00, %neg
439 store float %div, float addrspace(1)* %arg, align 4
443 ; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded:
444 ; GCN-DENORM-DAG: v_div_scale_f32
445 ; GCN-DENORM-DAG: v_rcp_f32_e32
446 ; GCN-DENORM-DAG: v_div_scale_f32
447 ; GCN-DENORM: v_div_fmas_f32
448 ; GCN-DENORM: v_div_fixup_f32
450 ; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
451 ; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
452 ; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
453 define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
454 %load = load float, float addrspace(1)* %arg, align 4
455 %neg = fsub float -0.000000e+00, %load
456 %div = fdiv float -1.000000e+00, %neg
457 store float %div, float addrspace(1)* %arg, align 4
461 !0 = !{float 2.500000e+00}