1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
3 ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
4 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
8 define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
9 ; GFX7-LABEL: test_div_scale_f32_1:
11 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
12 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
13 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
14 ; GFX7-NEXT: s_mov_b32 s6, 0
15 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
17 ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3]
18 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
19 ; GFX7-NEXT: s_waitcnt vmcnt(0)
20 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
21 ; GFX7-NEXT: s_waitcnt vmcnt(0)
22 ; GFX7-NEXT: s_mov_b32 s6, -1
23 ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2
24 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
25 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
28 ; GFX8-LABEL: test_div_scale_f32_1:
30 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
31 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
32 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
33 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
34 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
35 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
36 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
37 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
38 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
39 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
40 ; GFX8-NEXT: s_waitcnt vmcnt(0)
41 ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
42 ; GFX8-NEXT: s_waitcnt vmcnt(0)
43 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
44 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
45 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
46 ; GFX8-NEXT: flat_store_dword v[0:1], v2
49 ; GFX10-LABEL: test_div_scale_f32_1:
51 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
52 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
53 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
54 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
55 ; GFX10-NEXT: s_waitcnt vmcnt(0)
56 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
57 ; GFX10-NEXT: s_waitcnt vmcnt(0)
58 ; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1
59 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
60 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
61 ; GFX10-NEXT: s_endpgm
63 ; GFX11-LABEL: test_div_scale_f32_1:
65 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
66 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
67 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
68 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
69 ; GFX11-NEXT: s_waitcnt vmcnt(0)
70 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
71 ; GFX11-NEXT: s_waitcnt vmcnt(0)
72 ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1
73 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
74 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
76 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
77 ; GFX11-NEXT: s_endpgm
78 %tid = call i32 @llvm.amdgcn.workitem.id.x()
79 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
80 %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
82 %a = load volatile float, ptr addrspace(1) %gep.0, align 4
83 %b = load volatile float, ptr addrspace(1) %gep.1, align 4
85 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
86 %result0 = extractvalue { float, i1 } %result, 0
87 store float %result0, ptr addrspace(1) %out, align 4
91 define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
92 ; GFX7-LABEL: test_div_scale_f32_2:
94 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
95 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
96 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
97 ; GFX7-NEXT: s_mov_b32 s6, 0
98 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
99 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
100 ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3]
101 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
102 ; GFX7-NEXT: s_waitcnt vmcnt(0)
103 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
104 ; GFX7-NEXT: s_waitcnt vmcnt(0)
105 ; GFX7-NEXT: s_mov_b32 s6, -1
106 ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v2, v0, v2
107 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
108 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
109 ; GFX7-NEXT: s_endpgm
111 ; GFX8-LABEL: test_div_scale_f32_2:
113 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
114 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
115 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
116 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
117 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
118 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
119 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
120 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
121 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
122 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
123 ; GFX8-NEXT: s_waitcnt vmcnt(0)
124 ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
125 ; GFX8-NEXT: s_waitcnt vmcnt(0)
126 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v1, v0
127 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
128 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
129 ; GFX8-NEXT: flat_store_dword v[0:1], v2
130 ; GFX8-NEXT: s_endpgm
132 ; GFX10-LABEL: test_div_scale_f32_2:
134 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
135 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
136 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
137 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
138 ; GFX10-NEXT: s_waitcnt vmcnt(0)
139 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
140 ; GFX10-NEXT: s_waitcnt vmcnt(0)
141 ; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1
142 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
143 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
144 ; GFX10-NEXT: s_endpgm
146 ; GFX11-LABEL: test_div_scale_f32_2:
148 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
149 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
150 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
151 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
152 ; GFX11-NEXT: s_waitcnt vmcnt(0)
153 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
154 ; GFX11-NEXT: s_waitcnt vmcnt(0)
155 ; GFX11-NEXT: v_div_scale_f32 v0, null, v1, v0, v1
156 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
157 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
158 ; GFX11-NEXT: s_nop 0
159 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
160 ; GFX11-NEXT: s_endpgm
161 %tid = call i32 @llvm.amdgcn.workitem.id.x()
162 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
163 %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
165 %a = load volatile float, ptr addrspace(1) %gep.0, align 4
166 %b = load volatile float, ptr addrspace(1) %gep.1, align 4
168 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
169 %result0 = extractvalue { float, i1 } %result, 0
170 store float %result0, ptr addrspace(1) %out, align 4
174 define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) {
175 ; GFX7-LABEL: test_div_scale_f64_1:
177 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
178 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
179 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
180 ; GFX7-NEXT: s_mov_b32 s6, 0
181 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
182 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
184 ; GFX7-NEXT: s_waitcnt vmcnt(0)
185 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc
186 ; GFX7-NEXT: s_waitcnt vmcnt(0)
187 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
188 ; GFX7-NEXT: s_mov_b32 s6, -1
189 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], v[2:3]
190 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
191 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
192 ; GFX7-NEXT: s_endpgm
194 ; GFX8-LABEL: test_div_scale_f64_1:
196 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
197 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
198 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
199 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
200 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
201 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
202 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
203 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
204 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0
205 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
206 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
207 ; GFX8-NEXT: s_waitcnt vmcnt(0)
208 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
209 ; GFX8-NEXT: s_waitcnt vmcnt(0)
210 ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
211 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
212 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
213 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
214 ; GFX8-NEXT: s_endpgm
216 ; GFX10-LABEL: test_div_scale_f64_1:
218 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
219 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
220 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
221 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
222 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
223 ; GFX10-NEXT: s_waitcnt vmcnt(0)
224 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
225 ; GFX10-NEXT: s_waitcnt vmcnt(0)
226 ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1]
227 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
228 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
229 ; GFX10-NEXT: s_endpgm
231 ; GFX11-LABEL: test_div_scale_f64_1:
233 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
234 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
235 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
236 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
237 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
238 ; GFX11-NEXT: s_waitcnt vmcnt(0)
239 ; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc
240 ; GFX11-NEXT: s_waitcnt vmcnt(0)
241 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[2:3], v[2:3], v[0:1]
242 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
243 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
244 ; GFX11-NEXT: s_nop 0
245 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
246 ; GFX11-NEXT: s_endpgm
247 %tid = call i32 @llvm.amdgcn.workitem.id.x()
248 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
249 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
251 %a = load volatile double, ptr addrspace(1) %gep.0, align 8
252 %b = load volatile double, ptr addrspace(1) %gep.1, align 8
254 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
255 %result0 = extractvalue { double, i1 } %result, 0
256 store double %result0, ptr addrspace(1) %out, align 8
260 define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) {
261 ; GFX7-LABEL: test_div_scale_f64_2:
263 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
264 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
265 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
266 ; GFX7-NEXT: s_mov_b32 s6, 0
267 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
268 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
269 ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
270 ; GFX7-NEXT: s_waitcnt vmcnt(0)
271 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc
272 ; GFX7-NEXT: s_waitcnt vmcnt(0)
273 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
274 ; GFX7-NEXT: s_mov_b32 s6, -1
275 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[2:3], v[0:1], v[2:3]
276 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
277 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
278 ; GFX7-NEXT: s_endpgm
280 ; GFX8-LABEL: test_div_scale_f64_2:
282 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
283 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
284 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
285 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
286 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
287 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
288 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
289 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
290 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0
291 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
292 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
293 ; GFX8-NEXT: s_waitcnt vmcnt(0)
294 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
295 ; GFX8-NEXT: s_waitcnt vmcnt(0)
296 ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
297 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
298 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
299 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
300 ; GFX8-NEXT: s_endpgm
302 ; GFX10-LABEL: test_div_scale_f64_2:
304 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
305 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
306 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
307 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
308 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
309 ; GFX10-NEXT: s_waitcnt vmcnt(0)
310 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
311 ; GFX10-NEXT: s_waitcnt vmcnt(0)
312 ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
313 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
314 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
315 ; GFX10-NEXT: s_endpgm
317 ; GFX11-LABEL: test_div_scale_f64_2:
319 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
320 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
321 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
322 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
323 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
324 ; GFX11-NEXT: s_waitcnt vmcnt(0)
325 ; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc
326 ; GFX11-NEXT: s_waitcnt vmcnt(0)
327 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[2:3], v[0:1]
328 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
329 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
330 ; GFX11-NEXT: s_nop 0
331 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
332 ; GFX11-NEXT: s_endpgm
333 %tid = call i32 @llvm.amdgcn.workitem.id.x()
334 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
335 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
337 %a = load volatile double, ptr addrspace(1) %gep.0, align 8
338 %b = load volatile double, ptr addrspace(1) %gep.1, align 8
340 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
341 %result0 = extractvalue { double, i1 } %result, 0
342 store double %result0, ptr addrspace(1) %out, align 8
346 define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], float %a) {
347 ; GFX7-LABEL: test_div_scale_f32_scalar_num_1:
349 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
350 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0x15
351 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
352 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
353 ; GFX7-NEXT: s_mov_b32 s2, 0
354 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
355 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
356 ; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7]
357 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
358 ; GFX7-NEXT: s_mov_b32 s2, -1
359 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
360 ; GFX7-NEXT: s_waitcnt vmcnt(0)
361 ; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s8
362 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
363 ; GFX7-NEXT: s_endpgm
365 ; GFX8-LABEL: test_div_scale_f32_scalar_num_1:
367 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
368 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x54
369 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
370 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
371 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
372 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
373 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
374 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
375 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
376 ; GFX8-NEXT: s_waitcnt vmcnt(0)
377 ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0
378 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
379 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
380 ; GFX8-NEXT: flat_store_dword v[0:1], v2
381 ; GFX8-NEXT: s_endpgm
383 ; GFX10-LABEL: test_div_scale_f32_scalar_num_1:
385 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
386 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
387 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x54
388 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
389 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
390 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
391 ; GFX10-NEXT: s_waitcnt vmcnt(0)
392 ; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, s0
393 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
394 ; GFX10-NEXT: s_endpgm
396 ; GFX11-LABEL: test_div_scale_f32_scalar_num_1:
398 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
399 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
400 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x54
401 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
403 ; GFX11-NEXT: s_waitcnt vmcnt(0)
404 ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, s0
405 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
406 ; GFX11-NEXT: s_nop 0
407 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
408 ; GFX11-NEXT: s_endpgm
409 %tid = call i32 @llvm.amdgcn.workitem.id.x()
410 %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
412 %b = load float, ptr addrspace(1) %gep, align 4
414 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
415 %result0 = extractvalue { float, i1 } %result, 0
416 store float %result0, ptr addrspace(1) %out, align 4
420 define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(ptr addrspace(1) %out, ptr addrspace(1) %in, float %a) {
421 ; GFX7-LABEL: test_div_scale_f32_scalar_num_2:
423 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
424 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd
425 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
426 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
427 ; GFX7-NEXT: s_mov_b32 s2, 0
428 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
429 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
430 ; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7]
431 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
432 ; GFX7-NEXT: s_mov_b32 s2, -1
433 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
434 ; GFX7-NEXT: s_waitcnt vmcnt(0)
435 ; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s8, v0, s8
436 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
437 ; GFX7-NEXT: s_endpgm
439 ; GFX8-LABEL: test_div_scale_f32_scalar_num_2:
441 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
442 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
443 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
444 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
445 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
446 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
447 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
448 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
449 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
450 ; GFX8-NEXT: s_waitcnt vmcnt(0)
451 ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, v0, s0
452 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
453 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
454 ; GFX8-NEXT: flat_store_dword v[0:1], v2
455 ; GFX8-NEXT: s_endpgm
457 ; GFX10-LABEL: test_div_scale_f32_scalar_num_2:
459 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
460 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
461 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
462 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
463 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
464 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
465 ; GFX10-NEXT: s_waitcnt vmcnt(0)
466 ; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, v0, s0
467 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
468 ; GFX10-NEXT: s_endpgm
470 ; GFX11-LABEL: test_div_scale_f32_scalar_num_2:
472 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
473 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
474 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
475 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
476 ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
477 ; GFX11-NEXT: s_waitcnt vmcnt(0)
478 ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, v0, s0
479 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
480 ; GFX11-NEXT: s_nop 0
481 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
482 ; GFX11-NEXT: s_endpgm
483 %tid = call i32 @llvm.amdgcn.workitem.id.x()
484 %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
486 %b = load float, ptr addrspace(1) %gep, align 4
488 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
489 %result0 = extractvalue { float, i1 } %result, 0
490 store float %result0, ptr addrspace(1) %out, align 4
494 define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) {
495 ; GFX7-LABEL: test_div_scale_f32_scalar_den_1:
497 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
498 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd
499 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
500 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
501 ; GFX7-NEXT: s_mov_b32 s2, 0
502 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
503 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
504 ; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7]
505 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
506 ; GFX7-NEXT: s_mov_b32 s2, -1
507 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
508 ; GFX7-NEXT: s_waitcnt vmcnt(0)
509 ; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s8, s8, v0
510 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
511 ; GFX7-NEXT: s_endpgm
513 ; GFX8-LABEL: test_div_scale_f32_scalar_den_1:
515 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
516 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
517 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
518 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
519 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
520 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
521 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
522 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
523 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
524 ; GFX8-NEXT: s_waitcnt vmcnt(0)
525 ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0
526 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
527 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
528 ; GFX8-NEXT: flat_store_dword v[0:1], v2
529 ; GFX8-NEXT: s_endpgm
531 ; GFX10-LABEL: test_div_scale_f32_scalar_den_1:
533 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
534 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
535 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
536 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
537 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
538 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
539 ; GFX10-NEXT: s_waitcnt vmcnt(0)
540 ; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, v0
541 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
542 ; GFX10-NEXT: s_endpgm
544 ; GFX11-LABEL: test_div_scale_f32_scalar_den_1:
546 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
547 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
548 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
549 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
550 ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
551 ; GFX11-NEXT: s_waitcnt vmcnt(0)
552 ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, v0
553 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
554 ; GFX11-NEXT: s_nop 0
555 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
556 ; GFX11-NEXT: s_endpgm
557 %tid = call i32 @llvm.amdgcn.workitem.id.x()
558 %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
560 %a = load float, ptr addrspace(1) %gep, align 4
562 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
563 %result0 = extractvalue { float, i1 } %result, 0
564 store float %result0, ptr addrspace(1) %out, align 4
568 define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) {
569 ; GFX7-LABEL: test_div_scale_f32_scalar_den_2:
571 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
572 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd
573 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
574 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
575 ; GFX7-NEXT: s_mov_b32 s2, 0
576 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
577 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
578 ; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7]
579 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
580 ; GFX7-NEXT: s_mov_b32 s2, -1
581 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
582 ; GFX7-NEXT: s_waitcnt vmcnt(0)
583 ; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, s8, v0
584 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
585 ; GFX7-NEXT: s_endpgm
587 ; GFX8-LABEL: test_div_scale_f32_scalar_den_2:
589 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
590 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
591 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
592 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
593 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
594 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
595 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
596 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
597 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
598 ; GFX8-NEXT: s_waitcnt vmcnt(0)
599 ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, s0, v0
600 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
601 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
602 ; GFX8-NEXT: flat_store_dword v[0:1], v2
603 ; GFX8-NEXT: s_endpgm
605 ; GFX10-LABEL: test_div_scale_f32_scalar_den_2:
607 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
608 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
609 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
610 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
611 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
612 ; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
613 ; GFX10-NEXT: s_waitcnt vmcnt(0)
614 ; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, s0, v0
615 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
616 ; GFX10-NEXT: s_endpgm
618 ; GFX11-LABEL: test_div_scale_f32_scalar_den_2:
620 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
621 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
622 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
623 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
624 ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
625 ; GFX11-NEXT: s_waitcnt vmcnt(0)
626 ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, s0, v0
627 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
628 ; GFX11-NEXT: s_nop 0
629 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
630 ; GFX11-NEXT: s_endpgm
631 %tid = call i32 @llvm.amdgcn.workitem.id.x()
632 %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
634 %a = load float, ptr addrspace(1) %gep, align 4
636 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
637 %result0 = extractvalue { float, i1 } %result, 0
638 store float %result0, ptr addrspace(1) %out, align 4
642 define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %a) {
643 ; GFX7-LABEL: test_div_scale_f64_scalar_num_1:
645 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
646 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15
647 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
648 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
649 ; GFX7-NEXT: s_mov_b32 s2, 0
650 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
651 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
652 ; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7]
653 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
654 ; GFX7-NEXT: s_mov_b32 s2, -1
655 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
656 ; GFX7-NEXT: s_waitcnt vmcnt(0)
657 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[8:9]
658 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
659 ; GFX7-NEXT: s_endpgm
661 ; GFX8-LABEL: test_div_scale_f64_scalar_num_1:
663 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
664 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
665 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
666 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
667 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
668 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
669 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
670 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
671 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
672 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
673 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
674 ; GFX8-NEXT: s_waitcnt vmcnt(0)
675 ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1]
676 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
677 ; GFX8-NEXT: s_endpgm
679 ; GFX10-LABEL: test_div_scale_f64_scalar_num_1:
681 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
682 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
683 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
684 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
685 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
686 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
687 ; GFX10-NEXT: s_waitcnt vmcnt(0)
688 ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1]
689 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
690 ; GFX10-NEXT: s_endpgm
692 ; GFX11-LABEL: test_div_scale_f64_scalar_num_1:
694 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
695 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
696 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
697 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
698 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
699 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
700 ; GFX11-NEXT: s_waitcnt vmcnt(0)
701 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[0:1]
702 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
703 ; GFX11-NEXT: s_nop 0
704 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
705 ; GFX11-NEXT: s_endpgm
706 %tid = call i32 @llvm.amdgcn.workitem.id.x()
707 %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
709 %b = load double, ptr addrspace(1) %gep, align 8
711 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
712 %result0 = extractvalue { double, i1 } %result, 0
713 store double %result0, ptr addrspace(1) %out, align 8
717 define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %a) {
718 ; GFX7-LABEL: test_div_scale_f64_scalar_num_2:
720 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
721 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15
722 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
723 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
724 ; GFX7-NEXT: s_mov_b32 s2, 0
725 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
726 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
727 ; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7]
728 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
729 ; GFX7-NEXT: s_mov_b32 s2, -1
730 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
731 ; GFX7-NEXT: s_waitcnt vmcnt(0)
732 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[8:9], v[0:1], s[8:9]
733 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
734 ; GFX7-NEXT: s_endpgm
736 ; GFX8-LABEL: test_div_scale_f64_scalar_num_2:
738 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
739 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
740 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
741 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
742 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
743 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
744 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
745 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
746 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
747 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
748 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
749 ; GFX8-NEXT: s_waitcnt vmcnt(0)
750 ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1]
751 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
752 ; GFX8-NEXT: s_endpgm
754 ; GFX10-LABEL: test_div_scale_f64_scalar_num_2:
756 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
757 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
758 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
759 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
760 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
761 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
762 ; GFX10-NEXT: s_waitcnt vmcnt(0)
763 ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1]
764 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
765 ; GFX10-NEXT: s_endpgm
767 ; GFX11-LABEL: test_div_scale_f64_scalar_num_2:
769 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
770 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
771 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
772 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
773 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
774 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
775 ; GFX11-NEXT: s_waitcnt vmcnt(0)
776 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], v[0:1], s[0:1]
777 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
778 ; GFX11-NEXT: s_nop 0
779 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
780 ; GFX11-NEXT: s_endpgm
781 %tid = call i32 @llvm.amdgcn.workitem.id.x()
782 %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
784 %b = load double, ptr addrspace(1) %gep, align 8
786 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
787 %result0 = extractvalue { double, i1 } %result, 0
788 store double %result0, ptr addrspace(1) %out, align 8
792 define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %b) {
793 ; GFX7-LABEL: test_div_scale_f64_scalar_den_1:
795 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
796 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15
797 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
798 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
799 ; GFX7-NEXT: s_mov_b32 s2, 0
800 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
801 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
802 ; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7]
803 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
804 ; GFX7-NEXT: s_mov_b32 s2, -1
805 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
806 ; GFX7-NEXT: s_waitcnt vmcnt(0)
807 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[8:9], s[8:9], v[0:1]
808 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
809 ; GFX7-NEXT: s_endpgm
811 ; GFX8-LABEL: test_div_scale_f64_scalar_den_1:
813 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
814 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
815 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
816 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
817 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
818 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
819 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
820 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
821 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
822 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
823 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
824 ; GFX8-NEXT: s_waitcnt vmcnt(0)
825 ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1]
826 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
827 ; GFX8-NEXT: s_endpgm
829 ; GFX10-LABEL: test_div_scale_f64_scalar_den_1:
831 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
832 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
833 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
834 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
835 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
836 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
837 ; GFX10-NEXT: s_waitcnt vmcnt(0)
838 ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1]
839 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
840 ; GFX10-NEXT: s_endpgm
842 ; GFX11-LABEL: test_div_scale_f64_scalar_den_1:
844 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
845 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
846 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
847 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
848 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
849 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
850 ; GFX11-NEXT: s_waitcnt vmcnt(0)
851 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], v[0:1]
852 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
853 ; GFX11-NEXT: s_nop 0
854 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
855 ; GFX11-NEXT: s_endpgm
856 %tid = call i32 @llvm.amdgcn.workitem.id.x()
857 %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
859 %a = load double, ptr addrspace(1) %gep, align 8
861 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
862 %result0 = extractvalue { double, i1 } %result, 0
863 store double %result0, ptr addrspace(1) %out, align 8
867 define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], double %b) {
868 ; GFX7-LABEL: test_div_scale_f64_scalar_den_2:
870 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
871 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15
872 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
873 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
874 ; GFX7-NEXT: s_mov_b32 s2, 0
875 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
876 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
877 ; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7]
878 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
879 ; GFX7-NEXT: s_mov_b32 s2, -1
880 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
881 ; GFX7-NEXT: s_waitcnt vmcnt(0)
882 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[8:9], v[0:1]
883 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
884 ; GFX7-NEXT: s_endpgm
886 ; GFX8-LABEL: test_div_scale_f64_scalar_den_2:
888 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
889 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
890 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
891 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
892 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
893 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
894 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
895 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
896 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
897 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
898 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
899 ; GFX8-NEXT: s_waitcnt vmcnt(0)
900 ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1]
901 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
902 ; GFX8-NEXT: s_endpgm
904 ; GFX10-LABEL: test_div_scale_f64_scalar_den_2:
906 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
907 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
908 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
909 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
910 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
911 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
912 ; GFX10-NEXT: s_waitcnt vmcnt(0)
913 ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1]
914 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
915 ; GFX10-NEXT: s_endpgm
917 ; GFX11-LABEL: test_div_scale_f64_scalar_den_2:
919 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
920 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
921 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
922 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
923 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
924 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
925 ; GFX11-NEXT: s_waitcnt vmcnt(0)
926 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[0:1], v[0:1]
927 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
928 ; GFX11-NEXT: s_nop 0
929 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
930 ; GFX11-NEXT: s_endpgm
931 %tid = call i32 @llvm.amdgcn.workitem.id.x()
932 %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
934 %a = load double, ptr addrspace(1) %gep, align 8
936 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
937 %result0 = extractvalue { double, i1 } %result, 0
938 store double %result0, ptr addrspace(1) %out, align 8
942 define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) {
943 ; GFX7-LABEL: test_div_scale_f32_all_scalar_1:
945 ; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c
946 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x13
947 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
948 ; GFX7-NEXT: s_mov_b32 s2, -1
949 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
950 ; GFX7-NEXT: v_mov_b32_e32 v0, s3
951 ; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], v0, v0, s4
952 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
953 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
954 ; GFX7-NEXT: s_endpgm
956 ; GFX8-LABEL: test_div_scale_f32_all_scalar_1:
958 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70
959 ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c
960 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
961 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
962 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
963 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s3
964 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
965 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
966 ; GFX8-NEXT: flat_store_dword v[0:1], v2
967 ; GFX8-NEXT: s_endpgm
969 ; GFX10-LABEL: test_div_scale_f32_all_scalar_1:
971 ; GFX10-NEXT: s_clause 0x2
972 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c
973 ; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70
974 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
975 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
976 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
977 ; GFX10-NEXT: v_div_scale_f32 v0, s0, s5, s5, s4
978 ; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
979 ; GFX10-NEXT: s_endpgm
981 ; GFX11-LABEL: test_div_scale_f32_all_scalar_1:
983 ; GFX11-NEXT: s_clause 0x2
984 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
985 ; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
986 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
987 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
988 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
989 ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2
990 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
991 ; GFX11-NEXT: s_nop 0
992 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
993 ; GFX11-NEXT: s_endpgm
994 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
995 %result0 = extractvalue { float, i1 } %result, 0
996 store float %result0, ptr addrspace(1) %out, align 4
1000 define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) {
1001 ; GFX7-LABEL: test_div_scale_f32_all_scalar_2:
1003 ; GFX7-NEXT: s_load_dword s3, s[0:1], 0x1c
1004 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x13
1005 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1006 ; GFX7-NEXT: s_mov_b32 s2, -1
1007 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1008 ; GFX7-NEXT: v_mov_b32_e32 v0, s3
1009 ; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], s4, v0, s4
1010 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1011 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1012 ; GFX7-NEXT: s_endpgm
1014 ; GFX8-LABEL: test_div_scale_f32_all_scalar_2:
1016 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70
1017 ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c
1018 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1019 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1020 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
1021 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s3, v0, s3
1022 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1023 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1024 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1025 ; GFX8-NEXT: s_endpgm
1027 ; GFX10-LABEL: test_div_scale_f32_all_scalar_2:
1029 ; GFX10-NEXT: s_clause 0x2
1030 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c
1031 ; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70
1032 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1033 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1034 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1035 ; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s5, s4
1036 ; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
1037 ; GFX10-NEXT: s_endpgm
1039 ; GFX11-LABEL: test_div_scale_f32_all_scalar_2:
1041 ; GFX11-NEXT: s_clause 0x2
1042 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
1043 ; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
1044 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1045 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1046 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1047 ; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2
1048 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1049 ; GFX11-NEXT: s_nop 0
1050 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1051 ; GFX11-NEXT: s_endpgm
1052 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
1053 %result0 = extractvalue { float, i1 } %result, 0
1054 store float %result0, ptr addrspace(1) %out, align 4
1058 define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) {
1059 ; GFX7-LABEL: test_div_scale_f64_all_scalar_1:
1061 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1d
1062 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13
1063 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1064 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1065 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
1066 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
1067 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5]
1068 ; GFX7-NEXT: s_mov_b32 s2, -1
1069 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1070 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1071 ; GFX7-NEXT: s_endpgm
1073 ; GFX8-LABEL: test_div_scale_f64_all_scalar_1:
1075 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
1076 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
1077 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1078 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1079 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
1080 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1081 ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5]
1082 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
1083 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
1084 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1085 ; GFX8-NEXT: s_endpgm
1087 ; GFX10-LABEL: test_div_scale_f64_all_scalar_1:
1089 ; GFX10-NEXT: s_clause 0x1
1090 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
1091 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
1092 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1093 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1094 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1095 ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3]
1096 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1097 ; GFX10-NEXT: s_endpgm
1099 ; GFX11-LABEL: test_div_scale_f64_all_scalar_1:
1101 ; GFX11-NEXT: s_clause 0x2
1102 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
1103 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74
1104 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1105 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1106 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1107 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[2:3]
1108 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1109 ; GFX11-NEXT: s_nop 0
1110 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1111 ; GFX11-NEXT: s_endpgm
1112 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
1113 %result0 = extractvalue { double, i1 } %result, 0
1114 store double %result0, ptr addrspace(1) %out, align 8
1118 define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], double %b) {
1119 ; GFX7-LABEL: test_div_scale_f64_all_scalar_2:
1121 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x1d
1122 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13
1123 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1124 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1125 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
1126 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
1127 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5]
1128 ; GFX7-NEXT: s_mov_b32 s2, -1
1129 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1130 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1131 ; GFX7-NEXT: s_endpgm
1133 ; GFX8-LABEL: test_div_scale_f64_all_scalar_2:
1135 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
1136 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
1137 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1138 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1139 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
1140 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1141 ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5]
1142 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
1143 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
1144 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1145 ; GFX8-NEXT: s_endpgm
1147 ; GFX10-LABEL: test_div_scale_f64_all_scalar_2:
1149 ; GFX10-NEXT: s_clause 0x1
1150 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
1151 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
1152 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1153 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1154 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1155 ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3]
1156 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1157 ; GFX10-NEXT: s_endpgm
1159 ; GFX11-LABEL: test_div_scale_f64_all_scalar_2:
1161 ; GFX11-NEXT: s_clause 0x2
1162 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
1163 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74
1164 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1165 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1166 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1167 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[4:5], s[2:3]
1168 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1169 ; GFX11-NEXT: s_nop 0
1170 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1171 ; GFX11-NEXT: s_endpgm
1172 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
1173 %result0 = extractvalue { double, i1 } %result, 0
1174 store double %result0, ptr addrspace(1) %out, align 8
1178 define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1179 ; GFX7-LABEL: test_div_scale_f32_inline_imm_num:
1181 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1182 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1183 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1184 ; GFX7-NEXT: s_mov_b32 s6, 0
1185 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1186 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1187 ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3]
1188 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1189 ; GFX7-NEXT: s_mov_b32 s6, -1
1190 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1191 ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, 1.0
1192 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
1193 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1194 ; GFX7-NEXT: s_endpgm
1196 ; GFX8-LABEL: test_div_scale_f32_inline_imm_num:
1198 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1199 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1200 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1201 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
1202 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1203 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1204 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1205 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1206 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1207 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0
1208 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1209 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1210 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1211 ; GFX8-NEXT: s_endpgm
1213 ; GFX10-LABEL: test_div_scale_f32_inline_imm_num:
1215 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1216 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1217 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1218 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1219 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1220 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1221 ; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0
1222 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1223 ; GFX10-NEXT: s_endpgm
1225 ; GFX11-LABEL: test_div_scale_f32_inline_imm_num:
1227 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1228 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1229 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1230 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1231 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1232 ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, 1.0
1233 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1234 ; GFX11-NEXT: s_nop 0
1235 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1236 ; GFX11-NEXT: s_endpgm
1237 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1238 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
1239 %a = load float, ptr addrspace(1) %gep.0, align 4
1241 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 1.0, float %a, i1 false)
1242 %result0 = extractvalue { float, i1 } %result, 0
1243 store float %result0, ptr addrspace(1) %out, align 4
1247 define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1248 ; GFX7-LABEL: test_div_scale_f32_inline_imm_den:
1250 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1251 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1252 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1253 ; GFX7-NEXT: s_mov_b32 s6, 0
1254 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1255 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1256 ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3]
1257 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1258 ; GFX7-NEXT: s_mov_b32 s6, -1
1259 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1260 ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], 2.0, 2.0, v0
1261 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
1262 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1263 ; GFX7-NEXT: s_endpgm
1265 ; GFX8-LABEL: test_div_scale_f32_inline_imm_den:
1267 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1268 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1269 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1270 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
1271 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1272 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1273 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1274 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1275 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1276 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0
1277 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1278 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1279 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1280 ; GFX8-NEXT: s_endpgm
1282 ; GFX10-LABEL: test_div_scale_f32_inline_imm_den:
1284 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1285 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1286 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1287 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1288 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1289 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1290 ; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0
1291 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1292 ; GFX10-NEXT: s_endpgm
1294 ; GFX11-LABEL: test_div_scale_f32_inline_imm_den:
1296 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1297 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1298 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1299 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1300 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1301 ; GFX11-NEXT: v_div_scale_f32 v0, null, 2.0, 2.0, v0
1302 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1303 ; GFX11-NEXT: s_nop 0
1304 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1305 ; GFX11-NEXT: s_endpgm
1306 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1307 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
1308 %a = load float, ptr addrspace(1) %gep.0, align 4
1310 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float 2.0, i1 false)
1311 %result0 = extractvalue { float, i1 } %result, 0
1312 store float %result0, ptr addrspace(1) %out, align 4
1316 define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1317 ; GFX7-LABEL: test_div_scale_f32_fabs_num:
1319 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1320 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1321 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1322 ; GFX7-NEXT: s_mov_b32 s6, 0
1323 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1324 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1325 ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3]
1326 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
1327 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1328 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
1329 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1330 ; GFX7-NEXT: s_mov_b32 s6, -1
1331 ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2
1332 ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v1
1333 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
1334 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1335 ; GFX7-NEXT: s_endpgm
1337 ; GFX8-LABEL: test_div_scale_f32_fabs_num:
1339 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1340 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1341 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1342 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
1343 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1344 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1345 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1346 ; GFX8-NEXT: flat_load_dword v2, v[0:1] glc
1347 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1348 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
1349 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1350 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
1351 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1352 ; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2
1353 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, v1
1354 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1355 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1356 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1357 ; GFX8-NEXT: s_endpgm
1359 ; GFX10-LABEL: test_div_scale_f32_fabs_num:
1361 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1362 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1363 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1364 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
1365 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1366 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
1367 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1368 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1
1369 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1370 ; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0
1371 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1372 ; GFX10-NEXT: s_endpgm
1374 ; GFX11-LABEL: test_div_scale_f32_fabs_num:
1376 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1377 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1378 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1379 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
1380 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1381 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
1382 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1383 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
1384 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1385 ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1
1386 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1387 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1388 ; GFX11-NEXT: s_nop 0
1389 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1390 ; GFX11-NEXT: s_endpgm
1391 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1392 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
1393 %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
1395 %a = load volatile float, ptr addrspace(1) %gep.0, align 4
1396 %b = load volatile float, ptr addrspace(1) %gep.1, align 4
1398 %a.fabs = call float @llvm.fabs.f32(float %a)
1400 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a.fabs, float %b, i1 false)
1401 %result0 = extractvalue { float, i1 } %result, 0
1402 store float %result0, ptr addrspace(1) %out, align 4
1406 define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1407 ; GFX7-LABEL: test_div_scale_f32_fabs_den:
1409 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1410 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1411 ; GFX7-NEXT: v_mov_b32_e32 v1, 0
1412 ; GFX7-NEXT: s_mov_b32 s6, 0
1413 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1414 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1415 ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3]
1416 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
1417 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1418 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
1419 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1420 ; GFX7-NEXT: s_mov_b32 s6, -1
1421 ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
1422 ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2
1423 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
1424 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1425 ; GFX7-NEXT: s_endpgm
1427 ; GFX8-LABEL: test_div_scale_f32_fabs_den:
1429 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1430 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1431 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1432 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
1433 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1434 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
1435 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1436 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
1437 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1438 ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
1439 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1440 ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
1441 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1442 ; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
1443 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
1444 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1445 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1446 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1447 ; GFX8-NEXT: s_endpgm
1449 ; GFX10-LABEL: test_div_scale_f32_fabs_den:
1451 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1452 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1453 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1454 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
1455 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1456 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
1457 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1458 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2
1459 ; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1
1460 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1461 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1462 ; GFX10-NEXT: s_endpgm
1464 ; GFX11-LABEL: test_div_scale_f32_fabs_den:
1466 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1467 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1468 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1469 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
1470 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1471 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
1472 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1473 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
1474 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1475 ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1
1476 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1477 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1478 ; GFX11-NEXT: s_nop 0
1479 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1480 ; GFX11-NEXT: s_endpgm
1481 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1482 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
1483 %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
1485 %a = load volatile float, ptr addrspace(1) %gep.0, align 4
1486 %b = load volatile float, ptr addrspace(1) %gep.1, align 4
1488 %b.fabs = call float @llvm.fabs.f32(float %b)
1490 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b.fabs, i1 false)
1491 %result0 = extractvalue { float, i1 } %result, 0
1492 store float %result0, ptr addrspace(1) %out, align 4
1496 define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %out) #0 {
1497 ; GFX7-LABEL: test_div_scale_f32_val_undef_val:
1499 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1500 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000
1501 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1502 ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, v0
1503 ; GFX7-NEXT: s_mov_b32 s2, -1
1504 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1505 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1506 ; GFX7-NEXT: s_endpgm
1508 ; GFX8-LABEL: test_div_scale_f32_val_undef_val:
1510 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000
1511 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, v0
1512 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1513 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1514 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1515 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1516 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1517 ; GFX8-NEXT: s_endpgm
1519 ; GFX10-LABEL: test_div_scale_f32_val_undef_val:
1521 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1522 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1523 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1524 ; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000
1525 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1526 ; GFX10-NEXT: s_endpgm
1528 ; GFX11-LABEL: test_div_scale_f32_val_undef_val:
1530 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1531 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1533 ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000
1534 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1535 ; GFX11-NEXT: s_nop 0
1536 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1537 ; GFX11-NEXT: s_endpgm
1538 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false)
1539 %result0 = extractvalue { float, i1 } %result, 0
1540 store float %result0, ptr addrspace(1) %out, align 4
1544 define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %out) #0 {
1545 ; GFX7-LABEL: test_div_scale_f32_undef_val_val:
1547 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1548 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000
1549 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1550 ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, s0
1551 ; GFX7-NEXT: s_mov_b32 s2, -1
1552 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1553 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1554 ; GFX7-NEXT: s_endpgm
1556 ; GFX8-LABEL: test_div_scale_f32_undef_val_val:
1558 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000
1559 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s0
1560 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1561 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1562 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1563 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1564 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1565 ; GFX8-NEXT: s_endpgm
1567 ; GFX10-LABEL: test_div_scale_f32_undef_val_val:
1569 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1570 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1571 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1572 ; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0
1573 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1574 ; GFX10-NEXT: s_endpgm
1576 ; GFX11-LABEL: test_div_scale_f32_undef_val_val:
1578 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1579 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1580 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1581 ; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0
1582 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1583 ; GFX11-NEXT: s_nop 0
1584 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1585 ; GFX11-NEXT: s_endpgm
1586 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)
1587 %result0 = extractvalue { float, i1 } %result, 0
1588 store float %result0, ptr addrspace(1) %out, align 4
1592 define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %out) #0 {
1593 ; GFX7-LABEL: test_div_scale_f32_undef_undef_val:
1595 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1596 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1597 ; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, s0
1598 ; GFX7-NEXT: s_mov_b32 s2, -1
1599 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1600 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1601 ; GFX7-NEXT: s_endpgm
1603 ; GFX8-LABEL: test_div_scale_f32_undef_undef_val:
1605 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, s0
1606 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1607 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1608 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1609 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1610 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1611 ; GFX8-NEXT: s_endpgm
1613 ; GFX10-LABEL: test_div_scale_f32_undef_undef_val:
1615 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1616 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1617 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1618 ; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0
1619 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1620 ; GFX10-NEXT: s_endpgm
1622 ; GFX11-LABEL: test_div_scale_f32_undef_undef_val:
1624 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1625 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1626 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1627 ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0
1628 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1629 ; GFX11-NEXT: s_nop 0
1630 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1631 ; GFX11-NEXT: s_endpgm
1632 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false)
1633 %result0 = extractvalue { float, i1 } %result, 0
1634 store float %result0, ptr addrspace(1) %out, align 4
1638 define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 {
1639 ; GFX7-LABEL: test_div_scale_f64_val_undef_val:
1641 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1642 ; GFX7-NEXT: v_mov_b32_e32 v0, 0
1643 ; GFX7-NEXT: v_mov_b32_e32 v1, 0x40200000
1644 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1645 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1]
1646 ; GFX7-NEXT: s_mov_b32 s2, -1
1647 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1648 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1649 ; GFX7-NEXT: s_endpgm
1651 ; GFX8-LABEL: test_div_scale_f64_val_undef_val:
1653 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
1654 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x40200000
1655 ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1]
1656 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1657 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1658 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
1659 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
1660 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1661 ; GFX8-NEXT: s_endpgm
1663 ; GFX10-LABEL: test_div_scale_f64_val_undef_val:
1665 ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], 0x40200000
1666 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1667 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1668 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1669 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1670 ; GFX10-NEXT: s_endpgm
1672 ; GFX11-LABEL: test_div_scale_f64_val_undef_val:
1674 ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], 0x40200000
1675 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1676 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1677 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1678 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1679 ; GFX11-NEXT: s_nop 0
1680 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1681 ; GFX11-NEXT: s_endpgm
1682 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false)
1683 %result0 = extractvalue { double, i1 } %result, 0
1684 store double %result0, ptr addrspace(1) %out, align 8
1688 declare i32 @llvm.amdgcn.workitem.id.x() #1
1689 declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) #1
1690 declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) #1
1691 declare float @llvm.fabs.f32(float) #1
1693 attributes #0 = { nounwind }
1694 attributes #1 = { nounwind readnone speculatable }