1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
3 ; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5 define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 {
8 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
9 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
10 ; CI-NEXT: s_waitcnt lgkmcnt(0)
11 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
12 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2
13 ; CI-NEXT: s_waitcnt lgkmcnt(0)
14 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
15 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
16 ; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
17 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
18 ; CI-NEXT: v_rcp_f32_e32 v4, v2
19 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
20 ; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
21 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
22 ; CI-NEXT: v_mul_f32_e32 v5, v3, v4
23 ; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
24 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
25 ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
26 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
27 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
28 ; CI-NEXT: s_mov_b32 s6, -1
29 ; CI-NEXT: s_mov_b32 s7, 0xf000
30 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
31 ; CI-NEXT: v_trunc_f32_e32 v2, v2
32 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
33 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
34 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
35 ; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
40 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
41 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
42 ; VI-NEXT: s_waitcnt lgkmcnt(0)
43 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
44 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8
45 ; VI-NEXT: s_waitcnt lgkmcnt(0)
46 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
47 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
48 ; VI-NEXT: v_mov_b32_e32 v1, s0
49 ; VI-NEXT: v_rcp_f32_e32 v2, v2
50 ; VI-NEXT: v_mul_f32_e32 v0, v0, v2
51 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
52 ; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
53 ; VI-NEXT: v_trunc_f16_e32 v0, v0
54 ; VI-NEXT: v_fma_f16 v2, -v0, v1, s2
55 ; VI-NEXT: v_mov_b32_e32 v0, s4
56 ; VI-NEXT: v_mov_b32_e32 v1, s5
57 ; VI-NEXT: flat_store_short v[0:1], v2
59 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
60 %r0 = load half, half addrspace(1)* %in1, align 4
61 %r1 = load half, half addrspace(1)* %gep2, align 4
62 %r2 = frem half %r0, %r1
63 store half %r2, half addrspace(1)* %out, align 4
67 define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 {
68 ; CI-LABEL: fast_frem_f16:
70 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
71 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
72 ; CI-NEXT: s_waitcnt lgkmcnt(0)
73 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
74 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2
75 ; CI-NEXT: s_mov_b32 s6, -1
76 ; CI-NEXT: s_mov_b32 s7, 0xf000
77 ; CI-NEXT: s_waitcnt lgkmcnt(0)
78 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
79 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
80 ; CI-NEXT: v_rcp_f32_e32 v2, v1
81 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
82 ; CI-NEXT: v_trunc_f32_e32 v2, v2
83 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
84 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
85 ; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
88 ; VI-LABEL: fast_frem_f16:
90 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
91 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
92 ; VI-NEXT: s_waitcnt lgkmcnt(0)
93 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
94 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8
95 ; VI-NEXT: s_waitcnt lgkmcnt(0)
96 ; VI-NEXT: v_mov_b32_e32 v1, s2
97 ; VI-NEXT: v_rcp_f16_e32 v0, s0
98 ; VI-NEXT: v_mul_f16_e32 v0, s2, v0
99 ; VI-NEXT: v_trunc_f16_e32 v0, v0
100 ; VI-NEXT: v_fma_f16 v2, -v0, s0, v1
101 ; VI-NEXT: v_mov_b32_e32 v0, s4
102 ; VI-NEXT: v_mov_b32_e32 v1, s5
103 ; VI-NEXT: flat_store_short v[0:1], v2
105 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
106 %r0 = load half, half addrspace(1)* %in1, align 4
107 %r1 = load half, half addrspace(1)* %gep2, align 4
108 %r2 = frem fast half %r0, %r1
109 store half %r2, half addrspace(1)* %out, align 4
113 define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #1 {
114 ; CI-LABEL: unsafe_frem_f16:
116 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
117 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
118 ; CI-NEXT: s_waitcnt lgkmcnt(0)
119 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
120 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2
121 ; CI-NEXT: s_mov_b32 s6, -1
122 ; CI-NEXT: s_mov_b32 s7, 0xf000
123 ; CI-NEXT: s_waitcnt lgkmcnt(0)
124 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
125 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
126 ; CI-NEXT: v_rcp_f32_e32 v2, v1
127 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
128 ; CI-NEXT: v_trunc_f32_e32 v2, v2
129 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
130 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
131 ; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
134 ; VI-LABEL: unsafe_frem_f16:
136 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
137 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
138 ; VI-NEXT: s_waitcnt lgkmcnt(0)
139 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
140 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8
141 ; VI-NEXT: s_waitcnt lgkmcnt(0)
142 ; VI-NEXT: v_mov_b32_e32 v1, s2
143 ; VI-NEXT: v_rcp_f16_e32 v0, s0
144 ; VI-NEXT: v_mul_f16_e32 v0, s2, v0
145 ; VI-NEXT: v_trunc_f16_e32 v0, v0
146 ; VI-NEXT: v_fma_f16 v2, -v0, s0, v1
147 ; VI-NEXT: v_mov_b32_e32 v0, s4
148 ; VI-NEXT: v_mov_b32_e32 v1, s5
149 ; VI-NEXT: flat_store_short v[0:1], v2
151 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
152 %r0 = load half, half addrspace(1)* %in1, align 4
153 %r1 = load half, half addrspace(1)* %gep2, align 4
154 %r2 = frem half %r0, %r1
155 store half %r2, half addrspace(1)* %out, align 4
159 define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #0 {
160 ; CI-LABEL: frem_f32:
162 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
163 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
164 ; CI-NEXT: s_waitcnt lgkmcnt(0)
165 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
166 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4
167 ; CI-NEXT: s_waitcnt lgkmcnt(0)
168 ; CI-NEXT: v_mov_b32_e32 v0, s0
169 ; CI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2
170 ; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
171 ; CI-NEXT: v_rcp_f32_e32 v3, v1
172 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
173 ; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
174 ; CI-NEXT: v_fma_f32 v3, v4, v3, v3
175 ; CI-NEXT: v_mul_f32_e32 v4, v2, v3
176 ; CI-NEXT: v_fma_f32 v5, -v1, v4, v2
177 ; CI-NEXT: v_fma_f32 v4, v5, v3, v4
178 ; CI-NEXT: v_fma_f32 v1, -v1, v4, v2
179 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
180 ; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
181 ; CI-NEXT: s_mov_b32 s6, -1
182 ; CI-NEXT: s_mov_b32 s7, 0xf000
183 ; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
184 ; CI-NEXT: v_trunc_f32_e32 v1, v1
185 ; CI-NEXT: v_fma_f32 v0, -v1, v0, s2
186 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
189 ; VI-LABEL: frem_f32:
191 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
192 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
193 ; VI-NEXT: s_waitcnt lgkmcnt(0)
194 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
195 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10
196 ; VI-NEXT: s_waitcnt lgkmcnt(0)
197 ; VI-NEXT: v_mov_b32_e32 v0, s0
198 ; VI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2
199 ; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
200 ; VI-NEXT: v_rcp_f32_e32 v3, v1
201 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
202 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
203 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
204 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
205 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
206 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
207 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
208 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
209 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
210 ; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
211 ; VI-NEXT: v_trunc_f32_e32 v1, v1
212 ; VI-NEXT: v_fma_f32 v2, -v1, v0, s2
213 ; VI-NEXT: v_mov_b32_e32 v0, s4
214 ; VI-NEXT: v_mov_b32_e32 v1, s5
215 ; VI-NEXT: flat_store_dword v[0:1], v2
217 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
218 %r0 = load float, float addrspace(1)* %in1, align 4
219 %r1 = load float, float addrspace(1)* %gep2, align 4
220 %r2 = frem float %r0, %r1
221 store float %r2, float addrspace(1)* %out, align 4
225 define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #0 {
226 ; CI-LABEL: fast_frem_f32:
228 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
229 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
230 ; CI-NEXT: s_waitcnt lgkmcnt(0)
231 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
232 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4
233 ; CI-NEXT: s_mov_b32 s6, -1
234 ; CI-NEXT: s_mov_b32 s7, 0xf000
235 ; CI-NEXT: s_waitcnt lgkmcnt(0)
236 ; CI-NEXT: v_mov_b32_e32 v1, s2
237 ; CI-NEXT: v_rcp_f32_e32 v0, s0
238 ; CI-NEXT: v_mul_f32_e32 v0, s2, v0
239 ; CI-NEXT: v_trunc_f32_e32 v0, v0
240 ; CI-NEXT: v_fma_f32 v0, -v0, s0, v1
241 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
244 ; VI-LABEL: fast_frem_f32:
246 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
247 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
248 ; VI-NEXT: s_waitcnt lgkmcnt(0)
249 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
250 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10
251 ; VI-NEXT: s_waitcnt lgkmcnt(0)
252 ; VI-NEXT: v_mov_b32_e32 v1, s2
253 ; VI-NEXT: v_rcp_f32_e32 v0, s0
254 ; VI-NEXT: v_mul_f32_e32 v0, s2, v0
255 ; VI-NEXT: v_trunc_f32_e32 v0, v0
256 ; VI-NEXT: v_fma_f32 v2, -v0, s0, v1
257 ; VI-NEXT: v_mov_b32_e32 v0, s4
258 ; VI-NEXT: v_mov_b32_e32 v1, s5
259 ; VI-NEXT: flat_store_dword v[0:1], v2
261 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
262 %r0 = load float, float addrspace(1)* %in1, align 4
263 %r1 = load float, float addrspace(1)* %gep2, align 4
264 %r2 = frem fast float %r0, %r1
265 store float %r2, float addrspace(1)* %out, align 4
269 define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #1 {
270 ; CI-LABEL: unsafe_frem_f32:
272 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
273 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
274 ; CI-NEXT: s_waitcnt lgkmcnt(0)
275 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
276 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4
277 ; CI-NEXT: s_mov_b32 s6, -1
278 ; CI-NEXT: s_mov_b32 s7, 0xf000
279 ; CI-NEXT: s_waitcnt lgkmcnt(0)
280 ; CI-NEXT: v_mov_b32_e32 v1, s2
281 ; CI-NEXT: v_rcp_f32_e32 v0, s0
282 ; CI-NEXT: v_mul_f32_e32 v0, s2, v0
283 ; CI-NEXT: v_trunc_f32_e32 v0, v0
284 ; CI-NEXT: v_fma_f32 v0, -v0, s0, v1
285 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
288 ; VI-LABEL: unsafe_frem_f32:
290 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
291 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
292 ; VI-NEXT: s_waitcnt lgkmcnt(0)
293 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
294 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10
295 ; VI-NEXT: s_waitcnt lgkmcnt(0)
296 ; VI-NEXT: v_mov_b32_e32 v1, s2
297 ; VI-NEXT: v_rcp_f32_e32 v0, s0
298 ; VI-NEXT: v_mul_f32_e32 v0, s2, v0
299 ; VI-NEXT: v_trunc_f32_e32 v0, v0
300 ; VI-NEXT: v_fma_f32 v2, -v0, s0, v1
301 ; VI-NEXT: v_mov_b32_e32 v0, s4
302 ; VI-NEXT: v_mov_b32_e32 v1, s5
303 ; VI-NEXT: flat_store_dword v[0:1], v2
305 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
306 %r0 = load float, float addrspace(1)* %in1, align 4
307 %r1 = load float, float addrspace(1)* %gep2, align 4
308 %r2 = frem float %r0, %r1
309 store float %r2, float addrspace(1)* %out, align 4
313 define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 {
314 ; CI-LABEL: frem_f64:
316 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
317 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
318 ; CI-NEXT: s_waitcnt lgkmcnt(0)
319 ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
320 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
321 ; CI-NEXT: s_waitcnt lgkmcnt(0)
322 ; CI-NEXT: v_mov_b32_e32 v0, s0
323 ; CI-NEXT: v_mov_b32_e32 v1, s1
324 ; CI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3]
325 ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
326 ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
327 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
328 ; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
329 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
330 ; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
331 ; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
332 ; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
333 ; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
334 ; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
335 ; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
336 ; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
337 ; CI-NEXT: v_mov_b32_e32 v2, s4
338 ; CI-NEXT: v_mov_b32_e32 v3, s5
339 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
342 ; VI-LABEL: frem_f64:
344 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
345 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
346 ; VI-NEXT: s_waitcnt lgkmcnt(0)
347 ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
348 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
349 ; VI-NEXT: s_waitcnt lgkmcnt(0)
350 ; VI-NEXT: v_mov_b32_e32 v0, s0
351 ; VI-NEXT: v_mov_b32_e32 v1, s1
352 ; VI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3]
353 ; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
354 ; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
355 ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
356 ; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
357 ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
358 ; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
359 ; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
360 ; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
361 ; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
362 ; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
363 ; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
364 ; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
365 ; VI-NEXT: v_mov_b32_e32 v2, s4
366 ; VI-NEXT: v_mov_b32_e32 v3, s5
367 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
369 %r0 = load double, double addrspace(1)* %in1, align 8
370 %r1 = load double, double addrspace(1)* %in2, align 8
371 %r2 = frem double %r0, %r1
372 store double %r2, double addrspace(1)* %out, align 8
376 define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) #0 {
377 ; CI-LABEL: fast_frem_f64:
379 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
380 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
381 ; CI-NEXT: s_waitcnt lgkmcnt(0)
382 ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
383 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
384 ; CI-NEXT: s_waitcnt lgkmcnt(0)
385 ; CI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
386 ; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
387 ; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
388 ; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
389 ; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
390 ; CI-NEXT: v_mov_b32_e32 v2, s2
391 ; CI-NEXT: v_mov_b32_e32 v3, s3
392 ; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
393 ; CI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
394 ; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
395 ; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
396 ; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
397 ; CI-NEXT: v_mov_b32_e32 v2, s4
398 ; CI-NEXT: v_mov_b32_e32 v3, s5
399 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
402 ; VI-LABEL: fast_frem_f64:
404 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
405 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
406 ; VI-NEXT: s_waitcnt lgkmcnt(0)
407 ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
408 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
409 ; VI-NEXT: s_waitcnt lgkmcnt(0)
410 ; VI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
411 ; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
412 ; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
413 ; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
414 ; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
415 ; VI-NEXT: v_mov_b32_e32 v2, s2
416 ; VI-NEXT: v_mov_b32_e32 v3, s3
417 ; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
418 ; VI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
419 ; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
420 ; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
421 ; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
422 ; VI-NEXT: v_mov_b32_e32 v2, s4
423 ; VI-NEXT: v_mov_b32_e32 v3, s5
424 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
426 %r0 = load double, double addrspace(1)* %in1, align 8
427 %r1 = load double, double addrspace(1)* %in2, align 8
428 %r2 = frem fast double %r0, %r1
429 store double %r2, double addrspace(1)* %out, align 8
433 define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
434 ; CI-LABEL: unsafe_frem_f64:
436 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
437 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
438 ; CI-NEXT: s_waitcnt lgkmcnt(0)
439 ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
440 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
441 ; CI-NEXT: s_waitcnt lgkmcnt(0)
442 ; CI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
443 ; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
444 ; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
445 ; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
446 ; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
447 ; CI-NEXT: v_mov_b32_e32 v2, s2
448 ; CI-NEXT: v_mov_b32_e32 v3, s3
449 ; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
450 ; CI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
451 ; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
452 ; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
453 ; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
454 ; CI-NEXT: v_mov_b32_e32 v2, s4
455 ; CI-NEXT: v_mov_b32_e32 v3, s5
456 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
459 ; VI-LABEL: unsafe_frem_f64:
461 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
462 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
463 ; VI-NEXT: s_waitcnt lgkmcnt(0)
464 ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
465 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
466 ; VI-NEXT: s_waitcnt lgkmcnt(0)
467 ; VI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
468 ; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
469 ; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
470 ; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
471 ; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
472 ; VI-NEXT: v_mov_b32_e32 v2, s2
473 ; VI-NEXT: v_mov_b32_e32 v3, s3
474 ; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
475 ; VI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
476 ; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
477 ; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
478 ; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
479 ; VI-NEXT: v_mov_b32_e32 v2, s4
480 ; VI-NEXT: v_mov_b32_e32 v3, s5
481 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
483 double addrspace(1)* %in2) #1 {
484 %r0 = load double, double addrspace(1)* %in1, align 8
485 %r1 = load double, double addrspace(1)* %in2, align 8
486 %r2 = frem double %r0, %r1
487 store double %r2, double addrspace(1)* %out, align 8
491 define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, <2 x half> addrspace(1)* %in2) #0 {
492 ; CI-LABEL: frem_v2f16:
494 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
495 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
496 ; CI-NEXT: s_waitcnt lgkmcnt(0)
497 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
498 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4
499 ; CI-NEXT: s_waitcnt lgkmcnt(0)
500 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
501 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
502 ; CI-NEXT: s_lshr_b32 s6, s0, 16
503 ; CI-NEXT: s_lshr_b32 s3, s2, 16
504 ; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
505 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
506 ; CI-NEXT: v_rcp_f32_e32 v4, v2
507 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
508 ; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
509 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
510 ; CI-NEXT: v_mul_f32_e32 v5, v3, v4
511 ; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
512 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
513 ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
514 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
515 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
516 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
517 ; CI-NEXT: v_trunc_f32_e32 v2, v2
518 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
519 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
520 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s6
521 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
522 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
523 ; CI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v1
524 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1
525 ; CI-NEXT: v_rcp_f32_e32 v5, v3
526 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
527 ; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
528 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5
529 ; CI-NEXT: v_mul_f32_e32 v6, v4, v5
530 ; CI-NEXT: v_fma_f32 v7, -v3, v6, v4
531 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6
532 ; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
533 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
534 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
535 ; CI-NEXT: v_bfe_u32 v0, v0, 0, 16
536 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1
537 ; CI-NEXT: v_trunc_f32_e32 v3, v3
538 ; CI-NEXT: v_fma_f32 v1, -v3, v2, v1
539 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
540 ; CI-NEXT: v_bfe_u32 v1, v1, 0, 16
541 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
542 ; CI-NEXT: v_or_b32_e32 v2, v0, v1
543 ; CI-NEXT: v_mov_b32_e32 v0, s4
544 ; CI-NEXT: v_mov_b32_e32 v1, s5
545 ; CI-NEXT: flat_store_dword v[0:1], v2
548 ; VI-LABEL: frem_v2f16:
550 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
551 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
552 ; VI-NEXT: s_waitcnt lgkmcnt(0)
553 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
554 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10
555 ; VI-NEXT: s_waitcnt lgkmcnt(0)
556 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
557 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
558 ; VI-NEXT: s_lshr_b32 s3, s0, 16
559 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s3
560 ; VI-NEXT: v_mov_b32_e32 v1, s0
561 ; VI-NEXT: v_rcp_f32_e32 v2, v2
562 ; VI-NEXT: s_lshr_b32 s1, s2, 16
563 ; VI-NEXT: v_rcp_f32_e32 v3, v3
564 ; VI-NEXT: v_mul_f32_e32 v0, v0, v2
565 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
566 ; VI-NEXT: v_mov_b32_e32 v2, s3
567 ; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
568 ; VI-NEXT: v_trunc_f16_e32 v0, v0
569 ; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
570 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s1
571 ; VI-NEXT: v_mul_f32_e32 v1, v1, v3
572 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
573 ; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1
574 ; VI-NEXT: v_trunc_f16_e32 v1, v1
575 ; VI-NEXT: v_fma_f16 v1, -v1, v2, s1
576 ; VI-NEXT: v_mov_b32_e32 v2, 16
577 ; VI-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
578 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
579 ; VI-NEXT: v_mov_b32_e32 v0, s4
580 ; VI-NEXT: v_mov_b32_e32 v1, s5
581 ; VI-NEXT: flat_store_dword v[0:1], v2
583 %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4
584 %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8
585 %r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8
586 %r2 = frem <2 x half> %r0, %r1
587 store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8
591 define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in1, <4 x half> addrspace(1)* %in2) #0 {
592 ; CI-LABEL: frem_v4f16:
594 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
595 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
596 ; CI-NEXT: s_waitcnt lgkmcnt(0)
597 ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
598 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
599 ; CI-NEXT: s_waitcnt lgkmcnt(0)
600 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
601 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
602 ; CI-NEXT: s_lshr_b32 s8, s2, 16
603 ; CI-NEXT: s_lshr_b32 s9, s3, 16
604 ; CI-NEXT: s_lshr_b32 s10, s0, 16
605 ; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, v0
606 ; CI-NEXT: s_lshr_b32 s11, s1, 16
607 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
608 ; CI-NEXT: v_rcp_f32_e32 v4, v2
609 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
610 ; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
611 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
612 ; CI-NEXT: v_mul_f32_e32 v5, v3, v4
613 ; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
614 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
615 ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
616 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
617 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
618 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
619 ; CI-NEXT: v_trunc_f32_e32 v2, v2
620 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
621 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s8
622 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s10
623 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
624 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
625 ; CI-NEXT: v_div_scale_f32 v3, s[6:7], v2, v2, v1
626 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1
627 ; CI-NEXT: v_rcp_f32_e32 v5, v3
628 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
629 ; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
630 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5
631 ; CI-NEXT: v_mul_f32_e32 v6, v4, v5
632 ; CI-NEXT: v_fma_f32 v7, -v3, v6, v4
633 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6
634 ; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
635 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
636 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
637 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1
638 ; CI-NEXT: v_trunc_f32_e32 v3, v3
639 ; CI-NEXT: v_fma_f32 v1, -v3, v2, v1
640 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
641 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s1
642 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
643 ; CI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, v2
644 ; CI-NEXT: v_div_scale_f32 v5, vcc, v2, v3, v2
645 ; CI-NEXT: v_rcp_f32_e32 v6, v4
646 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
647 ; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0
648 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6
649 ; CI-NEXT: v_mul_f32_e32 v7, v5, v6
650 ; CI-NEXT: v_fma_f32 v8, -v4, v7, v5
651 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7
652 ; CI-NEXT: v_fma_f32 v4, -v4, v7, v5
653 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
654 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
655 ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v2
656 ; CI-NEXT: v_trunc_f32_e32 v4, v4
657 ; CI-NEXT: v_fma_f32 v2, -v4, v3, v2
658 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s9
659 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s11
660 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
661 ; CI-NEXT: v_div_scale_f32 v5, s[0:1], v4, v4, v3
662 ; CI-NEXT: v_div_scale_f32 v6, vcc, v3, v4, v3
663 ; CI-NEXT: v_rcp_f32_e32 v7, v5
664 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
665 ; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0
666 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7
667 ; CI-NEXT: v_mul_f32_e32 v8, v6, v7
668 ; CI-NEXT: v_fma_f32 v9, -v5, v8, v6
669 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8
670 ; CI-NEXT: v_fma_f32 v5, -v5, v8, v6
671 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
672 ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
673 ; CI-NEXT: v_bfe_u32 v1, v1, 0, 16
674 ; CI-NEXT: v_bfe_u32 v0, v0, 0, 16
675 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
676 ; CI-NEXT: v_or_b32_e32 v0, v0, v1
677 ; CI-NEXT: v_bfe_u32 v1, v2, 0, 16
678 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3
679 ; CI-NEXT: v_trunc_f32_e32 v5, v5
680 ; CI-NEXT: v_fma_f32 v3, -v5, v4, v3
681 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
682 ; CI-NEXT: v_bfe_u32 v2, v3, 0, 16
683 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
684 ; CI-NEXT: v_or_b32_e32 v1, v1, v2
685 ; CI-NEXT: v_mov_b32_e32 v2, s4
686 ; CI-NEXT: v_mov_b32_e32 v3, s5
687 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
690 ; VI-LABEL: frem_v4f16:
692 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
693 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
694 ; VI-NEXT: s_waitcnt lgkmcnt(0)
695 ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
696 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20
697 ; VI-NEXT: s_waitcnt lgkmcnt(0)
698 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
699 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
700 ; VI-NEXT: s_lshr_b32 s8, s0, 16
701 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
702 ; VI-NEXT: v_mov_b32_e32 v1, s0
703 ; VI-NEXT: v_rcp_f32_e32 v2, v2
704 ; VI-NEXT: s_lshr_b32 s6, s2, 16
705 ; VI-NEXT: v_rcp_f32_e32 v3, v3
706 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
707 ; VI-NEXT: v_mul_f32_e32 v0, v0, v2
708 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
709 ; VI-NEXT: v_mov_b32_e32 v2, s8
710 ; VI-NEXT: v_rcp_f32_e32 v4, v4
711 ; VI-NEXT: s_lshr_b32 s9, s1, 16
712 ; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
713 ; VI-NEXT: v_trunc_f16_e32 v0, v0
714 ; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
715 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s6
716 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
717 ; VI-NEXT: s_lshr_b32 s7, s3, 16
718 ; VI-NEXT: v_mul_f32_e32 v1, v1, v3
719 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
720 ; VI-NEXT: v_mov_b32_e32 v3, s1
721 ; VI-NEXT: v_rcp_f32_e32 v5, v5
722 ; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6
723 ; VI-NEXT: v_trunc_f16_e32 v1, v1
724 ; VI-NEXT: v_fma_f16 v1, -v1, v2, s6
725 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
726 ; VI-NEXT: v_mul_f32_e32 v2, v2, v4
727 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
728 ; VI-NEXT: v_mov_b32_e32 v4, s9
729 ; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3
730 ; VI-NEXT: v_trunc_f16_e32 v2, v2
731 ; VI-NEXT: v_fma_f16 v2, -v2, v3, s3
732 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s7
733 ; VI-NEXT: v_mul_f32_e32 v3, v3, v5
734 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
735 ; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7
736 ; VI-NEXT: v_trunc_f16_e32 v3, v3
737 ; VI-NEXT: v_fma_f16 v3, -v3, v4, s7
738 ; VI-NEXT: v_mov_b32_e32 v4, 16
739 ; VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
740 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
741 ; VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
742 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
743 ; VI-NEXT: v_mov_b32_e32 v2, s4
744 ; VI-NEXT: v_mov_b32_e32 v3, s5
745 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
747 %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4
748 %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16
749 %r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16
750 %r2 = frem <4 x half> %r0, %r1
751 store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16
755 define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, <2 x float> addrspace(1)* %in2) #0 {
756 ; CI-LABEL: frem_v2f32:
758 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
759 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
760 ; CI-NEXT: s_waitcnt lgkmcnt(0)
761 ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
762 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
763 ; CI-NEXT: s_waitcnt lgkmcnt(0)
764 ; CI-NEXT: v_mov_b32_e32 v0, s0
765 ; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2
766 ; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
767 ; CI-NEXT: v_rcp_f32_e32 v3, v1
768 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
769 ; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
770 ; CI-NEXT: v_fma_f32 v3, v4, v3, v3
771 ; CI-NEXT: v_mul_f32_e32 v4, v2, v3
772 ; CI-NEXT: v_fma_f32 v5, -v1, v4, v2
773 ; CI-NEXT: v_fma_f32 v4, v5, v3, v4
774 ; CI-NEXT: v_fma_f32 v1, -v1, v4, v2
775 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
776 ; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
777 ; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
778 ; CI-NEXT: v_trunc_f32_e32 v1, v1
779 ; CI-NEXT: v_fma_f32 v0, -v1, v0, s2
780 ; CI-NEXT: v_mov_b32_e32 v1, s1
781 ; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3
782 ; CI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3
783 ; CI-NEXT: v_rcp_f32_e32 v4, v2
784 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
785 ; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
786 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
787 ; CI-NEXT: v_mul_f32_e32 v5, v3, v4
788 ; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
789 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
790 ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
791 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
792 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
793 ; CI-NEXT: s_mov_b32 s6, -1
794 ; CI-NEXT: s_mov_b32 s7, 0xf000
795 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s3
796 ; CI-NEXT: v_trunc_f32_e32 v2, v2
797 ; CI-NEXT: v_fma_f32 v1, -v2, v1, s3
798 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
801 ; VI-LABEL: frem_v2f32:
803 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
804 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
805 ; VI-NEXT: s_waitcnt lgkmcnt(0)
806 ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
807 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20
808 ; VI-NEXT: s_waitcnt lgkmcnt(0)
809 ; VI-NEXT: v_mov_b32_e32 v0, s0
810 ; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2
811 ; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
812 ; VI-NEXT: v_rcp_f32_e32 v3, v1
813 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
814 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
815 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
816 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
817 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
818 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
819 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
820 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
821 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
822 ; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
823 ; VI-NEXT: v_trunc_f32_e32 v1, v1
824 ; VI-NEXT: v_fma_f32 v0, -v1, v0, s2
825 ; VI-NEXT: v_mov_b32_e32 v1, s1
826 ; VI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3
827 ; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3
828 ; VI-NEXT: v_rcp_f32_e32 v4, v2
829 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
830 ; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
831 ; VI-NEXT: v_fma_f32 v4, v5, v4, v4
832 ; VI-NEXT: v_mul_f32_e32 v5, v3, v4
833 ; VI-NEXT: v_fma_f32 v6, -v2, v5, v3
834 ; VI-NEXT: v_fma_f32 v5, v6, v4, v5
835 ; VI-NEXT: v_fma_f32 v2, -v2, v5, v3
836 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
837 ; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
838 ; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3
839 ; VI-NEXT: v_trunc_f32_e32 v2, v2
840 ; VI-NEXT: v_fma_f32 v1, -v2, v1, s3
841 ; VI-NEXT: v_mov_b32_e32 v2, s4
842 ; VI-NEXT: v_mov_b32_e32 v3, s5
843 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
845 %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
846 %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
847 %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8
848 %r2 = frem <2 x float> %r0, %r1
849 store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
853 define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, <4 x float> addrspace(1)* %in2) #0 {
854 ; CI-LABEL: frem_v4f32:
856 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
857 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
858 ; CI-NEXT: s_waitcnt lgkmcnt(0)
859 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
860 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10
861 ; CI-NEXT: s_waitcnt lgkmcnt(0)
862 ; CI-NEXT: v_mov_b32_e32 v0, s8
863 ; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0
864 ; CI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0
865 ; CI-NEXT: v_rcp_f32_e32 v3, v1
866 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
867 ; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
868 ; CI-NEXT: v_fma_f32 v3, v4, v3, v3
869 ; CI-NEXT: v_mul_f32_e32 v4, v2, v3
870 ; CI-NEXT: v_fma_f32 v5, -v1, v4, v2
871 ; CI-NEXT: v_fma_f32 v4, v5, v3, v4
872 ; CI-NEXT: v_fma_f32 v1, -v1, v4, v2
873 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
874 ; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
875 ; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s0
876 ; CI-NEXT: v_trunc_f32_e32 v1, v1
877 ; CI-NEXT: v_fma_f32 v0, -v1, v0, s0
878 ; CI-NEXT: v_mov_b32_e32 v1, s9
879 ; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, s1
880 ; CI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1
881 ; CI-NEXT: v_rcp_f32_e32 v4, v2
882 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
883 ; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
884 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
885 ; CI-NEXT: v_mul_f32_e32 v5, v3, v4
886 ; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
887 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
888 ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
889 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
890 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
891 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s1
892 ; CI-NEXT: v_trunc_f32_e32 v2, v2
893 ; CI-NEXT: v_fma_f32 v1, -v2, v1, s1
894 ; CI-NEXT: v_mov_b32_e32 v2, s10
895 ; CI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, s2
896 ; CI-NEXT: v_div_scale_f32 v4, vcc, s2, v2, s2
897 ; CI-NEXT: v_rcp_f32_e32 v5, v3
898 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
899 ; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
900 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5
901 ; CI-NEXT: v_mul_f32_e32 v6, v4, v5
902 ; CI-NEXT: v_fma_f32 v7, -v3, v6, v4
903 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6
904 ; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
905 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
906 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
907 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, s2
908 ; CI-NEXT: v_trunc_f32_e32 v3, v3
909 ; CI-NEXT: v_fma_f32 v2, -v3, v2, s2
910 ; CI-NEXT: v_mov_b32_e32 v3, s11
911 ; CI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, s3
912 ; CI-NEXT: v_div_scale_f32 v5, vcc, s3, v3, s3
913 ; CI-NEXT: v_rcp_f32_e32 v6, v4
914 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
915 ; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0
916 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6
917 ; CI-NEXT: v_mul_f32_e32 v7, v5, v6
918 ; CI-NEXT: v_fma_f32 v8, -v4, v7, v5
919 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7
920 ; CI-NEXT: v_fma_f32 v4, -v4, v7, v5
921 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
922 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
923 ; CI-NEXT: s_mov_b32 s6, -1
924 ; CI-NEXT: s_mov_b32 s7, 0xf000
925 ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, s3
926 ; CI-NEXT: v_trunc_f32_e32 v4, v4
927 ; CI-NEXT: v_fma_f32 v3, -v4, v3, s3
928 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
931 ; VI-LABEL: frem_v4f32:
933 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
934 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
935 ; VI-NEXT: s_waitcnt lgkmcnt(0)
936 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
937 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40
938 ; VI-NEXT: s_waitcnt lgkmcnt(0)
939 ; VI-NEXT: v_mov_b32_e32 v0, s8
940 ; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0
941 ; VI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0
942 ; VI-NEXT: v_rcp_f32_e32 v3, v1
943 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
944 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
945 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
946 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
947 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
948 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
949 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
950 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
951 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
952 ; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s0
953 ; VI-NEXT: v_trunc_f32_e32 v1, v1
954 ; VI-NEXT: v_fma_f32 v0, -v1, v0, s0
955 ; VI-NEXT: v_mov_b32_e32 v1, s9
956 ; VI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, s1
957 ; VI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1
958 ; VI-NEXT: v_rcp_f32_e32 v4, v2
959 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
960 ; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
961 ; VI-NEXT: v_fma_f32 v4, v5, v4, v4
962 ; VI-NEXT: v_mul_f32_e32 v5, v3, v4
963 ; VI-NEXT: v_fma_f32 v6, -v2, v5, v3
964 ; VI-NEXT: v_fma_f32 v5, v6, v4, v5
965 ; VI-NEXT: v_fma_f32 v2, -v2, v5, v3
966 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
967 ; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
968 ; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s1
969 ; VI-NEXT: v_trunc_f32_e32 v2, v2
970 ; VI-NEXT: v_fma_f32 v1, -v2, v1, s1
971 ; VI-NEXT: v_mov_b32_e32 v2, s10
972 ; VI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, s2
973 ; VI-NEXT: v_div_scale_f32 v4, vcc, s2, v2, s2
974 ; VI-NEXT: v_rcp_f32_e32 v5, v3
975 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
976 ; VI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
977 ; VI-NEXT: v_fma_f32 v5, v6, v5, v5
978 ; VI-NEXT: v_mul_f32_e32 v6, v4, v5
979 ; VI-NEXT: v_fma_f32 v7, -v3, v6, v4
980 ; VI-NEXT: v_fma_f32 v6, v7, v5, v6
981 ; VI-NEXT: v_fma_f32 v3, -v3, v6, v4
982 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
983 ; VI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
984 ; VI-NEXT: v_div_fixup_f32 v3, v3, v2, s2
985 ; VI-NEXT: v_trunc_f32_e32 v3, v3
986 ; VI-NEXT: v_fma_f32 v2, -v3, v2, s2
987 ; VI-NEXT: v_mov_b32_e32 v3, s11
988 ; VI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, s3
989 ; VI-NEXT: v_div_scale_f32 v5, vcc, s3, v3, s3
990 ; VI-NEXT: v_rcp_f32_e32 v6, v4
991 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
992 ; VI-NEXT: v_fma_f32 v7, -v4, v6, 1.0
993 ; VI-NEXT: v_fma_f32 v6, v7, v6, v6
994 ; VI-NEXT: v_mul_f32_e32 v7, v5, v6
995 ; VI-NEXT: v_fma_f32 v8, -v4, v7, v5
996 ; VI-NEXT: v_fma_f32 v7, v8, v6, v7
997 ; VI-NEXT: v_fma_f32 v4, -v4, v7, v5
998 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
999 ; VI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
1000 ; VI-NEXT: v_div_fixup_f32 v4, v4, v3, s3
1001 ; VI-NEXT: v_trunc_f32_e32 v4, v4
1002 ; VI-NEXT: v_fma_f32 v3, -v4, v3, s3
1003 ; VI-NEXT: v_mov_b32_e32 v4, s4
1004 ; VI-NEXT: v_mov_b32_e32 v5, s5
1005 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1007 %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
1008 %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
1009 %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16
1010 %r2 = frem <4 x float> %r0, %r1
1011 store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
1015 define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, <2 x double> addrspace(1)* %in2) #0 {
1016 ; CI-LABEL: frem_v2f64:
1018 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1019 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1020 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1021 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1022 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10
1023 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1024 ; CI-NEXT: v_mov_b32_e32 v0, s8
1025 ; CI-NEXT: v_mov_b32_e32 v1, s9
1026 ; CI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1]
1027 ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
1028 ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1029 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1030 ; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
1031 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1032 ; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
1033 ; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
1034 ; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
1035 ; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
1036 ; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
1037 ; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
1038 ; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
1039 ; CI-NEXT: v_mov_b32_e32 v2, s10
1040 ; CI-NEXT: v_mov_b32_e32 v3, s11
1041 ; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], s[2:3]
1042 ; CI-NEXT: v_div_scale_f64 v[10:11], vcc, s[2:3], v[2:3], s[2:3]
1043 ; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1044 ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1045 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1046 ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1047 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1048 ; CI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7]
1049 ; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
1050 ; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
1051 ; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[2:3]
1052 ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1053 ; CI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[2:3]
1054 ; CI-NEXT: v_mov_b32_e32 v4, s4
1055 ; CI-NEXT: v_mov_b32_e32 v5, s5
1056 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1059 ; VI-LABEL: frem_v2f64:
1061 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1062 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1063 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1064 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1065 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40
1066 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1067 ; VI-NEXT: v_mov_b32_e32 v0, s8
1068 ; VI-NEXT: v_mov_b32_e32 v1, s9
1069 ; VI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1]
1070 ; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
1071 ; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1072 ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1073 ; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
1074 ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1075 ; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
1076 ; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
1077 ; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
1078 ; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
1079 ; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
1080 ; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
1081 ; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
1082 ; VI-NEXT: v_mov_b32_e32 v2, s10
1083 ; VI-NEXT: v_mov_b32_e32 v3, s11
1084 ; VI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], s[2:3]
1085 ; VI-NEXT: v_div_scale_f64 v[10:11], vcc, s[2:3], v[2:3], s[2:3]
1086 ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1087 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1088 ; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1089 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1090 ; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1091 ; VI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7]
1092 ; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
1093 ; VI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
1094 ; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[2:3]
1095 ; VI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1096 ; VI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[2:3]
1097 ; VI-NEXT: v_mov_b32_e32 v4, s4
1098 ; VI-NEXT: v_mov_b32_e32 v5, s5
1099 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1101 %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
1102 %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
1103 %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16
1104 %r2 = frem <2 x double> %r0, %r1
1105 store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
1109 attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1110 attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }