1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
3 ; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5 define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
8 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
9 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
10 ; CI-NEXT: s_waitcnt lgkmcnt(0)
11 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
12 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2
13 ; CI-NEXT: s_waitcnt lgkmcnt(0)
14 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
15 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
16 ; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
17 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
18 ; CI-NEXT: v_rcp_f32_e32 v4, v2
19 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
20 ; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
21 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
22 ; CI-NEXT: v_mul_f32_e32 v5, v3, v4
23 ; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
24 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
25 ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
26 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
27 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
28 ; CI-NEXT: s_mov_b32 s6, -1
29 ; CI-NEXT: s_mov_b32 s7, 0xf000
30 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
31 ; CI-NEXT: v_trunc_f32_e32 v2, v2
32 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
33 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
34 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
35 ; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
40 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
41 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
42 ; VI-NEXT: s_waitcnt lgkmcnt(0)
43 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
44 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8
45 ; VI-NEXT: s_waitcnt lgkmcnt(0)
46 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
47 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
48 ; VI-NEXT: v_mov_b32_e32 v1, s0
49 ; VI-NEXT: v_rcp_f32_e32 v2, v2
50 ; VI-NEXT: v_mul_f32_e32 v0, v0, v2
51 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
52 ; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
53 ; VI-NEXT: v_trunc_f16_e32 v0, v0
54 ; VI-NEXT: v_fma_f16 v2, -v0, v1, s2
55 ; VI-NEXT: v_mov_b32_e32 v0, s4
56 ; VI-NEXT: v_mov_b32_e32 v1, s5
57 ; VI-NEXT: flat_store_short v[0:1], v2
59 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
60 %r0 = load half, ptr addrspace(1) %in1, align 4
61 %r1 = load half, ptr addrspace(1) %gep2, align 4
62 %r2 = frem half %r0, %r1
63 store half %r2, ptr addrspace(1) %out, align 4
67 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
68 ; CI-LABEL: fast_frem_f16:
70 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
71 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
72 ; CI-NEXT: s_waitcnt lgkmcnt(0)
73 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
74 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2
75 ; CI-NEXT: s_mov_b32 s6, -1
76 ; CI-NEXT: s_mov_b32 s7, 0xf000
77 ; CI-NEXT: s_waitcnt lgkmcnt(0)
78 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
79 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
80 ; CI-NEXT: v_rcp_f32_e32 v2, v1
81 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
82 ; CI-NEXT: v_trunc_f32_e32 v2, v2
83 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
84 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
85 ; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
88 ; VI-LABEL: fast_frem_f16:
90 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
91 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
92 ; VI-NEXT: s_waitcnt lgkmcnt(0)
93 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
94 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8
95 ; VI-NEXT: s_waitcnt lgkmcnt(0)
96 ; VI-NEXT: v_mov_b32_e32 v1, s2
97 ; VI-NEXT: v_rcp_f16_e32 v0, s0
98 ; VI-NEXT: v_mul_f16_e32 v0, s2, v0
99 ; VI-NEXT: v_trunc_f16_e32 v0, v0
100 ; VI-NEXT: v_fma_f16 v2, -v0, s0, v1
101 ; VI-NEXT: v_mov_b32_e32 v0, s4
102 ; VI-NEXT: v_mov_b32_e32 v1, s5
103 ; VI-NEXT: flat_store_short v[0:1], v2
105 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
106 %r0 = load half, ptr addrspace(1) %in1, align 4
107 %r1 = load half, ptr addrspace(1) %gep2, align 4
108 %r2 = frem fast half %r0, %r1
109 store half %r2, ptr addrspace(1) %out, align 4
113 define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 {
114 ; CI-LABEL: unsafe_frem_f16:
116 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
117 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
118 ; CI-NEXT: s_waitcnt lgkmcnt(0)
119 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
120 ; CI-NEXT: s_load_dword s0, s[0:1], 0x2
121 ; CI-NEXT: s_mov_b32 s6, -1
122 ; CI-NEXT: s_mov_b32 s7, 0xf000
123 ; CI-NEXT: s_waitcnt lgkmcnt(0)
124 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
125 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
126 ; CI-NEXT: v_rcp_f32_e32 v2, v1
127 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
128 ; CI-NEXT: v_trunc_f32_e32 v2, v2
129 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
130 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
131 ; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
134 ; VI-LABEL: unsafe_frem_f16:
136 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
137 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
138 ; VI-NEXT: s_waitcnt lgkmcnt(0)
139 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
140 ; VI-NEXT: s_load_dword s0, s[0:1], 0x8
141 ; VI-NEXT: s_waitcnt lgkmcnt(0)
142 ; VI-NEXT: v_mov_b32_e32 v1, s2
143 ; VI-NEXT: v_rcp_f16_e32 v0, s0
144 ; VI-NEXT: v_mul_f16_e32 v0, s2, v0
145 ; VI-NEXT: v_trunc_f16_e32 v0, v0
146 ; VI-NEXT: v_fma_f16 v2, -v0, s0, v1
147 ; VI-NEXT: v_mov_b32_e32 v0, s4
148 ; VI-NEXT: v_mov_b32_e32 v1, s5
149 ; VI-NEXT: flat_store_short v[0:1], v2
151 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
152 %r0 = load half, ptr addrspace(1) %in1, align 4
153 %r1 = load half, ptr addrspace(1) %gep2, align 4
154 %r2 = frem half %r0, %r1
155 store half %r2, ptr addrspace(1) %out, align 4
159 define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
160 ; CI-LABEL: frem_f32:
162 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
163 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
164 ; CI-NEXT: s_waitcnt lgkmcnt(0)
165 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
166 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4
167 ; CI-NEXT: s_waitcnt lgkmcnt(0)
168 ; CI-NEXT: v_mov_b32_e32 v0, s0
169 ; CI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2
170 ; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
171 ; CI-NEXT: v_rcp_f32_e32 v3, v1
172 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
173 ; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
174 ; CI-NEXT: v_fma_f32 v3, v4, v3, v3
175 ; CI-NEXT: v_mul_f32_e32 v4, v2, v3
176 ; CI-NEXT: v_fma_f32 v5, -v1, v4, v2
177 ; CI-NEXT: v_fma_f32 v4, v5, v3, v4
178 ; CI-NEXT: v_fma_f32 v1, -v1, v4, v2
179 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
180 ; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
181 ; CI-NEXT: s_mov_b32 s6, -1
182 ; CI-NEXT: s_mov_b32 s7, 0xf000
183 ; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
184 ; CI-NEXT: v_trunc_f32_e32 v1, v1
185 ; CI-NEXT: v_fma_f32 v0, -v1, v0, s2
186 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
189 ; VI-LABEL: frem_f32:
191 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
192 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
193 ; VI-NEXT: s_waitcnt lgkmcnt(0)
194 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
195 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10
196 ; VI-NEXT: s_waitcnt lgkmcnt(0)
197 ; VI-NEXT: v_mov_b32_e32 v0, s0
198 ; VI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2
199 ; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
200 ; VI-NEXT: v_rcp_f32_e32 v3, v1
201 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
202 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
203 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
204 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
205 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
206 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
207 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
208 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
209 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
210 ; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
211 ; VI-NEXT: v_trunc_f32_e32 v1, v1
212 ; VI-NEXT: v_fma_f32 v2, -v1, v0, s2
213 ; VI-NEXT: v_mov_b32_e32 v0, s4
214 ; VI-NEXT: v_mov_b32_e32 v1, s5
215 ; VI-NEXT: flat_store_dword v[0:1], v2
217 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
218 %r0 = load float, ptr addrspace(1) %in1, align 4
219 %r1 = load float, ptr addrspace(1) %gep2, align 4
220 %r2 = frem float %r0, %r1
221 store float %r2, ptr addrspace(1) %out, align 4
225 define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
226 ; CI-LABEL: fast_frem_f32:
228 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
229 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
230 ; CI-NEXT: s_waitcnt lgkmcnt(0)
231 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
232 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4
233 ; CI-NEXT: s_mov_b32 s6, -1
234 ; CI-NEXT: s_mov_b32 s7, 0xf000
235 ; CI-NEXT: s_waitcnt lgkmcnt(0)
236 ; CI-NEXT: v_mov_b32_e32 v1, s2
237 ; CI-NEXT: v_rcp_f32_e32 v0, s0
238 ; CI-NEXT: v_mul_f32_e32 v0, s2, v0
239 ; CI-NEXT: v_trunc_f32_e32 v0, v0
240 ; CI-NEXT: v_fma_f32 v0, -v0, s0, v1
241 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
244 ; VI-LABEL: fast_frem_f32:
246 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
247 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
248 ; VI-NEXT: s_waitcnt lgkmcnt(0)
249 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
250 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10
251 ; VI-NEXT: s_waitcnt lgkmcnt(0)
252 ; VI-NEXT: v_mov_b32_e32 v1, s2
253 ; VI-NEXT: v_rcp_f32_e32 v0, s0
254 ; VI-NEXT: v_mul_f32_e32 v0, s2, v0
255 ; VI-NEXT: v_trunc_f32_e32 v0, v0
256 ; VI-NEXT: v_fma_f32 v2, -v0, s0, v1
257 ; VI-NEXT: v_mov_b32_e32 v0, s4
258 ; VI-NEXT: v_mov_b32_e32 v1, s5
259 ; VI-NEXT: flat_store_dword v[0:1], v2
261 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
262 %r0 = load float, ptr addrspace(1) %in1, align 4
263 %r1 = load float, ptr addrspace(1) %gep2, align 4
264 %r2 = frem fast float %r0, %r1
265 store float %r2, ptr addrspace(1) %out, align 4
269 define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 {
270 ; CI-LABEL: unsafe_frem_f32:
272 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
273 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
274 ; CI-NEXT: s_waitcnt lgkmcnt(0)
275 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
276 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4
277 ; CI-NEXT: s_mov_b32 s6, -1
278 ; CI-NEXT: s_mov_b32 s7, 0xf000
279 ; CI-NEXT: s_waitcnt lgkmcnt(0)
280 ; CI-NEXT: v_mov_b32_e32 v1, s2
281 ; CI-NEXT: v_rcp_f32_e32 v0, s0
282 ; CI-NEXT: v_mul_f32_e32 v0, s2, v0
283 ; CI-NEXT: v_trunc_f32_e32 v0, v0
284 ; CI-NEXT: v_fma_f32 v0, -v0, s0, v1
285 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
288 ; VI-LABEL: unsafe_frem_f32:
290 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
291 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
292 ; VI-NEXT: s_waitcnt lgkmcnt(0)
293 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
294 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10
295 ; VI-NEXT: s_waitcnt lgkmcnt(0)
296 ; VI-NEXT: v_mov_b32_e32 v1, s2
297 ; VI-NEXT: v_rcp_f32_e32 v0, s0
298 ; VI-NEXT: v_mul_f32_e32 v0, s2, v0
299 ; VI-NEXT: v_trunc_f32_e32 v0, v0
300 ; VI-NEXT: v_fma_f32 v2, -v0, s0, v1
301 ; VI-NEXT: v_mov_b32_e32 v0, s4
302 ; VI-NEXT: v_mov_b32_e32 v1, s5
303 ; VI-NEXT: flat_store_dword v[0:1], v2
305 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
306 %r0 = load float, ptr addrspace(1) %in1, align 4
307 %r1 = load float, ptr addrspace(1) %gep2, align 4
308 %r2 = frem float %r0, %r1
309 store float %r2, ptr addrspace(1) %out, align 4
313 define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
314 ; CI-LABEL: frem_f64:
316 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
317 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
318 ; CI-NEXT: s_waitcnt lgkmcnt(0)
319 ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
320 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
321 ; CI-NEXT: s_mov_b32 s6, -1
322 ; CI-NEXT: s_mov_b32 s7, 0xf000
323 ; CI-NEXT: s_waitcnt lgkmcnt(0)
324 ; CI-NEXT: v_mov_b32_e32 v0, s0
325 ; CI-NEXT: v_mov_b32_e32 v1, s1
326 ; CI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3]
327 ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
328 ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
329 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
330 ; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
331 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
332 ; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
333 ; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
334 ; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
335 ; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
336 ; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
337 ; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
338 ; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
339 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
342 ; VI-LABEL: frem_f64:
344 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
345 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
346 ; VI-NEXT: s_waitcnt lgkmcnt(0)
347 ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
348 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
349 ; VI-NEXT: s_waitcnt lgkmcnt(0)
350 ; VI-NEXT: v_mov_b32_e32 v0, s0
351 ; VI-NEXT: v_mov_b32_e32 v1, s1
352 ; VI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3]
353 ; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
354 ; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
355 ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
356 ; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
357 ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
358 ; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
359 ; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
360 ; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
361 ; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
362 ; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
363 ; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
364 ; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
365 ; VI-NEXT: v_mov_b32_e32 v2, s4
366 ; VI-NEXT: v_mov_b32_e32 v3, s5
367 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
369 %r0 = load double, ptr addrspace(1) %in1, align 8
370 %r1 = load double, ptr addrspace(1) %in2, align 8
371 %r2 = frem double %r0, %r1
372 store double %r2, ptr addrspace(1) %out, align 8
376 define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
377 ; CI-LABEL: fast_frem_f64:
379 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
380 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
381 ; CI-NEXT: s_waitcnt lgkmcnt(0)
382 ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
383 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
384 ; CI-NEXT: s_mov_b32 s6, -1
385 ; CI-NEXT: s_mov_b32 s7, 0xf000
386 ; CI-NEXT: s_waitcnt lgkmcnt(0)
387 ; CI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
388 ; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
389 ; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
390 ; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
391 ; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
392 ; CI-NEXT: v_mov_b32_e32 v2, s2
393 ; CI-NEXT: v_mov_b32_e32 v3, s3
394 ; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
395 ; CI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
396 ; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
397 ; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
398 ; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
399 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
402 ; VI-LABEL: fast_frem_f64:
404 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
405 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
406 ; VI-NEXT: s_waitcnt lgkmcnt(0)
407 ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
408 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
409 ; VI-NEXT: s_waitcnt lgkmcnt(0)
410 ; VI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
411 ; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
412 ; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
413 ; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
414 ; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
415 ; VI-NEXT: v_mov_b32_e32 v2, s2
416 ; VI-NEXT: v_mov_b32_e32 v3, s3
417 ; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
418 ; VI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
419 ; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
420 ; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
421 ; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
422 ; VI-NEXT: v_mov_b32_e32 v2, s4
423 ; VI-NEXT: v_mov_b32_e32 v3, s5
424 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
426 %r0 = load double, ptr addrspace(1) %in1, align 8
427 %r1 = load double, ptr addrspace(1) %in2, align 8
428 %r2 = frem fast double %r0, %r1
429 store double %r2, ptr addrspace(1) %out, align 8
433 define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
434 ; CI-LABEL: unsafe_frem_f64:
436 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
437 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
438 ; CI-NEXT: s_waitcnt lgkmcnt(0)
439 ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
440 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
441 ; CI-NEXT: s_mov_b32 s6, -1
442 ; CI-NEXT: s_mov_b32 s7, 0xf000
443 ; CI-NEXT: s_waitcnt lgkmcnt(0)
444 ; CI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
445 ; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
446 ; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
447 ; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
448 ; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
449 ; CI-NEXT: v_mov_b32_e32 v2, s2
450 ; CI-NEXT: v_mov_b32_e32 v3, s3
451 ; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
452 ; CI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
453 ; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
454 ; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
455 ; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
456 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
459 ; VI-LABEL: unsafe_frem_f64:
461 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
462 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
463 ; VI-NEXT: s_waitcnt lgkmcnt(0)
464 ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
465 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
466 ; VI-NEXT: s_waitcnt lgkmcnt(0)
467 ; VI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
468 ; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
469 ; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
470 ; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
471 ; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
472 ; VI-NEXT: v_mov_b32_e32 v2, s2
473 ; VI-NEXT: v_mov_b32_e32 v3, s3
474 ; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
475 ; VI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
476 ; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
477 ; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
478 ; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
479 ; VI-NEXT: v_mov_b32_e32 v2, s4
480 ; VI-NEXT: v_mov_b32_e32 v3, s5
481 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
483 ptr addrspace(1) %in2) #1 {
484 %r0 = load double, ptr addrspace(1) %in1, align 8
485 %r1 = load double, ptr addrspace(1) %in2, align 8
486 %r2 = frem double %r0, %r1
487 store double %r2, ptr addrspace(1) %out, align 8
491 define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
492 ; CI-LABEL: frem_v2f16:
494 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
495 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
496 ; CI-NEXT: s_waitcnt lgkmcnt(0)
497 ; CI-NEXT: s_load_dword s2, s[6:7], 0x0
498 ; CI-NEXT: s_load_dword s0, s[0:1], 0x4
499 ; CI-NEXT: s_waitcnt lgkmcnt(0)
500 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
501 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
502 ; CI-NEXT: s_lshr_b32 s6, s0, 16
503 ; CI-NEXT: s_lshr_b32 s3, s2, 16
504 ; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
505 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
506 ; CI-NEXT: v_rcp_f32_e32 v4, v2
507 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
508 ; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
509 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
510 ; CI-NEXT: v_mul_f32_e32 v5, v3, v4
511 ; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
512 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
513 ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
514 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
515 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
516 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
517 ; CI-NEXT: v_trunc_f32_e32 v2, v2
518 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
519 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
520 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s6
521 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
522 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
523 ; CI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v1
524 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1
525 ; CI-NEXT: v_rcp_f32_e32 v5, v3
526 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
527 ; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
528 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5
529 ; CI-NEXT: v_mul_f32_e32 v6, v4, v5
530 ; CI-NEXT: v_fma_f32 v7, -v3, v6, v4
531 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6
532 ; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
533 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
534 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
535 ; CI-NEXT: s_mov_b32 s6, -1
536 ; CI-NEXT: s_mov_b32 s7, 0xf000
537 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1
538 ; CI-NEXT: v_trunc_f32_e32 v3, v3
539 ; CI-NEXT: v_fma_f32 v1, -v3, v2, v1
540 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
541 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
542 ; CI-NEXT: v_or_b32_e32 v0, v0, v1
543 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
546 ; VI-LABEL: frem_v2f16:
548 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
549 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
550 ; VI-NEXT: s_waitcnt lgkmcnt(0)
551 ; VI-NEXT: s_load_dword s2, s[6:7], 0x0
552 ; VI-NEXT: s_load_dword s0, s[0:1], 0x10
553 ; VI-NEXT: s_waitcnt lgkmcnt(0)
554 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
555 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
556 ; VI-NEXT: s_lshr_b32 s3, s0, 16
557 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s3
558 ; VI-NEXT: v_mov_b32_e32 v1, s0
559 ; VI-NEXT: v_rcp_f32_e32 v2, v2
560 ; VI-NEXT: s_lshr_b32 s1, s2, 16
561 ; VI-NEXT: v_rcp_f32_e32 v3, v3
562 ; VI-NEXT: v_mul_f32_e32 v0, v0, v2
563 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
564 ; VI-NEXT: v_mov_b32_e32 v2, s3
565 ; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
566 ; VI-NEXT: v_trunc_f16_e32 v0, v0
567 ; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
568 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s1
569 ; VI-NEXT: v_mul_f32_e32 v1, v1, v3
570 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
571 ; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1
572 ; VI-NEXT: v_trunc_f16_e32 v1, v1
573 ; VI-NEXT: v_fma_f16 v1, -v1, v2, s1
574 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
575 ; VI-NEXT: v_or_b32_e32 v2, v0, v1
576 ; VI-NEXT: v_mov_b32_e32 v0, s4
577 ; VI-NEXT: v_mov_b32_e32 v1, s5
578 ; VI-NEXT: flat_store_dword v[0:1], v2
580 %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
581 %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
582 %r1 = load <2 x half>, ptr addrspace(1) %gep2, align 8
583 %r2 = frem <2 x half> %r0, %r1
584 store <2 x half> %r2, ptr addrspace(1) %out, align 8
588 define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
589 ; CI-LABEL: frem_v4f16:
591 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
592 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
593 ; CI-NEXT: s_waitcnt lgkmcnt(0)
594 ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
595 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
596 ; CI-NEXT: s_waitcnt lgkmcnt(0)
597 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
598 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
599 ; CI-NEXT: s_lshr_b32 s8, s2, 16
600 ; CI-NEXT: s_lshr_b32 s9, s3, 16
601 ; CI-NEXT: s_lshr_b32 s10, s0, 16
602 ; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, v0
603 ; CI-NEXT: s_lshr_b32 s11, s1, 16
604 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
605 ; CI-NEXT: v_rcp_f32_e32 v4, v2
606 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
607 ; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
608 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
609 ; CI-NEXT: v_mul_f32_e32 v5, v3, v4
610 ; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
611 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
612 ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
613 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
614 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
615 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
616 ; CI-NEXT: v_trunc_f32_e32 v2, v2
617 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
618 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s8
619 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s10
620 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
621 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
622 ; CI-NEXT: v_div_scale_f32 v3, s[6:7], v2, v2, v1
623 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1
624 ; CI-NEXT: v_rcp_f32_e32 v5, v3
625 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
626 ; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
627 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5
628 ; CI-NEXT: v_mul_f32_e32 v6, v4, v5
629 ; CI-NEXT: v_fma_f32 v7, -v3, v6, v4
630 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6
631 ; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
632 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
633 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
634 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1
635 ; CI-NEXT: v_trunc_f32_e32 v3, v3
636 ; CI-NEXT: v_fma_f32 v1, -v3, v2, v1
637 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
638 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s1
639 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
640 ; CI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, v2
641 ; CI-NEXT: v_div_scale_f32 v5, vcc, v2, v3, v2
642 ; CI-NEXT: v_rcp_f32_e32 v6, v4
643 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
644 ; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0
645 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6
646 ; CI-NEXT: v_mul_f32_e32 v7, v5, v6
647 ; CI-NEXT: v_fma_f32 v8, -v4, v7, v5
648 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7
649 ; CI-NEXT: v_fma_f32 v4, -v4, v7, v5
650 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
651 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
652 ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v2
653 ; CI-NEXT: v_trunc_f32_e32 v4, v4
654 ; CI-NEXT: v_fma_f32 v2, -v4, v3, v2
655 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s9
656 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s11
657 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
658 ; CI-NEXT: v_div_scale_f32 v5, s[0:1], v4, v4, v3
659 ; CI-NEXT: v_div_scale_f32 v6, vcc, v3, v4, v3
660 ; CI-NEXT: v_rcp_f32_e32 v7, v5
661 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
662 ; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0
663 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7
664 ; CI-NEXT: v_mul_f32_e32 v8, v6, v7
665 ; CI-NEXT: v_fma_f32 v9, -v5, v8, v6
666 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8
667 ; CI-NEXT: v_fma_f32 v5, -v5, v8, v6
668 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
669 ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
670 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
671 ; CI-NEXT: v_or_b32_e32 v0, v0, v1
672 ; CI-NEXT: s_mov_b32 s6, -1
673 ; CI-NEXT: s_mov_b32 s7, 0xf000
674 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3
675 ; CI-NEXT: v_trunc_f32_e32 v5, v5
676 ; CI-NEXT: v_fma_f32 v3, -v5, v4, v3
677 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
678 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
679 ; CI-NEXT: v_or_b32_e32 v1, v2, v1
680 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
683 ; VI-LABEL: frem_v4f16:
685 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
686 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
687 ; VI-NEXT: s_waitcnt lgkmcnt(0)
688 ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
689 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20
690 ; VI-NEXT: s_waitcnt lgkmcnt(0)
691 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
692 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
693 ; VI-NEXT: s_lshr_b32 s8, s0, 16
694 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
695 ; VI-NEXT: v_mov_b32_e32 v1, s0
696 ; VI-NEXT: v_rcp_f32_e32 v2, v2
697 ; VI-NEXT: s_lshr_b32 s6, s2, 16
698 ; VI-NEXT: v_rcp_f32_e32 v3, v3
699 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
700 ; VI-NEXT: v_mul_f32_e32 v0, v0, v2
701 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
702 ; VI-NEXT: v_mov_b32_e32 v2, s8
703 ; VI-NEXT: v_rcp_f32_e32 v4, v4
704 ; VI-NEXT: s_lshr_b32 s9, s1, 16
705 ; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
706 ; VI-NEXT: v_trunc_f16_e32 v0, v0
707 ; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
708 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s6
709 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
710 ; VI-NEXT: s_lshr_b32 s7, s3, 16
711 ; VI-NEXT: v_mul_f32_e32 v1, v1, v3
712 ; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
713 ; VI-NEXT: v_mov_b32_e32 v3, s1
714 ; VI-NEXT: v_rcp_f32_e32 v5, v5
715 ; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6
716 ; VI-NEXT: v_trunc_f16_e32 v1, v1
717 ; VI-NEXT: v_fma_f16 v1, -v1, v2, s6
718 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
719 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
720 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
721 ; VI-NEXT: v_mul_f32_e32 v2, v2, v4
722 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
723 ; VI-NEXT: v_mov_b32_e32 v4, s9
724 ; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3
725 ; VI-NEXT: v_trunc_f16_e32 v2, v2
726 ; VI-NEXT: v_fma_f16 v2, -v2, v3, s3
727 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s7
728 ; VI-NEXT: v_mul_f32_e32 v3, v3, v5
729 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
730 ; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7
731 ; VI-NEXT: v_trunc_f16_e32 v3, v3
732 ; VI-NEXT: v_fma_f16 v3, -v3, v4, s7
733 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
734 ; VI-NEXT: v_or_b32_e32 v1, v2, v1
735 ; VI-NEXT: v_mov_b32_e32 v2, s4
736 ; VI-NEXT: v_mov_b32_e32 v3, s5
737 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
739 %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
740 %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
741 %r1 = load <4 x half>, ptr addrspace(1) %gep2, align 16
742 %r2 = frem <4 x half> %r0, %r1
743 store <4 x half> %r2, ptr addrspace(1) %out, align 16
747 define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
748 ; CI-LABEL: frem_v2f32:
750 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
751 ; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
752 ; CI-NEXT: s_waitcnt lgkmcnt(0)
753 ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
754 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
755 ; CI-NEXT: s_waitcnt lgkmcnt(0)
756 ; CI-NEXT: v_mov_b32_e32 v0, s0
757 ; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2
758 ; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
759 ; CI-NEXT: v_rcp_f32_e32 v3, v1
760 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
761 ; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
762 ; CI-NEXT: v_fma_f32 v3, v4, v3, v3
763 ; CI-NEXT: v_mul_f32_e32 v4, v2, v3
764 ; CI-NEXT: v_fma_f32 v5, -v1, v4, v2
765 ; CI-NEXT: v_fma_f32 v4, v5, v3, v4
766 ; CI-NEXT: v_fma_f32 v1, -v1, v4, v2
767 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
768 ; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
769 ; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
770 ; CI-NEXT: v_trunc_f32_e32 v1, v1
771 ; CI-NEXT: v_fma_f32 v0, -v1, v0, s2
772 ; CI-NEXT: v_mov_b32_e32 v1, s1
773 ; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3
774 ; CI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3
775 ; CI-NEXT: v_rcp_f32_e32 v4, v2
776 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
777 ; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
778 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
779 ; CI-NEXT: v_mul_f32_e32 v5, v3, v4
780 ; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
781 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
782 ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
783 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
784 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
785 ; CI-NEXT: s_mov_b32 s6, -1
786 ; CI-NEXT: s_mov_b32 s7, 0xf000
787 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s3
788 ; CI-NEXT: v_trunc_f32_e32 v2, v2
789 ; CI-NEXT: v_fma_f32 v1, -v2, v1, s3
790 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
793 ; VI-LABEL: frem_v2f32:
795 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
796 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
797 ; VI-NEXT: s_waitcnt lgkmcnt(0)
798 ; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
799 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20
800 ; VI-NEXT: s_waitcnt lgkmcnt(0)
801 ; VI-NEXT: v_mov_b32_e32 v0, s0
802 ; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2
803 ; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
804 ; VI-NEXT: v_rcp_f32_e32 v3, v1
805 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
806 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
807 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
808 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
809 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
810 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
811 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
812 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
813 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
814 ; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
815 ; VI-NEXT: v_trunc_f32_e32 v1, v1
816 ; VI-NEXT: v_fma_f32 v0, -v1, v0, s2
817 ; VI-NEXT: v_mov_b32_e32 v1, s1
818 ; VI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3
819 ; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3
820 ; VI-NEXT: v_rcp_f32_e32 v4, v2
821 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
822 ; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
823 ; VI-NEXT: v_fma_f32 v4, v5, v4, v4
824 ; VI-NEXT: v_mul_f32_e32 v5, v3, v4
825 ; VI-NEXT: v_fma_f32 v6, -v2, v5, v3
826 ; VI-NEXT: v_fma_f32 v5, v6, v4, v5
827 ; VI-NEXT: v_fma_f32 v2, -v2, v5, v3
828 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
829 ; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
830 ; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3
831 ; VI-NEXT: v_trunc_f32_e32 v2, v2
832 ; VI-NEXT: v_fma_f32 v1, -v2, v1, s3
833 ; VI-NEXT: v_mov_b32_e32 v2, s4
834 ; VI-NEXT: v_mov_b32_e32 v3, s5
835 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
837 %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
838 %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8
839 %r1 = load <2 x float>, ptr addrspace(1) %gep2, align 8
840 %r2 = frem <2 x float> %r0, %r1
841 store <2 x float> %r2, ptr addrspace(1) %out, align 8
845 define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
846 ; CI-LABEL: frem_v4f32:
848 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
849 ; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd
850 ; CI-NEXT: s_waitcnt lgkmcnt(0)
851 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
852 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10
853 ; CI-NEXT: s_waitcnt lgkmcnt(0)
854 ; CI-NEXT: v_mov_b32_e32 v0, s8
855 ; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0
856 ; CI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0
857 ; CI-NEXT: v_rcp_f32_e32 v3, v1
858 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
859 ; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
860 ; CI-NEXT: v_fma_f32 v3, v4, v3, v3
861 ; CI-NEXT: v_mul_f32_e32 v4, v2, v3
862 ; CI-NEXT: v_fma_f32 v5, -v1, v4, v2
863 ; CI-NEXT: v_fma_f32 v4, v5, v3, v4
864 ; CI-NEXT: v_fma_f32 v1, -v1, v4, v2
865 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
866 ; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
867 ; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s0
868 ; CI-NEXT: v_trunc_f32_e32 v1, v1
869 ; CI-NEXT: v_fma_f32 v0, -v1, v0, s0
870 ; CI-NEXT: v_mov_b32_e32 v1, s9
871 ; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, s1
872 ; CI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1
873 ; CI-NEXT: v_rcp_f32_e32 v4, v2
874 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
875 ; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
876 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
877 ; CI-NEXT: v_mul_f32_e32 v5, v3, v4
878 ; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
879 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
880 ; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
881 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
882 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
883 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s1
884 ; CI-NEXT: v_trunc_f32_e32 v2, v2
885 ; CI-NEXT: v_fma_f32 v1, -v2, v1, s1
886 ; CI-NEXT: v_mov_b32_e32 v2, s10
887 ; CI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, s2
888 ; CI-NEXT: v_div_scale_f32 v4, vcc, s2, v2, s2
889 ; CI-NEXT: v_rcp_f32_e32 v5, v3
890 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
891 ; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
892 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5
893 ; CI-NEXT: v_mul_f32_e32 v6, v4, v5
894 ; CI-NEXT: v_fma_f32 v7, -v3, v6, v4
895 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6
896 ; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
897 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
898 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
899 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, s2
900 ; CI-NEXT: v_trunc_f32_e32 v3, v3
901 ; CI-NEXT: v_fma_f32 v2, -v3, v2, s2
902 ; CI-NEXT: v_mov_b32_e32 v3, s11
903 ; CI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, s3
904 ; CI-NEXT: v_div_scale_f32 v5, vcc, s3, v3, s3
905 ; CI-NEXT: v_rcp_f32_e32 v6, v4
906 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
907 ; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0
908 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6
909 ; CI-NEXT: v_mul_f32_e32 v7, v5, v6
910 ; CI-NEXT: v_fma_f32 v8, -v4, v7, v5
911 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7
912 ; CI-NEXT: v_fma_f32 v4, -v4, v7, v5
913 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
914 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
915 ; CI-NEXT: s_mov_b32 s6, -1
916 ; CI-NEXT: s_mov_b32 s7, 0xf000
917 ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, s3
918 ; CI-NEXT: v_trunc_f32_e32 v4, v4
919 ; CI-NEXT: v_fma_f32 v3, -v4, v3, s3
920 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
923 ; VI-LABEL: frem_v4f32:
925 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
926 ; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34
927 ; VI-NEXT: s_waitcnt lgkmcnt(0)
928 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
929 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40
930 ; VI-NEXT: s_waitcnt lgkmcnt(0)
931 ; VI-NEXT: v_mov_b32_e32 v0, s8
932 ; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0
933 ; VI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0
934 ; VI-NEXT: v_rcp_f32_e32 v3, v1
935 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
936 ; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
937 ; VI-NEXT: v_fma_f32 v3, v4, v3, v3
938 ; VI-NEXT: v_mul_f32_e32 v4, v2, v3
939 ; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
940 ; VI-NEXT: v_fma_f32 v4, v5, v3, v4
941 ; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
942 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
943 ; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
944 ; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s0
945 ; VI-NEXT: v_trunc_f32_e32 v1, v1
946 ; VI-NEXT: v_fma_f32 v0, -v1, v0, s0
947 ; VI-NEXT: v_mov_b32_e32 v1, s9
948 ; VI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, s1
949 ; VI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1
950 ; VI-NEXT: v_rcp_f32_e32 v4, v2
951 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
952 ; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
953 ; VI-NEXT: v_fma_f32 v4, v5, v4, v4
954 ; VI-NEXT: v_mul_f32_e32 v5, v3, v4
955 ; VI-NEXT: v_fma_f32 v6, -v2, v5, v3
956 ; VI-NEXT: v_fma_f32 v5, v6, v4, v5
957 ; VI-NEXT: v_fma_f32 v2, -v2, v5, v3
958 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
959 ; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
960 ; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s1
961 ; VI-NEXT: v_trunc_f32_e32 v2, v2
962 ; VI-NEXT: v_fma_f32 v1, -v2, v1, s1
963 ; VI-NEXT: v_mov_b32_e32 v2, s10
964 ; VI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, s2
965 ; VI-NEXT: v_div_scale_f32 v4, vcc, s2, v2, s2
966 ; VI-NEXT: v_rcp_f32_e32 v5, v3
967 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
968 ; VI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
969 ; VI-NEXT: v_fma_f32 v5, v6, v5, v5
970 ; VI-NEXT: v_mul_f32_e32 v6, v4, v5
971 ; VI-NEXT: v_fma_f32 v7, -v3, v6, v4
972 ; VI-NEXT: v_fma_f32 v6, v7, v5, v6
973 ; VI-NEXT: v_fma_f32 v3, -v3, v6, v4
974 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
975 ; VI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
976 ; VI-NEXT: v_div_fixup_f32 v3, v3, v2, s2
977 ; VI-NEXT: v_trunc_f32_e32 v3, v3
978 ; VI-NEXT: v_fma_f32 v2, -v3, v2, s2
979 ; VI-NEXT: v_mov_b32_e32 v3, s11
980 ; VI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, s3
981 ; VI-NEXT: v_div_scale_f32 v5, vcc, s3, v3, s3
982 ; VI-NEXT: v_rcp_f32_e32 v6, v4
983 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
984 ; VI-NEXT: v_fma_f32 v7, -v4, v6, 1.0
985 ; VI-NEXT: v_fma_f32 v6, v7, v6, v6
986 ; VI-NEXT: v_mul_f32_e32 v7, v5, v6
987 ; VI-NEXT: v_fma_f32 v8, -v4, v7, v5
988 ; VI-NEXT: v_fma_f32 v7, v8, v6, v7
989 ; VI-NEXT: v_fma_f32 v4, -v4, v7, v5
990 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
991 ; VI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
992 ; VI-NEXT: v_div_fixup_f32 v4, v4, v3, s3
993 ; VI-NEXT: v_trunc_f32_e32 v4, v4
994 ; VI-NEXT: v_fma_f32 v3, -v4, v3, s3
995 ; VI-NEXT: v_mov_b32_e32 v4, s4
996 ; VI-NEXT: v_mov_b32_e32 v5, s5
997 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
999 %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
1000 %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16
1001 %r1 = load <4 x float>, ptr addrspace(1) %gep2, align 16
1002 %r2 = frem <4 x float> %r0, %r1
1003 store <4 x float> %r2, ptr addrspace(1) %out, align 16
1007 define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
1008 ; CI-LABEL: frem_v2f64:
1010 ; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
1011 ; CI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd
1012 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1013 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1014 ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10
1015 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1016 ; CI-NEXT: v_mov_b32_e32 v0, s8
1017 ; CI-NEXT: v_mov_b32_e32 v1, s9
1018 ; CI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1]
1019 ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
1020 ; CI-NEXT: s_mov_b32 s6, -1
1021 ; CI-NEXT: s_mov_b32 s7, 0xf000
1022 ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1023 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1024 ; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
1025 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1026 ; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
1027 ; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
1028 ; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
1029 ; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
1030 ; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
1031 ; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
1032 ; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
1033 ; CI-NEXT: v_mov_b32_e32 v2, s10
1034 ; CI-NEXT: v_mov_b32_e32 v3, s11
1035 ; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], s[2:3]
1036 ; CI-NEXT: v_div_scale_f64 v[10:11], vcc, s[2:3], v[2:3], s[2:3]
1037 ; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1038 ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1039 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1040 ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1041 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1042 ; CI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7]
1043 ; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
1044 ; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
1045 ; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[2:3]
1046 ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1047 ; CI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[2:3]
1048 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1051 ; VI-LABEL: frem_v2f64:
1053 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1054 ; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x34
1055 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1056 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1057 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40
1058 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1059 ; VI-NEXT: v_mov_b32_e32 v0, s8
1060 ; VI-NEXT: v_mov_b32_e32 v1, s9
1061 ; VI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1]
1062 ; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
1063 ; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1064 ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1065 ; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
1066 ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1067 ; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
1068 ; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
1069 ; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
1070 ; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
1071 ; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
1072 ; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
1073 ; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
1074 ; VI-NEXT: v_mov_b32_e32 v2, s10
1075 ; VI-NEXT: v_mov_b32_e32 v3, s11
1076 ; VI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], s[2:3]
1077 ; VI-NEXT: v_div_scale_f64 v[10:11], vcc, s[2:3], v[2:3], s[2:3]
1078 ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1079 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1080 ; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1081 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1082 ; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1083 ; VI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7]
1084 ; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
1085 ; VI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
1086 ; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[2:3]
1087 ; VI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1088 ; VI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[2:3]
1089 ; VI-NEXT: v_mov_b32_e32 v4, s4
1090 ; VI-NEXT: v_mov_b32_e32 v5, s5
1091 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1093 %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
1094 %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16
1095 %r1 = load <2 x double>, ptr addrspace(1) %gep2, align 16
1096 %r2 = frem <2 x double> %r0, %r1
1097 store <2 x double> %r2, ptr addrspace(1) %out, align 16
1101 attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1102 attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }