1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
9 define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
12 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
14 ; SI-NEXT: s_mov_b32 s11, 0xf000
15 ; SI-NEXT: s_mov_b32 s10, -1
16 ; SI-NEXT: s_waitcnt lgkmcnt(0)
17 ; SI-NEXT: s_mov_b32 s8, s4
18 ; SI-NEXT: s_mov_b32 s9, s5
19 ; SI-NEXT: s_mov_b32 s4, s6
20 ; SI-NEXT: s_mov_b32 s5, s7
21 ; SI-NEXT: s_mov_b32 s6, s10
22 ; SI-NEXT: s_mov_b32 s7, s11
23 ; SI-NEXT: s_mov_b32 s2, s10
24 ; SI-NEXT: s_mov_b32 s3, s11
25 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
26 ; SI-NEXT: s_waitcnt vmcnt(0)
27 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
28 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
29 ; SI-NEXT: s_waitcnt vmcnt(0)
30 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
31 ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0
32 ; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0
33 ; SI-NEXT: v_rcp_f32_e32 v4, v3
34 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
35 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0
36 ; SI-NEXT: v_fma_f32 v4, v5, v4, v4
37 ; SI-NEXT: v_mul_f32_e32 v5, v2, v4
38 ; SI-NEXT: v_fma_f32 v6, -v3, v5, v2
39 ; SI-NEXT: v_fma_f32 v5, v6, v4, v5
40 ; SI-NEXT: v_fma_f32 v2, -v3, v5, v2
41 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
42 ; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
43 ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
44 ; SI-NEXT: v_trunc_f32_e32 v2, v2
45 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
46 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
47 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
48 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
53 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
54 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
55 ; CI-NEXT: s_mov_b32 s11, 0xf000
56 ; CI-NEXT: s_mov_b32 s10, -1
57 ; CI-NEXT: s_mov_b32 s2, s10
58 ; CI-NEXT: s_waitcnt lgkmcnt(0)
59 ; CI-NEXT: s_mov_b32 s8, s4
60 ; CI-NEXT: s_mov_b32 s9, s5
61 ; CI-NEXT: s_mov_b32 s4, s6
62 ; CI-NEXT: s_mov_b32 s5, s7
63 ; CI-NEXT: s_mov_b32 s6, s10
64 ; CI-NEXT: s_mov_b32 s7, s11
65 ; CI-NEXT: s_mov_b32 s3, s11
66 ; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
67 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
68 ; CI-NEXT: s_waitcnt vmcnt(1)
69 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
70 ; CI-NEXT: s_waitcnt vmcnt(0)
71 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
72 ; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0
73 ; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0
74 ; CI-NEXT: v_rcp_f32_e32 v4, v3
75 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
76 ; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0
77 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
78 ; CI-NEXT: v_mul_f32_e32 v5, v2, v4
79 ; CI-NEXT: v_fma_f32 v6, -v3, v5, v2
80 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
81 ; CI-NEXT: v_fma_f32 v2, -v3, v5, v2
82 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
83 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
84 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
85 ; CI-NEXT: v_trunc_f32_e32 v2, v2
86 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
87 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
88 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
89 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0
94 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
95 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
96 ; VI-NEXT: s_waitcnt lgkmcnt(0)
97 ; VI-NEXT: v_mov_b32_e32 v2, s6
98 ; VI-NEXT: s_add_u32 s0, s0, 8
99 ; VI-NEXT: v_mov_b32_e32 v3, s7
100 ; VI-NEXT: s_addc_u32 s1, s1, 0
101 ; VI-NEXT: flat_load_ushort v4, v[2:3]
102 ; VI-NEXT: v_mov_b32_e32 v3, s1
103 ; VI-NEXT: v_mov_b32_e32 v2, s0
104 ; VI-NEXT: flat_load_ushort v2, v[2:3]
105 ; VI-NEXT: v_mov_b32_e32 v0, s4
106 ; VI-NEXT: v_mov_b32_e32 v1, s5
107 ; VI-NEXT: s_waitcnt vmcnt(1)
108 ; VI-NEXT: v_cvt_f32_f16_e32 v3, v4
109 ; VI-NEXT: s_waitcnt vmcnt(0)
110 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v2
111 ; VI-NEXT: v_rcp_f32_e32 v5, v5
112 ; VI-NEXT: v_mul_f32_e32 v3, v3, v5
113 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
114 ; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4
115 ; VI-NEXT: v_trunc_f16_e32 v3, v3
116 ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4
117 ; VI-NEXT: flat_store_short v[0:1], v2
120 ; GFX9-LABEL: frem_f16:
122 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
123 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
124 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
125 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
126 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
127 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
128 ; GFX9-NEXT: s_waitcnt vmcnt(0)
129 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
130 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3
131 ; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
132 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1
133 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
134 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1
135 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
136 ; GFX9-NEXT: s_endpgm
138 ; GFX10-LABEL: frem_f16:
140 ; GFX10-NEXT: s_clause 0x1
141 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
142 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
143 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
144 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
145 ; GFX10-NEXT: s_clause 0x1
146 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
147 ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
148 ; GFX10-NEXT: s_waitcnt vmcnt(0)
149 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
150 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
151 ; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
152 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
153 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
154 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
155 ; GFX10-NEXT: global_store_short v0, v1, s[4:5]
156 ; GFX10-NEXT: s_endpgm
158 ; GFX11-LABEL: frem_f16:
160 ; GFX11-NEXT: s_clause 0x1
161 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
162 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
163 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
164 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
165 ; GFX11-NEXT: s_clause 0x1
166 ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
167 ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
168 ; GFX11-NEXT: s_waitcnt vmcnt(0)
169 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
170 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
171 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
172 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
173 ; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
174 ; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1
175 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
176 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3
177 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1
178 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
179 ; GFX11-NEXT: s_nop 0
180 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
181 ; GFX11-NEXT: s_endpgm
182 ptr addrspace(1) %in2) #0 {
183 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
184 %r0 = load half, ptr addrspace(1) %in1, align 4
185 %r1 = load half, ptr addrspace(1) %gep2, align 4
186 %r2 = frem half %r0, %r1
187 store half %r2, ptr addrspace(1) %out, align 4
191 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
192 ; SI-LABEL: fast_frem_f16:
194 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
195 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
196 ; SI-NEXT: s_mov_b32 s11, 0xf000
197 ; SI-NEXT: s_mov_b32 s10, -1
198 ; SI-NEXT: s_waitcnt lgkmcnt(0)
199 ; SI-NEXT: s_mov_b32 s8, s4
200 ; SI-NEXT: s_mov_b32 s9, s5
201 ; SI-NEXT: s_mov_b32 s4, s6
202 ; SI-NEXT: s_mov_b32 s5, s7
203 ; SI-NEXT: s_mov_b32 s6, s10
204 ; SI-NEXT: s_mov_b32 s7, s11
205 ; SI-NEXT: s_mov_b32 s2, s10
206 ; SI-NEXT: s_mov_b32 s3, s11
207 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
208 ; SI-NEXT: s_waitcnt vmcnt(0)
209 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
210 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
211 ; SI-NEXT: s_waitcnt vmcnt(0)
212 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
213 ; SI-NEXT: v_rcp_f32_e32 v2, v1
214 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2
215 ; SI-NEXT: v_trunc_f32_e32 v2, v2
216 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
217 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
218 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
221 ; CI-LABEL: fast_frem_f16:
223 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
224 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
225 ; CI-NEXT: s_mov_b32 s11, 0xf000
226 ; CI-NEXT: s_mov_b32 s10, -1
227 ; CI-NEXT: s_mov_b32 s2, s10
228 ; CI-NEXT: s_mov_b32 s3, s11
229 ; CI-NEXT: s_waitcnt lgkmcnt(0)
230 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
231 ; CI-NEXT: s_mov_b32 s8, s4
232 ; CI-NEXT: s_mov_b32 s9, s5
233 ; CI-NEXT: s_mov_b32 s4, s6
234 ; CI-NEXT: s_mov_b32 s5, s7
235 ; CI-NEXT: s_mov_b32 s6, s10
236 ; CI-NEXT: s_mov_b32 s7, s11
237 ; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
238 ; CI-NEXT: s_waitcnt vmcnt(1)
239 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
240 ; CI-NEXT: v_rcp_f32_e32 v2, v1
241 ; CI-NEXT: s_waitcnt vmcnt(0)
242 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
243 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
244 ; CI-NEXT: v_trunc_f32_e32 v2, v2
245 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
246 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
247 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0
250 ; VI-LABEL: fast_frem_f16:
252 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
253 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
254 ; VI-NEXT: s_waitcnt lgkmcnt(0)
255 ; VI-NEXT: v_mov_b32_e32 v2, s6
256 ; VI-NEXT: s_add_u32 s0, s0, 8
257 ; VI-NEXT: v_mov_b32_e32 v3, s7
258 ; VI-NEXT: s_addc_u32 s1, s1, 0
259 ; VI-NEXT: flat_load_ushort v4, v[2:3]
260 ; VI-NEXT: v_mov_b32_e32 v3, s1
261 ; VI-NEXT: v_mov_b32_e32 v2, s0
262 ; VI-NEXT: flat_load_ushort v2, v[2:3]
263 ; VI-NEXT: v_mov_b32_e32 v0, s4
264 ; VI-NEXT: v_mov_b32_e32 v1, s5
265 ; VI-NEXT: s_waitcnt vmcnt(0)
266 ; VI-NEXT: v_rcp_f16_e32 v3, v2
267 ; VI-NEXT: v_mul_f16_e32 v3, v4, v3
268 ; VI-NEXT: v_trunc_f16_e32 v3, v3
269 ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4
270 ; VI-NEXT: flat_store_short v[0:1], v2
273 ; GFX9-LABEL: fast_frem_f16:
275 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
276 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
277 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
278 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
279 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
280 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
281 ; GFX9-NEXT: s_waitcnt vmcnt(0)
282 ; GFX9-NEXT: v_rcp_f16_e32 v3, v2
283 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3
284 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
285 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1
286 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
287 ; GFX9-NEXT: s_endpgm
289 ; GFX10-LABEL: fast_frem_f16:
291 ; GFX10-NEXT: s_clause 0x1
292 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
293 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
294 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
295 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
296 ; GFX10-NEXT: s_clause 0x1
297 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
298 ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
299 ; GFX10-NEXT: s_waitcnt vmcnt(0)
300 ; GFX10-NEXT: v_rcp_f16_e32 v3, v2
301 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3
302 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
303 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
304 ; GFX10-NEXT: global_store_short v0, v1, s[4:5]
305 ; GFX10-NEXT: s_endpgm
307 ; GFX11-LABEL: fast_frem_f16:
309 ; GFX11-NEXT: s_clause 0x1
310 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
311 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
312 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
313 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
314 ; GFX11-NEXT: s_clause 0x1
315 ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
316 ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
317 ; GFX11-NEXT: s_waitcnt vmcnt(0)
318 ; GFX11-NEXT: v_rcp_f16_e32 v3, v2
319 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
320 ; GFX11-NEXT: v_mul_f16_e32 v3, v1, v3
321 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
322 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3
323 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1
324 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
325 ; GFX11-NEXT: s_nop 0
326 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
327 ; GFX11-NEXT: s_endpgm
328 ptr addrspace(1) %in2) #0 {
329 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
330 %r0 = load half, ptr addrspace(1) %in1, align 4
331 %r1 = load half, ptr addrspace(1) %gep2, align 4
332 %r2 = frem fast half %r0, %r1
333 store half %r2, ptr addrspace(1) %out, align 4
337 define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
338 ; SI-LABEL: unsafe_frem_f16:
340 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
341 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
342 ; SI-NEXT: s_mov_b32 s11, 0xf000
343 ; SI-NEXT: s_mov_b32 s10, -1
344 ; SI-NEXT: s_waitcnt lgkmcnt(0)
345 ; SI-NEXT: s_mov_b32 s8, s4
346 ; SI-NEXT: s_mov_b32 s9, s5
347 ; SI-NEXT: s_mov_b32 s4, s6
348 ; SI-NEXT: s_mov_b32 s5, s7
349 ; SI-NEXT: s_mov_b32 s6, s10
350 ; SI-NEXT: s_mov_b32 s7, s11
351 ; SI-NEXT: s_mov_b32 s2, s10
352 ; SI-NEXT: s_mov_b32 s3, s11
353 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
354 ; SI-NEXT: s_waitcnt vmcnt(0)
355 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
356 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
357 ; SI-NEXT: s_waitcnt vmcnt(0)
358 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
359 ; SI-NEXT: v_rcp_f32_e32 v2, v1
360 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2
361 ; SI-NEXT: v_trunc_f32_e32 v2, v2
362 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
363 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
364 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
367 ; CI-LABEL: unsafe_frem_f16:
369 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
370 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
371 ; CI-NEXT: s_mov_b32 s11, 0xf000
372 ; CI-NEXT: s_mov_b32 s10, -1
373 ; CI-NEXT: s_mov_b32 s2, s10
374 ; CI-NEXT: s_mov_b32 s3, s11
375 ; CI-NEXT: s_waitcnt lgkmcnt(0)
376 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
377 ; CI-NEXT: s_mov_b32 s8, s4
378 ; CI-NEXT: s_mov_b32 s9, s5
379 ; CI-NEXT: s_mov_b32 s4, s6
380 ; CI-NEXT: s_mov_b32 s5, s7
381 ; CI-NEXT: s_mov_b32 s6, s10
382 ; CI-NEXT: s_mov_b32 s7, s11
383 ; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
384 ; CI-NEXT: s_waitcnt vmcnt(1)
385 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
386 ; CI-NEXT: v_rcp_f32_e32 v2, v1
387 ; CI-NEXT: s_waitcnt vmcnt(0)
388 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
389 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
390 ; CI-NEXT: v_trunc_f32_e32 v2, v2
391 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
392 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
393 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0
396 ; VI-LABEL: unsafe_frem_f16:
398 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
399 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
400 ; VI-NEXT: s_waitcnt lgkmcnt(0)
401 ; VI-NEXT: v_mov_b32_e32 v2, s6
402 ; VI-NEXT: s_add_u32 s0, s0, 8
403 ; VI-NEXT: v_mov_b32_e32 v3, s7
404 ; VI-NEXT: s_addc_u32 s1, s1, 0
405 ; VI-NEXT: flat_load_ushort v4, v[2:3]
406 ; VI-NEXT: v_mov_b32_e32 v3, s1
407 ; VI-NEXT: v_mov_b32_e32 v2, s0
408 ; VI-NEXT: flat_load_ushort v2, v[2:3]
409 ; VI-NEXT: v_mov_b32_e32 v0, s4
410 ; VI-NEXT: v_mov_b32_e32 v1, s5
411 ; VI-NEXT: s_waitcnt vmcnt(0)
412 ; VI-NEXT: v_rcp_f16_e32 v3, v2
413 ; VI-NEXT: v_mul_f16_e32 v3, v4, v3
414 ; VI-NEXT: v_trunc_f16_e32 v3, v3
415 ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4
416 ; VI-NEXT: flat_store_short v[0:1], v2
419 ; GFX9-LABEL: unsafe_frem_f16:
421 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
422 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
423 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
424 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
425 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
426 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
427 ; GFX9-NEXT: s_waitcnt vmcnt(0)
428 ; GFX9-NEXT: v_rcp_f16_e32 v3, v2
429 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3
430 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
431 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1
432 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
433 ; GFX9-NEXT: s_endpgm
435 ; GFX10-LABEL: unsafe_frem_f16:
437 ; GFX10-NEXT: s_clause 0x1
438 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
439 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
440 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
441 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
442 ; GFX10-NEXT: s_clause 0x1
443 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
444 ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
445 ; GFX10-NEXT: s_waitcnt vmcnt(0)
446 ; GFX10-NEXT: v_rcp_f16_e32 v3, v2
447 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3
448 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
449 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
450 ; GFX10-NEXT: global_store_short v0, v1, s[4:5]
451 ; GFX10-NEXT: s_endpgm
453 ; GFX11-LABEL: unsafe_frem_f16:
455 ; GFX11-NEXT: s_clause 0x1
456 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
457 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
458 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
459 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
460 ; GFX11-NEXT: s_clause 0x1
461 ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
462 ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
463 ; GFX11-NEXT: s_waitcnt vmcnt(0)
464 ; GFX11-NEXT: v_rcp_f16_e32 v3, v2
465 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
466 ; GFX11-NEXT: v_mul_f16_e32 v3, v1, v3
467 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
468 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3
469 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1
470 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
471 ; GFX11-NEXT: s_nop 0
472 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
473 ; GFX11-NEXT: s_endpgm
474 ptr addrspace(1) %in2) #1 {
475 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
476 %r0 = load half, ptr addrspace(1) %in1, align 4
477 %r1 = load half, ptr addrspace(1) %gep2, align 4
478 %r2 = frem afn half %r0, %r1
479 store half %r2, ptr addrspace(1) %out, align 4
483 define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
484 ; SI-LABEL: frem_f32:
486 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
487 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
488 ; SI-NEXT: s_mov_b32 s11, 0xf000
489 ; SI-NEXT: s_mov_b32 s10, -1
490 ; SI-NEXT: s_waitcnt lgkmcnt(0)
491 ; SI-NEXT: s_mov_b32 s8, s4
492 ; SI-NEXT: s_mov_b32 s9, s5
493 ; SI-NEXT: s_mov_b32 s4, s6
494 ; SI-NEXT: s_mov_b32 s5, s7
495 ; SI-NEXT: s_mov_b32 s6, s10
496 ; SI-NEXT: s_mov_b32 s7, s11
497 ; SI-NEXT: s_mov_b32 s2, s10
498 ; SI-NEXT: s_mov_b32 s3, s11
499 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
500 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
501 ; SI-NEXT: s_waitcnt vmcnt(0)
502 ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0
503 ; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0
504 ; SI-NEXT: v_rcp_f32_e32 v4, v3
505 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
506 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0
507 ; SI-NEXT: v_fma_f32 v4, v5, v4, v4
508 ; SI-NEXT: v_mul_f32_e32 v5, v2, v4
509 ; SI-NEXT: v_fma_f32 v6, -v3, v5, v2
510 ; SI-NEXT: v_fma_f32 v5, v6, v4, v5
511 ; SI-NEXT: v_fma_f32 v2, -v3, v5, v2
512 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
513 ; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
514 ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
515 ; SI-NEXT: v_trunc_f32_e32 v2, v2
516 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
517 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
520 ; CI-LABEL: frem_f32:
522 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
523 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
524 ; CI-NEXT: s_mov_b32 s11, 0xf000
525 ; CI-NEXT: s_mov_b32 s10, -1
526 ; CI-NEXT: s_mov_b32 s2, s10
527 ; CI-NEXT: s_waitcnt lgkmcnt(0)
528 ; CI-NEXT: s_mov_b32 s8, s4
529 ; CI-NEXT: s_mov_b32 s9, s5
530 ; CI-NEXT: s_mov_b32 s4, s6
531 ; CI-NEXT: s_mov_b32 s5, s7
532 ; CI-NEXT: s_mov_b32 s6, s10
533 ; CI-NEXT: s_mov_b32 s7, s11
534 ; CI-NEXT: s_mov_b32 s3, s11
535 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
536 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
537 ; CI-NEXT: s_waitcnt vmcnt(0)
538 ; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0
539 ; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0
540 ; CI-NEXT: v_rcp_f32_e32 v4, v3
541 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
542 ; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0
543 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
544 ; CI-NEXT: v_mul_f32_e32 v5, v2, v4
545 ; CI-NEXT: v_fma_f32 v6, -v3, v5, v2
546 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
547 ; CI-NEXT: v_fma_f32 v2, -v3, v5, v2
548 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
549 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
550 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
551 ; CI-NEXT: v_trunc_f32_e32 v2, v2
552 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
553 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0
556 ; VI-LABEL: frem_f32:
558 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
559 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
560 ; VI-NEXT: s_waitcnt lgkmcnt(0)
561 ; VI-NEXT: v_mov_b32_e32 v2, s6
562 ; VI-NEXT: s_add_u32 s0, s0, 16
563 ; VI-NEXT: v_mov_b32_e32 v3, s7
564 ; VI-NEXT: s_addc_u32 s1, s1, 0
565 ; VI-NEXT: flat_load_dword v4, v[2:3]
566 ; VI-NEXT: v_mov_b32_e32 v3, s1
567 ; VI-NEXT: v_mov_b32_e32 v2, s0
568 ; VI-NEXT: flat_load_dword v2, v[2:3]
569 ; VI-NEXT: v_mov_b32_e32 v0, s4
570 ; VI-NEXT: v_mov_b32_e32 v1, s5
571 ; VI-NEXT: s_waitcnt vmcnt(0)
572 ; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4
573 ; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4
574 ; VI-NEXT: v_rcp_f32_e32 v6, v5
575 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
576 ; VI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
577 ; VI-NEXT: v_fma_f32 v6, v7, v6, v6
578 ; VI-NEXT: v_mul_f32_e32 v7, v3, v6
579 ; VI-NEXT: v_fma_f32 v8, -v5, v7, v3
580 ; VI-NEXT: v_fma_f32 v7, v8, v6, v7
581 ; VI-NEXT: v_fma_f32 v3, -v5, v7, v3
582 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
583 ; VI-NEXT: v_div_fmas_f32 v3, v3, v6, v7
584 ; VI-NEXT: v_div_fixup_f32 v3, v3, v2, v4
585 ; VI-NEXT: v_trunc_f32_e32 v3, v3
586 ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4
587 ; VI-NEXT: flat_store_dword v[0:1], v2
590 ; GFX9-LABEL: frem_f32:
592 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
593 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
594 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
595 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
596 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
597 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16
598 ; GFX9-NEXT: s_waitcnt vmcnt(0)
599 ; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1
600 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1
601 ; GFX9-NEXT: v_rcp_f32_e32 v5, v4
602 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
603 ; GFX9-NEXT: v_fma_f32 v6, -v4, v5, 1.0
604 ; GFX9-NEXT: v_fma_f32 v5, v6, v5, v5
605 ; GFX9-NEXT: v_mul_f32_e32 v6, v3, v5
606 ; GFX9-NEXT: v_fma_f32 v7, -v4, v6, v3
607 ; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6
608 ; GFX9-NEXT: v_fma_f32 v3, -v4, v6, v3
609 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
610 ; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v6
611 ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v1
612 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
613 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1
614 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
615 ; GFX9-NEXT: s_endpgm
617 ; GFX10-LABEL: frem_f32:
619 ; GFX10-NEXT: s_clause 0x1
620 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
621 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
622 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
623 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
624 ; GFX10-NEXT: s_clause 0x1
625 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
626 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16
627 ; GFX10-NEXT: s_waitcnt vmcnt(0)
628 ; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1
629 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
630 ; GFX10-NEXT: v_rcp_f32_e32 v5, v4
631 ; GFX10-NEXT: s_denorm_mode 15
632 ; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0
633 ; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v5
634 ; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5
635 ; GFX10-NEXT: v_fma_f32 v7, -v4, v6, v3
636 ; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v5
637 ; GFX10-NEXT: v_fma_f32 v3, -v4, v6, v3
638 ; GFX10-NEXT: s_denorm_mode 12
639 ; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6
640 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1
641 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
642 ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
643 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
644 ; GFX10-NEXT: s_endpgm
646 ; GFX11-LABEL: frem_f32:
648 ; GFX11-NEXT: s_clause 0x1
649 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
650 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
651 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
652 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
653 ; GFX11-NEXT: s_clause 0x1
654 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
655 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
656 ; GFX11-NEXT: s_waitcnt vmcnt(0)
657 ; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
658 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
659 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
660 ; GFX11-NEXT: v_rcp_f32_e32 v5, v4
661 ; GFX11-NEXT: s_denorm_mode 15
662 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
663 ; GFX11-NEXT: v_fma_f32 v6, -v4, v5, 1.0
664 ; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v5
665 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
666 ; GFX11-NEXT: v_mul_f32_e32 v6, v3, v5
667 ; GFX11-NEXT: v_fma_f32 v7, -v4, v6, v3
668 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
669 ; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v5
670 ; GFX11-NEXT: v_fma_f32 v3, -v4, v6, v3
671 ; GFX11-NEXT: s_denorm_mode 12
672 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
673 ; GFX11-NEXT: v_div_fmas_f32 v3, v3, v5, v6
674 ; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v1
675 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
676 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
677 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1
678 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
679 ; GFX11-NEXT: s_nop 0
680 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
681 ; GFX11-NEXT: s_endpgm
682 ptr addrspace(1) %in2) #0 {
683 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
684 %r0 = load float, ptr addrspace(1) %in1, align 4
685 %r1 = load float, ptr addrspace(1) %gep2, align 4
686 %r2 = frem float %r0, %r1
687 store float %r2, ptr addrspace(1) %out, align 4
691 define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
692 ; SI-LABEL: fast_frem_f32:
694 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
695 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
696 ; SI-NEXT: s_mov_b32 s11, 0xf000
697 ; SI-NEXT: s_mov_b32 s10, -1
698 ; SI-NEXT: s_waitcnt lgkmcnt(0)
699 ; SI-NEXT: s_mov_b32 s8, s4
700 ; SI-NEXT: s_mov_b32 s9, s5
701 ; SI-NEXT: s_mov_b32 s4, s6
702 ; SI-NEXT: s_mov_b32 s5, s7
703 ; SI-NEXT: s_mov_b32 s6, s10
704 ; SI-NEXT: s_mov_b32 s7, s11
705 ; SI-NEXT: s_mov_b32 s2, s10
706 ; SI-NEXT: s_mov_b32 s3, s11
707 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
708 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
709 ; SI-NEXT: s_waitcnt vmcnt(0)
710 ; SI-NEXT: v_rcp_f32_e32 v2, v1
711 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2
712 ; SI-NEXT: v_trunc_f32_e32 v2, v2
713 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
714 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
717 ; CI-LABEL: fast_frem_f32:
719 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
720 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
721 ; CI-NEXT: s_mov_b32 s11, 0xf000
722 ; CI-NEXT: s_mov_b32 s10, -1
723 ; CI-NEXT: s_mov_b32 s2, s10
724 ; CI-NEXT: s_waitcnt lgkmcnt(0)
725 ; CI-NEXT: s_mov_b32 s8, s4
726 ; CI-NEXT: s_mov_b32 s9, s5
727 ; CI-NEXT: s_mov_b32 s4, s6
728 ; CI-NEXT: s_mov_b32 s5, s7
729 ; CI-NEXT: s_mov_b32 s6, s10
730 ; CI-NEXT: s_mov_b32 s7, s11
731 ; CI-NEXT: s_mov_b32 s3, s11
732 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
733 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
734 ; CI-NEXT: s_waitcnt vmcnt(0)
735 ; CI-NEXT: v_rcp_f32_e32 v2, v1
736 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
737 ; CI-NEXT: v_trunc_f32_e32 v2, v2
738 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
739 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0
742 ; VI-LABEL: fast_frem_f32:
744 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
745 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
746 ; VI-NEXT: s_waitcnt lgkmcnt(0)
747 ; VI-NEXT: v_mov_b32_e32 v2, s6
748 ; VI-NEXT: s_add_u32 s0, s0, 16
749 ; VI-NEXT: v_mov_b32_e32 v3, s7
750 ; VI-NEXT: s_addc_u32 s1, s1, 0
751 ; VI-NEXT: flat_load_dword v4, v[2:3]
752 ; VI-NEXT: v_mov_b32_e32 v3, s1
753 ; VI-NEXT: v_mov_b32_e32 v2, s0
754 ; VI-NEXT: flat_load_dword v2, v[2:3]
755 ; VI-NEXT: v_mov_b32_e32 v0, s4
756 ; VI-NEXT: v_mov_b32_e32 v1, s5
757 ; VI-NEXT: s_waitcnt vmcnt(0)
758 ; VI-NEXT: v_rcp_f32_e32 v3, v2
759 ; VI-NEXT: v_mul_f32_e32 v3, v4, v3
760 ; VI-NEXT: v_trunc_f32_e32 v3, v3
761 ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4
762 ; VI-NEXT: flat_store_dword v[0:1], v2
765 ; GFX9-LABEL: fast_frem_f32:
767 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
768 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
769 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
770 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
771 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
772 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16
773 ; GFX9-NEXT: s_waitcnt vmcnt(0)
774 ; GFX9-NEXT: v_rcp_f32_e32 v3, v2
775 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3
776 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
777 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1
778 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
779 ; GFX9-NEXT: s_endpgm
781 ; GFX10-LABEL: fast_frem_f32:
783 ; GFX10-NEXT: s_clause 0x1
784 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
785 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
786 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
787 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
788 ; GFX10-NEXT: s_clause 0x1
789 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
790 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16
791 ; GFX10-NEXT: s_waitcnt vmcnt(0)
792 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2
793 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3
794 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
795 ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
796 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
797 ; GFX10-NEXT: s_endpgm
799 ; GFX11-LABEL: fast_frem_f32:
801 ; GFX11-NEXT: s_clause 0x1
802 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
803 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
804 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
805 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
806 ; GFX11-NEXT: s_clause 0x1
807 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
808 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
809 ; GFX11-NEXT: s_waitcnt vmcnt(0)
810 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2
811 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
812 ; GFX11-NEXT: v_mul_f32_e32 v3, v1, v3
813 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
814 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
815 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1
816 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
817 ; GFX11-NEXT: s_nop 0
818 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
819 ; GFX11-NEXT: s_endpgm
820 ptr addrspace(1) %in2) #0 {
821 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
822 %r0 = load float, ptr addrspace(1) %in1, align 4
823 %r1 = load float, ptr addrspace(1) %gep2, align 4
824 %r2 = frem fast float %r0, %r1
825 store float %r2, ptr addrspace(1) %out, align 4
829 define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
830 ; SI-LABEL: unsafe_frem_f32:
832 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
833 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
834 ; SI-NEXT: s_mov_b32 s11, 0xf000
835 ; SI-NEXT: s_mov_b32 s10, -1
836 ; SI-NEXT: s_waitcnt lgkmcnt(0)
837 ; SI-NEXT: s_mov_b32 s8, s4
838 ; SI-NEXT: s_mov_b32 s9, s5
839 ; SI-NEXT: s_mov_b32 s4, s6
840 ; SI-NEXT: s_mov_b32 s5, s7
841 ; SI-NEXT: s_mov_b32 s6, s10
842 ; SI-NEXT: s_mov_b32 s7, s11
843 ; SI-NEXT: s_mov_b32 s2, s10
844 ; SI-NEXT: s_mov_b32 s3, s11
845 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
846 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
847 ; SI-NEXT: s_waitcnt vmcnt(0)
848 ; SI-NEXT: v_rcp_f32_e32 v2, v1
849 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2
850 ; SI-NEXT: v_trunc_f32_e32 v2, v2
851 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
852 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
855 ; CI-LABEL: unsafe_frem_f32:
857 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
858 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
859 ; CI-NEXT: s_mov_b32 s11, 0xf000
860 ; CI-NEXT: s_mov_b32 s10, -1
861 ; CI-NEXT: s_mov_b32 s2, s10
862 ; CI-NEXT: s_waitcnt lgkmcnt(0)
863 ; CI-NEXT: s_mov_b32 s8, s4
864 ; CI-NEXT: s_mov_b32 s9, s5
865 ; CI-NEXT: s_mov_b32 s4, s6
866 ; CI-NEXT: s_mov_b32 s5, s7
867 ; CI-NEXT: s_mov_b32 s6, s10
868 ; CI-NEXT: s_mov_b32 s7, s11
869 ; CI-NEXT: s_mov_b32 s3, s11
870 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
871 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
872 ; CI-NEXT: s_waitcnt vmcnt(0)
873 ; CI-NEXT: v_rcp_f32_e32 v2, v1
874 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
875 ; CI-NEXT: v_trunc_f32_e32 v2, v2
876 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
877 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0
880 ; VI-LABEL: unsafe_frem_f32:
882 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
883 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
884 ; VI-NEXT: s_waitcnt lgkmcnt(0)
885 ; VI-NEXT: v_mov_b32_e32 v2, s6
886 ; VI-NEXT: s_add_u32 s0, s0, 16
887 ; VI-NEXT: v_mov_b32_e32 v3, s7
888 ; VI-NEXT: s_addc_u32 s1, s1, 0
889 ; VI-NEXT: flat_load_dword v4, v[2:3]
890 ; VI-NEXT: v_mov_b32_e32 v3, s1
891 ; VI-NEXT: v_mov_b32_e32 v2, s0
892 ; VI-NEXT: flat_load_dword v2, v[2:3]
893 ; VI-NEXT: v_mov_b32_e32 v0, s4
894 ; VI-NEXT: v_mov_b32_e32 v1, s5
895 ; VI-NEXT: s_waitcnt vmcnt(0)
896 ; VI-NEXT: v_rcp_f32_e32 v3, v2
897 ; VI-NEXT: v_mul_f32_e32 v3, v4, v3
898 ; VI-NEXT: v_trunc_f32_e32 v3, v3
899 ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4
900 ; VI-NEXT: flat_store_dword v[0:1], v2
903 ; GFX9-LABEL: unsafe_frem_f32:
905 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
906 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
907 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
908 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
909 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
910 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16
911 ; GFX9-NEXT: s_waitcnt vmcnt(0)
912 ; GFX9-NEXT: v_rcp_f32_e32 v3, v2
913 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3
914 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
915 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1
916 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
917 ; GFX9-NEXT: s_endpgm
919 ; GFX10-LABEL: unsafe_frem_f32:
921 ; GFX10-NEXT: s_clause 0x1
922 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
923 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
924 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
925 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
926 ; GFX10-NEXT: s_clause 0x1
927 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
928 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16
929 ; GFX10-NEXT: s_waitcnt vmcnt(0)
930 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2
931 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3
932 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
933 ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
934 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
935 ; GFX10-NEXT: s_endpgm
937 ; GFX11-LABEL: unsafe_frem_f32:
939 ; GFX11-NEXT: s_clause 0x1
940 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
941 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
942 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
943 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
944 ; GFX11-NEXT: s_clause 0x1
945 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
946 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
947 ; GFX11-NEXT: s_waitcnt vmcnt(0)
948 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2
949 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
950 ; GFX11-NEXT: v_mul_f32_e32 v3, v1, v3
951 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
952 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
953 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1
954 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
955 ; GFX11-NEXT: s_nop 0
956 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
957 ; GFX11-NEXT: s_endpgm
958 ptr addrspace(1) %in2) #1 {
959 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
960 %r0 = load float, ptr addrspace(1) %in1, align 4
961 %r1 = load float, ptr addrspace(1) %gep2, align 4
962 %r2 = frem afn float %r0, %r1
963 store float %r2, ptr addrspace(1) %out, align 4
967 define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
968 ; SI-LABEL: frem_f64:
970 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
971 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
972 ; SI-NEXT: s_mov_b32 s7, 0xf000
973 ; SI-NEXT: s_mov_b32 s6, -1
974 ; SI-NEXT: s_waitcnt lgkmcnt(0)
975 ; SI-NEXT: s_mov_b32 s4, s8
976 ; SI-NEXT: s_mov_b32 s5, s9
977 ; SI-NEXT: s_mov_b32 s8, s10
978 ; SI-NEXT: s_mov_b32 s9, s11
979 ; SI-NEXT: s_mov_b32 s10, s6
980 ; SI-NEXT: s_mov_b32 s11, s7
981 ; SI-NEXT: s_mov_b32 s2, s6
982 ; SI-NEXT: s_mov_b32 s3, s7
983 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
984 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
985 ; SI-NEXT: s_waitcnt vmcnt(0)
986 ; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
987 ; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
988 ; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
989 ; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
990 ; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
991 ; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
992 ; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1]
993 ; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
994 ; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9]
995 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
996 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9
997 ; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
999 ; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
1000 ; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1001 ; SI-NEXT: v_readfirstlane_b32 s2, v5
1002 ; SI-NEXT: s_bfe_u32 s0, s2, 0xb0014
1003 ; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01
1004 ; SI-NEXT: s_mov_b32 s1, 0xfffff
1005 ; SI-NEXT: s_mov_b32 s0, s6
1006 ; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
1007 ; SI-NEXT: v_not_b32_e32 v6, s0
1008 ; SI-NEXT: v_and_b32_e32 v6, v4, v6
1009 ; SI-NEXT: v_not_b32_e32 v7, s1
1010 ; SI-NEXT: v_and_b32_e32 v5, v5, v7
1011 ; SI-NEXT: s_and_b32 s0, s2, 0x80000000
1012 ; SI-NEXT: s_cmp_lt_i32 s3, 0
1013 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1014 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
1015 ; SI-NEXT: v_mov_b32_e32 v7, s0
1016 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1017 ; SI-NEXT: s_cmp_gt_i32 s3, 51
1018 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1019 ; SI-NEXT: v_mov_b32_e32 v7, s2
1020 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1021 ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
1022 ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1023 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1026 ; CI-LABEL: frem_f64:
1028 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1029 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1030 ; CI-NEXT: s_mov_b32 s11, 0xf000
1031 ; CI-NEXT: s_mov_b32 s10, -1
1032 ; CI-NEXT: s_mov_b32 s2, s10
1033 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1034 ; CI-NEXT: s_mov_b32 s8, s4
1035 ; CI-NEXT: s_mov_b32 s9, s5
1036 ; CI-NEXT: s_mov_b32 s4, s6
1037 ; CI-NEXT: s_mov_b32 s5, s7
1038 ; CI-NEXT: s_mov_b32 s6, s10
1039 ; CI-NEXT: s_mov_b32 s7, s11
1040 ; CI-NEXT: s_mov_b32 s3, s11
1041 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1042 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1043 ; CI-NEXT: s_waitcnt vmcnt(0)
1044 ; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
1045 ; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1046 ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1047 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1048 ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1049 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1050 ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
1051 ; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
1052 ; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1054 ; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1055 ; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1056 ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1057 ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1058 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1061 ; VI-LABEL: frem_f64:
1063 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1064 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1065 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1066 ; VI-NEXT: v_mov_b32_e32 v2, s6
1067 ; VI-NEXT: v_mov_b32_e32 v3, s7
1068 ; VI-NEXT: v_mov_b32_e32 v4, s0
1069 ; VI-NEXT: v_mov_b32_e32 v5, s1
1070 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1071 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
1072 ; VI-NEXT: v_mov_b32_e32 v0, s4
1073 ; VI-NEXT: v_mov_b32_e32 v1, s5
1074 ; VI-NEXT: s_waitcnt vmcnt(0)
1075 ; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3]
1076 ; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
1077 ; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
1078 ; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
1079 ; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
1080 ; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
1081 ; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3]
1082 ; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
1083 ; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
1085 ; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
1086 ; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3]
1087 ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
1088 ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1089 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1092 ; GFX9-LABEL: frem_f64:
1094 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1095 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1096 ; GFX9-NEXT: v_mov_b32_e32 v12, 0
1097 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1098 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7]
1099 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3]
1100 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1101 ; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
1102 ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1103 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1104 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1105 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1106 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1107 ; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
1108 ; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
1109 ; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1110 ; GFX9-NEXT: s_nop 1
1111 ; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1112 ; GFX9-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1113 ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1114 ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1115 ; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5]
1116 ; GFX9-NEXT: s_endpgm
1118 ; GFX10-LABEL: frem_f64:
1120 ; GFX10-NEXT: s_clause 0x1
1121 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1122 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1123 ; GFX10-NEXT: v_mov_b32_e32 v12, 0
1124 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1125 ; GFX10-NEXT: s_clause 0x1
1126 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7]
1127 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3]
1128 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1129 ; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1]
1130 ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1131 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1132 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1133 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1134 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1135 ; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
1136 ; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
1137 ; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1138 ; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1139 ; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1140 ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1141 ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1142 ; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5]
1143 ; GFX10-NEXT: s_endpgm
1145 ; GFX11-LABEL: frem_f64:
1147 ; GFX11-NEXT: s_clause 0x1
1148 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1149 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1150 ; GFX11-NEXT: v_mov_b32_e32 v12, 0
1151 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1152 ; GFX11-NEXT: s_clause 0x1
1153 ; GFX11-NEXT: global_load_b64 v[0:1], v12, s[6:7]
1154 ; GFX11-NEXT: global_load_b64 v[2:3], v12, s[0:1]
1155 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1156 ; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
1157 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1158 ; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1159 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1160 ; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1161 ; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1162 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1163 ; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1164 ; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1165 ; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
1166 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1167 ; GFX11-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
1168 ; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1169 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1170 ; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1171 ; GFX11-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1172 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1173 ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1174 ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1175 ; GFX11-NEXT: global_store_b64 v12, v[0:1], s[4:5]
1176 ; GFX11-NEXT: s_nop 0
1177 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1178 ; GFX11-NEXT: s_endpgm
1179 ptr addrspace(1) %in2) #0 {
1180 %r0 = load double, ptr addrspace(1) %in1, align 8
1181 %r1 = load double, ptr addrspace(1) %in2, align 8
1182 %r2 = frem double %r0, %r1
1183 store double %r2, ptr addrspace(1) %out, align 8
1187 define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
1188 ; SI-LABEL: fast_frem_f64:
1190 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1191 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1192 ; SI-NEXT: s_mov_b32 s3, 0xf000
1193 ; SI-NEXT: s_mov_b32 s2, -1
1194 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1195 ; SI-NEXT: s_mov_b32 s0, s4
1196 ; SI-NEXT: s_mov_b32 s1, s5
1197 ; SI-NEXT: s_mov_b32 s4, s6
1198 ; SI-NEXT: s_mov_b32 s5, s7
1199 ; SI-NEXT: s_mov_b32 s6, s2
1200 ; SI-NEXT: s_mov_b32 s7, s3
1201 ; SI-NEXT: s_mov_b32 s10, s2
1202 ; SI-NEXT: s_mov_b32 s11, s3
1203 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1204 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
1205 ; SI-NEXT: s_waitcnt vmcnt(0)
1206 ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1207 ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1208 ; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1209 ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1210 ; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1211 ; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1212 ; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1213 ; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1214 ; SI-NEXT: v_readfirstlane_b32 s6, v5
1215 ; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014
1216 ; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01
1217 ; SI-NEXT: s_mov_b32 s5, 0xfffff
1218 ; SI-NEXT: s_mov_b32 s4, s2
1219 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
1220 ; SI-NEXT: v_not_b32_e32 v6, s4
1221 ; SI-NEXT: v_and_b32_e32 v6, v4, v6
1222 ; SI-NEXT: v_not_b32_e32 v7, s5
1223 ; SI-NEXT: v_and_b32_e32 v5, v5, v7
1224 ; SI-NEXT: s_and_b32 s4, s6, 0x80000000
1225 ; SI-NEXT: s_cmp_lt_i32 s7, 0
1226 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1227 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
1228 ; SI-NEXT: v_mov_b32_e32 v7, s4
1229 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1230 ; SI-NEXT: s_cmp_gt_i32 s7, 51
1231 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1232 ; SI-NEXT: v_mov_b32_e32 v7, s6
1233 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1234 ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
1235 ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1236 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1239 ; CI-LABEL: fast_frem_f64:
1241 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1242 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1243 ; CI-NEXT: s_mov_b32 s11, 0xf000
1244 ; CI-NEXT: s_mov_b32 s10, -1
1245 ; CI-NEXT: s_mov_b32 s2, s10
1246 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1247 ; CI-NEXT: s_mov_b32 s8, s4
1248 ; CI-NEXT: s_mov_b32 s9, s5
1249 ; CI-NEXT: s_mov_b32 s4, s6
1250 ; CI-NEXT: s_mov_b32 s5, s7
1251 ; CI-NEXT: s_mov_b32 s6, s10
1252 ; CI-NEXT: s_mov_b32 s7, s11
1253 ; CI-NEXT: s_mov_b32 s3, s11
1254 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1255 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1256 ; CI-NEXT: s_waitcnt vmcnt(0)
1257 ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1258 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1259 ; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1260 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1261 ; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1262 ; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1263 ; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1264 ; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1265 ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1266 ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1267 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1270 ; VI-LABEL: fast_frem_f64:
1272 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1273 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1274 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1275 ; VI-NEXT: v_mov_b32_e32 v2, s6
1276 ; VI-NEXT: v_mov_b32_e32 v3, s7
1277 ; VI-NEXT: v_mov_b32_e32 v4, s0
1278 ; VI-NEXT: v_mov_b32_e32 v5, s1
1279 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1280 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
1281 ; VI-NEXT: v_mov_b32_e32 v0, s4
1282 ; VI-NEXT: v_mov_b32_e32 v1, s5
1283 ; VI-NEXT: s_waitcnt vmcnt(0)
1284 ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1285 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1286 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1287 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1288 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1289 ; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7]
1290 ; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1291 ; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1292 ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
1293 ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1294 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1297 ; GFX9-LABEL: fast_frem_f64:
1299 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1300 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1301 ; GFX9-NEXT: v_mov_b32_e32 v10, 0
1302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1303 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7]
1304 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3]
1305 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1306 ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1307 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1308 ; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1309 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1310 ; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1311 ; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1312 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1313 ; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1314 ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1315 ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1316 ; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
1317 ; GFX9-NEXT: s_endpgm
1319 ; GFX10-LABEL: fast_frem_f64:
1321 ; GFX10-NEXT: s_clause 0x1
1322 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1323 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1324 ; GFX10-NEXT: v_mov_b32_e32 v10, 0
1325 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1326 ; GFX10-NEXT: s_clause 0x1
1327 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7]
1328 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3]
1329 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1330 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1331 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1332 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1333 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1334 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1335 ; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1336 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1337 ; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1338 ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1339 ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1340 ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
1341 ; GFX10-NEXT: s_endpgm
1343 ; GFX11-LABEL: fast_frem_f64:
1345 ; GFX11-NEXT: s_clause 0x1
1346 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1347 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1348 ; GFX11-NEXT: v_mov_b32_e32 v10, 0
1349 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1350 ; GFX11-NEXT: s_clause 0x1
1351 ; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7]
1352 ; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1]
1353 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1354 ; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1355 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1356 ; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1357 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1358 ; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1359 ; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1360 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1361 ; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1362 ; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1363 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1364 ; GFX11-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1365 ; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1366 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1367 ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1368 ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1369 ; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5]
1370 ; GFX11-NEXT: s_nop 0
1371 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1372 ; GFX11-NEXT: s_endpgm
1373 ptr addrspace(1) %in2) #0 {
1374 %r0 = load double, ptr addrspace(1) %in1, align 8
1375 %r1 = load double, ptr addrspace(1) %in2, align 8
1376 %r2 = frem fast double %r0, %r1
1377 store double %r2, ptr addrspace(1) %out, align 8
1381 define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
1382 ; SI-LABEL: unsafe_frem_f64:
1384 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1385 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1386 ; SI-NEXT: s_mov_b32 s3, 0xf000
1387 ; SI-NEXT: s_mov_b32 s2, -1
1388 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1389 ; SI-NEXT: s_mov_b32 s0, s4
1390 ; SI-NEXT: s_mov_b32 s1, s5
1391 ; SI-NEXT: s_mov_b32 s4, s6
1392 ; SI-NEXT: s_mov_b32 s5, s7
1393 ; SI-NEXT: s_mov_b32 s6, s2
1394 ; SI-NEXT: s_mov_b32 s7, s3
1395 ; SI-NEXT: s_mov_b32 s10, s2
1396 ; SI-NEXT: s_mov_b32 s11, s3
1397 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1398 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
1399 ; SI-NEXT: s_waitcnt vmcnt(0)
1400 ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1401 ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1402 ; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1403 ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1404 ; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1405 ; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1406 ; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1407 ; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1408 ; SI-NEXT: v_readfirstlane_b32 s6, v5
1409 ; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014
1410 ; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01
1411 ; SI-NEXT: s_mov_b32 s5, 0xfffff
1412 ; SI-NEXT: s_mov_b32 s4, s2
1413 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
1414 ; SI-NEXT: v_not_b32_e32 v6, s4
1415 ; SI-NEXT: v_and_b32_e32 v6, v4, v6
1416 ; SI-NEXT: v_not_b32_e32 v7, s5
1417 ; SI-NEXT: v_and_b32_e32 v5, v5, v7
1418 ; SI-NEXT: s_and_b32 s4, s6, 0x80000000
1419 ; SI-NEXT: s_cmp_lt_i32 s7, 0
1420 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1421 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
1422 ; SI-NEXT: v_mov_b32_e32 v7, s4
1423 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1424 ; SI-NEXT: s_cmp_gt_i32 s7, 51
1425 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1426 ; SI-NEXT: v_mov_b32_e32 v7, s6
1427 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1428 ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
1429 ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1430 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1433 ; CI-LABEL: unsafe_frem_f64:
1435 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1436 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1437 ; CI-NEXT: s_mov_b32 s11, 0xf000
1438 ; CI-NEXT: s_mov_b32 s10, -1
1439 ; CI-NEXT: s_mov_b32 s2, s10
1440 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1441 ; CI-NEXT: s_mov_b32 s8, s4
1442 ; CI-NEXT: s_mov_b32 s9, s5
1443 ; CI-NEXT: s_mov_b32 s4, s6
1444 ; CI-NEXT: s_mov_b32 s5, s7
1445 ; CI-NEXT: s_mov_b32 s6, s10
1446 ; CI-NEXT: s_mov_b32 s7, s11
1447 ; CI-NEXT: s_mov_b32 s3, s11
1448 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1449 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1450 ; CI-NEXT: s_waitcnt vmcnt(0)
1451 ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1452 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1453 ; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1454 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1455 ; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1456 ; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1457 ; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1458 ; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1459 ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1460 ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1461 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1464 ; VI-LABEL: unsafe_frem_f64:
1466 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1467 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1468 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1469 ; VI-NEXT: v_mov_b32_e32 v2, s6
1470 ; VI-NEXT: v_mov_b32_e32 v3, s7
1471 ; VI-NEXT: v_mov_b32_e32 v4, s0
1472 ; VI-NEXT: v_mov_b32_e32 v5, s1
1473 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1474 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
1475 ; VI-NEXT: v_mov_b32_e32 v0, s4
1476 ; VI-NEXT: v_mov_b32_e32 v1, s5
1477 ; VI-NEXT: s_waitcnt vmcnt(0)
1478 ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1479 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1480 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1481 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1482 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1483 ; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7]
1484 ; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1485 ; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1486 ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
1487 ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1488 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1491 ; GFX9-LABEL: unsafe_frem_f64:
1493 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1494 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1495 ; GFX9-NEXT: v_mov_b32_e32 v10, 0
1496 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1497 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7]
1498 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3]
1499 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1500 ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1501 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1502 ; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1503 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1504 ; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1505 ; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1506 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1507 ; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1508 ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1509 ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1510 ; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
1511 ; GFX9-NEXT: s_endpgm
1513 ; GFX10-LABEL: unsafe_frem_f64:
1515 ; GFX10-NEXT: s_clause 0x1
1516 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1517 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1518 ; GFX10-NEXT: v_mov_b32_e32 v10, 0
1519 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1520 ; GFX10-NEXT: s_clause 0x1
1521 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7]
1522 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3]
1523 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1524 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1525 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1526 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1527 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1528 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1529 ; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1530 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1531 ; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1532 ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1533 ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1534 ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
1535 ; GFX10-NEXT: s_endpgm
1537 ; GFX11-LABEL: unsafe_frem_f64:
1539 ; GFX11-NEXT: s_clause 0x1
1540 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1541 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1542 ; GFX11-NEXT: v_mov_b32_e32 v10, 0
1543 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1544 ; GFX11-NEXT: s_clause 0x1
1545 ; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7]
1546 ; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1]
1547 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1548 ; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1549 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1550 ; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1551 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1552 ; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1553 ; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1554 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1555 ; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1556 ; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1557 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1558 ; GFX11-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1559 ; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1560 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1561 ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1562 ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1563 ; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5]
1564 ; GFX11-NEXT: s_nop 0
1565 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1566 ; GFX11-NEXT: s_endpgm
1567 ptr addrspace(1) %in2) #1 {
1568 %r0 = load double, ptr addrspace(1) %in1, align 8
1569 %r1 = load double, ptr addrspace(1) %in2, align 8
1570 %r2 = frem afn double %r0, %r1
1571 store double %r2, ptr addrspace(1) %out, align 8
1575 define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
1576 ; SI-LABEL: frem_v2f16:
1578 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1579 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1580 ; SI-NEXT: s_mov_b32 s3, 0xf000
1581 ; SI-NEXT: s_mov_b32 s2, -1
1582 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1583 ; SI-NEXT: s_mov_b32 s0, s4
1584 ; SI-NEXT: s_mov_b32 s1, s5
1585 ; SI-NEXT: s_mov_b32 s4, s6
1586 ; SI-NEXT: s_mov_b32 s5, s7
1587 ; SI-NEXT: s_mov_b32 s6, s2
1588 ; SI-NEXT: s_mov_b32 s7, s3
1589 ; SI-NEXT: s_mov_b32 s10, s2
1590 ; SI-NEXT: s_mov_b32 s11, s3
1591 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
1592 ; SI-NEXT: s_waitcnt vmcnt(0)
1593 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
1594 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1595 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1596 ; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16
1597 ; SI-NEXT: s_waitcnt vmcnt(0)
1598 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2
1599 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1600 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
1601 ; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0
1602 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0
1603 ; SI-NEXT: v_rcp_f32_e32 v6, v5
1604 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1605 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
1606 ; SI-NEXT: v_fma_f32 v6, v7, v6, v6
1607 ; SI-NEXT: v_mul_f32_e32 v7, v4, v6
1608 ; SI-NEXT: v_fma_f32 v8, -v5, v7, v4
1609 ; SI-NEXT: v_fma_f32 v7, v8, v6, v7
1610 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4
1611 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1612 ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
1613 ; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0
1614 ; SI-NEXT: v_trunc_f32_e32 v4, v4
1615 ; SI-NEXT: v_fma_f32 v0, -v4, v2, v0
1616 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1617 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1618 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1619 ; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1
1620 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1
1621 ; SI-NEXT: v_rcp_f32_e32 v5, v4
1622 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1623 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
1624 ; SI-NEXT: v_fma_f32 v5, v6, v5, v5
1625 ; SI-NEXT: v_mul_f32_e32 v6, v2, v5
1626 ; SI-NEXT: v_fma_f32 v7, -v4, v6, v2
1627 ; SI-NEXT: v_fma_f32 v6, v7, v5, v6
1628 ; SI-NEXT: v_fma_f32 v2, -v4, v6, v2
1629 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1630 ; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6
1631 ; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1
1632 ; SI-NEXT: v_trunc_f32_e32 v2, v2
1633 ; SI-NEXT: v_fma_f32 v1, -v2, v3, v1
1634 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1635 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1636 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1639 ; CI-LABEL: frem_v2f16:
1641 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1642 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1643 ; CI-NEXT: s_mov_b32 s3, 0xf000
1644 ; CI-NEXT: s_mov_b32 s2, -1
1645 ; CI-NEXT: s_mov_b32 s10, s2
1646 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1647 ; CI-NEXT: s_mov_b32 s0, s4
1648 ; CI-NEXT: s_mov_b32 s1, s5
1649 ; CI-NEXT: s_mov_b32 s4, s6
1650 ; CI-NEXT: s_mov_b32 s5, s7
1651 ; CI-NEXT: s_mov_b32 s6, s2
1652 ; CI-NEXT: s_mov_b32 s7, s3
1653 ; CI-NEXT: s_mov_b32 s11, s3
1654 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
1655 ; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16
1656 ; CI-NEXT: s_waitcnt vmcnt(1)
1657 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v0
1658 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1659 ; CI-NEXT: s_waitcnt vmcnt(0)
1660 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v2
1661 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1662 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1663 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
1664 ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0
1665 ; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0
1666 ; CI-NEXT: v_rcp_f32_e32 v6, v5
1667 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1668 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
1669 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6
1670 ; CI-NEXT: v_mul_f32_e32 v7, v4, v6
1671 ; CI-NEXT: v_fma_f32 v8, -v5, v7, v4
1672 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7
1673 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4
1674 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1675 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
1676 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0
1677 ; CI-NEXT: v_trunc_f32_e32 v4, v4
1678 ; CI-NEXT: v_fma_f32 v0, -v4, v2, v0
1679 ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1
1680 ; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1
1681 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1682 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1683 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1684 ; CI-NEXT: v_rcp_f32_e32 v5, v4
1685 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1686 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
1687 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5
1688 ; CI-NEXT: v_mul_f32_e32 v6, v2, v5
1689 ; CI-NEXT: v_fma_f32 v7, -v4, v6, v2
1690 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6
1691 ; CI-NEXT: v_fma_f32 v2, -v4, v6, v2
1692 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1693 ; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6
1694 ; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1
1695 ; CI-NEXT: v_trunc_f32_e32 v2, v2
1696 ; CI-NEXT: v_fma_f32 v1, -v2, v3, v1
1697 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1698 ; CI-NEXT: v_or_b32_e32 v0, v1, v0
1699 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1702 ; VI-LABEL: frem_v2f16:
1704 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1705 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1706 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1707 ; VI-NEXT: v_mov_b32_e32 v2, s6
1708 ; VI-NEXT: s_add_u32 s0, s0, 16
1709 ; VI-NEXT: v_mov_b32_e32 v3, s7
1710 ; VI-NEXT: s_addc_u32 s1, s1, 0
1711 ; VI-NEXT: flat_load_dword v4, v[2:3]
1712 ; VI-NEXT: v_mov_b32_e32 v3, s1
1713 ; VI-NEXT: v_mov_b32_e32 v2, s0
1714 ; VI-NEXT: flat_load_dword v2, v[2:3]
1715 ; VI-NEXT: v_mov_b32_e32 v0, s4
1716 ; VI-NEXT: v_mov_b32_e32 v1, s5
1717 ; VI-NEXT: s_waitcnt vmcnt(1)
1718 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
1719 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v3
1720 ; VI-NEXT: s_waitcnt vmcnt(0)
1721 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
1722 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
1723 ; VI-NEXT: v_rcp_f32_e32 v7, v7
1724 ; VI-NEXT: v_mul_f32_e32 v5, v5, v7
1725 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5
1726 ; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3
1727 ; VI-NEXT: v_trunc_f16_e32 v5, v5
1728 ; VI-NEXT: v_fma_f16 v3, -v5, v6, v3
1729 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v2
1730 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v4
1731 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1732 ; VI-NEXT: v_rcp_f32_e32 v6, v6
1733 ; VI-NEXT: v_mul_f32_e32 v5, v5, v6
1734 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5
1735 ; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4
1736 ; VI-NEXT: v_trunc_f16_e32 v5, v5
1737 ; VI-NEXT: v_fma_f16 v2, -v5, v2, v4
1738 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1739 ; VI-NEXT: flat_store_dword v[0:1], v2
1742 ; GFX9-LABEL: frem_v2f16:
1744 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1745 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1746 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1747 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1748 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
1749 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16
1750 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1751 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
1752 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3
1753 ; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
1754 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1
1755 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
1756 ; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1
1757 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1758 ; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2
1759 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4
1760 ; GFX9-NEXT: v_mad_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
1761 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1762 ; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1
1763 ; GFX9-NEXT: v_trunc_f16_e32 v4, v4
1764 ; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1
1765 ; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
1766 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
1767 ; GFX9-NEXT: s_endpgm
1769 ; GFX10-LABEL: frem_v2f16:
1771 ; GFX10-NEXT: s_clause 0x1
1772 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1773 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1774 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1775 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1776 ; GFX10-NEXT: s_clause 0x1
1777 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
1778 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16
1779 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1780 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
1781 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
1782 ; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
1783 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
1784 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
1785 ; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1
1786 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1787 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
1788 ; GFX10-NEXT: v_rcp_f32_e32 v4, v4
1789 ; GFX10-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
1790 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1791 ; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1
1792 ; GFX10-NEXT: v_trunc_f16_e32 v4, v4
1793 ; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1
1794 ; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
1795 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
1796 ; GFX10-NEXT: s_endpgm
1798 ; GFX11-LABEL: frem_v2f16:
1800 ; GFX11-NEXT: s_clause 0x1
1801 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1802 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1803 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1804 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1805 ; GFX11-NEXT: s_clause 0x1
1806 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
1807 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
1808 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1809 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
1810 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1811 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
1812 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1813 ; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
1814 ; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1
1815 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1816 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3
1817 ; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v1
1818 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1819 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1820 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
1821 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4
1822 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1823 ; GFX11-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
1824 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1825 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1826 ; GFX11-NEXT: v_div_fixup_f16 v4, v4, v2, v1
1827 ; GFX11-NEXT: v_trunc_f16_e32 v4, v4
1828 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1829 ; GFX11-NEXT: v_fma_f16 v1, -v4, v2, v1
1830 ; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1
1831 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
1832 ; GFX11-NEXT: s_nop 0
1833 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1834 ; GFX11-NEXT: s_endpgm
1835 ptr addrspace(1) %in2) #0 {
1836 %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
1837 %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
1838 %r1 = load <2 x half>, ptr addrspace(1) %gep2, align 8
1839 %r2 = frem <2 x half> %r0, %r1
1840 store <2 x half> %r2, ptr addrspace(1) %out, align 8
1844 define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
1845 ; SI-LABEL: frem_v4f16:
1847 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1848 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1849 ; SI-NEXT: s_mov_b32 s3, 0xf000
1850 ; SI-NEXT: s_mov_b32 s2, -1
1851 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1852 ; SI-NEXT: s_mov_b32 s0, s4
1853 ; SI-NEXT: s_mov_b32 s1, s5
1854 ; SI-NEXT: s_mov_b32 s4, s6
1855 ; SI-NEXT: s_mov_b32 s5, s7
1856 ; SI-NEXT: s_mov_b32 s6, s2
1857 ; SI-NEXT: s_mov_b32 s7, s3
1858 ; SI-NEXT: s_mov_b32 s10, s2
1859 ; SI-NEXT: s_mov_b32 s11, s3
1860 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1861 ; SI-NEXT: s_waitcnt vmcnt(0)
1862 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
1863 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1864 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
1865 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
1866 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
1867 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v0
1868 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
1869 ; SI-NEXT: s_waitcnt vmcnt(0)
1870 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v0
1871 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1872 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1873 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v1
1874 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1875 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1876 ; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5
1877 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5
1878 ; SI-NEXT: v_rcp_f32_e32 v10, v9
1879 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1880 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
1881 ; SI-NEXT: v_fma_f32 v10, v11, v10, v10
1882 ; SI-NEXT: v_mul_f32_e32 v11, v8, v10
1883 ; SI-NEXT: v_fma_f32 v12, -v9, v11, v8
1884 ; SI-NEXT: v_fma_f32 v11, v12, v10, v11
1885 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8
1886 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1887 ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
1888 ; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5
1889 ; SI-NEXT: v_trunc_f32_e32 v8, v8
1890 ; SI-NEXT: v_fma_f32 v1, -v8, v1, v5
1891 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1892 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1893 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1894 ; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4
1895 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4
1896 ; SI-NEXT: v_rcp_f32_e32 v9, v8
1897 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1898 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
1899 ; SI-NEXT: v_fma_f32 v9, v10, v9, v9
1900 ; SI-NEXT: v_mul_f32_e32 v10, v5, v9
1901 ; SI-NEXT: v_fma_f32 v11, -v8, v10, v5
1902 ; SI-NEXT: v_fma_f32 v10, v11, v9, v10
1903 ; SI-NEXT: v_fma_f32 v5, -v8, v10, v5
1904 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1905 ; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10
1906 ; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4
1907 ; SI-NEXT: v_trunc_f32_e32 v5, v5
1908 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4
1909 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
1910 ; SI-NEXT: v_or_b32_e32 v1, v4, v1
1911 ; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3
1912 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3
1913 ; SI-NEXT: v_rcp_f32_e32 v7, v5
1914 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1915 ; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0
1916 ; SI-NEXT: v_fma_f32 v7, v8, v7, v7
1917 ; SI-NEXT: v_mul_f32_e32 v8, v4, v7
1918 ; SI-NEXT: v_fma_f32 v9, -v5, v8, v4
1919 ; SI-NEXT: v_fma_f32 v8, v9, v7, v8
1920 ; SI-NEXT: v_fma_f32 v4, -v5, v8, v4
1921 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1922 ; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8
1923 ; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3
1924 ; SI-NEXT: v_trunc_f32_e32 v4, v4
1925 ; SI-NEXT: v_fma_f32 v0, -v4, v0, v3
1926 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1927 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1928 ; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2
1929 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2
1930 ; SI-NEXT: v_rcp_f32_e32 v5, v4
1931 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1932 ; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0
1933 ; SI-NEXT: v_fma_f32 v5, v7, v5, v5
1934 ; SI-NEXT: v_mul_f32_e32 v7, v3, v5
1935 ; SI-NEXT: v_fma_f32 v8, -v4, v7, v3
1936 ; SI-NEXT: v_fma_f32 v7, v8, v5, v7
1937 ; SI-NEXT: v_fma_f32 v3, -v4, v7, v3
1938 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1939 ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7
1940 ; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2
1941 ; SI-NEXT: v_trunc_f32_e32 v3, v3
1942 ; SI-NEXT: v_fma_f32 v2, -v3, v6, v2
1943 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
1944 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
1945 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1948 ; CI-LABEL: frem_v4f16:
1950 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1951 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1952 ; CI-NEXT: s_mov_b32 s3, 0xf000
1953 ; CI-NEXT: s_mov_b32 s2, -1
1954 ; CI-NEXT: s_mov_b32 s10, s2
1955 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1956 ; CI-NEXT: s_mov_b32 s0, s4
1957 ; CI-NEXT: s_mov_b32 s1, s5
1958 ; CI-NEXT: s_mov_b32 s4, s6
1959 ; CI-NEXT: s_mov_b32 s5, s7
1960 ; CI-NEXT: s_mov_b32 s6, s2
1961 ; CI-NEXT: s_mov_b32 s7, s3
1962 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1963 ; CI-NEXT: s_mov_b32 s11, s3
1964 ; CI-NEXT: s_waitcnt vmcnt(0)
1965 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v0
1966 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1967 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v0
1968 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
1969 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v1
1970 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v0
1971 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
1972 ; CI-NEXT: s_waitcnt vmcnt(0)
1973 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v1
1974 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1975 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
1976 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v0
1977 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1978 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1979 ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5
1980 ; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5
1981 ; CI-NEXT: v_rcp_f32_e32 v10, v9
1982 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1983 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
1984 ; CI-NEXT: v_fma_f32 v10, v11, v10, v10
1985 ; CI-NEXT: v_mul_f32_e32 v11, v8, v10
1986 ; CI-NEXT: v_fma_f32 v12, -v9, v11, v8
1987 ; CI-NEXT: v_fma_f32 v11, v12, v10, v11
1988 ; CI-NEXT: v_fma_f32 v8, -v9, v11, v8
1989 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1990 ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
1991 ; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5
1992 ; CI-NEXT: v_trunc_f32_e32 v8, v8
1993 ; CI-NEXT: v_fma_f32 v1, -v8, v1, v5
1994 ; CI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4
1995 ; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4
1996 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1997 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1998 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1999 ; CI-NEXT: v_rcp_f32_e32 v9, v8
2000 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2001 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
2002 ; CI-NEXT: v_fma_f32 v9, v10, v9, v9
2003 ; CI-NEXT: v_mul_f32_e32 v10, v5, v9
2004 ; CI-NEXT: v_fma_f32 v11, -v8, v10, v5
2005 ; CI-NEXT: v_fma_f32 v10, v11, v9, v10
2006 ; CI-NEXT: v_fma_f32 v5, -v8, v10, v5
2007 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2008 ; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10
2009 ; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4
2010 ; CI-NEXT: v_trunc_f32_e32 v5, v5
2011 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4
2012 ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3
2013 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
2014 ; CI-NEXT: v_or_b32_e32 v1, v4, v1
2015 ; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3
2016 ; CI-NEXT: v_rcp_f32_e32 v7, v5
2017 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2018 ; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0
2019 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7
2020 ; CI-NEXT: v_mul_f32_e32 v8, v4, v7
2021 ; CI-NEXT: v_fma_f32 v9, -v5, v8, v4
2022 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8
2023 ; CI-NEXT: v_fma_f32 v4, -v5, v8, v4
2024 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2025 ; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8
2026 ; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3
2027 ; CI-NEXT: v_trunc_f32_e32 v4, v4
2028 ; CI-NEXT: v_fma_f32 v0, -v4, v0, v3
2029 ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2
2030 ; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2
2031 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2032 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2033 ; CI-NEXT: v_rcp_f32_e32 v5, v4
2034 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2035 ; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0
2036 ; CI-NEXT: v_fma_f32 v5, v7, v5, v5
2037 ; CI-NEXT: v_mul_f32_e32 v7, v3, v5
2038 ; CI-NEXT: v_fma_f32 v8, -v4, v7, v3
2039 ; CI-NEXT: v_fma_f32 v7, v8, v5, v7
2040 ; CI-NEXT: v_fma_f32 v3, -v4, v7, v3
2041 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2042 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7
2043 ; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2
2044 ; CI-NEXT: v_trunc_f32_e32 v3, v3
2045 ; CI-NEXT: v_fma_f32 v2, -v3, v6, v2
2046 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2047 ; CI-NEXT: v_or_b32_e32 v0, v2, v0
2048 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2051 ; VI-LABEL: frem_v4f16:
2053 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2054 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2055 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2056 ; VI-NEXT: v_mov_b32_e32 v2, s6
2057 ; VI-NEXT: s_add_u32 s0, s0, 32
2058 ; VI-NEXT: s_addc_u32 s1, s1, 0
2059 ; VI-NEXT: v_mov_b32_e32 v5, s1
2060 ; VI-NEXT: v_mov_b32_e32 v4, s0
2061 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
2062 ; VI-NEXT: v_mov_b32_e32 v3, s7
2063 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
2064 ; VI-NEXT: v_mov_b32_e32 v0, s4
2065 ; VI-NEXT: v_mov_b32_e32 v1, s5
2066 ; VI-NEXT: s_waitcnt vmcnt(1)
2067 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5
2068 ; VI-NEXT: v_cvt_f32_f16_e32 v9, v8
2069 ; VI-NEXT: s_waitcnt vmcnt(0)
2070 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3
2071 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
2072 ; VI-NEXT: v_rcp_f32_e32 v9, v9
2073 ; VI-NEXT: v_mul_f32_e32 v7, v7, v9
2074 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7
2075 ; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6
2076 ; VI-NEXT: v_trunc_f16_e32 v7, v7
2077 ; VI-NEXT: v_fma_f16 v6, -v7, v8, v6
2078 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v5
2079 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v3
2080 ; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
2081 ; VI-NEXT: v_rcp_f32_e32 v8, v8
2082 ; VI-NEXT: v_mul_f32_e32 v7, v7, v8
2083 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7
2084 ; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3
2085 ; VI-NEXT: v_trunc_f16_e32 v7, v7
2086 ; VI-NEXT: v_fma_f16 v3, -v7, v5, v3
2087 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
2088 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v7
2089 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
2090 ; VI-NEXT: v_or_b32_e32 v3, v3, v6
2091 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v5
2092 ; VI-NEXT: v_rcp_f32_e32 v8, v8
2093 ; VI-NEXT: v_mul_f32_e32 v6, v6, v8
2094 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
2095 ; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5
2096 ; VI-NEXT: v_trunc_f16_e32 v6, v6
2097 ; VI-NEXT: v_fma_f16 v5, -v6, v7, v5
2098 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v4
2099 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v2
2100 ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
2101 ; VI-NEXT: v_rcp_f32_e32 v7, v7
2102 ; VI-NEXT: v_mul_f32_e32 v6, v6, v7
2103 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
2104 ; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2
2105 ; VI-NEXT: v_trunc_f16_e32 v6, v6
2106 ; VI-NEXT: v_fma_f16 v2, -v6, v4, v2
2107 ; VI-NEXT: v_or_b32_e32 v2, v2, v5
2108 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
2111 ; GFX9-LABEL: frem_v4f16:
2113 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2114 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2115 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2116 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2117 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
2118 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2119 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2120 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v3
2121 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5
2122 ; GFX9-NEXT: v_mad_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
2123 ; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1
2124 ; GFX9-NEXT: v_trunc_f16_e32 v5, v5
2125 ; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1
2126 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2127 ; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3
2128 ; GFX9-NEXT: v_rcp_f32_e32 v6, v6
2129 ; GFX9-NEXT: v_mad_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2130 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2131 ; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1
2132 ; GFX9-NEXT: v_trunc_f16_e32 v6, v6
2133 ; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1
2134 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
2135 ; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1
2136 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3
2137 ; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
2138 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0
2139 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
2140 ; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0
2141 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2142 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2
2143 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5
2144 ; GFX9-NEXT: v_mad_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2145 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2146 ; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0
2147 ; GFX9-NEXT: v_trunc_f16_e32 v5, v5
2148 ; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0
2149 ; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
2150 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
2151 ; GFX9-NEXT: s_endpgm
2153 ; GFX10-LABEL: frem_v4f16:
2155 ; GFX10-NEXT: s_clause 0x1
2156 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2157 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2158 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
2159 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2160 ; GFX10-NEXT: s_clause 0x1
2161 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
2162 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2163 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2164 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v3
2165 ; GFX10-NEXT: v_rcp_f32_e32 v5, v5
2166 ; GFX10-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
2167 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
2168 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5
2169 ; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1
2170 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2171 ; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3
2172 ; GFX10-NEXT: v_rcp_f32_e32 v6, v6
2173 ; GFX10-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2174 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2175 ; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1
2176 ; GFX10-NEXT: v_trunc_f16_e32 v6, v6
2177 ; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1
2178 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
2179 ; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1
2180 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
2181 ; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
2182 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
2183 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
2184 ; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0
2185 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2186 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
2187 ; GFX10-NEXT: v_rcp_f32_e32 v5, v5
2188 ; GFX10-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2189 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2190 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0
2191 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5
2192 ; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0
2193 ; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
2194 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
2195 ; GFX10-NEXT: s_endpgm
2197 ; GFX11-LABEL: frem_v4f16:
2199 ; GFX11-NEXT: s_clause 0x1
2200 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2201 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2202 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
2203 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2204 ; GFX11-NEXT: s_clause 0x1
2205 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
2206 ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
2207 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2208 ; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v3
2209 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2210 ; GFX11-NEXT: v_rcp_f32_e32 v5, v5
2211 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2212 ; GFX11-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
2213 ; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1
2214 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2215 ; GFX11-NEXT: v_trunc_f16_e32 v5, v5
2216 ; GFX11-NEXT: v_fma_f16 v5, -v5, v3, v1
2217 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2218 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2219 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3
2220 ; GFX11-NEXT: v_rcp_f32_e32 v6, v6
2221 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2222 ; GFX11-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2223 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2224 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2225 ; GFX11-NEXT: v_div_fixup_f16 v6, v6, v3, v1
2226 ; GFX11-NEXT: v_trunc_f16_e32 v6, v6
2227 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2228 ; GFX11-NEXT: v_fma_f16 v1, -v6, v3, v1
2229 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
2230 ; GFX11-NEXT: v_pack_b32_f16 v1, v5, v1
2231 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2232 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
2233 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2234 ; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
2235 ; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0
2236 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2237 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3
2238 ; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v0
2239 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2240 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2241 ; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2
2242 ; GFX11-NEXT: v_rcp_f32_e32 v5, v5
2243 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2244 ; GFX11-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2245 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2246 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2247 ; GFX11-NEXT: v_div_fixup_f16 v5, v5, v2, v0
2248 ; GFX11-NEXT: v_trunc_f16_e32 v5, v5
2249 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2250 ; GFX11-NEXT: v_fma_f16 v0, -v5, v2, v0
2251 ; GFX11-NEXT: v_pack_b32_f16 v0, v3, v0
2252 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
2253 ; GFX11-NEXT: s_nop 0
2254 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2255 ; GFX11-NEXT: s_endpgm
2256 ptr addrspace(1) %in2) #0 {
2257 %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
2258 %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
2259 %r1 = load <4 x half>, ptr addrspace(1) %gep2, align 16
2260 %r2 = frem <4 x half> %r0, %r1
2261 store <4 x half> %r2, ptr addrspace(1) %out, align 16
2265 define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
2266 ; SI-LABEL: frem_v2f32:
2268 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2269 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2270 ; SI-NEXT: s_mov_b32 s3, 0xf000
2271 ; SI-NEXT: s_mov_b32 s2, -1
2272 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2273 ; SI-NEXT: s_mov_b32 s0, s4
2274 ; SI-NEXT: s_mov_b32 s1, s5
2275 ; SI-NEXT: s_mov_b32 s4, s6
2276 ; SI-NEXT: s_mov_b32 s5, s7
2277 ; SI-NEXT: s_mov_b32 s6, s2
2278 ; SI-NEXT: s_mov_b32 s7, s3
2279 ; SI-NEXT: s_mov_b32 s10, s2
2280 ; SI-NEXT: s_mov_b32 s11, s3
2281 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2282 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
2283 ; SI-NEXT: s_waitcnt vmcnt(0)
2284 ; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1
2285 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
2286 ; SI-NEXT: v_rcp_f32_e32 v6, v5
2287 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2288 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2289 ; SI-NEXT: v_fma_f32 v6, v7, v6, v6
2290 ; SI-NEXT: v_mul_f32_e32 v7, v4, v6
2291 ; SI-NEXT: v_fma_f32 v8, -v5, v7, v4
2292 ; SI-NEXT: v_fma_f32 v7, v8, v6, v7
2293 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4
2294 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2295 ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
2296 ; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1
2297 ; SI-NEXT: v_trunc_f32_e32 v4, v4
2298 ; SI-NEXT: v_fma_f32 v1, -v4, v3, v1
2299 ; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0
2300 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
2301 ; SI-NEXT: v_rcp_f32_e32 v5, v4
2302 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2303 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
2304 ; SI-NEXT: v_fma_f32 v5, v6, v5, v5
2305 ; SI-NEXT: v_mul_f32_e32 v6, v3, v5
2306 ; SI-NEXT: v_fma_f32 v7, -v4, v6, v3
2307 ; SI-NEXT: v_fma_f32 v6, v7, v5, v6
2308 ; SI-NEXT: v_fma_f32 v3, -v4, v6, v3
2309 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2310 ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
2311 ; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2312 ; SI-NEXT: v_trunc_f32_e32 v3, v3
2313 ; SI-NEXT: v_fma_f32 v0, -v3, v2, v0
2314 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2317 ; CI-LABEL: frem_v2f32:
2319 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2320 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2321 ; CI-NEXT: s_mov_b32 s3, 0xf000
2322 ; CI-NEXT: s_mov_b32 s2, -1
2323 ; CI-NEXT: s_mov_b32 s10, s2
2324 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2325 ; CI-NEXT: s_mov_b32 s0, s4
2326 ; CI-NEXT: s_mov_b32 s1, s5
2327 ; CI-NEXT: s_mov_b32 s4, s6
2328 ; CI-NEXT: s_mov_b32 s5, s7
2329 ; CI-NEXT: s_mov_b32 s6, s2
2330 ; CI-NEXT: s_mov_b32 s7, s3
2331 ; CI-NEXT: s_mov_b32 s11, s3
2332 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2333 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
2334 ; CI-NEXT: s_waitcnt vmcnt(0)
2335 ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
2336 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1
2337 ; CI-NEXT: v_rcp_f32_e32 v6, v5
2338 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2339 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2340 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6
2341 ; CI-NEXT: v_mul_f32_e32 v7, v4, v6
2342 ; CI-NEXT: v_fma_f32 v8, -v5, v7, v4
2343 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7
2344 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4
2345 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2346 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
2347 ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1
2348 ; CI-NEXT: v_trunc_f32_e32 v4, v4
2349 ; CI-NEXT: v_fma_f32 v1, -v4, v3, v1
2350 ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
2351 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0
2352 ; CI-NEXT: v_rcp_f32_e32 v5, v4
2353 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2354 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
2355 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5
2356 ; CI-NEXT: v_mul_f32_e32 v6, v3, v5
2357 ; CI-NEXT: v_fma_f32 v7, -v4, v6, v3
2358 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6
2359 ; CI-NEXT: v_fma_f32 v3, -v4, v6, v3
2360 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2361 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
2362 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2363 ; CI-NEXT: v_trunc_f32_e32 v3, v3
2364 ; CI-NEXT: v_fma_f32 v0, -v3, v2, v0
2365 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2368 ; VI-LABEL: frem_v2f32:
2370 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2371 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2372 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2373 ; VI-NEXT: v_mov_b32_e32 v2, s6
2374 ; VI-NEXT: s_add_u32 s0, s0, 32
2375 ; VI-NEXT: s_addc_u32 s1, s1, 0
2376 ; VI-NEXT: v_mov_b32_e32 v5, s1
2377 ; VI-NEXT: v_mov_b32_e32 v3, s7
2378 ; VI-NEXT: v_mov_b32_e32 v4, s0
2379 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
2380 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
2381 ; VI-NEXT: v_mov_b32_e32 v0, s4
2382 ; VI-NEXT: v_mov_b32_e32 v1, s5
2383 ; VI-NEXT: s_waitcnt vmcnt(0)
2384 ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3
2385 ; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3
2386 ; VI-NEXT: v_rcp_f32_e32 v8, v7
2387 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2388 ; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0
2389 ; VI-NEXT: v_fma_f32 v8, v9, v8, v8
2390 ; VI-NEXT: v_mul_f32_e32 v9, v6, v8
2391 ; VI-NEXT: v_fma_f32 v10, -v7, v9, v6
2392 ; VI-NEXT: v_fma_f32 v9, v10, v8, v9
2393 ; VI-NEXT: v_fma_f32 v6, -v7, v9, v6
2394 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2395 ; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9
2396 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3
2397 ; VI-NEXT: v_trunc_f32_e32 v6, v6
2398 ; VI-NEXT: v_fma_f32 v3, -v6, v5, v3
2399 ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2
2400 ; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2
2401 ; VI-NEXT: v_rcp_f32_e32 v7, v6
2402 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2403 ; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2404 ; VI-NEXT: v_fma_f32 v7, v8, v7, v7
2405 ; VI-NEXT: v_mul_f32_e32 v8, v5, v7
2406 ; VI-NEXT: v_fma_f32 v9, -v6, v8, v5
2407 ; VI-NEXT: v_fma_f32 v8, v9, v7, v8
2408 ; VI-NEXT: v_fma_f32 v5, -v6, v8, v5
2409 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2410 ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2411 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2
2412 ; VI-NEXT: v_trunc_f32_e32 v5, v5
2413 ; VI-NEXT: v_fma_f32 v2, -v5, v4, v2
2414 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
2417 ; GFX9-LABEL: frem_v2f32:
2419 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2420 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2421 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2422 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2423 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
2424 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2425 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2426 ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1
2427 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
2428 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6
2429 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2430 ; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2431 ; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7
2432 ; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7
2433 ; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5
2434 ; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8
2435 ; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5
2436 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2437 ; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2438 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1
2439 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
2440 ; GFX9-NEXT: v_fma_f32 v1, -v5, v3, v1
2441 ; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0
2442 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0
2443 ; GFX9-NEXT: v_rcp_f32_e32 v6, v5
2444 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2445 ; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2446 ; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6
2447 ; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6
2448 ; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3
2449 ; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7
2450 ; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3
2451 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2452 ; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7
2453 ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2454 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
2455 ; GFX9-NEXT: v_fma_f32 v0, -v3, v2, v0
2456 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
2457 ; GFX9-NEXT: s_endpgm
2459 ; GFX10-LABEL: frem_v2f32:
2461 ; GFX10-NEXT: s_clause 0x1
2462 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2463 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2464 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
2465 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2466 ; GFX10-NEXT: s_clause 0x1
2467 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
2468 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2469 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2470 ; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1
2471 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
2472 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6
2473 ; GFX10-NEXT: s_denorm_mode 15
2474 ; GFX10-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2475 ; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v7
2476 ; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7
2477 ; GFX10-NEXT: v_fma_f32 v9, -v6, v8, v5
2478 ; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v7
2479 ; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5
2480 ; GFX10-NEXT: s_denorm_mode 12
2481 ; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2482 ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1
2483 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5
2484 ; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1
2485 ; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0
2486 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0
2487 ; GFX10-NEXT: v_rcp_f32_e32 v6, v5
2488 ; GFX10-NEXT: s_denorm_mode 15
2489 ; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2490 ; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v6
2491 ; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6
2492 ; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3
2493 ; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v6
2494 ; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3
2495 ; GFX10-NEXT: s_denorm_mode 12
2496 ; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7
2497 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2498 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
2499 ; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0
2500 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
2501 ; GFX10-NEXT: s_endpgm
2503 ; GFX11-LABEL: frem_v2f32:
2505 ; GFX11-NEXT: s_clause 0x1
2506 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2507 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2508 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
2509 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2510 ; GFX11-NEXT: s_clause 0x1
2511 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
2512 ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
2513 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2514 ; GFX11-NEXT: v_div_scale_f32 v6, null, v3, v3, v1
2515 ; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
2516 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
2517 ; GFX11-NEXT: v_rcp_f32_e32 v7, v6
2518 ; GFX11-NEXT: s_denorm_mode 15
2519 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2520 ; GFX11-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2521 ; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v7
2522 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2523 ; GFX11-NEXT: v_mul_f32_e32 v8, v5, v7
2524 ; GFX11-NEXT: v_fma_f32 v9, -v6, v8, v5
2525 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2526 ; GFX11-NEXT: v_fmac_f32_e32 v8, v9, v7
2527 ; GFX11-NEXT: v_fma_f32 v5, -v6, v8, v5
2528 ; GFX11-NEXT: s_denorm_mode 12
2529 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2530 ; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2531 ; GFX11-NEXT: v_div_fixup_f32 v5, v5, v3, v1
2532 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2533 ; GFX11-NEXT: v_trunc_f32_e32 v5, v5
2534 ; GFX11-NEXT: v_fma_f32 v1, -v5, v3, v1
2535 ; GFX11-NEXT: v_div_scale_f32 v5, null, v2, v2, v0
2536 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0
2537 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
2538 ; GFX11-NEXT: v_rcp_f32_e32 v6, v5
2539 ; GFX11-NEXT: s_denorm_mode 15
2540 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2541 ; GFX11-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2542 ; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v6
2543 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2544 ; GFX11-NEXT: v_mul_f32_e32 v7, v3, v6
2545 ; GFX11-NEXT: v_fma_f32 v8, -v5, v7, v3
2546 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2547 ; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v6
2548 ; GFX11-NEXT: v_fma_f32 v3, -v5, v7, v3
2549 ; GFX11-NEXT: s_denorm_mode 12
2550 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2551 ; GFX11-NEXT: v_div_fmas_f32 v3, v3, v6, v7
2552 ; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2553 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2554 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
2555 ; GFX11-NEXT: v_fma_f32 v0, -v3, v2, v0
2556 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
2557 ; GFX11-NEXT: s_nop 0
2558 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2559 ; GFX11-NEXT: s_endpgm
2560 ptr addrspace(1) %in2) #0 {
2561 %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
2562 %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8
2563 %r1 = load <2 x float>, ptr addrspace(1) %gep2, align 8
2564 %r2 = frem <2 x float> %r0, %r1
2565 store <2 x float> %r2, ptr addrspace(1) %out, align 8
2569 define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
2570 ; SI-LABEL: frem_v4f32:
2572 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2573 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2574 ; SI-NEXT: s_mov_b32 s3, 0xf000
2575 ; SI-NEXT: s_mov_b32 s2, -1
2576 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2577 ; SI-NEXT: s_mov_b32 s0, s4
2578 ; SI-NEXT: s_mov_b32 s1, s5
2579 ; SI-NEXT: s_mov_b32 s4, s6
2580 ; SI-NEXT: s_mov_b32 s5, s7
2581 ; SI-NEXT: s_mov_b32 s6, s2
2582 ; SI-NEXT: s_mov_b32 s7, s3
2583 ; SI-NEXT: s_mov_b32 s10, s2
2584 ; SI-NEXT: s_mov_b32 s11, s3
2585 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2586 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2587 ; SI-NEXT: s_waitcnt vmcnt(0)
2588 ; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3
2589 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3
2590 ; SI-NEXT: v_rcp_f32_e32 v10, v9
2591 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2592 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
2593 ; SI-NEXT: v_fma_f32 v10, v11, v10, v10
2594 ; SI-NEXT: v_mul_f32_e32 v11, v8, v10
2595 ; SI-NEXT: v_fma_f32 v12, -v9, v11, v8
2596 ; SI-NEXT: v_fma_f32 v11, v12, v10, v11
2597 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8
2598 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2599 ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
2600 ; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3
2601 ; SI-NEXT: v_trunc_f32_e32 v8, v8
2602 ; SI-NEXT: v_fma_f32 v3, -v8, v7, v3
2603 ; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
2604 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2
2605 ; SI-NEXT: v_rcp_f32_e32 v9, v8
2606 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2607 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
2608 ; SI-NEXT: v_fma_f32 v9, v10, v9, v9
2609 ; SI-NEXT: v_mul_f32_e32 v10, v7, v9
2610 ; SI-NEXT: v_fma_f32 v11, -v8, v10, v7
2611 ; SI-NEXT: v_fma_f32 v10, v11, v9, v10
2612 ; SI-NEXT: v_fma_f32 v7, -v8, v10, v7
2613 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2614 ; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10
2615 ; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2
2616 ; SI-NEXT: v_trunc_f32_e32 v7, v7
2617 ; SI-NEXT: v_fma_f32 v2, -v7, v6, v2
2618 ; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
2619 ; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1
2620 ; SI-NEXT: v_rcp_f32_e32 v8, v7
2621 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2622 ; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0
2623 ; SI-NEXT: v_fma_f32 v8, v9, v8, v8
2624 ; SI-NEXT: v_mul_f32_e32 v9, v6, v8
2625 ; SI-NEXT: v_fma_f32 v10, -v7, v9, v6
2626 ; SI-NEXT: v_fma_f32 v9, v10, v8, v9
2627 ; SI-NEXT: v_fma_f32 v6, -v7, v9, v6
2628 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2629 ; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9
2630 ; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1
2631 ; SI-NEXT: v_trunc_f32_e32 v6, v6
2632 ; SI-NEXT: v_fma_f32 v1, -v6, v5, v1
2633 ; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
2634 ; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0
2635 ; SI-NEXT: v_rcp_f32_e32 v7, v6
2636 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2637 ; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2638 ; SI-NEXT: v_fma_f32 v7, v8, v7, v7
2639 ; SI-NEXT: v_mul_f32_e32 v8, v5, v7
2640 ; SI-NEXT: v_fma_f32 v9, -v6, v8, v5
2641 ; SI-NEXT: v_fma_f32 v8, v9, v7, v8
2642 ; SI-NEXT: v_fma_f32 v5, -v6, v8, v5
2643 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2644 ; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2645 ; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0
2646 ; SI-NEXT: v_trunc_f32_e32 v5, v5
2647 ; SI-NEXT: v_fma_f32 v0, -v5, v4, v0
2648 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2651 ; CI-LABEL: frem_v4f32:
2653 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2654 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2655 ; CI-NEXT: s_mov_b32 s3, 0xf000
2656 ; CI-NEXT: s_mov_b32 s2, -1
2657 ; CI-NEXT: s_mov_b32 s10, s2
2658 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2659 ; CI-NEXT: s_mov_b32 s0, s4
2660 ; CI-NEXT: s_mov_b32 s1, s5
2661 ; CI-NEXT: s_mov_b32 s4, s6
2662 ; CI-NEXT: s_mov_b32 s5, s7
2663 ; CI-NEXT: s_mov_b32 s6, s2
2664 ; CI-NEXT: s_mov_b32 s7, s3
2665 ; CI-NEXT: s_mov_b32 s11, s3
2666 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2667 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2668 ; CI-NEXT: s_waitcnt vmcnt(0)
2669 ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3
2670 ; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3
2671 ; CI-NEXT: v_rcp_f32_e32 v10, v9
2672 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2673 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
2674 ; CI-NEXT: v_fma_f32 v10, v11, v10, v10
2675 ; CI-NEXT: v_mul_f32_e32 v11, v8, v10
2676 ; CI-NEXT: v_fma_f32 v12, -v9, v11, v8
2677 ; CI-NEXT: v_fma_f32 v11, v12, v10, v11
2678 ; CI-NEXT: v_fma_f32 v8, -v9, v11, v8
2679 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2680 ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
2681 ; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3
2682 ; CI-NEXT: v_trunc_f32_e32 v8, v8
2683 ; CI-NEXT: v_fma_f32 v3, -v8, v7, v3
2684 ; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2
2685 ; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
2686 ; CI-NEXT: v_rcp_f32_e32 v9, v8
2687 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2688 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
2689 ; CI-NEXT: v_fma_f32 v9, v10, v9, v9
2690 ; CI-NEXT: v_mul_f32_e32 v10, v7, v9
2691 ; CI-NEXT: v_fma_f32 v11, -v8, v10, v7
2692 ; CI-NEXT: v_fma_f32 v10, v11, v9, v10
2693 ; CI-NEXT: v_fma_f32 v7, -v8, v10, v7
2694 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2695 ; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10
2696 ; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2
2697 ; CI-NEXT: v_trunc_f32_e32 v7, v7
2698 ; CI-NEXT: v_fma_f32 v2, -v7, v6, v2
2699 ; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1
2700 ; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
2701 ; CI-NEXT: v_rcp_f32_e32 v8, v7
2702 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2703 ; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0
2704 ; CI-NEXT: v_fma_f32 v8, v9, v8, v8
2705 ; CI-NEXT: v_mul_f32_e32 v9, v6, v8
2706 ; CI-NEXT: v_fma_f32 v10, -v7, v9, v6
2707 ; CI-NEXT: v_fma_f32 v9, v10, v8, v9
2708 ; CI-NEXT: v_fma_f32 v6, -v7, v9, v6
2709 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2710 ; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9
2711 ; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1
2712 ; CI-NEXT: v_trunc_f32_e32 v6, v6
2713 ; CI-NEXT: v_fma_f32 v1, -v6, v5, v1
2714 ; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0
2715 ; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
2716 ; CI-NEXT: v_rcp_f32_e32 v7, v6
2717 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2718 ; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2719 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7
2720 ; CI-NEXT: v_mul_f32_e32 v8, v5, v7
2721 ; CI-NEXT: v_fma_f32 v9, -v6, v8, v5
2722 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8
2723 ; CI-NEXT: v_fma_f32 v5, -v6, v8, v5
2724 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2725 ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2726 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0
2727 ; CI-NEXT: v_trunc_f32_e32 v5, v5
2728 ; CI-NEXT: v_fma_f32 v0, -v5, v4, v0
2729 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2732 ; VI-LABEL: frem_v4f32:
2734 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2735 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2736 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2737 ; VI-NEXT: v_mov_b32_e32 v0, s6
2738 ; VI-NEXT: s_add_u32 s0, s0, 64
2739 ; VI-NEXT: s_addc_u32 s1, s1, 0
2740 ; VI-NEXT: v_mov_b32_e32 v5, s1
2741 ; VI-NEXT: v_mov_b32_e32 v1, s7
2742 ; VI-NEXT: v_mov_b32_e32 v4, s0
2743 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2744 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2745 ; VI-NEXT: v_mov_b32_e32 v8, s4
2746 ; VI-NEXT: v_mov_b32_e32 v9, s5
2747 ; VI-NEXT: s_waitcnt vmcnt(0)
2748 ; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3
2749 ; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3
2750 ; VI-NEXT: v_rcp_f32_e32 v12, v11
2751 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2752 ; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0
2753 ; VI-NEXT: v_fma_f32 v12, v13, v12, v12
2754 ; VI-NEXT: v_mul_f32_e32 v13, v10, v12
2755 ; VI-NEXT: v_fma_f32 v14, -v11, v13, v10
2756 ; VI-NEXT: v_fma_f32 v13, v14, v12, v13
2757 ; VI-NEXT: v_fma_f32 v10, -v11, v13, v10
2758 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2759 ; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13
2760 ; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3
2761 ; VI-NEXT: v_trunc_f32_e32 v10, v10
2762 ; VI-NEXT: v_fma_f32 v3, -v10, v7, v3
2763 ; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2
2764 ; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
2765 ; VI-NEXT: v_rcp_f32_e32 v11, v10
2766 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2767 ; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0
2768 ; VI-NEXT: v_fma_f32 v11, v12, v11, v11
2769 ; VI-NEXT: v_mul_f32_e32 v12, v7, v11
2770 ; VI-NEXT: v_fma_f32 v13, -v10, v12, v7
2771 ; VI-NEXT: v_fma_f32 v12, v13, v11, v12
2772 ; VI-NEXT: v_fma_f32 v7, -v10, v12, v7
2773 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2774 ; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12
2775 ; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2
2776 ; VI-NEXT: v_trunc_f32_e32 v7, v7
2777 ; VI-NEXT: v_fma_f32 v2, -v7, v6, v2
2778 ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1
2779 ; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
2780 ; VI-NEXT: v_rcp_f32_e32 v10, v7
2781 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2782 ; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0
2783 ; VI-NEXT: v_fma_f32 v10, v11, v10, v10
2784 ; VI-NEXT: v_mul_f32_e32 v11, v6, v10
2785 ; VI-NEXT: v_fma_f32 v12, -v7, v11, v6
2786 ; VI-NEXT: v_fma_f32 v11, v12, v10, v11
2787 ; VI-NEXT: v_fma_f32 v6, -v7, v11, v6
2788 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2789 ; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11
2790 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1
2791 ; VI-NEXT: v_trunc_f32_e32 v6, v6
2792 ; VI-NEXT: v_fma_f32 v1, -v6, v5, v1
2793 ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0
2794 ; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
2795 ; VI-NEXT: v_rcp_f32_e32 v7, v6
2796 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2797 ; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0
2798 ; VI-NEXT: v_fma_f32 v7, v10, v7, v7
2799 ; VI-NEXT: v_mul_f32_e32 v10, v5, v7
2800 ; VI-NEXT: v_fma_f32 v11, -v6, v10, v5
2801 ; VI-NEXT: v_fma_f32 v10, v11, v7, v10
2802 ; VI-NEXT: v_fma_f32 v5, -v6, v10, v5
2803 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2804 ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10
2805 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0
2806 ; VI-NEXT: v_trunc_f32_e32 v5, v5
2807 ; VI-NEXT: v_fma_f32 v0, -v5, v4, v0
2808 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2811 ; GFX9-LABEL: frem_v4f32:
2813 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2814 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2815 ; GFX9-NEXT: v_mov_b32_e32 v8, 0
2816 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2817 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
2818 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
2819 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2820 ; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3
2821 ; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3
2822 ; GFX9-NEXT: v_rcp_f32_e32 v11, v10
2823 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2824 ; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0
2825 ; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11
2826 ; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11
2827 ; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9
2828 ; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12
2829 ; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9
2830 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2831 ; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12
2832 ; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3
2833 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9
2834 ; GFX9-NEXT: v_fma_f32 v3, -v9, v7, v3
2835 ; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2
2836 ; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
2837 ; GFX9-NEXT: v_rcp_f32_e32 v10, v9
2838 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2839 ; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0
2840 ; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10
2841 ; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10
2842 ; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7
2843 ; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11
2844 ; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7
2845 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2846 ; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11
2847 ; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2
2848 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7
2849 ; GFX9-NEXT: v_fma_f32 v2, -v7, v6, v2
2850 ; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1
2851 ; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
2852 ; GFX9-NEXT: v_rcp_f32_e32 v9, v7
2853 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2854 ; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0
2855 ; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9
2856 ; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9
2857 ; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6
2858 ; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10
2859 ; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6
2860 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2861 ; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10
2862 ; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1
2863 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6
2864 ; GFX9-NEXT: v_fma_f32 v1, -v6, v5, v1
2865 ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0
2866 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
2867 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6
2868 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2869 ; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0
2870 ; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7
2871 ; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7
2872 ; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5
2873 ; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9
2874 ; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5
2875 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2876 ; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9
2877 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0
2878 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
2879 ; GFX9-NEXT: v_fma_f32 v0, -v5, v4, v0
2880 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
2881 ; GFX9-NEXT: s_endpgm
2883 ; GFX10-LABEL: frem_v4f32:
2885 ; GFX10-NEXT: s_clause 0x1
2886 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2887 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2888 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
2889 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2890 ; GFX10-NEXT: s_clause 0x1
2891 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
2892 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
2893 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2894 ; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3
2895 ; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
2896 ; GFX10-NEXT: v_rcp_f32_e32 v11, v10
2897 ; GFX10-NEXT: s_denorm_mode 15
2898 ; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0
2899 ; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v11
2900 ; GFX10-NEXT: v_mul_f32_e32 v12, v9, v11
2901 ; GFX10-NEXT: v_fma_f32 v13, -v10, v12, v9
2902 ; GFX10-NEXT: v_fmac_f32_e32 v12, v13, v11
2903 ; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9
2904 ; GFX10-NEXT: s_denorm_mode 12
2905 ; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12
2906 ; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3
2907 ; GFX10-NEXT: v_trunc_f32_e32 v9, v9
2908 ; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3
2909 ; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2
2910 ; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2
2911 ; GFX10-NEXT: v_rcp_f32_e32 v10, v9
2912 ; GFX10-NEXT: s_denorm_mode 15
2913 ; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0
2914 ; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v10
2915 ; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10
2916 ; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7
2917 ; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v10
2918 ; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7
2919 ; GFX10-NEXT: s_denorm_mode 12
2920 ; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11
2921 ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2
2922 ; GFX10-NEXT: v_trunc_f32_e32 v7, v7
2923 ; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2
2924 ; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1
2925 ; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1
2926 ; GFX10-NEXT: v_rcp_f32_e32 v9, v7
2927 ; GFX10-NEXT: s_denorm_mode 15
2928 ; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0
2929 ; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v9
2930 ; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9
2931 ; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6
2932 ; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v9
2933 ; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6
2934 ; GFX10-NEXT: s_denorm_mode 12
2935 ; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10
2936 ; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1
2937 ; GFX10-NEXT: v_trunc_f32_e32 v6, v6
2938 ; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1
2939 ; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0
2940 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0
2941 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6
2942 ; GFX10-NEXT: s_denorm_mode 15
2943 ; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0
2944 ; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v7
2945 ; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7
2946 ; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5
2947 ; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v7
2948 ; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5
2949 ; GFX10-NEXT: s_denorm_mode 12
2950 ; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9
2951 ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0
2952 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5
2953 ; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0
2954 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
2955 ; GFX10-NEXT: s_endpgm
2957 ; GFX11-LABEL: frem_v4f32:
2959 ; GFX11-NEXT: s_clause 0x1
2960 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2961 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2962 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
2963 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2964 ; GFX11-NEXT: s_clause 0x1
2965 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7]
2966 ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64
2967 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2968 ; GFX11-NEXT: v_div_scale_f32 v10, null, v7, v7, v3
2969 ; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
2970 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
2971 ; GFX11-NEXT: v_rcp_f32_e32 v11, v10
2972 ; GFX11-NEXT: s_denorm_mode 15
2973 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2974 ; GFX11-NEXT: v_fma_f32 v12, -v10, v11, 1.0
2975 ; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v11
2976 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2977 ; GFX11-NEXT: v_mul_f32_e32 v12, v9, v11
2978 ; GFX11-NEXT: v_fma_f32 v13, -v10, v12, v9
2979 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2980 ; GFX11-NEXT: v_fmac_f32_e32 v12, v13, v11
2981 ; GFX11-NEXT: v_fma_f32 v9, -v10, v12, v9
2982 ; GFX11-NEXT: s_denorm_mode 12
2983 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2984 ; GFX11-NEXT: v_div_fmas_f32 v9, v9, v11, v12
2985 ; GFX11-NEXT: v_div_fixup_f32 v9, v9, v7, v3
2986 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2987 ; GFX11-NEXT: v_trunc_f32_e32 v9, v9
2988 ; GFX11-NEXT: v_fma_f32 v3, -v9, v7, v3
2989 ; GFX11-NEXT: v_div_scale_f32 v9, null, v6, v6, v2
2990 ; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2
2991 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
2992 ; GFX11-NEXT: v_rcp_f32_e32 v10, v9
2993 ; GFX11-NEXT: s_denorm_mode 15
2994 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2995 ; GFX11-NEXT: v_fma_f32 v11, -v9, v10, 1.0
2996 ; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v10
2997 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2998 ; GFX11-NEXT: v_mul_f32_e32 v11, v7, v10
2999 ; GFX11-NEXT: v_fma_f32 v12, -v9, v11, v7
3000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3001 ; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v10
3002 ; GFX11-NEXT: v_fma_f32 v7, -v9, v11, v7
3003 ; GFX11-NEXT: s_denorm_mode 12
3004 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3005 ; GFX11-NEXT: v_div_fmas_f32 v7, v7, v10, v11
3006 ; GFX11-NEXT: v_div_fixup_f32 v7, v7, v6, v2
3007 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3008 ; GFX11-NEXT: v_trunc_f32_e32 v7, v7
3009 ; GFX11-NEXT: v_fma_f32 v2, -v7, v6, v2
3010 ; GFX11-NEXT: v_div_scale_f32 v7, null, v5, v5, v1
3011 ; GFX11-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1
3012 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3013 ; GFX11-NEXT: v_rcp_f32_e32 v9, v7
3014 ; GFX11-NEXT: s_denorm_mode 15
3015 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3016 ; GFX11-NEXT: v_fma_f32 v10, -v7, v9, 1.0
3017 ; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v9
3018 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3019 ; GFX11-NEXT: v_mul_f32_e32 v10, v6, v9
3020 ; GFX11-NEXT: v_fma_f32 v11, -v7, v10, v6
3021 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3022 ; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v9
3023 ; GFX11-NEXT: v_fma_f32 v6, -v7, v10, v6
3024 ; GFX11-NEXT: s_denorm_mode 12
3025 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3026 ; GFX11-NEXT: v_div_fmas_f32 v6, v6, v9, v10
3027 ; GFX11-NEXT: v_div_fixup_f32 v6, v6, v5, v1
3028 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3029 ; GFX11-NEXT: v_trunc_f32_e32 v6, v6
3030 ; GFX11-NEXT: v_fma_f32 v1, -v6, v5, v1
3031 ; GFX11-NEXT: v_div_scale_f32 v6, null, v4, v4, v0
3032 ; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0
3033 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3034 ; GFX11-NEXT: v_rcp_f32_e32 v7, v6
3035 ; GFX11-NEXT: s_denorm_mode 15
3036 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3037 ; GFX11-NEXT: v_fma_f32 v9, -v6, v7, 1.0
3038 ; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v7
3039 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3040 ; GFX11-NEXT: v_mul_f32_e32 v9, v5, v7
3041 ; GFX11-NEXT: v_fma_f32 v10, -v6, v9, v5
3042 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3043 ; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v7
3044 ; GFX11-NEXT: v_fma_f32 v5, -v6, v9, v5
3045 ; GFX11-NEXT: s_denorm_mode 12
3046 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3047 ; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v9
3048 ; GFX11-NEXT: v_div_fixup_f32 v5, v5, v4, v0
3049 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3050 ; GFX11-NEXT: v_trunc_f32_e32 v5, v5
3051 ; GFX11-NEXT: v_fma_f32 v0, -v5, v4, v0
3052 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5]
3053 ; GFX11-NEXT: s_nop 0
3054 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3055 ; GFX11-NEXT: s_endpgm
3056 ptr addrspace(1) %in2) #0 {
3057 %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
3058 %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16
3059 %r1 = load <4 x float>, ptr addrspace(1) %gep2, align 16
3060 %r2 = frem <4 x float> %r0, %r1
3061 store <4 x float> %r2, ptr addrspace(1) %out, align 16
3065 define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
3066 ; SI-LABEL: frem_v2f64:
3068 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
3069 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
3070 ; SI-NEXT: s_mov_b32 s7, 0xf000
3071 ; SI-NEXT: s_mov_b32 s6, -1
3072 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3073 ; SI-NEXT: s_mov_b32 s4, s8
3074 ; SI-NEXT: s_mov_b32 s5, s9
3075 ; SI-NEXT: s_mov_b32 s8, s10
3076 ; SI-NEXT: s_mov_b32 s9, s11
3077 ; SI-NEXT: s_mov_b32 s10, s6
3078 ; SI-NEXT: s_mov_b32 s11, s7
3079 ; SI-NEXT: s_mov_b32 s2, s6
3080 ; SI-NEXT: s_mov_b32 s3, s7
3081 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
3082 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64
3083 ; SI-NEXT: s_waitcnt vmcnt(0)
3084 ; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
3085 ; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
3086 ; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3087 ; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3088 ; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3089 ; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3090 ; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3]
3091 ; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3092 ; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13]
3093 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9
3094 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v13
3095 ; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
3097 ; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
3098 ; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3099 ; SI-NEXT: v_readfirstlane_b32 s8, v9
3100 ; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014
3101 ; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01
3102 ; SI-NEXT: s_mov_b32 s3, 0xfffff
3103 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9
3104 ; SI-NEXT: v_not_b32_e32 v10, s0
3105 ; SI-NEXT: v_and_b32_e32 v10, v8, v10
3106 ; SI-NEXT: v_not_b32_e32 v11, s1
3107 ; SI-NEXT: v_and_b32_e32 v9, v9, v11
3108 ; SI-NEXT: s_and_b32 s0, s8, 0x80000000
3109 ; SI-NEXT: s_cmp_lt_i32 s9, 0
3110 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
3111 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
3112 ; SI-NEXT: v_mov_b32_e32 v11, s0
3113 ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
3114 ; SI-NEXT: s_cmp_gt_i32 s9, 51
3115 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
3116 ; SI-NEXT: v_mov_b32_e32 v11, s8
3117 ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
3118 ; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
3119 ; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3120 ; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
3121 ; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
3122 ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3123 ; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3124 ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3125 ; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3126 ; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1]
3127 ; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
3128 ; SI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11]
3129 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
3130 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v11
3131 ; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
3133 ; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
3134 ; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3135 ; SI-NEXT: v_readfirstlane_b32 s8, v7
3136 ; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014
3137 ; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01
3138 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9
3139 ; SI-NEXT: v_not_b32_e32 v8, s0
3140 ; SI-NEXT: v_and_b32_e32 v8, v6, v8
3141 ; SI-NEXT: v_not_b32_e32 v9, s1
3142 ; SI-NEXT: v_and_b32_e32 v7, v7, v9
3143 ; SI-NEXT: s_and_b32 s0, s8, 0x80000000
3144 ; SI-NEXT: s_cmp_lt_i32 s9, 0
3145 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
3146 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
3147 ; SI-NEXT: v_mov_b32_e32 v9, s0
3148 ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
3149 ; SI-NEXT: s_cmp_gt_i32 s9, 51
3150 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
3151 ; SI-NEXT: v_mov_b32_e32 v9, s8
3152 ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
3153 ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
3154 ; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3155 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
3158 ; CI-LABEL: frem_v2f64:
3160 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3161 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
3162 ; CI-NEXT: s_mov_b32 s3, 0xf000
3163 ; CI-NEXT: s_mov_b32 s2, -1
3164 ; CI-NEXT: s_mov_b32 s10, s2
3165 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3166 ; CI-NEXT: s_mov_b32 s0, s4
3167 ; CI-NEXT: s_mov_b32 s1, s5
3168 ; CI-NEXT: s_mov_b32 s4, s6
3169 ; CI-NEXT: s_mov_b32 s5, s7
3170 ; CI-NEXT: s_mov_b32 s6, s2
3171 ; CI-NEXT: s_mov_b32 s7, s3
3172 ; CI-NEXT: s_mov_b32 s11, s3
3173 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
3174 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
3175 ; CI-NEXT: s_waitcnt vmcnt(0)
3176 ; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3]
3177 ; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
3178 ; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3179 ; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3180 ; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3181 ; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3182 ; CI-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
3183 ; CI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3184 ; CI-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3186 ; CI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3187 ; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3188 ; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
3189 ; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3190 ; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1]
3191 ; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
3192 ; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3193 ; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3194 ; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3195 ; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3196 ; CI-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
3197 ; CI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
3198 ; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3200 ; CI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3201 ; CI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3202 ; CI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
3203 ; CI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3204 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3207 ; VI-LABEL: frem_v2f64:
3209 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3210 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3211 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3212 ; VI-NEXT: v_mov_b32_e32 v0, s6
3213 ; VI-NEXT: s_add_u32 s0, s0, 64
3214 ; VI-NEXT: s_addc_u32 s1, s1, 0
3215 ; VI-NEXT: v_mov_b32_e32 v5, s1
3216 ; VI-NEXT: v_mov_b32_e32 v1, s7
3217 ; VI-NEXT: v_mov_b32_e32 v4, s0
3218 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
3219 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
3220 ; VI-NEXT: v_mov_b32_e32 v8, s4
3221 ; VI-NEXT: v_mov_b32_e32 v9, s5
3222 ; VI-NEXT: s_waitcnt vmcnt(0)
3223 ; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3]
3224 ; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11]
3225 ; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
3226 ; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
3227 ; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
3228 ; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
3229 ; VI-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3]
3230 ; VI-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13]
3231 ; VI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15]
3233 ; VI-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17]
3234 ; VI-NEXT: v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3]
3235 ; VI-NEXT: v_trunc_f64_e32 v[10:11], v[10:11]
3236 ; VI-NEXT: v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3]
3237 ; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
3238 ; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
3239 ; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
3240 ; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3241 ; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
3242 ; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3243 ; VI-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1]
3244 ; VI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3245 ; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13]
3247 ; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15]
3248 ; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3249 ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
3250 ; VI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3251 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
3254 ; GFX9-LABEL: frem_v2f64:
3256 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3257 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3258 ; GFX9-NEXT: v_mov_b32_e32 v16, 0
3259 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3260 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
3261 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
3262 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3263 ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
3264 ; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
3265 ; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3266 ; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3267 ; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3268 ; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3269 ; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
3270 ; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3271 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3272 ; GFX9-NEXT: s_nop 1
3273 ; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3274 ; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3275 ; GFX9-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
3276 ; GFX9-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3277 ; GFX9-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
3278 ; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
3279 ; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3280 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3281 ; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3282 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3283 ; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
3284 ; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
3285 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3286 ; GFX9-NEXT: s_nop 1
3287 ; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3288 ; GFX9-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3289 ; GFX9-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
3290 ; GFX9-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3291 ; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5]
3292 ; GFX9-NEXT: s_endpgm
3294 ; GFX10-LABEL: frem_v2f64:
3296 ; GFX10-NEXT: s_clause 0x1
3297 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3298 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3299 ; GFX10-NEXT: v_mov_b32_e32 v16, 0
3300 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3301 ; GFX10-NEXT: s_clause 0x1
3302 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
3303 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
3304 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3305 ; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3]
3306 ; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
3307 ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3308 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3309 ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3310 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3311 ; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
3312 ; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3313 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3314 ; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3315 ; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3316 ; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
3317 ; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3318 ; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1]
3319 ; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
3320 ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3321 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3322 ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3323 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3324 ; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
3325 ; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
3326 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3327 ; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3328 ; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3329 ; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
3330 ; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3331 ; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5]
3332 ; GFX10-NEXT: s_endpgm
3334 ; GFX11-LABEL: frem_v2f64:
3336 ; GFX11-NEXT: s_clause 0x1
3337 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
3338 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
3339 ; GFX11-NEXT: v_mov_b32_e32 v16, 0
3340 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3341 ; GFX11-NEXT: s_clause 0x1
3342 ; GFX11-NEXT: global_load_b128 v[0:3], v16, s[6:7]
3343 ; GFX11-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64
3344 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3345 ; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
3346 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
3347 ; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
3348 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3349 ; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3350 ; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3351 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3352 ; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3353 ; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3354 ; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
3355 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3356 ; GFX11-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3357 ; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3358 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3359 ; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3360 ; GFX11-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3361 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3362 ; GFX11-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
3363 ; GFX11-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3364 ; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1]
3365 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
3366 ; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
3367 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3368 ; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3369 ; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3370 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3371 ; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3372 ; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3373 ; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
3374 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3375 ; GFX11-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
3376 ; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3377 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3378 ; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3379 ; GFX11-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3380 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3381 ; GFX11-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
3382 ; GFX11-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3383 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[4:5]
3384 ; GFX11-NEXT: s_nop 0
3385 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3386 ; GFX11-NEXT: s_endpgm
3387 ptr addrspace(1) %in2) #0 {
3388 %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
3389 %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16
3390 %r1 = load <2 x double>, ptr addrspace(1) %gep2, align 16
3391 %r2 = frem <2 x double> %r0, %r1
3392 store <2 x double> %r2, ptr addrspace(1) %out, align 16
3396 attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
3397 attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }