1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
8 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s
10 define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
13 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
14 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
15 ; SI-NEXT: s_mov_b32 s11, 0xf000
16 ; SI-NEXT: s_mov_b32 s10, -1
17 ; SI-NEXT: s_waitcnt lgkmcnt(0)
18 ; SI-NEXT: s_mov_b32 s8, s4
19 ; SI-NEXT: s_mov_b32 s9, s5
20 ; SI-NEXT: s_mov_b32 s4, s6
21 ; SI-NEXT: s_mov_b32 s5, s7
22 ; SI-NEXT: s_mov_b32 s6, s10
23 ; SI-NEXT: s_mov_b32 s7, s11
24 ; SI-NEXT: s_mov_b32 s2, s10
25 ; SI-NEXT: s_mov_b32 s3, s11
26 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
27 ; SI-NEXT: s_waitcnt vmcnt(0)
28 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
29 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
30 ; SI-NEXT: s_waitcnt vmcnt(0)
31 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
32 ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0
33 ; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0
34 ; SI-NEXT: v_rcp_f32_e32 v4, v3
35 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
36 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0
37 ; SI-NEXT: v_fma_f32 v4, v5, v4, v4
38 ; SI-NEXT: v_mul_f32_e32 v5, v2, v4
39 ; SI-NEXT: v_fma_f32 v6, -v3, v5, v2
40 ; SI-NEXT: v_fma_f32 v5, v6, v4, v5
41 ; SI-NEXT: v_fma_f32 v2, -v3, v5, v2
42 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
43 ; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
44 ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
45 ; SI-NEXT: v_trunc_f32_e32 v2, v2
46 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
47 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
48 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
49 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
54 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
55 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
56 ; CI-NEXT: s_mov_b32 s11, 0xf000
57 ; CI-NEXT: s_mov_b32 s10, -1
58 ; CI-NEXT: s_mov_b32 s2, s10
59 ; CI-NEXT: s_waitcnt lgkmcnt(0)
60 ; CI-NEXT: s_mov_b32 s8, s4
61 ; CI-NEXT: s_mov_b32 s9, s5
62 ; CI-NEXT: s_mov_b32 s4, s6
63 ; CI-NEXT: s_mov_b32 s5, s7
64 ; CI-NEXT: s_mov_b32 s6, s10
65 ; CI-NEXT: s_mov_b32 s7, s11
66 ; CI-NEXT: s_mov_b32 s3, s11
67 ; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
68 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
69 ; CI-NEXT: s_waitcnt vmcnt(1)
70 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
71 ; CI-NEXT: s_waitcnt vmcnt(0)
72 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
73 ; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0
74 ; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0
75 ; CI-NEXT: v_rcp_f32_e32 v4, v3
76 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
77 ; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0
78 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
79 ; CI-NEXT: v_mul_f32_e32 v5, v2, v4
80 ; CI-NEXT: v_fma_f32 v6, -v3, v5, v2
81 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
82 ; CI-NEXT: v_fma_f32 v2, -v3, v5, v2
83 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
84 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
85 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
86 ; CI-NEXT: v_trunc_f32_e32 v2, v2
87 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
88 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
89 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
90 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0
95 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
96 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
97 ; VI-NEXT: s_waitcnt lgkmcnt(0)
98 ; VI-NEXT: v_mov_b32_e32 v2, s6
99 ; VI-NEXT: s_add_u32 s0, s0, 8
100 ; VI-NEXT: v_mov_b32_e32 v3, s7
101 ; VI-NEXT: s_addc_u32 s1, s1, 0
102 ; VI-NEXT: flat_load_ushort v4, v[2:3]
103 ; VI-NEXT: v_mov_b32_e32 v3, s1
104 ; VI-NEXT: v_mov_b32_e32 v2, s0
105 ; VI-NEXT: flat_load_ushort v2, v[2:3]
106 ; VI-NEXT: v_mov_b32_e32 v0, s4
107 ; VI-NEXT: v_mov_b32_e32 v1, s5
108 ; VI-NEXT: s_waitcnt vmcnt(1)
109 ; VI-NEXT: v_cvt_f32_f16_e32 v3, v4
110 ; VI-NEXT: s_waitcnt vmcnt(0)
111 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v2
112 ; VI-NEXT: v_rcp_f32_e32 v5, v5
113 ; VI-NEXT: v_mul_f32_e32 v3, v3, v5
114 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
115 ; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4
116 ; VI-NEXT: v_trunc_f16_e32 v3, v3
117 ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4
118 ; VI-NEXT: flat_store_short v[0:1], v2
121 ; GFX9-LABEL: frem_f16:
123 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
124 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
125 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
126 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
127 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
128 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
129 ; GFX9-NEXT: s_waitcnt vmcnt(0)
130 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
131 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3
132 ; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
133 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1
134 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
135 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1
136 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
137 ; GFX9-NEXT: s_endpgm
139 ; GFX10-LABEL: frem_f16:
141 ; GFX10-NEXT: s_clause 0x1
142 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
143 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
144 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
145 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
146 ; GFX10-NEXT: s_clause 0x1
147 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
148 ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
149 ; GFX10-NEXT: s_waitcnt vmcnt(0)
150 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
151 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
152 ; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
153 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
154 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
155 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
156 ; GFX10-NEXT: global_store_short v0, v1, s[4:5]
157 ; GFX10-NEXT: s_endpgm
159 ; GFX11-LABEL: frem_f16:
161 ; GFX11-NEXT: s_clause 0x1
162 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
163 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
164 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
165 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
166 ; GFX11-NEXT: s_clause 0x1
167 ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
168 ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
169 ; GFX11-NEXT: s_waitcnt vmcnt(0)
170 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
171 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
172 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
173 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
174 ; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
175 ; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1
176 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
177 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3
178 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1
179 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
180 ; GFX11-NEXT: s_nop 0
181 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
182 ; GFX11-NEXT: s_endpgm
184 ; GFX1150-LABEL: frem_f16:
186 ; GFX1150-NEXT: s_clause 0x1
187 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
188 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
189 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0
190 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
191 ; GFX1150-NEXT: s_clause 0x1
192 ; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
193 ; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
194 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
195 ; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2
196 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
197 ; GFX1150-NEXT: v_rcp_f32_e32 v3, v3
198 ; GFX1150-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
199 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
200 ; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1
201 ; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
202 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
203 ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
204 ; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
205 ; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5]
206 ; GFX1150-NEXT: s_nop 0
207 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
208 ; GFX1150-NEXT: s_endpgm
209 ptr addrspace(1) %in2) #0 {
210 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
211 %r0 = load half, ptr addrspace(1) %in1, align 4
212 %r1 = load half, ptr addrspace(1) %gep2, align 4
213 %r2 = frem half %r0, %r1
214 store half %r2, ptr addrspace(1) %out, align 4
218 define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
219 ; SI-LABEL: fast_frem_f16:
221 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
222 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
223 ; SI-NEXT: s_mov_b32 s11, 0xf000
224 ; SI-NEXT: s_mov_b32 s10, -1
225 ; SI-NEXT: s_waitcnt lgkmcnt(0)
226 ; SI-NEXT: s_mov_b32 s8, s4
227 ; SI-NEXT: s_mov_b32 s9, s5
228 ; SI-NEXT: s_mov_b32 s4, s6
229 ; SI-NEXT: s_mov_b32 s5, s7
230 ; SI-NEXT: s_mov_b32 s6, s10
231 ; SI-NEXT: s_mov_b32 s7, s11
232 ; SI-NEXT: s_mov_b32 s2, s10
233 ; SI-NEXT: s_mov_b32 s3, s11
234 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
235 ; SI-NEXT: s_waitcnt vmcnt(0)
236 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
237 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
238 ; SI-NEXT: s_waitcnt vmcnt(0)
239 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
240 ; SI-NEXT: v_rcp_f32_e32 v2, v1
241 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2
242 ; SI-NEXT: v_trunc_f32_e32 v2, v2
243 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
244 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
245 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
248 ; CI-LABEL: fast_frem_f16:
250 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
251 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
252 ; CI-NEXT: s_mov_b32 s11, 0xf000
253 ; CI-NEXT: s_mov_b32 s10, -1
254 ; CI-NEXT: s_mov_b32 s2, s10
255 ; CI-NEXT: s_mov_b32 s3, s11
256 ; CI-NEXT: s_waitcnt lgkmcnt(0)
257 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
258 ; CI-NEXT: s_mov_b32 s8, s4
259 ; CI-NEXT: s_mov_b32 s9, s5
260 ; CI-NEXT: s_mov_b32 s4, s6
261 ; CI-NEXT: s_mov_b32 s5, s7
262 ; CI-NEXT: s_mov_b32 s6, s10
263 ; CI-NEXT: s_mov_b32 s7, s11
264 ; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
265 ; CI-NEXT: s_waitcnt vmcnt(1)
266 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
267 ; CI-NEXT: v_rcp_f32_e32 v2, v1
268 ; CI-NEXT: s_waitcnt vmcnt(0)
269 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
270 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
271 ; CI-NEXT: v_trunc_f32_e32 v2, v2
272 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
273 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
274 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0
277 ; VI-LABEL: fast_frem_f16:
279 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
280 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
281 ; VI-NEXT: s_waitcnt lgkmcnt(0)
282 ; VI-NEXT: v_mov_b32_e32 v2, s6
283 ; VI-NEXT: s_add_u32 s0, s0, 8
284 ; VI-NEXT: v_mov_b32_e32 v3, s7
285 ; VI-NEXT: s_addc_u32 s1, s1, 0
286 ; VI-NEXT: flat_load_ushort v4, v[2:3]
287 ; VI-NEXT: v_mov_b32_e32 v3, s1
288 ; VI-NEXT: v_mov_b32_e32 v2, s0
289 ; VI-NEXT: flat_load_ushort v2, v[2:3]
290 ; VI-NEXT: v_mov_b32_e32 v0, s4
291 ; VI-NEXT: v_mov_b32_e32 v1, s5
292 ; VI-NEXT: s_waitcnt vmcnt(0)
293 ; VI-NEXT: v_rcp_f16_e32 v3, v2
294 ; VI-NEXT: v_mul_f16_e32 v3, v4, v3
295 ; VI-NEXT: v_trunc_f16_e32 v3, v3
296 ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4
297 ; VI-NEXT: flat_store_short v[0:1], v2
300 ; GFX9-LABEL: fast_frem_f16:
302 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
303 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
304 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
305 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
306 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
307 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
308 ; GFX9-NEXT: s_waitcnt vmcnt(0)
309 ; GFX9-NEXT: v_rcp_f16_e32 v3, v2
310 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3
311 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
312 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1
313 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
314 ; GFX9-NEXT: s_endpgm
316 ; GFX10-LABEL: fast_frem_f16:
318 ; GFX10-NEXT: s_clause 0x1
319 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
320 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
321 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
322 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
323 ; GFX10-NEXT: s_clause 0x1
324 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
325 ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
326 ; GFX10-NEXT: s_waitcnt vmcnt(0)
327 ; GFX10-NEXT: v_rcp_f16_e32 v3, v2
328 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3
329 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
330 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
331 ; GFX10-NEXT: global_store_short v0, v1, s[4:5]
332 ; GFX10-NEXT: s_endpgm
334 ; GFX11-LABEL: fast_frem_f16:
336 ; GFX11-NEXT: s_clause 0x1
337 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
338 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
339 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
340 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
341 ; GFX11-NEXT: s_clause 0x1
342 ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
343 ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
344 ; GFX11-NEXT: s_waitcnt vmcnt(0)
345 ; GFX11-NEXT: v_rcp_f16_e32 v3, v2
346 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
347 ; GFX11-NEXT: v_mul_f16_e32 v3, v1, v3
348 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
349 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3
350 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1
351 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
352 ; GFX11-NEXT: s_nop 0
353 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
354 ; GFX11-NEXT: s_endpgm
356 ; GFX1150-LABEL: fast_frem_f16:
358 ; GFX1150-NEXT: s_clause 0x1
359 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
360 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
361 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0
362 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
363 ; GFX1150-NEXT: s_clause 0x1
364 ; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
365 ; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
366 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
367 ; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
368 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
369 ; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3
370 ; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
371 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
372 ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
373 ; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
374 ; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5]
375 ; GFX1150-NEXT: s_nop 0
376 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
377 ; GFX1150-NEXT: s_endpgm
378 ptr addrspace(1) %in2) #0 {
379 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
380 %r0 = load half, ptr addrspace(1) %in1, align 4
381 %r1 = load half, ptr addrspace(1) %gep2, align 4
382 %r2 = frem fast half %r0, %r1
383 store half %r2, ptr addrspace(1) %out, align 4
387 define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
388 ; SI-LABEL: unsafe_frem_f16:
390 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
391 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
392 ; SI-NEXT: s_mov_b32 s11, 0xf000
393 ; SI-NEXT: s_mov_b32 s10, -1
394 ; SI-NEXT: s_waitcnt lgkmcnt(0)
395 ; SI-NEXT: s_mov_b32 s8, s4
396 ; SI-NEXT: s_mov_b32 s9, s5
397 ; SI-NEXT: s_mov_b32 s4, s6
398 ; SI-NEXT: s_mov_b32 s5, s7
399 ; SI-NEXT: s_mov_b32 s6, s10
400 ; SI-NEXT: s_mov_b32 s7, s11
401 ; SI-NEXT: s_mov_b32 s2, s10
402 ; SI-NEXT: s_mov_b32 s3, s11
403 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
404 ; SI-NEXT: s_waitcnt vmcnt(0)
405 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
406 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
407 ; SI-NEXT: s_waitcnt vmcnt(0)
408 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
409 ; SI-NEXT: v_rcp_f32_e32 v2, v1
410 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2
411 ; SI-NEXT: v_trunc_f32_e32 v2, v2
412 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
413 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
414 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
417 ; CI-LABEL: unsafe_frem_f16:
419 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
420 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
421 ; CI-NEXT: s_mov_b32 s11, 0xf000
422 ; CI-NEXT: s_mov_b32 s10, -1
423 ; CI-NEXT: s_mov_b32 s2, s10
424 ; CI-NEXT: s_mov_b32 s3, s11
425 ; CI-NEXT: s_waitcnt lgkmcnt(0)
426 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
427 ; CI-NEXT: s_mov_b32 s8, s4
428 ; CI-NEXT: s_mov_b32 s9, s5
429 ; CI-NEXT: s_mov_b32 s4, s6
430 ; CI-NEXT: s_mov_b32 s5, s7
431 ; CI-NEXT: s_mov_b32 s6, s10
432 ; CI-NEXT: s_mov_b32 s7, s11
433 ; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
434 ; CI-NEXT: s_waitcnt vmcnt(1)
435 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
436 ; CI-NEXT: v_rcp_f32_e32 v2, v1
437 ; CI-NEXT: s_waitcnt vmcnt(0)
438 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
439 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
440 ; CI-NEXT: v_trunc_f32_e32 v2, v2
441 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
442 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
443 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0
446 ; VI-LABEL: unsafe_frem_f16:
448 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
449 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
450 ; VI-NEXT: s_waitcnt lgkmcnt(0)
451 ; VI-NEXT: v_mov_b32_e32 v2, s6
452 ; VI-NEXT: s_add_u32 s0, s0, 8
453 ; VI-NEXT: v_mov_b32_e32 v3, s7
454 ; VI-NEXT: s_addc_u32 s1, s1, 0
455 ; VI-NEXT: flat_load_ushort v4, v[2:3]
456 ; VI-NEXT: v_mov_b32_e32 v3, s1
457 ; VI-NEXT: v_mov_b32_e32 v2, s0
458 ; VI-NEXT: flat_load_ushort v2, v[2:3]
459 ; VI-NEXT: v_mov_b32_e32 v0, s4
460 ; VI-NEXT: v_mov_b32_e32 v1, s5
461 ; VI-NEXT: s_waitcnt vmcnt(0)
462 ; VI-NEXT: v_rcp_f16_e32 v3, v2
463 ; VI-NEXT: v_mul_f16_e32 v3, v4, v3
464 ; VI-NEXT: v_trunc_f16_e32 v3, v3
465 ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4
466 ; VI-NEXT: flat_store_short v[0:1], v2
469 ; GFX9-LABEL: unsafe_frem_f16:
471 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
472 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
473 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
474 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
475 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
476 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
477 ; GFX9-NEXT: s_waitcnt vmcnt(0)
478 ; GFX9-NEXT: v_rcp_f16_e32 v3, v2
479 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3
480 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
481 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1
482 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
483 ; GFX9-NEXT: s_endpgm
485 ; GFX10-LABEL: unsafe_frem_f16:
487 ; GFX10-NEXT: s_clause 0x1
488 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
489 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
490 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
491 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
492 ; GFX10-NEXT: s_clause 0x1
493 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
494 ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
495 ; GFX10-NEXT: s_waitcnt vmcnt(0)
496 ; GFX10-NEXT: v_rcp_f16_e32 v3, v2
497 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3
498 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
499 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
500 ; GFX10-NEXT: global_store_short v0, v1, s[4:5]
501 ; GFX10-NEXT: s_endpgm
503 ; GFX11-LABEL: unsafe_frem_f16:
505 ; GFX11-NEXT: s_clause 0x1
506 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
507 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
508 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
509 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
510 ; GFX11-NEXT: s_clause 0x1
511 ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
512 ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
513 ; GFX11-NEXT: s_waitcnt vmcnt(0)
514 ; GFX11-NEXT: v_rcp_f16_e32 v3, v2
515 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
516 ; GFX11-NEXT: v_mul_f16_e32 v3, v1, v3
517 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
518 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3
519 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1
520 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
521 ; GFX11-NEXT: s_nop 0
522 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
523 ; GFX11-NEXT: s_endpgm
525 ; GFX1150-LABEL: unsafe_frem_f16:
527 ; GFX1150-NEXT: s_clause 0x1
528 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
529 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
530 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0
531 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
532 ; GFX1150-NEXT: s_clause 0x1
533 ; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
534 ; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
535 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
536 ; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
537 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
538 ; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3
539 ; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
540 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
541 ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
542 ; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
543 ; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5]
544 ; GFX1150-NEXT: s_nop 0
545 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
546 ; GFX1150-NEXT: s_endpgm
547 ptr addrspace(1) %in2) #1 {
548 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
549 %r0 = load half, ptr addrspace(1) %in1, align 4
550 %r1 = load half, ptr addrspace(1) %gep2, align 4
551 %r2 = frem afn half %r0, %r1
552 store half %r2, ptr addrspace(1) %out, align 4
556 define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
557 ; SI-LABEL: frem_f32:
559 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
560 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
561 ; SI-NEXT: s_mov_b32 s11, 0xf000
562 ; SI-NEXT: s_mov_b32 s10, -1
563 ; SI-NEXT: s_waitcnt lgkmcnt(0)
564 ; SI-NEXT: s_mov_b32 s8, s4
565 ; SI-NEXT: s_mov_b32 s9, s5
566 ; SI-NEXT: s_mov_b32 s4, s6
567 ; SI-NEXT: s_mov_b32 s5, s7
568 ; SI-NEXT: s_mov_b32 s6, s10
569 ; SI-NEXT: s_mov_b32 s7, s11
570 ; SI-NEXT: s_mov_b32 s2, s10
571 ; SI-NEXT: s_mov_b32 s3, s11
572 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
573 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
574 ; SI-NEXT: s_waitcnt vmcnt(0)
575 ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0
576 ; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0
577 ; SI-NEXT: v_rcp_f32_e32 v4, v3
578 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
579 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0
580 ; SI-NEXT: v_fma_f32 v4, v5, v4, v4
581 ; SI-NEXT: v_mul_f32_e32 v5, v2, v4
582 ; SI-NEXT: v_fma_f32 v6, -v3, v5, v2
583 ; SI-NEXT: v_fma_f32 v5, v6, v4, v5
584 ; SI-NEXT: v_fma_f32 v2, -v3, v5, v2
585 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
586 ; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
587 ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
588 ; SI-NEXT: v_trunc_f32_e32 v2, v2
589 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
590 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
593 ; CI-LABEL: frem_f32:
595 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
596 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
597 ; CI-NEXT: s_mov_b32 s11, 0xf000
598 ; CI-NEXT: s_mov_b32 s10, -1
599 ; CI-NEXT: s_mov_b32 s2, s10
600 ; CI-NEXT: s_waitcnt lgkmcnt(0)
601 ; CI-NEXT: s_mov_b32 s8, s4
602 ; CI-NEXT: s_mov_b32 s9, s5
603 ; CI-NEXT: s_mov_b32 s4, s6
604 ; CI-NEXT: s_mov_b32 s5, s7
605 ; CI-NEXT: s_mov_b32 s6, s10
606 ; CI-NEXT: s_mov_b32 s7, s11
607 ; CI-NEXT: s_mov_b32 s3, s11
608 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
609 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
610 ; CI-NEXT: s_waitcnt vmcnt(0)
611 ; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0
612 ; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0
613 ; CI-NEXT: v_rcp_f32_e32 v4, v3
614 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
615 ; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0
616 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
617 ; CI-NEXT: v_mul_f32_e32 v5, v2, v4
618 ; CI-NEXT: v_fma_f32 v6, -v3, v5, v2
619 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
620 ; CI-NEXT: v_fma_f32 v2, -v3, v5, v2
621 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
622 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
623 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
624 ; CI-NEXT: v_trunc_f32_e32 v2, v2
625 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
626 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0
629 ; VI-LABEL: frem_f32:
631 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
632 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
633 ; VI-NEXT: s_waitcnt lgkmcnt(0)
634 ; VI-NEXT: v_mov_b32_e32 v2, s6
635 ; VI-NEXT: s_add_u32 s0, s0, 16
636 ; VI-NEXT: v_mov_b32_e32 v3, s7
637 ; VI-NEXT: s_addc_u32 s1, s1, 0
638 ; VI-NEXT: flat_load_dword v4, v[2:3]
639 ; VI-NEXT: v_mov_b32_e32 v3, s1
640 ; VI-NEXT: v_mov_b32_e32 v2, s0
641 ; VI-NEXT: flat_load_dword v2, v[2:3]
642 ; VI-NEXT: v_mov_b32_e32 v0, s4
643 ; VI-NEXT: v_mov_b32_e32 v1, s5
644 ; VI-NEXT: s_waitcnt vmcnt(0)
645 ; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4
646 ; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4
647 ; VI-NEXT: v_rcp_f32_e32 v6, v5
648 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
649 ; VI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
650 ; VI-NEXT: v_fma_f32 v6, v7, v6, v6
651 ; VI-NEXT: v_mul_f32_e32 v7, v3, v6
652 ; VI-NEXT: v_fma_f32 v8, -v5, v7, v3
653 ; VI-NEXT: v_fma_f32 v7, v8, v6, v7
654 ; VI-NEXT: v_fma_f32 v3, -v5, v7, v3
655 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
656 ; VI-NEXT: v_div_fmas_f32 v3, v3, v6, v7
657 ; VI-NEXT: v_div_fixup_f32 v3, v3, v2, v4
658 ; VI-NEXT: v_trunc_f32_e32 v3, v3
659 ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4
660 ; VI-NEXT: flat_store_dword v[0:1], v2
663 ; GFX9-LABEL: frem_f32:
665 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
666 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
667 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
668 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
669 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
670 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16
671 ; GFX9-NEXT: s_waitcnt vmcnt(0)
672 ; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1
673 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1
674 ; GFX9-NEXT: v_rcp_f32_e32 v5, v4
675 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
676 ; GFX9-NEXT: v_fma_f32 v6, -v4, v5, 1.0
677 ; GFX9-NEXT: v_fma_f32 v5, v6, v5, v5
678 ; GFX9-NEXT: v_mul_f32_e32 v6, v3, v5
679 ; GFX9-NEXT: v_fma_f32 v7, -v4, v6, v3
680 ; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6
681 ; GFX9-NEXT: v_fma_f32 v3, -v4, v6, v3
682 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
683 ; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v6
684 ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v1
685 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
686 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1
687 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
688 ; GFX9-NEXT: s_endpgm
690 ; GFX10-LABEL: frem_f32:
692 ; GFX10-NEXT: s_clause 0x1
693 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
694 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
695 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
696 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
697 ; GFX10-NEXT: s_clause 0x1
698 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
699 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16
700 ; GFX10-NEXT: s_waitcnt vmcnt(0)
701 ; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1
702 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
703 ; GFX10-NEXT: v_rcp_f32_e32 v5, v4
704 ; GFX10-NEXT: s_denorm_mode 15
705 ; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0
706 ; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v5
707 ; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5
708 ; GFX10-NEXT: v_fma_f32 v7, -v4, v6, v3
709 ; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v5
710 ; GFX10-NEXT: v_fma_f32 v3, -v4, v6, v3
711 ; GFX10-NEXT: s_denorm_mode 12
712 ; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6
713 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1
714 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
715 ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
716 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
717 ; GFX10-NEXT: s_endpgm
719 ; GFX11-LABEL: frem_f32:
721 ; GFX11-NEXT: s_clause 0x1
722 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
723 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
724 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
725 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
726 ; GFX11-NEXT: s_clause 0x1
727 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
728 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
729 ; GFX11-NEXT: s_waitcnt vmcnt(0)
730 ; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
731 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
732 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
733 ; GFX11-NEXT: v_rcp_f32_e32 v5, v4
734 ; GFX11-NEXT: s_denorm_mode 15
735 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
736 ; GFX11-NEXT: v_fma_f32 v6, -v4, v5, 1.0
737 ; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v5
738 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
739 ; GFX11-NEXT: v_mul_f32_e32 v6, v3, v5
740 ; GFX11-NEXT: v_fma_f32 v7, -v4, v6, v3
741 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
742 ; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v5
743 ; GFX11-NEXT: v_fma_f32 v3, -v4, v6, v3
744 ; GFX11-NEXT: s_denorm_mode 12
745 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
746 ; GFX11-NEXT: v_div_fmas_f32 v3, v3, v5, v6
747 ; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v1
748 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
749 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
750 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1
751 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
752 ; GFX11-NEXT: s_nop 0
753 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
754 ; GFX11-NEXT: s_endpgm
756 ; GFX1150-LABEL: frem_f32:
758 ; GFX1150-NEXT: s_clause 0x1
759 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
760 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
761 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0
762 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
763 ; GFX1150-NEXT: s_clause 0x1
764 ; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
765 ; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
766 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
767 ; GFX1150-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
768 ; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
769 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
770 ; GFX1150-NEXT: v_rcp_f32_e32 v5, v4
771 ; GFX1150-NEXT: s_denorm_mode 15
772 ; GFX1150-NEXT: v_fma_f32 v6, -v4, v5, 1.0
773 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
774 ; GFX1150-NEXT: v_fmac_f32_e32 v5, v6, v5
775 ; GFX1150-NEXT: v_mul_f32_e32 v6, v3, v5
776 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
777 ; GFX1150-NEXT: v_fma_f32 v7, -v4, v6, v3
778 ; GFX1150-NEXT: v_fmac_f32_e32 v6, v7, v5
779 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
780 ; GFX1150-NEXT: v_fma_f32 v3, -v4, v6, v3
781 ; GFX1150-NEXT: s_denorm_mode 12
782 ; GFX1150-NEXT: v_div_fmas_f32 v3, v3, v5, v6
783 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
784 ; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v2, v1
785 ; GFX1150-NEXT: v_trunc_f32_e32 v3, v3
786 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
787 ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
788 ; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2
789 ; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5]
790 ; GFX1150-NEXT: s_nop 0
791 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
792 ; GFX1150-NEXT: s_endpgm
793 ptr addrspace(1) %in2) #0 {
794 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
795 %r0 = load float, ptr addrspace(1) %in1, align 4
796 %r1 = load float, ptr addrspace(1) %gep2, align 4
797 %r2 = frem float %r0, %r1
798 store float %r2, ptr addrspace(1) %out, align 4
802 define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
803 ; SI-LABEL: fast_frem_f32:
805 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
806 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
807 ; SI-NEXT: s_mov_b32 s11, 0xf000
808 ; SI-NEXT: s_mov_b32 s10, -1
809 ; SI-NEXT: s_waitcnt lgkmcnt(0)
810 ; SI-NEXT: s_mov_b32 s8, s4
811 ; SI-NEXT: s_mov_b32 s9, s5
812 ; SI-NEXT: s_mov_b32 s4, s6
813 ; SI-NEXT: s_mov_b32 s5, s7
814 ; SI-NEXT: s_mov_b32 s6, s10
815 ; SI-NEXT: s_mov_b32 s7, s11
816 ; SI-NEXT: s_mov_b32 s2, s10
817 ; SI-NEXT: s_mov_b32 s3, s11
818 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
819 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
820 ; SI-NEXT: s_waitcnt vmcnt(0)
821 ; SI-NEXT: v_rcp_f32_e32 v2, v1
822 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2
823 ; SI-NEXT: v_trunc_f32_e32 v2, v2
824 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
825 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
828 ; CI-LABEL: fast_frem_f32:
830 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
831 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
832 ; CI-NEXT: s_mov_b32 s11, 0xf000
833 ; CI-NEXT: s_mov_b32 s10, -1
834 ; CI-NEXT: s_mov_b32 s2, s10
835 ; CI-NEXT: s_waitcnt lgkmcnt(0)
836 ; CI-NEXT: s_mov_b32 s8, s4
837 ; CI-NEXT: s_mov_b32 s9, s5
838 ; CI-NEXT: s_mov_b32 s4, s6
839 ; CI-NEXT: s_mov_b32 s5, s7
840 ; CI-NEXT: s_mov_b32 s6, s10
841 ; CI-NEXT: s_mov_b32 s7, s11
842 ; CI-NEXT: s_mov_b32 s3, s11
843 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
844 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
845 ; CI-NEXT: s_waitcnt vmcnt(0)
846 ; CI-NEXT: v_rcp_f32_e32 v2, v1
847 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
848 ; CI-NEXT: v_trunc_f32_e32 v2, v2
849 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
850 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0
853 ; VI-LABEL: fast_frem_f32:
855 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
856 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
857 ; VI-NEXT: s_waitcnt lgkmcnt(0)
858 ; VI-NEXT: v_mov_b32_e32 v2, s6
859 ; VI-NEXT: s_add_u32 s0, s0, 16
860 ; VI-NEXT: v_mov_b32_e32 v3, s7
861 ; VI-NEXT: s_addc_u32 s1, s1, 0
862 ; VI-NEXT: flat_load_dword v4, v[2:3]
863 ; VI-NEXT: v_mov_b32_e32 v3, s1
864 ; VI-NEXT: v_mov_b32_e32 v2, s0
865 ; VI-NEXT: flat_load_dword v2, v[2:3]
866 ; VI-NEXT: v_mov_b32_e32 v0, s4
867 ; VI-NEXT: v_mov_b32_e32 v1, s5
868 ; VI-NEXT: s_waitcnt vmcnt(0)
869 ; VI-NEXT: v_rcp_f32_e32 v3, v2
870 ; VI-NEXT: v_mul_f32_e32 v3, v4, v3
871 ; VI-NEXT: v_trunc_f32_e32 v3, v3
872 ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4
873 ; VI-NEXT: flat_store_dword v[0:1], v2
876 ; GFX9-LABEL: fast_frem_f32:
878 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
879 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
880 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
881 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
882 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
883 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16
884 ; GFX9-NEXT: s_waitcnt vmcnt(0)
885 ; GFX9-NEXT: v_rcp_f32_e32 v3, v2
886 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3
887 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
888 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1
889 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
890 ; GFX9-NEXT: s_endpgm
892 ; GFX10-LABEL: fast_frem_f32:
894 ; GFX10-NEXT: s_clause 0x1
895 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
896 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
897 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
898 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
899 ; GFX10-NEXT: s_clause 0x1
900 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
901 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16
902 ; GFX10-NEXT: s_waitcnt vmcnt(0)
903 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2
904 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3
905 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
906 ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
907 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
908 ; GFX10-NEXT: s_endpgm
910 ; GFX11-LABEL: fast_frem_f32:
912 ; GFX11-NEXT: s_clause 0x1
913 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
914 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
915 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
916 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
917 ; GFX11-NEXT: s_clause 0x1
918 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
919 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
920 ; GFX11-NEXT: s_waitcnt vmcnt(0)
921 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2
922 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
923 ; GFX11-NEXT: v_mul_f32_e32 v3, v1, v3
924 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
925 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
926 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1
927 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
928 ; GFX11-NEXT: s_nop 0
929 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
930 ; GFX11-NEXT: s_endpgm
932 ; GFX1150-LABEL: fast_frem_f32:
934 ; GFX1150-NEXT: s_clause 0x1
935 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
936 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
937 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0
938 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
939 ; GFX1150-NEXT: s_clause 0x1
940 ; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
941 ; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
942 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
943 ; GFX1150-NEXT: v_rcp_f32_e32 v3, v2
944 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
945 ; GFX1150-NEXT: v_mul_f32_e32 v3, v1, v3
946 ; GFX1150-NEXT: v_trunc_f32_e32 v3, v3
947 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
948 ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
949 ; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2
950 ; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5]
951 ; GFX1150-NEXT: s_nop 0
952 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
953 ; GFX1150-NEXT: s_endpgm
954 ptr addrspace(1) %in2) #0 {
955 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
956 %r0 = load float, ptr addrspace(1) %in1, align 4
957 %r1 = load float, ptr addrspace(1) %gep2, align 4
958 %r2 = frem fast float %r0, %r1
959 store float %r2, ptr addrspace(1) %out, align 4
963 define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
964 ; SI-LABEL: unsafe_frem_f32:
966 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
967 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
968 ; SI-NEXT: s_mov_b32 s11, 0xf000
969 ; SI-NEXT: s_mov_b32 s10, -1
970 ; SI-NEXT: s_waitcnt lgkmcnt(0)
971 ; SI-NEXT: s_mov_b32 s8, s4
972 ; SI-NEXT: s_mov_b32 s9, s5
973 ; SI-NEXT: s_mov_b32 s4, s6
974 ; SI-NEXT: s_mov_b32 s5, s7
975 ; SI-NEXT: s_mov_b32 s6, s10
976 ; SI-NEXT: s_mov_b32 s7, s11
977 ; SI-NEXT: s_mov_b32 s2, s10
978 ; SI-NEXT: s_mov_b32 s3, s11
979 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
980 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
981 ; SI-NEXT: s_waitcnt vmcnt(0)
982 ; SI-NEXT: v_rcp_f32_e32 v2, v1
983 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2
984 ; SI-NEXT: v_trunc_f32_e32 v2, v2
985 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
986 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
989 ; CI-LABEL: unsafe_frem_f32:
991 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
992 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
993 ; CI-NEXT: s_mov_b32 s11, 0xf000
994 ; CI-NEXT: s_mov_b32 s10, -1
995 ; CI-NEXT: s_mov_b32 s2, s10
996 ; CI-NEXT: s_waitcnt lgkmcnt(0)
997 ; CI-NEXT: s_mov_b32 s8, s4
998 ; CI-NEXT: s_mov_b32 s9, s5
999 ; CI-NEXT: s_mov_b32 s4, s6
1000 ; CI-NEXT: s_mov_b32 s5, s7
1001 ; CI-NEXT: s_mov_b32 s6, s10
1002 ; CI-NEXT: s_mov_b32 s7, s11
1003 ; CI-NEXT: s_mov_b32 s3, s11
1004 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
1005 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
1006 ; CI-NEXT: s_waitcnt vmcnt(0)
1007 ; CI-NEXT: v_rcp_f32_e32 v2, v1
1008 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
1009 ; CI-NEXT: v_trunc_f32_e32 v2, v2
1010 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
1011 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1014 ; VI-LABEL: unsafe_frem_f32:
1016 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1017 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1018 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1019 ; VI-NEXT: v_mov_b32_e32 v2, s6
1020 ; VI-NEXT: s_add_u32 s0, s0, 16
1021 ; VI-NEXT: v_mov_b32_e32 v3, s7
1022 ; VI-NEXT: s_addc_u32 s1, s1, 0
1023 ; VI-NEXT: flat_load_dword v4, v[2:3]
1024 ; VI-NEXT: v_mov_b32_e32 v3, s1
1025 ; VI-NEXT: v_mov_b32_e32 v2, s0
1026 ; VI-NEXT: flat_load_dword v2, v[2:3]
1027 ; VI-NEXT: v_mov_b32_e32 v0, s4
1028 ; VI-NEXT: v_mov_b32_e32 v1, s5
1029 ; VI-NEXT: s_waitcnt vmcnt(0)
1030 ; VI-NEXT: v_rcp_f32_e32 v3, v2
1031 ; VI-NEXT: v_mul_f32_e32 v3, v4, v3
1032 ; VI-NEXT: v_trunc_f32_e32 v3, v3
1033 ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4
1034 ; VI-NEXT: flat_store_dword v[0:1], v2
1037 ; GFX9-LABEL: unsafe_frem_f32:
1039 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1040 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1041 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1042 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1043 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
1044 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16
1045 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1046 ; GFX9-NEXT: v_rcp_f32_e32 v3, v2
1047 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3
1048 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
1049 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1
1050 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
1051 ; GFX9-NEXT: s_endpgm
1053 ; GFX10-LABEL: unsafe_frem_f32:
1055 ; GFX10-NEXT: s_clause 0x1
1056 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1057 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1058 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1059 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1060 ; GFX10-NEXT: s_clause 0x1
1061 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
1062 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16
1063 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1064 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2
1065 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3
1066 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
1067 ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
1068 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
1069 ; GFX10-NEXT: s_endpgm
1071 ; GFX11-LABEL: unsafe_frem_f32:
1073 ; GFX11-NEXT: s_clause 0x1
1074 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1075 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1076 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1077 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1078 ; GFX11-NEXT: s_clause 0x1
1079 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
1080 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
1081 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1082 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2
1083 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1084 ; GFX11-NEXT: v_mul_f32_e32 v3, v1, v3
1085 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1086 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
1087 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1
1088 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
1089 ; GFX11-NEXT: s_nop 0
1090 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1091 ; GFX11-NEXT: s_endpgm
1093 ; GFX1150-LABEL: unsafe_frem_f32:
1095 ; GFX1150-NEXT: s_clause 0x1
1096 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1097 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1098 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0
1099 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
1100 ; GFX1150-NEXT: s_clause 0x1
1101 ; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
1102 ; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
1103 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
1104 ; GFX1150-NEXT: v_rcp_f32_e32 v3, v2
1105 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1106 ; GFX1150-NEXT: v_mul_f32_e32 v3, v1, v3
1107 ; GFX1150-NEXT: v_trunc_f32_e32 v3, v3
1108 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1109 ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
1110 ; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2
1111 ; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5]
1112 ; GFX1150-NEXT: s_nop 0
1113 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1114 ; GFX1150-NEXT: s_endpgm
1115 ptr addrspace(1) %in2) #1 {
1116 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
1117 %r0 = load float, ptr addrspace(1) %in1, align 4
1118 %r1 = load float, ptr addrspace(1) %gep2, align 4
1119 %r2 = frem afn float %r0, %r1
1120 store float %r2, ptr addrspace(1) %out, align 4
1124 define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
1125 ; SI-LABEL: frem_f64:
1127 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
1128 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1129 ; SI-NEXT: s_mov_b32 s7, 0xf000
1130 ; SI-NEXT: s_mov_b32 s6, -1
1131 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1132 ; SI-NEXT: s_mov_b32 s4, s8
1133 ; SI-NEXT: s_mov_b32 s5, s9
1134 ; SI-NEXT: s_mov_b32 s8, s10
1135 ; SI-NEXT: s_mov_b32 s9, s11
1136 ; SI-NEXT: s_mov_b32 s10, s6
1137 ; SI-NEXT: s_mov_b32 s11, s7
1138 ; SI-NEXT: s_mov_b32 s2, s6
1139 ; SI-NEXT: s_mov_b32 s3, s7
1140 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1141 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1142 ; SI-NEXT: s_waitcnt vmcnt(0)
1143 ; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
1144 ; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1145 ; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1146 ; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1147 ; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1148 ; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1149 ; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1]
1150 ; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
1151 ; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9]
1152 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
1153 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9
1154 ; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
1156 ; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
1157 ; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1158 ; SI-NEXT: v_readfirstlane_b32 s2, v5
1159 ; SI-NEXT: s_bfe_u32 s0, s2, 0xb0014
1160 ; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01
1161 ; SI-NEXT: s_mov_b32 s1, 0xfffff
1162 ; SI-NEXT: s_mov_b32 s0, s6
1163 ; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
1164 ; SI-NEXT: v_not_b32_e32 v6, s0
1165 ; SI-NEXT: v_and_b32_e32 v6, v4, v6
1166 ; SI-NEXT: v_not_b32_e32 v7, s1
1167 ; SI-NEXT: v_and_b32_e32 v5, v5, v7
1168 ; SI-NEXT: s_and_b32 s0, s2, 0x80000000
1169 ; SI-NEXT: s_cmp_lt_i32 s3, 0
1170 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1171 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
1172 ; SI-NEXT: v_mov_b32_e32 v7, s0
1173 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1174 ; SI-NEXT: s_cmp_gt_i32 s3, 51
1175 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1176 ; SI-NEXT: v_mov_b32_e32 v7, s2
1177 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1178 ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
1179 ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1180 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1183 ; CI-LABEL: frem_f64:
1185 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1186 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1187 ; CI-NEXT: s_mov_b32 s11, 0xf000
1188 ; CI-NEXT: s_mov_b32 s10, -1
1189 ; CI-NEXT: s_mov_b32 s2, s10
1190 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1191 ; CI-NEXT: s_mov_b32 s8, s4
1192 ; CI-NEXT: s_mov_b32 s9, s5
1193 ; CI-NEXT: s_mov_b32 s4, s6
1194 ; CI-NEXT: s_mov_b32 s5, s7
1195 ; CI-NEXT: s_mov_b32 s6, s10
1196 ; CI-NEXT: s_mov_b32 s7, s11
1197 ; CI-NEXT: s_mov_b32 s3, s11
1198 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1199 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1200 ; CI-NEXT: s_waitcnt vmcnt(0)
1201 ; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
1202 ; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1203 ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1204 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1205 ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1206 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1207 ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
1208 ; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
1209 ; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1211 ; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1212 ; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1213 ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1214 ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1215 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1218 ; VI-LABEL: frem_f64:
1220 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1221 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1222 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1223 ; VI-NEXT: v_mov_b32_e32 v2, s6
1224 ; VI-NEXT: v_mov_b32_e32 v3, s7
1225 ; VI-NEXT: v_mov_b32_e32 v4, s0
1226 ; VI-NEXT: v_mov_b32_e32 v5, s1
1227 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1228 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
1229 ; VI-NEXT: v_mov_b32_e32 v0, s4
1230 ; VI-NEXT: v_mov_b32_e32 v1, s5
1231 ; VI-NEXT: s_waitcnt vmcnt(0)
1232 ; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3]
1233 ; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
1234 ; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
1235 ; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
1236 ; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
1237 ; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
1238 ; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3]
1239 ; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
1240 ; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
1242 ; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
1243 ; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3]
1244 ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
1245 ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1246 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1249 ; GFX9-LABEL: frem_f64:
1251 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1252 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1253 ; GFX9-NEXT: v_mov_b32_e32 v12, 0
1254 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1255 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7]
1256 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3]
1257 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1258 ; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
1259 ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1260 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1261 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1262 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1263 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1264 ; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
1265 ; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
1266 ; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1267 ; GFX9-NEXT: s_nop 1
1268 ; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1269 ; GFX9-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1270 ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1271 ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1272 ; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5]
1273 ; GFX9-NEXT: s_endpgm
1275 ; GFX10-LABEL: frem_f64:
1277 ; GFX10-NEXT: s_clause 0x1
1278 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1279 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1280 ; GFX10-NEXT: v_mov_b32_e32 v12, 0
1281 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1282 ; GFX10-NEXT: s_clause 0x1
1283 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7]
1284 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3]
1285 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1286 ; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1]
1287 ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1288 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1289 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1290 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1291 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1292 ; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
1293 ; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
1294 ; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1295 ; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1296 ; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1297 ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1298 ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1299 ; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5]
1300 ; GFX10-NEXT: s_endpgm
1302 ; GFX11-LABEL: frem_f64:
1304 ; GFX11-NEXT: s_clause 0x1
1305 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1306 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1307 ; GFX11-NEXT: v_mov_b32_e32 v12, 0
1308 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1309 ; GFX11-NEXT: s_clause 0x1
1310 ; GFX11-NEXT: global_load_b64 v[0:1], v12, s[6:7]
1311 ; GFX11-NEXT: global_load_b64 v[2:3], v12, s[0:1]
1312 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1313 ; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
1314 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1315 ; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1316 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1317 ; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1318 ; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1319 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1320 ; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1321 ; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1322 ; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
1323 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1324 ; GFX11-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
1325 ; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1326 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1327 ; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1328 ; GFX11-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1329 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1330 ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1331 ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1332 ; GFX11-NEXT: global_store_b64 v12, v[0:1], s[4:5]
1333 ; GFX11-NEXT: s_nop 0
1334 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1335 ; GFX11-NEXT: s_endpgm
1337 ; GFX1150-LABEL: frem_f64:
1339 ; GFX1150-NEXT: s_clause 0x1
1340 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1341 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1342 ; GFX1150-NEXT: v_mov_b32_e32 v12, 0
1343 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
1344 ; GFX1150-NEXT: s_clause 0x1
1345 ; GFX1150-NEXT: global_load_b64 v[0:1], v12, s[6:7]
1346 ; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[0:1]
1347 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
1348 ; GFX1150-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
1349 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
1350 ; GFX1150-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1351 ; GFX1150-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1352 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1353 ; GFX1150-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1354 ; GFX1150-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1355 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1356 ; GFX1150-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1357 ; GFX1150-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
1358 ; GFX1150-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
1359 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1360 ; GFX1150-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1361 ; GFX1150-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1362 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1363 ; GFX1150-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1364 ; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1365 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
1366 ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1367 ; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[4:5]
1368 ; GFX1150-NEXT: s_nop 0
1369 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1370 ; GFX1150-NEXT: s_endpgm
1371 ptr addrspace(1) %in2) #0 {
1372 %r0 = load double, ptr addrspace(1) %in1, align 8
1373 %r1 = load double, ptr addrspace(1) %in2, align 8
1374 %r2 = frem double %r0, %r1
1375 store double %r2, ptr addrspace(1) %out, align 8
1379 define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
1380 ; SI-LABEL: fast_frem_f64:
1382 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1383 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1384 ; SI-NEXT: s_mov_b32 s3, 0xf000
1385 ; SI-NEXT: s_mov_b32 s2, -1
1386 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1387 ; SI-NEXT: s_mov_b32 s0, s4
1388 ; SI-NEXT: s_mov_b32 s1, s5
1389 ; SI-NEXT: s_mov_b32 s4, s6
1390 ; SI-NEXT: s_mov_b32 s5, s7
1391 ; SI-NEXT: s_mov_b32 s6, s2
1392 ; SI-NEXT: s_mov_b32 s7, s3
1393 ; SI-NEXT: s_mov_b32 s10, s2
1394 ; SI-NEXT: s_mov_b32 s11, s3
1395 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1396 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
1397 ; SI-NEXT: s_waitcnt vmcnt(0)
1398 ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1399 ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1400 ; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1401 ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1402 ; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1403 ; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1404 ; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1405 ; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1406 ; SI-NEXT: v_readfirstlane_b32 s6, v5
1407 ; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014
1408 ; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01
1409 ; SI-NEXT: s_mov_b32 s5, 0xfffff
1410 ; SI-NEXT: s_mov_b32 s4, s2
1411 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
1412 ; SI-NEXT: v_not_b32_e32 v6, s4
1413 ; SI-NEXT: v_and_b32_e32 v6, v4, v6
1414 ; SI-NEXT: v_not_b32_e32 v7, s5
1415 ; SI-NEXT: v_and_b32_e32 v5, v5, v7
1416 ; SI-NEXT: s_and_b32 s4, s6, 0x80000000
1417 ; SI-NEXT: s_cmp_lt_i32 s7, 0
1418 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1419 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
1420 ; SI-NEXT: v_mov_b32_e32 v7, s4
1421 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1422 ; SI-NEXT: s_cmp_gt_i32 s7, 51
1423 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1424 ; SI-NEXT: v_mov_b32_e32 v7, s6
1425 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1426 ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
1427 ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1428 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1431 ; CI-LABEL: fast_frem_f64:
1433 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1434 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1435 ; CI-NEXT: s_mov_b32 s11, 0xf000
1436 ; CI-NEXT: s_mov_b32 s10, -1
1437 ; CI-NEXT: s_mov_b32 s2, s10
1438 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1439 ; CI-NEXT: s_mov_b32 s8, s4
1440 ; CI-NEXT: s_mov_b32 s9, s5
1441 ; CI-NEXT: s_mov_b32 s4, s6
1442 ; CI-NEXT: s_mov_b32 s5, s7
1443 ; CI-NEXT: s_mov_b32 s6, s10
1444 ; CI-NEXT: s_mov_b32 s7, s11
1445 ; CI-NEXT: s_mov_b32 s3, s11
1446 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1447 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1448 ; CI-NEXT: s_waitcnt vmcnt(0)
1449 ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1450 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1451 ; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1452 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1453 ; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1454 ; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1455 ; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1456 ; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1457 ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1458 ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1459 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1462 ; VI-LABEL: fast_frem_f64:
1464 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1465 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1466 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1467 ; VI-NEXT: v_mov_b32_e32 v2, s6
1468 ; VI-NEXT: v_mov_b32_e32 v3, s7
1469 ; VI-NEXT: v_mov_b32_e32 v4, s0
1470 ; VI-NEXT: v_mov_b32_e32 v5, s1
1471 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1472 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
1473 ; VI-NEXT: v_mov_b32_e32 v0, s4
1474 ; VI-NEXT: v_mov_b32_e32 v1, s5
1475 ; VI-NEXT: s_waitcnt vmcnt(0)
1476 ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1477 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1478 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1479 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1480 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1481 ; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7]
1482 ; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1483 ; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1484 ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
1485 ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1486 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1489 ; GFX9-LABEL: fast_frem_f64:
1491 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1492 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1493 ; GFX9-NEXT: v_mov_b32_e32 v10, 0
1494 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1495 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7]
1496 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3]
1497 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1498 ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1499 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1500 ; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1501 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1502 ; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1503 ; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1504 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1505 ; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1506 ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1507 ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1508 ; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
1509 ; GFX9-NEXT: s_endpgm
1511 ; GFX10-LABEL: fast_frem_f64:
1513 ; GFX10-NEXT: s_clause 0x1
1514 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1515 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1516 ; GFX10-NEXT: v_mov_b32_e32 v10, 0
1517 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1518 ; GFX10-NEXT: s_clause 0x1
1519 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7]
1520 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3]
1521 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1522 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1523 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1524 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1525 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1526 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1527 ; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1528 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1529 ; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1530 ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1531 ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1532 ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
1533 ; GFX10-NEXT: s_endpgm
1535 ; GFX11-LABEL: fast_frem_f64:
1537 ; GFX11-NEXT: s_clause 0x1
1538 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1539 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1540 ; GFX11-NEXT: v_mov_b32_e32 v10, 0
1541 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1542 ; GFX11-NEXT: s_clause 0x1
1543 ; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7]
1544 ; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1]
1545 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1546 ; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1547 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1548 ; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1549 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1550 ; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1551 ; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1552 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1553 ; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1554 ; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1555 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1556 ; GFX11-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1557 ; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1558 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1559 ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1560 ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1561 ; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5]
1562 ; GFX11-NEXT: s_nop 0
1563 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1564 ; GFX11-NEXT: s_endpgm
1566 ; GFX1150-LABEL: fast_frem_f64:
1568 ; GFX1150-NEXT: s_clause 0x1
1569 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1570 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1571 ; GFX1150-NEXT: v_mov_b32_e32 v10, 0
1572 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
1573 ; GFX1150-NEXT: s_clause 0x1
1574 ; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[6:7]
1575 ; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[0:1]
1576 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
1577 ; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1578 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1579 ; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1580 ; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1581 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1582 ; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1583 ; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1584 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1585 ; GFX1150-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1586 ; GFX1150-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1587 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1588 ; GFX1150-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1589 ; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1590 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
1591 ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1592 ; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[4:5]
1593 ; GFX1150-NEXT: s_nop 0
1594 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1595 ; GFX1150-NEXT: s_endpgm
1596 ptr addrspace(1) %in2) #0 {
1597 %r0 = load double, ptr addrspace(1) %in1, align 8
1598 %r1 = load double, ptr addrspace(1) %in2, align 8
1599 %r2 = frem fast double %r0, %r1
1600 store double %r2, ptr addrspace(1) %out, align 8
1604 define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
1605 ; SI-LABEL: unsafe_frem_f64:
1607 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1608 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1609 ; SI-NEXT: s_mov_b32 s3, 0xf000
1610 ; SI-NEXT: s_mov_b32 s2, -1
1611 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1612 ; SI-NEXT: s_mov_b32 s0, s4
1613 ; SI-NEXT: s_mov_b32 s1, s5
1614 ; SI-NEXT: s_mov_b32 s4, s6
1615 ; SI-NEXT: s_mov_b32 s5, s7
1616 ; SI-NEXT: s_mov_b32 s6, s2
1617 ; SI-NEXT: s_mov_b32 s7, s3
1618 ; SI-NEXT: s_mov_b32 s10, s2
1619 ; SI-NEXT: s_mov_b32 s11, s3
1620 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1621 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
1622 ; SI-NEXT: s_waitcnt vmcnt(0)
1623 ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1624 ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1625 ; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1626 ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1627 ; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1628 ; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1629 ; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1630 ; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1631 ; SI-NEXT: v_readfirstlane_b32 s6, v5
1632 ; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014
1633 ; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01
1634 ; SI-NEXT: s_mov_b32 s5, 0xfffff
1635 ; SI-NEXT: s_mov_b32 s4, s2
1636 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
1637 ; SI-NEXT: v_not_b32_e32 v6, s4
1638 ; SI-NEXT: v_and_b32_e32 v6, v4, v6
1639 ; SI-NEXT: v_not_b32_e32 v7, s5
1640 ; SI-NEXT: v_and_b32_e32 v5, v5, v7
1641 ; SI-NEXT: s_and_b32 s4, s6, 0x80000000
1642 ; SI-NEXT: s_cmp_lt_i32 s7, 0
1643 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1644 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
1645 ; SI-NEXT: v_mov_b32_e32 v7, s4
1646 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1647 ; SI-NEXT: s_cmp_gt_i32 s7, 51
1648 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1649 ; SI-NEXT: v_mov_b32_e32 v7, s6
1650 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
1651 ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
1652 ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1653 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1656 ; CI-LABEL: unsafe_frem_f64:
1658 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1659 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1660 ; CI-NEXT: s_mov_b32 s11, 0xf000
1661 ; CI-NEXT: s_mov_b32 s10, -1
1662 ; CI-NEXT: s_mov_b32 s2, s10
1663 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1664 ; CI-NEXT: s_mov_b32 s8, s4
1665 ; CI-NEXT: s_mov_b32 s9, s5
1666 ; CI-NEXT: s_mov_b32 s4, s6
1667 ; CI-NEXT: s_mov_b32 s5, s7
1668 ; CI-NEXT: s_mov_b32 s6, s10
1669 ; CI-NEXT: s_mov_b32 s7, s11
1670 ; CI-NEXT: s_mov_b32 s3, s11
1671 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1672 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1673 ; CI-NEXT: s_waitcnt vmcnt(0)
1674 ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1675 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1676 ; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1677 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1678 ; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1679 ; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1680 ; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1681 ; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1682 ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1683 ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1684 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1687 ; VI-LABEL: unsafe_frem_f64:
1689 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1690 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1691 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1692 ; VI-NEXT: v_mov_b32_e32 v2, s6
1693 ; VI-NEXT: v_mov_b32_e32 v3, s7
1694 ; VI-NEXT: v_mov_b32_e32 v4, s0
1695 ; VI-NEXT: v_mov_b32_e32 v5, s1
1696 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1697 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
1698 ; VI-NEXT: v_mov_b32_e32 v0, s4
1699 ; VI-NEXT: v_mov_b32_e32 v1, s5
1700 ; VI-NEXT: s_waitcnt vmcnt(0)
1701 ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1702 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1703 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1704 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1705 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1706 ; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7]
1707 ; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1708 ; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1709 ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
1710 ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1711 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1714 ; GFX9-LABEL: unsafe_frem_f64:
1716 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1717 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1718 ; GFX9-NEXT: v_mov_b32_e32 v10, 0
1719 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1720 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7]
1721 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3]
1722 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1723 ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1724 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1725 ; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1726 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1727 ; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1728 ; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1729 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1730 ; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1731 ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1732 ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1733 ; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
1734 ; GFX9-NEXT: s_endpgm
1736 ; GFX10-LABEL: unsafe_frem_f64:
1738 ; GFX10-NEXT: s_clause 0x1
1739 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1740 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1741 ; GFX10-NEXT: v_mov_b32_e32 v10, 0
1742 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1743 ; GFX10-NEXT: s_clause 0x1
1744 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7]
1745 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3]
1746 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1747 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1748 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1749 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1750 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1751 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1752 ; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1753 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1754 ; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1755 ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1756 ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1757 ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
1758 ; GFX10-NEXT: s_endpgm
1760 ; GFX11-LABEL: unsafe_frem_f64:
1762 ; GFX11-NEXT: s_clause 0x1
1763 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1764 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1765 ; GFX11-NEXT: v_mov_b32_e32 v10, 0
1766 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1767 ; GFX11-NEXT: s_clause 0x1
1768 ; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7]
1769 ; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1]
1770 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1771 ; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1772 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
1773 ; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1774 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1775 ; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1776 ; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1777 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1778 ; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1779 ; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1780 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1781 ; GFX11-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1782 ; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1783 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1784 ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1785 ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1786 ; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5]
1787 ; GFX11-NEXT: s_nop 0
1788 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1789 ; GFX11-NEXT: s_endpgm
1791 ; GFX1150-LABEL: unsafe_frem_f64:
1793 ; GFX1150-NEXT: s_clause 0x1
1794 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1795 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1796 ; GFX1150-NEXT: v_mov_b32_e32 v10, 0
1797 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
1798 ; GFX1150-NEXT: s_clause 0x1
1799 ; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[6:7]
1800 ; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[0:1]
1801 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
1802 ; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1803 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1804 ; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1805 ; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1806 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1807 ; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1808 ; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1809 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1810 ; GFX1150-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1811 ; GFX1150-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1812 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1813 ; GFX1150-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1814 ; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1815 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
1816 ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1817 ; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[4:5]
1818 ; GFX1150-NEXT: s_nop 0
1819 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1820 ; GFX1150-NEXT: s_endpgm
1821 ptr addrspace(1) %in2) #1 {
1822 %r0 = load double, ptr addrspace(1) %in1, align 8
1823 %r1 = load double, ptr addrspace(1) %in2, align 8
1824 %r2 = frem afn double %r0, %r1
1825 store double %r2, ptr addrspace(1) %out, align 8
1829 define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
1830 ; SI-LABEL: frem_v2f16:
1832 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1833 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1834 ; SI-NEXT: s_mov_b32 s3, 0xf000
1835 ; SI-NEXT: s_mov_b32 s2, -1
1836 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1837 ; SI-NEXT: s_mov_b32 s0, s4
1838 ; SI-NEXT: s_mov_b32 s1, s5
1839 ; SI-NEXT: s_mov_b32 s4, s6
1840 ; SI-NEXT: s_mov_b32 s5, s7
1841 ; SI-NEXT: s_mov_b32 s6, s2
1842 ; SI-NEXT: s_mov_b32 s7, s3
1843 ; SI-NEXT: s_mov_b32 s10, s2
1844 ; SI-NEXT: s_mov_b32 s11, s3
1845 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
1846 ; SI-NEXT: s_waitcnt vmcnt(0)
1847 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
1848 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1849 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1850 ; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16
1851 ; SI-NEXT: s_waitcnt vmcnt(0)
1852 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2
1853 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1854 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
1855 ; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0
1856 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0
1857 ; SI-NEXT: v_rcp_f32_e32 v6, v5
1858 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1859 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
1860 ; SI-NEXT: v_fma_f32 v6, v7, v6, v6
1861 ; SI-NEXT: v_mul_f32_e32 v7, v4, v6
1862 ; SI-NEXT: v_fma_f32 v8, -v5, v7, v4
1863 ; SI-NEXT: v_fma_f32 v7, v8, v6, v7
1864 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4
1865 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1866 ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
1867 ; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0
1868 ; SI-NEXT: v_trunc_f32_e32 v4, v4
1869 ; SI-NEXT: v_fma_f32 v0, -v4, v2, v0
1870 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1871 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1872 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1873 ; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1
1874 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1
1875 ; SI-NEXT: v_rcp_f32_e32 v5, v4
1876 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1877 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
1878 ; SI-NEXT: v_fma_f32 v5, v6, v5, v5
1879 ; SI-NEXT: v_mul_f32_e32 v6, v2, v5
1880 ; SI-NEXT: v_fma_f32 v7, -v4, v6, v2
1881 ; SI-NEXT: v_fma_f32 v6, v7, v5, v6
1882 ; SI-NEXT: v_fma_f32 v2, -v4, v6, v2
1883 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1884 ; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6
1885 ; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1
1886 ; SI-NEXT: v_trunc_f32_e32 v2, v2
1887 ; SI-NEXT: v_fma_f32 v1, -v2, v3, v1
1888 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1889 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1890 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1893 ; CI-LABEL: frem_v2f16:
1895 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1896 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1897 ; CI-NEXT: s_mov_b32 s3, 0xf000
1898 ; CI-NEXT: s_mov_b32 s2, -1
1899 ; CI-NEXT: s_mov_b32 s10, s2
1900 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1901 ; CI-NEXT: s_mov_b32 s0, s4
1902 ; CI-NEXT: s_mov_b32 s1, s5
1903 ; CI-NEXT: s_mov_b32 s4, s6
1904 ; CI-NEXT: s_mov_b32 s5, s7
1905 ; CI-NEXT: s_mov_b32 s6, s2
1906 ; CI-NEXT: s_mov_b32 s7, s3
1907 ; CI-NEXT: s_mov_b32 s11, s3
1908 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
1909 ; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16
1910 ; CI-NEXT: s_waitcnt vmcnt(1)
1911 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v0
1912 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1913 ; CI-NEXT: s_waitcnt vmcnt(0)
1914 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v2
1915 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1916 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1917 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
1918 ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0
1919 ; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0
1920 ; CI-NEXT: v_rcp_f32_e32 v6, v5
1921 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1922 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
1923 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6
1924 ; CI-NEXT: v_mul_f32_e32 v7, v4, v6
1925 ; CI-NEXT: v_fma_f32 v8, -v5, v7, v4
1926 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7
1927 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4
1928 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1929 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
1930 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0
1931 ; CI-NEXT: v_trunc_f32_e32 v4, v4
1932 ; CI-NEXT: v_fma_f32 v0, -v4, v2, v0
1933 ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1
1934 ; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1
1935 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1936 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1937 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1938 ; CI-NEXT: v_rcp_f32_e32 v5, v4
1939 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1940 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
1941 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5
1942 ; CI-NEXT: v_mul_f32_e32 v6, v2, v5
1943 ; CI-NEXT: v_fma_f32 v7, -v4, v6, v2
1944 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6
1945 ; CI-NEXT: v_fma_f32 v2, -v4, v6, v2
1946 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1947 ; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6
1948 ; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1
1949 ; CI-NEXT: v_trunc_f32_e32 v2, v2
1950 ; CI-NEXT: v_fma_f32 v1, -v2, v3, v1
1951 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1952 ; CI-NEXT: v_or_b32_e32 v0, v1, v0
1953 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1956 ; VI-LABEL: frem_v2f16:
1958 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1959 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1960 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1961 ; VI-NEXT: v_mov_b32_e32 v2, s6
1962 ; VI-NEXT: s_add_u32 s0, s0, 16
1963 ; VI-NEXT: v_mov_b32_e32 v3, s7
1964 ; VI-NEXT: s_addc_u32 s1, s1, 0
1965 ; VI-NEXT: flat_load_dword v4, v[2:3]
1966 ; VI-NEXT: v_mov_b32_e32 v3, s1
1967 ; VI-NEXT: v_mov_b32_e32 v2, s0
1968 ; VI-NEXT: flat_load_dword v2, v[2:3]
1969 ; VI-NEXT: v_mov_b32_e32 v0, s4
1970 ; VI-NEXT: v_mov_b32_e32 v1, s5
1971 ; VI-NEXT: s_waitcnt vmcnt(1)
1972 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
1973 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v3
1974 ; VI-NEXT: s_waitcnt vmcnt(0)
1975 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
1976 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
1977 ; VI-NEXT: v_rcp_f32_e32 v7, v7
1978 ; VI-NEXT: v_mul_f32_e32 v5, v5, v7
1979 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5
1980 ; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3
1981 ; VI-NEXT: v_trunc_f16_e32 v5, v5
1982 ; VI-NEXT: v_fma_f16 v3, -v5, v6, v3
1983 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v2
1984 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v4
1985 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1986 ; VI-NEXT: v_rcp_f32_e32 v6, v6
1987 ; VI-NEXT: v_mul_f32_e32 v5, v5, v6
1988 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5
1989 ; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4
1990 ; VI-NEXT: v_trunc_f16_e32 v5, v5
1991 ; VI-NEXT: v_fma_f16 v2, -v5, v2, v4
1992 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1993 ; VI-NEXT: flat_store_dword v[0:1], v2
1996 ; GFX9-LABEL: frem_v2f16:
1998 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1999 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2002 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
2003 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16
2004 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2005 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
2006 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3
2007 ; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
2008 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1
2009 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
2010 ; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1
2011 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2012 ; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2
2013 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4
2014 ; GFX9-NEXT: v_mad_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2015 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2016 ; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1
2017 ; GFX9-NEXT: v_trunc_f16_e32 v4, v4
2018 ; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1
2019 ; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
2020 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
2021 ; GFX9-NEXT: s_endpgm
2023 ; GFX10-LABEL: frem_v2f16:
2025 ; GFX10-NEXT: s_clause 0x1
2026 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2027 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2028 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2029 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2030 ; GFX10-NEXT: s_clause 0x1
2031 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
2032 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16
2033 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2034 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
2035 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
2036 ; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
2037 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
2038 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
2039 ; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1
2040 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2041 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
2042 ; GFX10-NEXT: v_rcp_f32_e32 v4, v4
2043 ; GFX10-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2044 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2045 ; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1
2046 ; GFX10-NEXT: v_trunc_f16_e32 v4, v4
2047 ; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1
2048 ; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
2049 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
2050 ; GFX10-NEXT: s_endpgm
2052 ; GFX11-LABEL: frem_v2f16:
2054 ; GFX11-NEXT: s_clause 0x1
2055 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2056 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2057 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2058 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2059 ; GFX11-NEXT: s_clause 0x1
2060 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
2061 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
2062 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2063 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
2064 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2065 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
2066 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2067 ; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
2068 ; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1
2069 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2070 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3
2071 ; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v1
2072 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2073 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2074 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
2075 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4
2076 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2077 ; GFX11-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2078 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2079 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2080 ; GFX11-NEXT: v_div_fixup_f16 v4, v4, v2, v1
2081 ; GFX11-NEXT: v_trunc_f16_e32 v4, v4
2082 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2083 ; GFX11-NEXT: v_fma_f16 v1, -v4, v2, v1
2084 ; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1
2085 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
2086 ; GFX11-NEXT: s_nop 0
2087 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2088 ; GFX11-NEXT: s_endpgm
2090 ; GFX1150-LABEL: frem_v2f16:
2092 ; GFX1150-NEXT: s_clause 0x1
2093 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2094 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2095 ; GFX1150-NEXT: v_mov_b32_e32 v0, 0
2096 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
2097 ; GFX1150-NEXT: s_clause 0x1
2098 ; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
2099 ; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
2100 ; GFX1150-NEXT: s_waitcnt vmcnt(1)
2101 ; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v1
2102 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
2103 ; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2104 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2105 ; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v3
2106 ; GFX1150-NEXT: v_rcp_f32_e32 v4, v4
2107 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2108 ; GFX1150-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2109 ; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v3, v5
2110 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2111 ; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
2112 ; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
2113 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2114 ; GFX1150-NEXT: v_fmac_f16_e32 v5, v4, v3
2115 ; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2
2116 ; GFX1150-NEXT: v_rcp_f32_e32 v3, v3
2117 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2118 ; GFX1150-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
2119 ; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1
2120 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2121 ; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
2122 ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
2123 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2124 ; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
2125 ; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v5
2126 ; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5]
2127 ; GFX1150-NEXT: s_nop 0
2128 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2129 ; GFX1150-NEXT: s_endpgm
2130 ptr addrspace(1) %in2) #0 {
2131 %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
2132 %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
2133 %r1 = load <2 x half>, ptr addrspace(1) %gep2, align 8
2134 %r2 = frem <2 x half> %r0, %r1
2135 store <2 x half> %r2, ptr addrspace(1) %out, align 8
2139 define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
2140 ; SI-LABEL: frem_v4f16:
2142 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2143 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2144 ; SI-NEXT: s_mov_b32 s3, 0xf000
2145 ; SI-NEXT: s_mov_b32 s2, -1
2146 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2147 ; SI-NEXT: s_mov_b32 s0, s4
2148 ; SI-NEXT: s_mov_b32 s1, s5
2149 ; SI-NEXT: s_mov_b32 s4, s6
2150 ; SI-NEXT: s_mov_b32 s5, s7
2151 ; SI-NEXT: s_mov_b32 s6, s2
2152 ; SI-NEXT: s_mov_b32 s7, s3
2153 ; SI-NEXT: s_mov_b32 s10, s2
2154 ; SI-NEXT: s_mov_b32 s11, s3
2155 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2156 ; SI-NEXT: s_waitcnt vmcnt(0)
2157 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
2158 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2159 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
2160 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
2161 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
2162 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v0
2163 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
2164 ; SI-NEXT: s_waitcnt vmcnt(0)
2165 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v0
2166 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2167 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
2168 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v1
2169 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2170 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2171 ; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5
2172 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5
2173 ; SI-NEXT: v_rcp_f32_e32 v10, v9
2174 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2175 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
2176 ; SI-NEXT: v_fma_f32 v10, v11, v10, v10
2177 ; SI-NEXT: v_mul_f32_e32 v11, v8, v10
2178 ; SI-NEXT: v_fma_f32 v12, -v9, v11, v8
2179 ; SI-NEXT: v_fma_f32 v11, v12, v10, v11
2180 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8
2181 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2182 ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
2183 ; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5
2184 ; SI-NEXT: v_trunc_f32_e32 v8, v8
2185 ; SI-NEXT: v_fma_f32 v1, -v8, v1, v5
2186 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2187 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
2188 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2189 ; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4
2190 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4
2191 ; SI-NEXT: v_rcp_f32_e32 v9, v8
2192 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2193 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
2194 ; SI-NEXT: v_fma_f32 v9, v10, v9, v9
2195 ; SI-NEXT: v_mul_f32_e32 v10, v5, v9
2196 ; SI-NEXT: v_fma_f32 v11, -v8, v10, v5
2197 ; SI-NEXT: v_fma_f32 v10, v11, v9, v10
2198 ; SI-NEXT: v_fma_f32 v5, -v8, v10, v5
2199 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2200 ; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10
2201 ; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4
2202 ; SI-NEXT: v_trunc_f32_e32 v5, v5
2203 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4
2204 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
2205 ; SI-NEXT: v_or_b32_e32 v1, v4, v1
2206 ; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3
2207 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3
2208 ; SI-NEXT: v_rcp_f32_e32 v7, v5
2209 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2210 ; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0
2211 ; SI-NEXT: v_fma_f32 v7, v8, v7, v7
2212 ; SI-NEXT: v_mul_f32_e32 v8, v4, v7
2213 ; SI-NEXT: v_fma_f32 v9, -v5, v8, v4
2214 ; SI-NEXT: v_fma_f32 v8, v9, v7, v8
2215 ; SI-NEXT: v_fma_f32 v4, -v5, v8, v4
2216 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2217 ; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8
2218 ; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3
2219 ; SI-NEXT: v_trunc_f32_e32 v4, v4
2220 ; SI-NEXT: v_fma_f32 v0, -v4, v0, v3
2221 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
2222 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2223 ; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2
2224 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2
2225 ; SI-NEXT: v_rcp_f32_e32 v5, v4
2226 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2227 ; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0
2228 ; SI-NEXT: v_fma_f32 v5, v7, v5, v5
2229 ; SI-NEXT: v_mul_f32_e32 v7, v3, v5
2230 ; SI-NEXT: v_fma_f32 v8, -v4, v7, v3
2231 ; SI-NEXT: v_fma_f32 v7, v8, v5, v7
2232 ; SI-NEXT: v_fma_f32 v3, -v4, v7, v3
2233 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2234 ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7
2235 ; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2
2236 ; SI-NEXT: v_trunc_f32_e32 v3, v3
2237 ; SI-NEXT: v_fma_f32 v2, -v3, v6, v2
2238 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
2239 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
2240 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2243 ; CI-LABEL: frem_v4f16:
2245 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2246 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2247 ; CI-NEXT: s_mov_b32 s3, 0xf000
2248 ; CI-NEXT: s_mov_b32 s2, -1
2249 ; CI-NEXT: s_mov_b32 s10, s2
2250 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2251 ; CI-NEXT: s_mov_b32 s0, s4
2252 ; CI-NEXT: s_mov_b32 s1, s5
2253 ; CI-NEXT: s_mov_b32 s4, s6
2254 ; CI-NEXT: s_mov_b32 s5, s7
2255 ; CI-NEXT: s_mov_b32 s6, s2
2256 ; CI-NEXT: s_mov_b32 s7, s3
2257 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2258 ; CI-NEXT: s_mov_b32 s11, s3
2259 ; CI-NEXT: s_waitcnt vmcnt(0)
2260 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v0
2261 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2262 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v0
2263 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
2264 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v1
2265 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v0
2266 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
2267 ; CI-NEXT: s_waitcnt vmcnt(0)
2268 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v1
2269 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2270 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
2271 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v0
2272 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2273 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2274 ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5
2275 ; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5
2276 ; CI-NEXT: v_rcp_f32_e32 v10, v9
2277 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2278 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
2279 ; CI-NEXT: v_fma_f32 v10, v11, v10, v10
2280 ; CI-NEXT: v_mul_f32_e32 v11, v8, v10
2281 ; CI-NEXT: v_fma_f32 v12, -v9, v11, v8
2282 ; CI-NEXT: v_fma_f32 v11, v12, v10, v11
2283 ; CI-NEXT: v_fma_f32 v8, -v9, v11, v8
2284 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2285 ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
2286 ; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5
2287 ; CI-NEXT: v_trunc_f32_e32 v8, v8
2288 ; CI-NEXT: v_fma_f32 v1, -v8, v1, v5
2289 ; CI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4
2290 ; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4
2291 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
2292 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2293 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2294 ; CI-NEXT: v_rcp_f32_e32 v9, v8
2295 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2296 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
2297 ; CI-NEXT: v_fma_f32 v9, v10, v9, v9
2298 ; CI-NEXT: v_mul_f32_e32 v10, v5, v9
2299 ; CI-NEXT: v_fma_f32 v11, -v8, v10, v5
2300 ; CI-NEXT: v_fma_f32 v10, v11, v9, v10
2301 ; CI-NEXT: v_fma_f32 v5, -v8, v10, v5
2302 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2303 ; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10
2304 ; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4
2305 ; CI-NEXT: v_trunc_f32_e32 v5, v5
2306 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4
2307 ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3
2308 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
2309 ; CI-NEXT: v_or_b32_e32 v1, v4, v1
2310 ; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3
2311 ; CI-NEXT: v_rcp_f32_e32 v7, v5
2312 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2313 ; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0
2314 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7
2315 ; CI-NEXT: v_mul_f32_e32 v8, v4, v7
2316 ; CI-NEXT: v_fma_f32 v9, -v5, v8, v4
2317 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8
2318 ; CI-NEXT: v_fma_f32 v4, -v5, v8, v4
2319 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2320 ; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8
2321 ; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3
2322 ; CI-NEXT: v_trunc_f32_e32 v4, v4
2323 ; CI-NEXT: v_fma_f32 v0, -v4, v0, v3
2324 ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2
2325 ; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2
2326 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2327 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2328 ; CI-NEXT: v_rcp_f32_e32 v5, v4
2329 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2330 ; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0
2331 ; CI-NEXT: v_fma_f32 v5, v7, v5, v5
2332 ; CI-NEXT: v_mul_f32_e32 v7, v3, v5
2333 ; CI-NEXT: v_fma_f32 v8, -v4, v7, v3
2334 ; CI-NEXT: v_fma_f32 v7, v8, v5, v7
2335 ; CI-NEXT: v_fma_f32 v3, -v4, v7, v3
2336 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2337 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7
2338 ; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2
2339 ; CI-NEXT: v_trunc_f32_e32 v3, v3
2340 ; CI-NEXT: v_fma_f32 v2, -v3, v6, v2
2341 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2342 ; CI-NEXT: v_or_b32_e32 v0, v2, v0
2343 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2346 ; VI-LABEL: frem_v4f16:
2348 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2349 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2350 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2351 ; VI-NEXT: v_mov_b32_e32 v2, s6
2352 ; VI-NEXT: s_add_u32 s0, s0, 32
2353 ; VI-NEXT: s_addc_u32 s1, s1, 0
2354 ; VI-NEXT: v_mov_b32_e32 v5, s1
2355 ; VI-NEXT: v_mov_b32_e32 v4, s0
2356 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
2357 ; VI-NEXT: v_mov_b32_e32 v3, s7
2358 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
2359 ; VI-NEXT: v_mov_b32_e32 v0, s4
2360 ; VI-NEXT: v_mov_b32_e32 v1, s5
2361 ; VI-NEXT: s_waitcnt vmcnt(1)
2362 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5
2363 ; VI-NEXT: v_cvt_f32_f16_e32 v9, v8
2364 ; VI-NEXT: s_waitcnt vmcnt(0)
2365 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3
2366 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
2367 ; VI-NEXT: v_rcp_f32_e32 v9, v9
2368 ; VI-NEXT: v_mul_f32_e32 v7, v7, v9
2369 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7
2370 ; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6
2371 ; VI-NEXT: v_trunc_f16_e32 v7, v7
2372 ; VI-NEXT: v_fma_f16 v6, -v7, v8, v6
2373 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v5
2374 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v3
2375 ; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
2376 ; VI-NEXT: v_rcp_f32_e32 v8, v8
2377 ; VI-NEXT: v_mul_f32_e32 v7, v7, v8
2378 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7
2379 ; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3
2380 ; VI-NEXT: v_trunc_f16_e32 v7, v7
2381 ; VI-NEXT: v_fma_f16 v3, -v7, v5, v3
2382 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
2383 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v7
2384 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
2385 ; VI-NEXT: v_or_b32_e32 v3, v3, v6
2386 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v5
2387 ; VI-NEXT: v_rcp_f32_e32 v8, v8
2388 ; VI-NEXT: v_mul_f32_e32 v6, v6, v8
2389 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
2390 ; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5
2391 ; VI-NEXT: v_trunc_f16_e32 v6, v6
2392 ; VI-NEXT: v_fma_f16 v5, -v6, v7, v5
2393 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v4
2394 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v2
2395 ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
2396 ; VI-NEXT: v_rcp_f32_e32 v7, v7
2397 ; VI-NEXT: v_mul_f32_e32 v6, v6, v7
2398 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
2399 ; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2
2400 ; VI-NEXT: v_trunc_f16_e32 v6, v6
2401 ; VI-NEXT: v_fma_f16 v2, -v6, v4, v2
2402 ; VI-NEXT: v_or_b32_e32 v2, v2, v5
2403 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
2406 ; GFX9-LABEL: frem_v4f16:
2408 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2409 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2410 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2411 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2412 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
2413 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2414 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2415 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v3
2416 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5
2417 ; GFX9-NEXT: v_mad_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
2418 ; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1
2419 ; GFX9-NEXT: v_trunc_f16_e32 v5, v5
2420 ; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1
2421 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2422 ; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3
2423 ; GFX9-NEXT: v_rcp_f32_e32 v6, v6
2424 ; GFX9-NEXT: v_mad_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2425 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2426 ; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1
2427 ; GFX9-NEXT: v_trunc_f16_e32 v6, v6
2428 ; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1
2429 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
2430 ; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1
2431 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3
2432 ; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
2433 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0
2434 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
2435 ; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0
2436 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2437 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2
2438 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5
2439 ; GFX9-NEXT: v_mad_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2440 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2441 ; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0
2442 ; GFX9-NEXT: v_trunc_f16_e32 v5, v5
2443 ; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0
2444 ; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
2445 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
2446 ; GFX9-NEXT: s_endpgm
2448 ; GFX10-LABEL: frem_v4f16:
2450 ; GFX10-NEXT: s_clause 0x1
2451 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2452 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2453 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
2454 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2455 ; GFX10-NEXT: s_clause 0x1
2456 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
2457 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2458 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2459 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v3
2460 ; GFX10-NEXT: v_rcp_f32_e32 v5, v5
2461 ; GFX10-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
2462 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
2463 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5
2464 ; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1
2465 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2466 ; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3
2467 ; GFX10-NEXT: v_rcp_f32_e32 v6, v6
2468 ; GFX10-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2469 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2470 ; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1
2471 ; GFX10-NEXT: v_trunc_f16_e32 v6, v6
2472 ; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1
2473 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
2474 ; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1
2475 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3
2476 ; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
2477 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
2478 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
2479 ; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0
2480 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2481 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
2482 ; GFX10-NEXT: v_rcp_f32_e32 v5, v5
2483 ; GFX10-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2484 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2485 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0
2486 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5
2487 ; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0
2488 ; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
2489 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
2490 ; GFX10-NEXT: s_endpgm
2492 ; GFX11-LABEL: frem_v4f16:
2494 ; GFX11-NEXT: s_clause 0x1
2495 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2496 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2497 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
2498 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2499 ; GFX11-NEXT: s_clause 0x1
2500 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
2501 ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
2502 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2503 ; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v3
2504 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2505 ; GFX11-NEXT: v_rcp_f32_e32 v5, v5
2506 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2507 ; GFX11-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
2508 ; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1
2509 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2510 ; GFX11-NEXT: v_trunc_f16_e32 v5, v5
2511 ; GFX11-NEXT: v_fma_f16 v5, -v5, v3, v1
2512 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2513 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2514 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3
2515 ; GFX11-NEXT: v_rcp_f32_e32 v6, v6
2516 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2517 ; GFX11-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2518 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2519 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2520 ; GFX11-NEXT: v_div_fixup_f16 v6, v6, v3, v1
2521 ; GFX11-NEXT: v_trunc_f16_e32 v6, v6
2522 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2523 ; GFX11-NEXT: v_fma_f16 v1, -v6, v3, v1
2524 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
2525 ; GFX11-NEXT: v_pack_b32_f16 v1, v5, v1
2526 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2527 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3
2528 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2529 ; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
2530 ; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0
2531 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2532 ; GFX11-NEXT: v_trunc_f16_e32 v3, v3
2533 ; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v0
2534 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2535 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2536 ; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2
2537 ; GFX11-NEXT: v_rcp_f32_e32 v5, v5
2538 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2539 ; GFX11-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2540 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2541 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2542 ; GFX11-NEXT: v_div_fixup_f16 v5, v5, v2, v0
2543 ; GFX11-NEXT: v_trunc_f16_e32 v5, v5
2544 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2545 ; GFX11-NEXT: v_fma_f16 v0, -v5, v2, v0
2546 ; GFX11-NEXT: v_pack_b32_f16 v0, v3, v0
2547 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
2548 ; GFX11-NEXT: s_nop 0
2549 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2550 ; GFX11-NEXT: s_endpgm
2552 ; GFX1150-LABEL: frem_v4f16:
2554 ; GFX1150-NEXT: s_clause 0x1
2555 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2556 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2557 ; GFX1150-NEXT: v_mov_b32_e32 v4, 0
2558 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
2559 ; GFX1150-NEXT: s_clause 0x1
2560 ; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7]
2561 ; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
2562 ; GFX1150-NEXT: s_waitcnt vmcnt(1)
2563 ; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v0
2564 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
2565 ; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2
2566 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2567 ; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
2568 ; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
2569 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2570 ; GFX1150-NEXT: v_fma_mixlo_f16 v6, v0, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2571 ; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v5, v7
2572 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2573 ; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
2574 ; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
2575 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
2576 ; GFX1150-NEXT: v_fmac_f16_e32 v7, v6, v5
2577 ; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
2578 ; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v1
2579 ; GFX1150-NEXT: v_rcp_f32_e32 v5, v5
2580 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2581 ; GFX1150-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel_hi:[1,0,0]
2582 ; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v0
2583 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2584 ; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
2585 ; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
2586 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2587 ; GFX1150-NEXT: v_fma_f16 v0, v5, v2, v0
2588 ; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v3
2589 ; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v7
2590 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2591 ; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
2592 ; GFX1150-NEXT: v_rcp_f32_e32 v5, v5
2593 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2594 ; GFX1150-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
2595 ; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v6
2596 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2597 ; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
2598 ; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
2599 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2600 ; GFX1150-NEXT: v_fmac_f16_e32 v6, v5, v2
2601 ; GFX1150-NEXT: v_cvt_f32_f16_e32 v2, v3
2602 ; GFX1150-NEXT: v_rcp_f32_e32 v2, v2
2603 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2604 ; GFX1150-NEXT: v_fma_mixlo_f16 v2, v1, v2, 0 op_sel_hi:[1,0,0]
2605 ; GFX1150-NEXT: v_div_fixup_f16 v2, v2, v3, v1
2606 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2607 ; GFX1150-NEXT: v_trunc_f16_e32 v2, v2
2608 ; GFX1150-NEXT: v_xor_b32_e32 v2, 0x8000, v2
2609 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2610 ; GFX1150-NEXT: v_fmac_f16_e32 v1, v2, v3
2611 ; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v6
2612 ; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[4:5]
2613 ; GFX1150-NEXT: s_nop 0
2614 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2615 ; GFX1150-NEXT: s_endpgm
2616 ptr addrspace(1) %in2) #0 {
2617 %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
2618 %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
2619 %r1 = load <4 x half>, ptr addrspace(1) %gep2, align 16
2620 %r2 = frem <4 x half> %r0, %r1
2621 store <4 x half> %r2, ptr addrspace(1) %out, align 16
2625 define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
2626 ; SI-LABEL: frem_v2f32:
2628 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2629 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2630 ; SI-NEXT: s_mov_b32 s3, 0xf000
2631 ; SI-NEXT: s_mov_b32 s2, -1
2632 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2633 ; SI-NEXT: s_mov_b32 s0, s4
2634 ; SI-NEXT: s_mov_b32 s1, s5
2635 ; SI-NEXT: s_mov_b32 s4, s6
2636 ; SI-NEXT: s_mov_b32 s5, s7
2637 ; SI-NEXT: s_mov_b32 s6, s2
2638 ; SI-NEXT: s_mov_b32 s7, s3
2639 ; SI-NEXT: s_mov_b32 s10, s2
2640 ; SI-NEXT: s_mov_b32 s11, s3
2641 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2642 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
2643 ; SI-NEXT: s_waitcnt vmcnt(0)
2644 ; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1
2645 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
2646 ; SI-NEXT: v_rcp_f32_e32 v6, v5
2647 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2648 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2649 ; SI-NEXT: v_fma_f32 v6, v7, v6, v6
2650 ; SI-NEXT: v_mul_f32_e32 v7, v4, v6
2651 ; SI-NEXT: v_fma_f32 v8, -v5, v7, v4
2652 ; SI-NEXT: v_fma_f32 v7, v8, v6, v7
2653 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4
2654 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2655 ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
2656 ; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1
2657 ; SI-NEXT: v_trunc_f32_e32 v4, v4
2658 ; SI-NEXT: v_fma_f32 v1, -v4, v3, v1
2659 ; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0
2660 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
2661 ; SI-NEXT: v_rcp_f32_e32 v5, v4
2662 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2663 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
2664 ; SI-NEXT: v_fma_f32 v5, v6, v5, v5
2665 ; SI-NEXT: v_mul_f32_e32 v6, v3, v5
2666 ; SI-NEXT: v_fma_f32 v7, -v4, v6, v3
2667 ; SI-NEXT: v_fma_f32 v6, v7, v5, v6
2668 ; SI-NEXT: v_fma_f32 v3, -v4, v6, v3
2669 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2670 ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
2671 ; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2672 ; SI-NEXT: v_trunc_f32_e32 v3, v3
2673 ; SI-NEXT: v_fma_f32 v0, -v3, v2, v0
2674 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2677 ; CI-LABEL: frem_v2f32:
2679 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2680 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2681 ; CI-NEXT: s_mov_b32 s3, 0xf000
2682 ; CI-NEXT: s_mov_b32 s2, -1
2683 ; CI-NEXT: s_mov_b32 s10, s2
2684 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2685 ; CI-NEXT: s_mov_b32 s0, s4
2686 ; CI-NEXT: s_mov_b32 s1, s5
2687 ; CI-NEXT: s_mov_b32 s4, s6
2688 ; CI-NEXT: s_mov_b32 s5, s7
2689 ; CI-NEXT: s_mov_b32 s6, s2
2690 ; CI-NEXT: s_mov_b32 s7, s3
2691 ; CI-NEXT: s_mov_b32 s11, s3
2692 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2693 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
2694 ; CI-NEXT: s_waitcnt vmcnt(0)
2695 ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
2696 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1
2697 ; CI-NEXT: v_rcp_f32_e32 v6, v5
2698 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2699 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2700 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6
2701 ; CI-NEXT: v_mul_f32_e32 v7, v4, v6
2702 ; CI-NEXT: v_fma_f32 v8, -v5, v7, v4
2703 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7
2704 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4
2705 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2706 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
2707 ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1
2708 ; CI-NEXT: v_trunc_f32_e32 v4, v4
2709 ; CI-NEXT: v_fma_f32 v1, -v4, v3, v1
2710 ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
2711 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0
2712 ; CI-NEXT: v_rcp_f32_e32 v5, v4
2713 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2714 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
2715 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5
2716 ; CI-NEXT: v_mul_f32_e32 v6, v3, v5
2717 ; CI-NEXT: v_fma_f32 v7, -v4, v6, v3
2718 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6
2719 ; CI-NEXT: v_fma_f32 v3, -v4, v6, v3
2720 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2721 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
2722 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2723 ; CI-NEXT: v_trunc_f32_e32 v3, v3
2724 ; CI-NEXT: v_fma_f32 v0, -v3, v2, v0
2725 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2728 ; VI-LABEL: frem_v2f32:
2730 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2731 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2732 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2733 ; VI-NEXT: v_mov_b32_e32 v2, s6
2734 ; VI-NEXT: s_add_u32 s0, s0, 32
2735 ; VI-NEXT: s_addc_u32 s1, s1, 0
2736 ; VI-NEXT: v_mov_b32_e32 v5, s1
2737 ; VI-NEXT: v_mov_b32_e32 v3, s7
2738 ; VI-NEXT: v_mov_b32_e32 v4, s0
2739 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
2740 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
2741 ; VI-NEXT: v_mov_b32_e32 v0, s4
2742 ; VI-NEXT: v_mov_b32_e32 v1, s5
2743 ; VI-NEXT: s_waitcnt vmcnt(0)
2744 ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3
2745 ; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3
2746 ; VI-NEXT: v_rcp_f32_e32 v8, v7
2747 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2748 ; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0
2749 ; VI-NEXT: v_fma_f32 v8, v9, v8, v8
2750 ; VI-NEXT: v_mul_f32_e32 v9, v6, v8
2751 ; VI-NEXT: v_fma_f32 v10, -v7, v9, v6
2752 ; VI-NEXT: v_fma_f32 v9, v10, v8, v9
2753 ; VI-NEXT: v_fma_f32 v6, -v7, v9, v6
2754 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2755 ; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9
2756 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3
2757 ; VI-NEXT: v_trunc_f32_e32 v6, v6
2758 ; VI-NEXT: v_fma_f32 v3, -v6, v5, v3
2759 ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2
2760 ; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2
2761 ; VI-NEXT: v_rcp_f32_e32 v7, v6
2762 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2763 ; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2764 ; VI-NEXT: v_fma_f32 v7, v8, v7, v7
2765 ; VI-NEXT: v_mul_f32_e32 v8, v5, v7
2766 ; VI-NEXT: v_fma_f32 v9, -v6, v8, v5
2767 ; VI-NEXT: v_fma_f32 v8, v9, v7, v8
2768 ; VI-NEXT: v_fma_f32 v5, -v6, v8, v5
2769 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2770 ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2771 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2
2772 ; VI-NEXT: v_trunc_f32_e32 v5, v5
2773 ; VI-NEXT: v_fma_f32 v2, -v5, v4, v2
2774 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
2777 ; GFX9-LABEL: frem_v2f32:
2779 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2780 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2781 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2782 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2783 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
2784 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2785 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2786 ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1
2787 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
2788 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6
2789 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2790 ; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2791 ; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7
2792 ; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7
2793 ; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5
2794 ; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8
2795 ; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5
2796 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2797 ; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2798 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1
2799 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
2800 ; GFX9-NEXT: v_fma_f32 v1, -v5, v3, v1
2801 ; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0
2802 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0
2803 ; GFX9-NEXT: v_rcp_f32_e32 v6, v5
2804 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2805 ; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2806 ; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6
2807 ; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6
2808 ; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3
2809 ; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7
2810 ; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3
2811 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2812 ; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7
2813 ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2814 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
2815 ; GFX9-NEXT: v_fma_f32 v0, -v3, v2, v0
2816 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
2817 ; GFX9-NEXT: s_endpgm
2819 ; GFX10-LABEL: frem_v2f32:
2821 ; GFX10-NEXT: s_clause 0x1
2822 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2823 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2824 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
2825 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2826 ; GFX10-NEXT: s_clause 0x1
2827 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
2828 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2829 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2830 ; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1
2831 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
2832 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6
2833 ; GFX10-NEXT: s_denorm_mode 15
2834 ; GFX10-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2835 ; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v7
2836 ; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7
2837 ; GFX10-NEXT: v_fma_f32 v9, -v6, v8, v5
2838 ; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v7
2839 ; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5
2840 ; GFX10-NEXT: s_denorm_mode 12
2841 ; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2842 ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1
2843 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5
2844 ; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1
2845 ; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0
2846 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0
2847 ; GFX10-NEXT: v_rcp_f32_e32 v6, v5
2848 ; GFX10-NEXT: s_denorm_mode 15
2849 ; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2850 ; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v6
2851 ; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6
2852 ; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3
2853 ; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v6
2854 ; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3
2855 ; GFX10-NEXT: s_denorm_mode 12
2856 ; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7
2857 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2858 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
2859 ; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0
2860 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
2861 ; GFX10-NEXT: s_endpgm
2863 ; GFX11-LABEL: frem_v2f32:
2865 ; GFX11-NEXT: s_clause 0x1
2866 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2867 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2868 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
2869 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2870 ; GFX11-NEXT: s_clause 0x1
2871 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
2872 ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
2873 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2874 ; GFX11-NEXT: v_div_scale_f32 v6, null, v3, v3, v1
2875 ; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
2876 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
2877 ; GFX11-NEXT: v_rcp_f32_e32 v7, v6
2878 ; GFX11-NEXT: s_denorm_mode 15
2879 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2880 ; GFX11-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2881 ; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v7
2882 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2883 ; GFX11-NEXT: v_mul_f32_e32 v8, v5, v7
2884 ; GFX11-NEXT: v_fma_f32 v9, -v6, v8, v5
2885 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2886 ; GFX11-NEXT: v_fmac_f32_e32 v8, v9, v7
2887 ; GFX11-NEXT: v_fma_f32 v5, -v6, v8, v5
2888 ; GFX11-NEXT: s_denorm_mode 12
2889 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2890 ; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2891 ; GFX11-NEXT: v_div_fixup_f32 v5, v5, v3, v1
2892 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2893 ; GFX11-NEXT: v_trunc_f32_e32 v5, v5
2894 ; GFX11-NEXT: v_fma_f32 v1, -v5, v3, v1
2895 ; GFX11-NEXT: v_div_scale_f32 v5, null, v2, v2, v0
2896 ; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0
2897 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
2898 ; GFX11-NEXT: v_rcp_f32_e32 v6, v5
2899 ; GFX11-NEXT: s_denorm_mode 15
2900 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
2901 ; GFX11-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2902 ; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v6
2903 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2904 ; GFX11-NEXT: v_mul_f32_e32 v7, v3, v6
2905 ; GFX11-NEXT: v_fma_f32 v8, -v5, v7, v3
2906 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2907 ; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v6
2908 ; GFX11-NEXT: v_fma_f32 v3, -v5, v7, v3
2909 ; GFX11-NEXT: s_denorm_mode 12
2910 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2911 ; GFX11-NEXT: v_div_fmas_f32 v3, v3, v6, v7
2912 ; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2913 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2914 ; GFX11-NEXT: v_trunc_f32_e32 v3, v3
2915 ; GFX11-NEXT: v_fma_f32 v0, -v3, v2, v0
2916 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
2917 ; GFX11-NEXT: s_nop 0
2918 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2919 ; GFX11-NEXT: s_endpgm
2921 ; GFX1150-LABEL: frem_v2f32:
2923 ; GFX1150-NEXT: s_clause 0x1
2924 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
2925 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
2926 ; GFX1150-NEXT: v_mov_b32_e32 v4, 0
2927 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
2928 ; GFX1150-NEXT: s_clause 0x1
2929 ; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7]
2930 ; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
2931 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
2932 ; GFX1150-NEXT: v_div_scale_f32 v6, null, v3, v3, v1
2933 ; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
2934 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
2935 ; GFX1150-NEXT: v_rcp_f32_e32 v7, v6
2936 ; GFX1150-NEXT: s_denorm_mode 15
2937 ; GFX1150-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2938 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2939 ; GFX1150-NEXT: v_fmac_f32_e32 v7, v8, v7
2940 ; GFX1150-NEXT: v_mul_f32_e32 v8, v5, v7
2941 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2942 ; GFX1150-NEXT: v_fma_f32 v9, -v6, v8, v5
2943 ; GFX1150-NEXT: v_fmac_f32_e32 v8, v9, v7
2944 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2945 ; GFX1150-NEXT: v_fma_f32 v5, -v6, v8, v5
2946 ; GFX1150-NEXT: s_denorm_mode 12
2947 ; GFX1150-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2948 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2949 ; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v3, v1
2950 ; GFX1150-NEXT: v_trunc_f32_e32 v5, v5
2951 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2952 ; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
2953 ; GFX1150-NEXT: v_fma_f32 v1, v5, v3, v1
2954 ; GFX1150-NEXT: v_div_scale_f32 v5, null, v2, v2, v0
2955 ; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0
2956 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
2957 ; GFX1150-NEXT: v_rcp_f32_e32 v6, v5
2958 ; GFX1150-NEXT: s_denorm_mode 15
2959 ; GFX1150-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2960 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2961 ; GFX1150-NEXT: v_fmac_f32_e32 v6, v7, v6
2962 ; GFX1150-NEXT: v_mul_f32_e32 v7, v3, v6
2963 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2964 ; GFX1150-NEXT: v_fma_f32 v8, -v5, v7, v3
2965 ; GFX1150-NEXT: v_fmac_f32_e32 v7, v8, v6
2966 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2967 ; GFX1150-NEXT: v_fma_f32 v3, -v5, v7, v3
2968 ; GFX1150-NEXT: s_denorm_mode 12
2969 ; GFX1150-NEXT: v_div_fmas_f32 v3, v3, v6, v7
2970 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2971 ; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2972 ; GFX1150-NEXT: v_trunc_f32_e32 v3, v3
2973 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2974 ; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
2975 ; GFX1150-NEXT: v_fmac_f32_e32 v0, v3, v2
2976 ; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[4:5]
2977 ; GFX1150-NEXT: s_nop 0
2978 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2979 ; GFX1150-NEXT: s_endpgm
2980 ptr addrspace(1) %in2) #0 {
2981 %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
2982 %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8
2983 %r1 = load <2 x float>, ptr addrspace(1) %gep2, align 8
2984 %r2 = frem <2 x float> %r0, %r1
2985 store <2 x float> %r2, ptr addrspace(1) %out, align 8
2989 define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
2990 ; SI-LABEL: frem_v4f32:
2992 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2993 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2994 ; SI-NEXT: s_mov_b32 s3, 0xf000
2995 ; SI-NEXT: s_mov_b32 s2, -1
2996 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2997 ; SI-NEXT: s_mov_b32 s0, s4
2998 ; SI-NEXT: s_mov_b32 s1, s5
2999 ; SI-NEXT: s_mov_b32 s4, s6
3000 ; SI-NEXT: s_mov_b32 s5, s7
3001 ; SI-NEXT: s_mov_b32 s6, s2
3002 ; SI-NEXT: s_mov_b32 s7, s3
3003 ; SI-NEXT: s_mov_b32 s10, s2
3004 ; SI-NEXT: s_mov_b32 s11, s3
3005 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
3006 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
3007 ; SI-NEXT: s_waitcnt vmcnt(0)
3008 ; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3
3009 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3
3010 ; SI-NEXT: v_rcp_f32_e32 v10, v9
3011 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3012 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
3013 ; SI-NEXT: v_fma_f32 v10, v11, v10, v10
3014 ; SI-NEXT: v_mul_f32_e32 v11, v8, v10
3015 ; SI-NEXT: v_fma_f32 v12, -v9, v11, v8
3016 ; SI-NEXT: v_fma_f32 v11, v12, v10, v11
3017 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8
3018 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3019 ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
3020 ; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3
3021 ; SI-NEXT: v_trunc_f32_e32 v8, v8
3022 ; SI-NEXT: v_fma_f32 v3, -v8, v7, v3
3023 ; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
3024 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2
3025 ; SI-NEXT: v_rcp_f32_e32 v9, v8
3026 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3027 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
3028 ; SI-NEXT: v_fma_f32 v9, v10, v9, v9
3029 ; SI-NEXT: v_mul_f32_e32 v10, v7, v9
3030 ; SI-NEXT: v_fma_f32 v11, -v8, v10, v7
3031 ; SI-NEXT: v_fma_f32 v10, v11, v9, v10
3032 ; SI-NEXT: v_fma_f32 v7, -v8, v10, v7
3033 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3034 ; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10
3035 ; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2
3036 ; SI-NEXT: v_trunc_f32_e32 v7, v7
3037 ; SI-NEXT: v_fma_f32 v2, -v7, v6, v2
3038 ; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
3039 ; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1
3040 ; SI-NEXT: v_rcp_f32_e32 v8, v7
3041 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3042 ; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0
3043 ; SI-NEXT: v_fma_f32 v8, v9, v8, v8
3044 ; SI-NEXT: v_mul_f32_e32 v9, v6, v8
3045 ; SI-NEXT: v_fma_f32 v10, -v7, v9, v6
3046 ; SI-NEXT: v_fma_f32 v9, v10, v8, v9
3047 ; SI-NEXT: v_fma_f32 v6, -v7, v9, v6
3048 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3049 ; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9
3050 ; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1
3051 ; SI-NEXT: v_trunc_f32_e32 v6, v6
3052 ; SI-NEXT: v_fma_f32 v1, -v6, v5, v1
3053 ; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
3054 ; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0
3055 ; SI-NEXT: v_rcp_f32_e32 v7, v6
3056 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3057 ; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0
3058 ; SI-NEXT: v_fma_f32 v7, v8, v7, v7
3059 ; SI-NEXT: v_mul_f32_e32 v8, v5, v7
3060 ; SI-NEXT: v_fma_f32 v9, -v6, v8, v5
3061 ; SI-NEXT: v_fma_f32 v8, v9, v7, v8
3062 ; SI-NEXT: v_fma_f32 v5, -v6, v8, v5
3063 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3064 ; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
3065 ; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0
3066 ; SI-NEXT: v_trunc_f32_e32 v5, v5
3067 ; SI-NEXT: v_fma_f32 v0, -v5, v4, v0
3068 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3071 ; CI-LABEL: frem_v4f32:
3073 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3074 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
3075 ; CI-NEXT: s_mov_b32 s3, 0xf000
3076 ; CI-NEXT: s_mov_b32 s2, -1
3077 ; CI-NEXT: s_mov_b32 s10, s2
3078 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3079 ; CI-NEXT: s_mov_b32 s0, s4
3080 ; CI-NEXT: s_mov_b32 s1, s5
3081 ; CI-NEXT: s_mov_b32 s4, s6
3082 ; CI-NEXT: s_mov_b32 s5, s7
3083 ; CI-NEXT: s_mov_b32 s6, s2
3084 ; CI-NEXT: s_mov_b32 s7, s3
3085 ; CI-NEXT: s_mov_b32 s11, s3
3086 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
3087 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
3088 ; CI-NEXT: s_waitcnt vmcnt(0)
3089 ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3
3090 ; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3
3091 ; CI-NEXT: v_rcp_f32_e32 v10, v9
3092 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3093 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
3094 ; CI-NEXT: v_fma_f32 v10, v11, v10, v10
3095 ; CI-NEXT: v_mul_f32_e32 v11, v8, v10
3096 ; CI-NEXT: v_fma_f32 v12, -v9, v11, v8
3097 ; CI-NEXT: v_fma_f32 v11, v12, v10, v11
3098 ; CI-NEXT: v_fma_f32 v8, -v9, v11, v8
3099 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3100 ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
3101 ; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3
3102 ; CI-NEXT: v_trunc_f32_e32 v8, v8
3103 ; CI-NEXT: v_fma_f32 v3, -v8, v7, v3
3104 ; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2
3105 ; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
3106 ; CI-NEXT: v_rcp_f32_e32 v9, v8
3107 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3108 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
3109 ; CI-NEXT: v_fma_f32 v9, v10, v9, v9
3110 ; CI-NEXT: v_mul_f32_e32 v10, v7, v9
3111 ; CI-NEXT: v_fma_f32 v11, -v8, v10, v7
3112 ; CI-NEXT: v_fma_f32 v10, v11, v9, v10
3113 ; CI-NEXT: v_fma_f32 v7, -v8, v10, v7
3114 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3115 ; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10
3116 ; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2
3117 ; CI-NEXT: v_trunc_f32_e32 v7, v7
3118 ; CI-NEXT: v_fma_f32 v2, -v7, v6, v2
3119 ; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1
3120 ; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
3121 ; CI-NEXT: v_rcp_f32_e32 v8, v7
3122 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3123 ; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0
3124 ; CI-NEXT: v_fma_f32 v8, v9, v8, v8
3125 ; CI-NEXT: v_mul_f32_e32 v9, v6, v8
3126 ; CI-NEXT: v_fma_f32 v10, -v7, v9, v6
3127 ; CI-NEXT: v_fma_f32 v9, v10, v8, v9
3128 ; CI-NEXT: v_fma_f32 v6, -v7, v9, v6
3129 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3130 ; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9
3131 ; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1
3132 ; CI-NEXT: v_trunc_f32_e32 v6, v6
3133 ; CI-NEXT: v_fma_f32 v1, -v6, v5, v1
3134 ; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0
3135 ; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
3136 ; CI-NEXT: v_rcp_f32_e32 v7, v6
3137 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3138 ; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0
3139 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7
3140 ; CI-NEXT: v_mul_f32_e32 v8, v5, v7
3141 ; CI-NEXT: v_fma_f32 v9, -v6, v8, v5
3142 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8
3143 ; CI-NEXT: v_fma_f32 v5, -v6, v8, v5
3144 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3145 ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
3146 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0
3147 ; CI-NEXT: v_trunc_f32_e32 v5, v5
3148 ; CI-NEXT: v_fma_f32 v0, -v5, v4, v0
3149 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3152 ; VI-LABEL: frem_v4f32:
3154 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3155 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3156 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3157 ; VI-NEXT: v_mov_b32_e32 v0, s6
3158 ; VI-NEXT: s_add_u32 s0, s0, 64
3159 ; VI-NEXT: s_addc_u32 s1, s1, 0
3160 ; VI-NEXT: v_mov_b32_e32 v5, s1
3161 ; VI-NEXT: v_mov_b32_e32 v1, s7
3162 ; VI-NEXT: v_mov_b32_e32 v4, s0
3163 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
3164 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
3165 ; VI-NEXT: v_mov_b32_e32 v8, s4
3166 ; VI-NEXT: v_mov_b32_e32 v9, s5
3167 ; VI-NEXT: s_waitcnt vmcnt(0)
3168 ; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3
3169 ; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3
3170 ; VI-NEXT: v_rcp_f32_e32 v12, v11
3171 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3172 ; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0
3173 ; VI-NEXT: v_fma_f32 v12, v13, v12, v12
3174 ; VI-NEXT: v_mul_f32_e32 v13, v10, v12
3175 ; VI-NEXT: v_fma_f32 v14, -v11, v13, v10
3176 ; VI-NEXT: v_fma_f32 v13, v14, v12, v13
3177 ; VI-NEXT: v_fma_f32 v10, -v11, v13, v10
3178 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3179 ; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13
3180 ; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3
3181 ; VI-NEXT: v_trunc_f32_e32 v10, v10
3182 ; VI-NEXT: v_fma_f32 v3, -v10, v7, v3
3183 ; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2
3184 ; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
3185 ; VI-NEXT: v_rcp_f32_e32 v11, v10
3186 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3187 ; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0
3188 ; VI-NEXT: v_fma_f32 v11, v12, v11, v11
3189 ; VI-NEXT: v_mul_f32_e32 v12, v7, v11
3190 ; VI-NEXT: v_fma_f32 v13, -v10, v12, v7
3191 ; VI-NEXT: v_fma_f32 v12, v13, v11, v12
3192 ; VI-NEXT: v_fma_f32 v7, -v10, v12, v7
3193 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3194 ; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12
3195 ; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2
3196 ; VI-NEXT: v_trunc_f32_e32 v7, v7
3197 ; VI-NEXT: v_fma_f32 v2, -v7, v6, v2
3198 ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1
3199 ; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
3200 ; VI-NEXT: v_rcp_f32_e32 v10, v7
3201 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3202 ; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0
3203 ; VI-NEXT: v_fma_f32 v10, v11, v10, v10
3204 ; VI-NEXT: v_mul_f32_e32 v11, v6, v10
3205 ; VI-NEXT: v_fma_f32 v12, -v7, v11, v6
3206 ; VI-NEXT: v_fma_f32 v11, v12, v10, v11
3207 ; VI-NEXT: v_fma_f32 v6, -v7, v11, v6
3208 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3209 ; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11
3210 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1
3211 ; VI-NEXT: v_trunc_f32_e32 v6, v6
3212 ; VI-NEXT: v_fma_f32 v1, -v6, v5, v1
3213 ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0
3214 ; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
3215 ; VI-NEXT: v_rcp_f32_e32 v7, v6
3216 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3217 ; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0
3218 ; VI-NEXT: v_fma_f32 v7, v10, v7, v7
3219 ; VI-NEXT: v_mul_f32_e32 v10, v5, v7
3220 ; VI-NEXT: v_fma_f32 v11, -v6, v10, v5
3221 ; VI-NEXT: v_fma_f32 v10, v11, v7, v10
3222 ; VI-NEXT: v_fma_f32 v5, -v6, v10, v5
3223 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3224 ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10
3225 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0
3226 ; VI-NEXT: v_trunc_f32_e32 v5, v5
3227 ; VI-NEXT: v_fma_f32 v0, -v5, v4, v0
3228 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
3231 ; GFX9-LABEL: frem_v4f32:
3233 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3234 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3235 ; GFX9-NEXT: v_mov_b32_e32 v8, 0
3236 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3237 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
3238 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
3239 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3240 ; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3
3241 ; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3
3242 ; GFX9-NEXT: v_rcp_f32_e32 v11, v10
3243 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3244 ; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0
3245 ; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11
3246 ; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11
3247 ; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9
3248 ; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12
3249 ; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9
3250 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3251 ; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12
3252 ; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3
3253 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9
3254 ; GFX9-NEXT: v_fma_f32 v3, -v9, v7, v3
3255 ; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2
3256 ; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
3257 ; GFX9-NEXT: v_rcp_f32_e32 v10, v9
3258 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3259 ; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0
3260 ; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10
3261 ; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10
3262 ; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7
3263 ; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11
3264 ; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7
3265 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3266 ; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11
3267 ; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2
3268 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7
3269 ; GFX9-NEXT: v_fma_f32 v2, -v7, v6, v2
3270 ; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1
3271 ; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
3272 ; GFX9-NEXT: v_rcp_f32_e32 v9, v7
3273 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3274 ; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0
3275 ; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9
3276 ; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9
3277 ; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6
3278 ; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10
3279 ; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6
3280 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3281 ; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10
3282 ; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1
3283 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6
3284 ; GFX9-NEXT: v_fma_f32 v1, -v6, v5, v1
3285 ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0
3286 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
3287 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6
3288 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
3289 ; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0
3290 ; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7
3291 ; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7
3292 ; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5
3293 ; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9
3294 ; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5
3295 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
3296 ; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9
3297 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0
3298 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
3299 ; GFX9-NEXT: v_fma_f32 v0, -v5, v4, v0
3300 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
3301 ; GFX9-NEXT: s_endpgm
3303 ; GFX10-LABEL: frem_v4f32:
3305 ; GFX10-NEXT: s_clause 0x1
3306 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3307 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3308 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
3309 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3310 ; GFX10-NEXT: s_clause 0x1
3311 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
3312 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
3313 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3314 ; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3
3315 ; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
3316 ; GFX10-NEXT: v_rcp_f32_e32 v11, v10
3317 ; GFX10-NEXT: s_denorm_mode 15
3318 ; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0
3319 ; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v11
3320 ; GFX10-NEXT: v_mul_f32_e32 v12, v9, v11
3321 ; GFX10-NEXT: v_fma_f32 v13, -v10, v12, v9
3322 ; GFX10-NEXT: v_fmac_f32_e32 v12, v13, v11
3323 ; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9
3324 ; GFX10-NEXT: s_denorm_mode 12
3325 ; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12
3326 ; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3
3327 ; GFX10-NEXT: v_trunc_f32_e32 v9, v9
3328 ; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3
3329 ; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2
3330 ; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2
3331 ; GFX10-NEXT: v_rcp_f32_e32 v10, v9
3332 ; GFX10-NEXT: s_denorm_mode 15
3333 ; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0
3334 ; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v10
3335 ; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10
3336 ; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7
3337 ; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v10
3338 ; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7
3339 ; GFX10-NEXT: s_denorm_mode 12
3340 ; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11
3341 ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2
3342 ; GFX10-NEXT: v_trunc_f32_e32 v7, v7
3343 ; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2
3344 ; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1
3345 ; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1
3346 ; GFX10-NEXT: v_rcp_f32_e32 v9, v7
3347 ; GFX10-NEXT: s_denorm_mode 15
3348 ; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0
3349 ; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v9
3350 ; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9
3351 ; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6
3352 ; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v9
3353 ; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6
3354 ; GFX10-NEXT: s_denorm_mode 12
3355 ; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10
3356 ; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1
3357 ; GFX10-NEXT: v_trunc_f32_e32 v6, v6
3358 ; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1
3359 ; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0
3360 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0
3361 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6
3362 ; GFX10-NEXT: s_denorm_mode 15
3363 ; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0
3364 ; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v7
3365 ; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7
3366 ; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5
3367 ; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v7
3368 ; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5
3369 ; GFX10-NEXT: s_denorm_mode 12
3370 ; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9
3371 ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0
3372 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5
3373 ; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0
3374 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
3375 ; GFX10-NEXT: s_endpgm
3377 ; GFX11-LABEL: frem_v4f32:
3379 ; GFX11-NEXT: s_clause 0x1
3380 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
3381 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
3382 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
3383 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3384 ; GFX11-NEXT: s_clause 0x1
3385 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7]
3386 ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64
3387 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3388 ; GFX11-NEXT: v_div_scale_f32 v10, null, v7, v7, v3
3389 ; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
3390 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3391 ; GFX11-NEXT: v_rcp_f32_e32 v11, v10
3392 ; GFX11-NEXT: s_denorm_mode 15
3393 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3394 ; GFX11-NEXT: v_fma_f32 v12, -v10, v11, 1.0
3395 ; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v11
3396 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3397 ; GFX11-NEXT: v_mul_f32_e32 v12, v9, v11
3398 ; GFX11-NEXT: v_fma_f32 v13, -v10, v12, v9
3399 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3400 ; GFX11-NEXT: v_fmac_f32_e32 v12, v13, v11
3401 ; GFX11-NEXT: v_fma_f32 v9, -v10, v12, v9
3402 ; GFX11-NEXT: s_denorm_mode 12
3403 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3404 ; GFX11-NEXT: v_div_fmas_f32 v9, v9, v11, v12
3405 ; GFX11-NEXT: v_div_fixup_f32 v9, v9, v7, v3
3406 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3407 ; GFX11-NEXT: v_trunc_f32_e32 v9, v9
3408 ; GFX11-NEXT: v_fma_f32 v3, -v9, v7, v3
3409 ; GFX11-NEXT: v_div_scale_f32 v9, null, v6, v6, v2
3410 ; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2
3411 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3412 ; GFX11-NEXT: v_rcp_f32_e32 v10, v9
3413 ; GFX11-NEXT: s_denorm_mode 15
3414 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3415 ; GFX11-NEXT: v_fma_f32 v11, -v9, v10, 1.0
3416 ; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v10
3417 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3418 ; GFX11-NEXT: v_mul_f32_e32 v11, v7, v10
3419 ; GFX11-NEXT: v_fma_f32 v12, -v9, v11, v7
3420 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3421 ; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v10
3422 ; GFX11-NEXT: v_fma_f32 v7, -v9, v11, v7
3423 ; GFX11-NEXT: s_denorm_mode 12
3424 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3425 ; GFX11-NEXT: v_div_fmas_f32 v7, v7, v10, v11
3426 ; GFX11-NEXT: v_div_fixup_f32 v7, v7, v6, v2
3427 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3428 ; GFX11-NEXT: v_trunc_f32_e32 v7, v7
3429 ; GFX11-NEXT: v_fma_f32 v2, -v7, v6, v2
3430 ; GFX11-NEXT: v_div_scale_f32 v7, null, v5, v5, v1
3431 ; GFX11-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1
3432 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3433 ; GFX11-NEXT: v_rcp_f32_e32 v9, v7
3434 ; GFX11-NEXT: s_denorm_mode 15
3435 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3436 ; GFX11-NEXT: v_fma_f32 v10, -v7, v9, 1.0
3437 ; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v9
3438 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3439 ; GFX11-NEXT: v_mul_f32_e32 v10, v6, v9
3440 ; GFX11-NEXT: v_fma_f32 v11, -v7, v10, v6
3441 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3442 ; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v9
3443 ; GFX11-NEXT: v_fma_f32 v6, -v7, v10, v6
3444 ; GFX11-NEXT: s_denorm_mode 12
3445 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3446 ; GFX11-NEXT: v_div_fmas_f32 v6, v6, v9, v10
3447 ; GFX11-NEXT: v_div_fixup_f32 v6, v6, v5, v1
3448 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3449 ; GFX11-NEXT: v_trunc_f32_e32 v6, v6
3450 ; GFX11-NEXT: v_fma_f32 v1, -v6, v5, v1
3451 ; GFX11-NEXT: v_div_scale_f32 v6, null, v4, v4, v0
3452 ; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0
3453 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3454 ; GFX11-NEXT: v_rcp_f32_e32 v7, v6
3455 ; GFX11-NEXT: s_denorm_mode 15
3456 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3457 ; GFX11-NEXT: v_fma_f32 v9, -v6, v7, 1.0
3458 ; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v7
3459 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3460 ; GFX11-NEXT: v_mul_f32_e32 v9, v5, v7
3461 ; GFX11-NEXT: v_fma_f32 v10, -v6, v9, v5
3462 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3463 ; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v7
3464 ; GFX11-NEXT: v_fma_f32 v5, -v6, v9, v5
3465 ; GFX11-NEXT: s_denorm_mode 12
3466 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3467 ; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v9
3468 ; GFX11-NEXT: v_div_fixup_f32 v5, v5, v4, v0
3469 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3470 ; GFX11-NEXT: v_trunc_f32_e32 v5, v5
3471 ; GFX11-NEXT: v_fma_f32 v0, -v5, v4, v0
3472 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5]
3473 ; GFX11-NEXT: s_nop 0
3474 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3475 ; GFX11-NEXT: s_endpgm
3477 ; GFX1150-LABEL: frem_v4f32:
3479 ; GFX1150-NEXT: s_clause 0x1
3480 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
3481 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
3482 ; GFX1150-NEXT: v_mov_b32_e32 v8, 0
3483 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
3484 ; GFX1150-NEXT: s_clause 0x1
3485 ; GFX1150-NEXT: global_load_b128 v[0:3], v8, s[6:7]
3486 ; GFX1150-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64
3487 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
3488 ; GFX1150-NEXT: v_div_scale_f32 v10, null, v7, v7, v3
3489 ; GFX1150-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
3490 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
3491 ; GFX1150-NEXT: v_rcp_f32_e32 v11, v10
3492 ; GFX1150-NEXT: s_denorm_mode 15
3493 ; GFX1150-NEXT: v_fma_f32 v12, -v10, v11, 1.0
3494 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3495 ; GFX1150-NEXT: v_fmac_f32_e32 v11, v12, v11
3496 ; GFX1150-NEXT: v_mul_f32_e32 v12, v9, v11
3497 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3498 ; GFX1150-NEXT: v_fma_f32 v13, -v10, v12, v9
3499 ; GFX1150-NEXT: v_fmac_f32_e32 v12, v13, v11
3500 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3501 ; GFX1150-NEXT: v_fma_f32 v9, -v10, v12, v9
3502 ; GFX1150-NEXT: s_denorm_mode 12
3503 ; GFX1150-NEXT: v_div_fmas_f32 v9, v9, v11, v12
3504 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3505 ; GFX1150-NEXT: v_div_fixup_f32 v9, v9, v7, v3
3506 ; GFX1150-NEXT: v_trunc_f32_e32 v9, v9
3507 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3508 ; GFX1150-NEXT: v_xor_b32_e32 v9, 0x80000000, v9
3509 ; GFX1150-NEXT: v_fma_f32 v3, v9, v7, v3
3510 ; GFX1150-NEXT: v_div_scale_f32 v9, null, v6, v6, v2
3511 ; GFX1150-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2
3512 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
3513 ; GFX1150-NEXT: v_rcp_f32_e32 v10, v9
3514 ; GFX1150-NEXT: s_denorm_mode 15
3515 ; GFX1150-NEXT: v_fma_f32 v11, -v9, v10, 1.0
3516 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3517 ; GFX1150-NEXT: v_fmac_f32_e32 v10, v11, v10
3518 ; GFX1150-NEXT: v_mul_f32_e32 v11, v7, v10
3519 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3520 ; GFX1150-NEXT: v_fma_f32 v12, -v9, v11, v7
3521 ; GFX1150-NEXT: v_fmac_f32_e32 v11, v12, v10
3522 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3523 ; GFX1150-NEXT: v_fma_f32 v7, -v9, v11, v7
3524 ; GFX1150-NEXT: s_denorm_mode 12
3525 ; GFX1150-NEXT: v_div_fmas_f32 v7, v7, v10, v11
3526 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3527 ; GFX1150-NEXT: v_div_fixup_f32 v7, v7, v6, v2
3528 ; GFX1150-NEXT: v_trunc_f32_e32 v7, v7
3529 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3530 ; GFX1150-NEXT: v_xor_b32_e32 v7, 0x80000000, v7
3531 ; GFX1150-NEXT: v_fma_f32 v2, v7, v6, v2
3532 ; GFX1150-NEXT: v_div_scale_f32 v7, null, v5, v5, v1
3533 ; GFX1150-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1
3534 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
3535 ; GFX1150-NEXT: v_rcp_f32_e32 v9, v7
3536 ; GFX1150-NEXT: s_denorm_mode 15
3537 ; GFX1150-NEXT: v_fma_f32 v10, -v7, v9, 1.0
3538 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3539 ; GFX1150-NEXT: v_fmac_f32_e32 v9, v10, v9
3540 ; GFX1150-NEXT: v_mul_f32_e32 v10, v6, v9
3541 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3542 ; GFX1150-NEXT: v_fma_f32 v11, -v7, v10, v6
3543 ; GFX1150-NEXT: v_fmac_f32_e32 v10, v11, v9
3544 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3545 ; GFX1150-NEXT: v_fma_f32 v6, -v7, v10, v6
3546 ; GFX1150-NEXT: s_denorm_mode 12
3547 ; GFX1150-NEXT: v_div_fmas_f32 v6, v6, v9, v10
3548 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3549 ; GFX1150-NEXT: v_div_fixup_f32 v6, v6, v5, v1
3550 ; GFX1150-NEXT: v_trunc_f32_e32 v6, v6
3551 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3552 ; GFX1150-NEXT: v_xor_b32_e32 v6, 0x80000000, v6
3553 ; GFX1150-NEXT: v_fma_f32 v1, v6, v5, v1
3554 ; GFX1150-NEXT: v_div_scale_f32 v6, null, v4, v4, v0
3555 ; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0
3556 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
3557 ; GFX1150-NEXT: v_rcp_f32_e32 v7, v6
3558 ; GFX1150-NEXT: s_denorm_mode 15
3559 ; GFX1150-NEXT: v_fma_f32 v9, -v6, v7, 1.0
3560 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3561 ; GFX1150-NEXT: v_fmac_f32_e32 v7, v9, v7
3562 ; GFX1150-NEXT: v_mul_f32_e32 v9, v5, v7
3563 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3564 ; GFX1150-NEXT: v_fma_f32 v10, -v6, v9, v5
3565 ; GFX1150-NEXT: v_fmac_f32_e32 v9, v10, v7
3566 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3567 ; GFX1150-NEXT: v_fma_f32 v5, -v6, v9, v5
3568 ; GFX1150-NEXT: s_denorm_mode 12
3569 ; GFX1150-NEXT: v_div_fmas_f32 v5, v5, v7, v9
3570 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3571 ; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v4, v0
3572 ; GFX1150-NEXT: v_trunc_f32_e32 v5, v5
3573 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3574 ; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
3575 ; GFX1150-NEXT: v_fmac_f32_e32 v0, v5, v4
3576 ; GFX1150-NEXT: global_store_b128 v8, v[0:3], s[4:5]
3577 ; GFX1150-NEXT: s_nop 0
3578 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3579 ; GFX1150-NEXT: s_endpgm
3580 ptr addrspace(1) %in2) #0 {
3581 %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
3582 %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16
3583 %r1 = load <4 x float>, ptr addrspace(1) %gep2, align 16
3584 %r2 = frem <4 x float> %r0, %r1
3585 store <4 x float> %r2, ptr addrspace(1) %out, align 16
3589 define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
3590 ; SI-LABEL: frem_v2f64:
3592 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
3593 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
3594 ; SI-NEXT: s_mov_b32 s7, 0xf000
3595 ; SI-NEXT: s_mov_b32 s6, -1
3596 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3597 ; SI-NEXT: s_mov_b32 s4, s8
3598 ; SI-NEXT: s_mov_b32 s5, s9
3599 ; SI-NEXT: s_mov_b32 s8, s10
3600 ; SI-NEXT: s_mov_b32 s9, s11
3601 ; SI-NEXT: s_mov_b32 s10, s6
3602 ; SI-NEXT: s_mov_b32 s11, s7
3603 ; SI-NEXT: s_mov_b32 s2, s6
3604 ; SI-NEXT: s_mov_b32 s3, s7
3605 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
3606 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64
3607 ; SI-NEXT: s_waitcnt vmcnt(0)
3608 ; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
3609 ; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
3610 ; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3611 ; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3612 ; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3613 ; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3614 ; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3]
3615 ; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3616 ; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13]
3617 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9
3618 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v13
3619 ; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
3621 ; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
3622 ; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3623 ; SI-NEXT: v_readfirstlane_b32 s8, v9
3624 ; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014
3625 ; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01
3626 ; SI-NEXT: s_mov_b32 s3, 0xfffff
3627 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9
3628 ; SI-NEXT: v_not_b32_e32 v10, s0
3629 ; SI-NEXT: v_and_b32_e32 v10, v8, v10
3630 ; SI-NEXT: v_not_b32_e32 v11, s1
3631 ; SI-NEXT: v_and_b32_e32 v9, v9, v11
3632 ; SI-NEXT: s_and_b32 s0, s8, 0x80000000
3633 ; SI-NEXT: s_cmp_lt_i32 s9, 0
3634 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
3635 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
3636 ; SI-NEXT: v_mov_b32_e32 v11, s0
3637 ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
3638 ; SI-NEXT: s_cmp_gt_i32 s9, 51
3639 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
3640 ; SI-NEXT: v_mov_b32_e32 v11, s8
3641 ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
3642 ; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
3643 ; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3644 ; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
3645 ; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
3646 ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3647 ; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3648 ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3649 ; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3650 ; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1]
3651 ; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
3652 ; SI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11]
3653 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
3654 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v11
3655 ; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
3657 ; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
3658 ; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3659 ; SI-NEXT: v_readfirstlane_b32 s8, v7
3660 ; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014
3661 ; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01
3662 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9
3663 ; SI-NEXT: v_not_b32_e32 v8, s0
3664 ; SI-NEXT: v_and_b32_e32 v8, v6, v8
3665 ; SI-NEXT: v_not_b32_e32 v9, s1
3666 ; SI-NEXT: v_and_b32_e32 v7, v7, v9
3667 ; SI-NEXT: s_and_b32 s0, s8, 0x80000000
3668 ; SI-NEXT: s_cmp_lt_i32 s9, 0
3669 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
3670 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
3671 ; SI-NEXT: v_mov_b32_e32 v9, s0
3672 ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
3673 ; SI-NEXT: s_cmp_gt_i32 s9, 51
3674 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
3675 ; SI-NEXT: v_mov_b32_e32 v9, s8
3676 ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
3677 ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
3678 ; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3679 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
3682 ; CI-LABEL: frem_v2f64:
3684 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3685 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
3686 ; CI-NEXT: s_mov_b32 s3, 0xf000
3687 ; CI-NEXT: s_mov_b32 s2, -1
3688 ; CI-NEXT: s_mov_b32 s10, s2
3689 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3690 ; CI-NEXT: s_mov_b32 s0, s4
3691 ; CI-NEXT: s_mov_b32 s1, s5
3692 ; CI-NEXT: s_mov_b32 s4, s6
3693 ; CI-NEXT: s_mov_b32 s5, s7
3694 ; CI-NEXT: s_mov_b32 s6, s2
3695 ; CI-NEXT: s_mov_b32 s7, s3
3696 ; CI-NEXT: s_mov_b32 s11, s3
3697 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
3698 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
3699 ; CI-NEXT: s_waitcnt vmcnt(0)
3700 ; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3]
3701 ; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
3702 ; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3703 ; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3704 ; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3705 ; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3706 ; CI-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
3707 ; CI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3708 ; CI-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3710 ; CI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3711 ; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3712 ; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
3713 ; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3714 ; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1]
3715 ; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
3716 ; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3717 ; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3718 ; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3719 ; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3720 ; CI-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
3721 ; CI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
3722 ; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3724 ; CI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3725 ; CI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3726 ; CI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
3727 ; CI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3728 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3731 ; VI-LABEL: frem_v2f64:
3733 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3734 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3735 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3736 ; VI-NEXT: v_mov_b32_e32 v0, s6
3737 ; VI-NEXT: s_add_u32 s0, s0, 64
3738 ; VI-NEXT: s_addc_u32 s1, s1, 0
3739 ; VI-NEXT: v_mov_b32_e32 v5, s1
3740 ; VI-NEXT: v_mov_b32_e32 v1, s7
3741 ; VI-NEXT: v_mov_b32_e32 v4, s0
3742 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
3743 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
3744 ; VI-NEXT: v_mov_b32_e32 v8, s4
3745 ; VI-NEXT: v_mov_b32_e32 v9, s5
3746 ; VI-NEXT: s_waitcnt vmcnt(0)
3747 ; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3]
3748 ; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11]
3749 ; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
3750 ; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
3751 ; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
3752 ; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
3753 ; VI-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3]
3754 ; VI-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13]
3755 ; VI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15]
3757 ; VI-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17]
3758 ; VI-NEXT: v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3]
3759 ; VI-NEXT: v_trunc_f64_e32 v[10:11], v[10:11]
3760 ; VI-NEXT: v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3]
3761 ; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
3762 ; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
3763 ; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
3764 ; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3765 ; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
3766 ; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3767 ; VI-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1]
3768 ; VI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3769 ; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13]
3771 ; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15]
3772 ; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3773 ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
3774 ; VI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3775 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
3778 ; GFX9-LABEL: frem_v2f64:
3780 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3781 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3782 ; GFX9-NEXT: v_mov_b32_e32 v16, 0
3783 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3784 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
3785 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
3786 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3787 ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
3788 ; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
3789 ; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3790 ; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3791 ; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3792 ; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3793 ; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
3794 ; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3795 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3796 ; GFX9-NEXT: s_nop 1
3797 ; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3798 ; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3799 ; GFX9-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
3800 ; GFX9-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3801 ; GFX9-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
3802 ; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
3803 ; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3804 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3805 ; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3806 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3807 ; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
3808 ; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
3809 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3810 ; GFX9-NEXT: s_nop 1
3811 ; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3812 ; GFX9-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3813 ; GFX9-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
3814 ; GFX9-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3815 ; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5]
3816 ; GFX9-NEXT: s_endpgm
3818 ; GFX10-LABEL: frem_v2f64:
3820 ; GFX10-NEXT: s_clause 0x1
3821 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3822 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3823 ; GFX10-NEXT: v_mov_b32_e32 v16, 0
3824 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3825 ; GFX10-NEXT: s_clause 0x1
3826 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
3827 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
3828 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3829 ; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3]
3830 ; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
3831 ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3832 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3833 ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3834 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3835 ; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
3836 ; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3837 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3838 ; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3839 ; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3840 ; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
3841 ; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3842 ; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1]
3843 ; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
3844 ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3845 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3846 ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3847 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3848 ; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
3849 ; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
3850 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3851 ; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3852 ; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3853 ; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
3854 ; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3855 ; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5]
3856 ; GFX10-NEXT: s_endpgm
3858 ; GFX11-LABEL: frem_v2f64:
3860 ; GFX11-NEXT: s_clause 0x1
3861 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
3862 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
3863 ; GFX11-NEXT: v_mov_b32_e32 v16, 0
3864 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3865 ; GFX11-NEXT: s_clause 0x1
3866 ; GFX11-NEXT: global_load_b128 v[0:3], v16, s[6:7]
3867 ; GFX11-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64
3868 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3869 ; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
3870 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
3871 ; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
3872 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3873 ; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3874 ; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3875 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3876 ; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3877 ; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3878 ; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
3879 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3880 ; GFX11-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3881 ; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3882 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3883 ; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3884 ; GFX11-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3885 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3886 ; GFX11-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
3887 ; GFX11-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3888 ; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1]
3889 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
3890 ; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
3891 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
3892 ; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3893 ; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3894 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3895 ; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3896 ; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3897 ; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
3898 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3899 ; GFX11-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
3900 ; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3901 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3902 ; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3903 ; GFX11-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3904 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3905 ; GFX11-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
3906 ; GFX11-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3907 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[4:5]
3908 ; GFX11-NEXT: s_nop 0
3909 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3910 ; GFX11-NEXT: s_endpgm
3912 ; GFX1150-LABEL: frem_v2f64:
3914 ; GFX1150-NEXT: s_clause 0x1
3915 ; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
3916 ; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
3917 ; GFX1150-NEXT: v_mov_b32_e32 v16, 0
3918 ; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
3919 ; GFX1150-NEXT: s_clause 0x1
3920 ; GFX1150-NEXT: global_load_b128 v[0:3], v16, s[6:7]
3921 ; GFX1150-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64
3922 ; GFX1150-NEXT: s_waitcnt vmcnt(0)
3923 ; GFX1150-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
3924 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
3925 ; GFX1150-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
3926 ; GFX1150-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3927 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3928 ; GFX1150-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3929 ; GFX1150-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3930 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3931 ; GFX1150-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3932 ; GFX1150-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
3933 ; GFX1150-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
3934 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3935 ; GFX1150-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3936 ; GFX1150-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3937 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3938 ; GFX1150-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3939 ; GFX1150-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
3940 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3941 ; GFX1150-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3942 ; GFX1150-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1]
3943 ; GFX1150-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
3944 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3945 ; GFX1150-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3946 ; GFX1150-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3947 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3948 ; GFX1150-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3949 ; GFX1150-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3950 ; GFX1150-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
3951 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3952 ; GFX1150-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
3953 ; GFX1150-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3954 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3955 ; GFX1150-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3956 ; GFX1150-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3957 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3958 ; GFX1150-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
3959 ; GFX1150-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3960 ; GFX1150-NEXT: global_store_b128 v16, v[0:3], s[4:5]
3961 ; GFX1150-NEXT: s_nop 0
3962 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3963 ; GFX1150-NEXT: s_endpgm
3964 ptr addrspace(1) %in2) #0 {
3965 %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
3966 %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16
3967 %r1 = load <2 x double>, ptr addrspace(1) %gep2, align 16
3968 %r2 = frem <2 x double> %r0, %r1
3969 store <2 x double> %r2, ptr addrspace(1) %out, align 16
3973 attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
3974 attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }