1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
8 define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
11 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
12 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
13 ; SI-NEXT: s_mov_b32 s11, 0xf000
14 ; SI-NEXT: s_mov_b32 s10, -1
15 ; SI-NEXT: s_waitcnt lgkmcnt(0)
16 ; SI-NEXT: s_mov_b32 s8, s4
17 ; SI-NEXT: s_mov_b32 s9, s5
18 ; SI-NEXT: s_mov_b32 s4, s6
19 ; SI-NEXT: s_mov_b32 s5, s7
20 ; SI-NEXT: s_mov_b32 s6, s10
21 ; SI-NEXT: s_mov_b32 s7, s11
22 ; SI-NEXT: s_mov_b32 s2, s10
23 ; SI-NEXT: s_mov_b32 s3, s11
24 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
25 ; SI-NEXT: s_waitcnt vmcnt(0)
26 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
27 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
28 ; SI-NEXT: s_waitcnt vmcnt(0)
29 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
30 ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0
31 ; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0
32 ; SI-NEXT: v_rcp_f32_e32 v4, v3
33 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
34 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0
35 ; SI-NEXT: v_fma_f32 v4, v5, v4, v4
36 ; SI-NEXT: v_mul_f32_e32 v5, v2, v4
37 ; SI-NEXT: v_fma_f32 v6, -v3, v5, v2
38 ; SI-NEXT: v_fma_f32 v5, v6, v4, v5
39 ; SI-NEXT: v_fma_f32 v2, -v3, v5, v2
40 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
41 ; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
42 ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
43 ; SI-NEXT: v_trunc_f32_e32 v2, v2
44 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
45 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
46 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
47 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
52 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
53 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
54 ; CI-NEXT: s_mov_b32 s11, 0xf000
55 ; CI-NEXT: s_mov_b32 s10, -1
56 ; CI-NEXT: s_mov_b32 s2, s10
57 ; CI-NEXT: s_waitcnt lgkmcnt(0)
58 ; CI-NEXT: s_mov_b32 s8, s4
59 ; CI-NEXT: s_mov_b32 s9, s5
60 ; CI-NEXT: s_mov_b32 s4, s6
61 ; CI-NEXT: s_mov_b32 s5, s7
62 ; CI-NEXT: s_mov_b32 s3, s11
63 ; CI-NEXT: s_mov_b32 s6, s10
64 ; CI-NEXT: s_mov_b32 s7, s11
65 ; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
66 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
67 ; CI-NEXT: s_waitcnt vmcnt(1)
68 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
69 ; CI-NEXT: s_waitcnt vmcnt(0)
70 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
71 ; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0
72 ; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0
73 ; CI-NEXT: v_rcp_f32_e32 v4, v3
74 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
75 ; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0
76 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
77 ; CI-NEXT: v_mul_f32_e32 v5, v2, v4
78 ; CI-NEXT: v_fma_f32 v6, -v3, v5, v2
79 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
80 ; CI-NEXT: v_fma_f32 v2, -v3, v5, v2
81 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
82 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
83 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
84 ; CI-NEXT: v_trunc_f32_e32 v2, v2
85 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
86 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
87 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
88 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0
93 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
94 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
95 ; VI-NEXT: s_waitcnt lgkmcnt(0)
96 ; VI-NEXT: v_mov_b32_e32 v2, s6
97 ; VI-NEXT: s_add_u32 s0, s0, 8
98 ; VI-NEXT: v_mov_b32_e32 v3, s7
99 ; VI-NEXT: s_addc_u32 s1, s1, 0
100 ; VI-NEXT: flat_load_ushort v4, v[2:3]
101 ; VI-NEXT: v_mov_b32_e32 v3, s1
102 ; VI-NEXT: v_mov_b32_e32 v2, s0
103 ; VI-NEXT: flat_load_ushort v2, v[2:3]
104 ; VI-NEXT: v_mov_b32_e32 v0, s4
105 ; VI-NEXT: v_mov_b32_e32 v1, s5
106 ; VI-NEXT: s_waitcnt vmcnt(1)
107 ; VI-NEXT: v_cvt_f32_f16_e32 v3, v4
108 ; VI-NEXT: s_waitcnt vmcnt(0)
109 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v2
110 ; VI-NEXT: v_rcp_f32_e32 v5, v5
111 ; VI-NEXT: v_mul_f32_e32 v3, v3, v5
112 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
113 ; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4
114 ; VI-NEXT: v_trunc_f16_e32 v3, v3
115 ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4
116 ; VI-NEXT: flat_store_short v[0:1], v2
119 ; GFX9-LABEL: frem_f16:
121 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
122 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
123 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
124 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
125 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
126 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
127 ; GFX9-NEXT: s_waitcnt vmcnt(1)
128 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
129 ; GFX9-NEXT: s_waitcnt vmcnt(0)
130 ; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2
131 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4
132 ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4
133 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
134 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1
135 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
136 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1
137 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
138 ; GFX9-NEXT: s_endpgm
140 ; GFX10-LABEL: frem_f16:
142 ; GFX10-NEXT: s_clause 0x1
143 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
144 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
145 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
146 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
147 ; GFX10-NEXT: s_clause 0x1
148 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
149 ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
150 ; GFX10-NEXT: s_waitcnt vmcnt(1)
151 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
152 ; GFX10-NEXT: s_waitcnt vmcnt(0)
153 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
154 ; GFX10-NEXT: v_rcp_f32_e32 v4, v4
155 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4
156 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
157 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
158 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
159 ; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
160 ; GFX10-NEXT: global_store_short v0, v1, s[4:5]
161 ; GFX10-NEXT: s_endpgm
162 half addrspace(1)* %in2) #0 {
163 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
164 %r0 = load half, half addrspace(1)* %in1, align 4
165 %r1 = load half, half addrspace(1)* %gep2, align 4
166 %r2 = frem half %r0, %r1
167 store half %r2, half addrspace(1)* %out, align 4
171 define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
172 ; SI-LABEL: fast_frem_f16:
174 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
175 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
176 ; SI-NEXT: s_mov_b32 s11, 0xf000
177 ; SI-NEXT: s_mov_b32 s10, -1
178 ; SI-NEXT: s_waitcnt lgkmcnt(0)
179 ; SI-NEXT: s_mov_b32 s8, s4
180 ; SI-NEXT: s_mov_b32 s9, s5
181 ; SI-NEXT: s_mov_b32 s4, s6
182 ; SI-NEXT: s_mov_b32 s5, s7
183 ; SI-NEXT: s_mov_b32 s6, s10
184 ; SI-NEXT: s_mov_b32 s7, s11
185 ; SI-NEXT: s_mov_b32 s2, s10
186 ; SI-NEXT: s_mov_b32 s3, s11
187 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
188 ; SI-NEXT: s_waitcnt vmcnt(0)
189 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
190 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
191 ; SI-NEXT: s_waitcnt vmcnt(0)
192 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
193 ; SI-NEXT: v_rcp_f32_e32 v2, v1
194 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2
195 ; SI-NEXT: v_trunc_f32_e32 v2, v2
196 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
197 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
198 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
201 ; CI-LABEL: fast_frem_f16:
203 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
204 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
205 ; CI-NEXT: s_mov_b32 s11, 0xf000
206 ; CI-NEXT: s_mov_b32 s10, -1
207 ; CI-NEXT: s_mov_b32 s2, s10
208 ; CI-NEXT: s_mov_b32 s3, s11
209 ; CI-NEXT: s_waitcnt lgkmcnt(0)
210 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
211 ; CI-NEXT: s_mov_b32 s8, s4
212 ; CI-NEXT: s_mov_b32 s9, s5
213 ; CI-NEXT: s_mov_b32 s4, s6
214 ; CI-NEXT: s_mov_b32 s5, s7
215 ; CI-NEXT: s_mov_b32 s6, s10
216 ; CI-NEXT: s_mov_b32 s7, s11
217 ; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
218 ; CI-NEXT: s_waitcnt vmcnt(1)
219 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
220 ; CI-NEXT: v_rcp_f32_e32 v2, v1
221 ; CI-NEXT: s_waitcnt vmcnt(0)
222 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
223 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
224 ; CI-NEXT: v_trunc_f32_e32 v2, v2
225 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
226 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
227 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0
230 ; VI-LABEL: fast_frem_f16:
232 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
233 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
234 ; VI-NEXT: s_waitcnt lgkmcnt(0)
235 ; VI-NEXT: v_mov_b32_e32 v2, s6
236 ; VI-NEXT: s_add_u32 s0, s0, 8
237 ; VI-NEXT: v_mov_b32_e32 v3, s7
238 ; VI-NEXT: s_addc_u32 s1, s1, 0
239 ; VI-NEXT: flat_load_ushort v4, v[2:3]
240 ; VI-NEXT: v_mov_b32_e32 v3, s1
241 ; VI-NEXT: v_mov_b32_e32 v2, s0
242 ; VI-NEXT: flat_load_ushort v2, v[2:3]
243 ; VI-NEXT: v_mov_b32_e32 v0, s4
244 ; VI-NEXT: v_mov_b32_e32 v1, s5
245 ; VI-NEXT: s_waitcnt vmcnt(0)
246 ; VI-NEXT: v_rcp_f16_e32 v3, v2
247 ; VI-NEXT: v_mul_f16_e32 v3, v4, v3
248 ; VI-NEXT: v_trunc_f16_e32 v3, v3
249 ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4
250 ; VI-NEXT: flat_store_short v[0:1], v2
253 ; GFX9-LABEL: fast_frem_f16:
255 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
256 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
257 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
258 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
259 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
260 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
261 ; GFX9-NEXT: s_waitcnt vmcnt(0)
262 ; GFX9-NEXT: v_rcp_f16_e32 v3, v2
263 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3
264 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
265 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1
266 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
267 ; GFX9-NEXT: s_endpgm
269 ; GFX10-LABEL: fast_frem_f16:
271 ; GFX10-NEXT: s_clause 0x1
272 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
273 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
274 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
275 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
276 ; GFX10-NEXT: s_clause 0x1
277 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
278 ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
279 ; GFX10-NEXT: s_waitcnt vmcnt(0)
280 ; GFX10-NEXT: v_rcp_f16_e32 v3, v2
281 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3
282 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
283 ; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
284 ; GFX10-NEXT: global_store_short v0, v1, s[4:5]
285 ; GFX10-NEXT: s_endpgm
286 half addrspace(1)* %in2) #0 {
287 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
288 %r0 = load half, half addrspace(1)* %in1, align 4
289 %r1 = load half, half addrspace(1)* %gep2, align 4
290 %r2 = frem fast half %r0, %r1
291 store half %r2, half addrspace(1)* %out, align 4
295 define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
296 ; SI-LABEL: unsafe_frem_f16:
298 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
299 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
300 ; SI-NEXT: s_mov_b32 s11, 0xf000
301 ; SI-NEXT: s_mov_b32 s10, -1
302 ; SI-NEXT: s_waitcnt lgkmcnt(0)
303 ; SI-NEXT: s_mov_b32 s8, s4
304 ; SI-NEXT: s_mov_b32 s9, s5
305 ; SI-NEXT: s_mov_b32 s4, s6
306 ; SI-NEXT: s_mov_b32 s5, s7
307 ; SI-NEXT: s_mov_b32 s6, s10
308 ; SI-NEXT: s_mov_b32 s7, s11
309 ; SI-NEXT: s_mov_b32 s2, s10
310 ; SI-NEXT: s_mov_b32 s3, s11
311 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
312 ; SI-NEXT: s_waitcnt vmcnt(0)
313 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
314 ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
315 ; SI-NEXT: s_waitcnt vmcnt(0)
316 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
317 ; SI-NEXT: v_rcp_f32_e32 v2, v1
318 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2
319 ; SI-NEXT: v_trunc_f32_e32 v2, v2
320 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
321 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
322 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
325 ; CI-LABEL: unsafe_frem_f16:
327 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
328 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
329 ; CI-NEXT: s_mov_b32 s11, 0xf000
330 ; CI-NEXT: s_mov_b32 s10, -1
331 ; CI-NEXT: s_mov_b32 s2, s10
332 ; CI-NEXT: s_mov_b32 s3, s11
333 ; CI-NEXT: s_waitcnt lgkmcnt(0)
334 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
335 ; CI-NEXT: s_mov_b32 s8, s4
336 ; CI-NEXT: s_mov_b32 s9, s5
337 ; CI-NEXT: s_mov_b32 s4, s6
338 ; CI-NEXT: s_mov_b32 s5, s7
339 ; CI-NEXT: s_mov_b32 s6, s10
340 ; CI-NEXT: s_mov_b32 s7, s11
341 ; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
342 ; CI-NEXT: s_waitcnt vmcnt(1)
343 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
344 ; CI-NEXT: v_rcp_f32_e32 v2, v1
345 ; CI-NEXT: s_waitcnt vmcnt(0)
346 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
347 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
348 ; CI-NEXT: v_trunc_f32_e32 v2, v2
349 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
350 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
351 ; CI-NEXT: buffer_store_short v0, off, s[8:11], 0
354 ; VI-LABEL: unsafe_frem_f16:
356 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
357 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
358 ; VI-NEXT: s_waitcnt lgkmcnt(0)
359 ; VI-NEXT: v_mov_b32_e32 v2, s6
360 ; VI-NEXT: s_add_u32 s0, s0, 8
361 ; VI-NEXT: v_mov_b32_e32 v3, s7
362 ; VI-NEXT: s_addc_u32 s1, s1, 0
363 ; VI-NEXT: flat_load_ushort v4, v[2:3]
364 ; VI-NEXT: v_mov_b32_e32 v3, s1
365 ; VI-NEXT: v_mov_b32_e32 v2, s0
366 ; VI-NEXT: flat_load_ushort v2, v[2:3]
367 ; VI-NEXT: v_mov_b32_e32 v0, s4
368 ; VI-NEXT: v_mov_b32_e32 v1, s5
369 ; VI-NEXT: s_waitcnt vmcnt(0)
370 ; VI-NEXT: v_rcp_f16_e32 v3, v2
371 ; VI-NEXT: v_mul_f16_e32 v3, v4, v3
372 ; VI-NEXT: v_trunc_f16_e32 v3, v3
373 ; VI-NEXT: v_fma_f16 v2, -v3, v2, v4
374 ; VI-NEXT: flat_store_short v[0:1], v2
377 ; GFX9-LABEL: unsafe_frem_f16:
379 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
380 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
381 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
382 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
383 ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
384 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
385 ; GFX9-NEXT: s_waitcnt vmcnt(0)
386 ; GFX9-NEXT: v_rcp_f16_e32 v3, v2
387 ; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3
388 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
389 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1
390 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
391 ; GFX9-NEXT: s_endpgm
393 ; GFX10-LABEL: unsafe_frem_f16:
395 ; GFX10-NEXT: s_clause 0x1
396 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
397 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
398 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
399 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
400 ; GFX10-NEXT: s_clause 0x1
401 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
402 ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8
403 ; GFX10-NEXT: s_waitcnt vmcnt(0)
404 ; GFX10-NEXT: v_rcp_f16_e32 v3, v2
405 ; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3
406 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
407 ; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
408 ; GFX10-NEXT: global_store_short v0, v1, s[4:5]
409 ; GFX10-NEXT: s_endpgm
410 half addrspace(1)* %in2) #1 {
411 %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
412 %r0 = load half, half addrspace(1)* %in1, align 4
413 %r1 = load half, half addrspace(1)* %gep2, align 4
414 %r2 = frem afn half %r0, %r1
415 store half %r2, half addrspace(1)* %out, align 4
419 define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
420 ; SI-LABEL: frem_f32:
422 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
423 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
424 ; SI-NEXT: s_mov_b32 s11, 0xf000
425 ; SI-NEXT: s_mov_b32 s10, -1
426 ; SI-NEXT: s_waitcnt lgkmcnt(0)
427 ; SI-NEXT: s_mov_b32 s8, s4
428 ; SI-NEXT: s_mov_b32 s9, s5
429 ; SI-NEXT: s_mov_b32 s4, s6
430 ; SI-NEXT: s_mov_b32 s5, s7
431 ; SI-NEXT: s_mov_b32 s6, s10
432 ; SI-NEXT: s_mov_b32 s7, s11
433 ; SI-NEXT: s_mov_b32 s2, s10
434 ; SI-NEXT: s_mov_b32 s3, s11
435 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
436 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
437 ; SI-NEXT: s_waitcnt vmcnt(0)
438 ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0
439 ; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0
440 ; SI-NEXT: v_rcp_f32_e32 v4, v3
441 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
442 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0
443 ; SI-NEXT: v_fma_f32 v4, v5, v4, v4
444 ; SI-NEXT: v_mul_f32_e32 v5, v2, v4
445 ; SI-NEXT: v_fma_f32 v6, -v3, v5, v2
446 ; SI-NEXT: v_fma_f32 v5, v6, v4, v5
447 ; SI-NEXT: v_fma_f32 v2, -v3, v5, v2
448 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
449 ; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
450 ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
451 ; SI-NEXT: v_trunc_f32_e32 v2, v2
452 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
453 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
456 ; CI-LABEL: frem_f32:
458 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
459 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
460 ; CI-NEXT: s_mov_b32 s11, 0xf000
461 ; CI-NEXT: s_mov_b32 s10, -1
462 ; CI-NEXT: s_mov_b32 s2, s10
463 ; CI-NEXT: s_waitcnt lgkmcnt(0)
464 ; CI-NEXT: s_mov_b32 s8, s4
465 ; CI-NEXT: s_mov_b32 s9, s5
466 ; CI-NEXT: s_mov_b32 s4, s6
467 ; CI-NEXT: s_mov_b32 s5, s7
468 ; CI-NEXT: s_mov_b32 s6, s10
469 ; CI-NEXT: s_mov_b32 s7, s11
470 ; CI-NEXT: s_mov_b32 s3, s11
471 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
472 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
473 ; CI-NEXT: s_waitcnt vmcnt(0)
474 ; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0
475 ; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0
476 ; CI-NEXT: v_rcp_f32_e32 v4, v3
477 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
478 ; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0
479 ; CI-NEXT: v_fma_f32 v4, v5, v4, v4
480 ; CI-NEXT: v_mul_f32_e32 v5, v2, v4
481 ; CI-NEXT: v_fma_f32 v6, -v3, v5, v2
482 ; CI-NEXT: v_fma_f32 v5, v6, v4, v5
483 ; CI-NEXT: v_fma_f32 v2, -v3, v5, v2
484 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
485 ; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
486 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
487 ; CI-NEXT: v_trunc_f32_e32 v2, v2
488 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
489 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0
492 ; VI-LABEL: frem_f32:
494 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
495 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
496 ; VI-NEXT: s_waitcnt lgkmcnt(0)
497 ; VI-NEXT: v_mov_b32_e32 v2, s6
498 ; VI-NEXT: s_add_u32 s0, s0, 16
499 ; VI-NEXT: v_mov_b32_e32 v3, s7
500 ; VI-NEXT: s_addc_u32 s1, s1, 0
501 ; VI-NEXT: flat_load_dword v4, v[2:3]
502 ; VI-NEXT: v_mov_b32_e32 v3, s1
503 ; VI-NEXT: v_mov_b32_e32 v2, s0
504 ; VI-NEXT: flat_load_dword v2, v[2:3]
505 ; VI-NEXT: v_mov_b32_e32 v0, s4
506 ; VI-NEXT: v_mov_b32_e32 v1, s5
507 ; VI-NEXT: s_waitcnt vmcnt(0)
508 ; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4
509 ; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4
510 ; VI-NEXT: v_rcp_f32_e32 v6, v5
511 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
512 ; VI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
513 ; VI-NEXT: v_fma_f32 v6, v7, v6, v6
514 ; VI-NEXT: v_mul_f32_e32 v7, v3, v6
515 ; VI-NEXT: v_fma_f32 v8, -v5, v7, v3
516 ; VI-NEXT: v_fma_f32 v7, v8, v6, v7
517 ; VI-NEXT: v_fma_f32 v3, -v5, v7, v3
518 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
519 ; VI-NEXT: v_div_fmas_f32 v3, v3, v6, v7
520 ; VI-NEXT: v_div_fixup_f32 v3, v3, v2, v4
521 ; VI-NEXT: v_trunc_f32_e32 v3, v3
522 ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4
523 ; VI-NEXT: flat_store_dword v[0:1], v2
526 ; GFX9-LABEL: frem_f32:
528 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
529 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
530 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
531 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
532 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
533 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16
534 ; GFX9-NEXT: s_waitcnt vmcnt(0)
535 ; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1
536 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1
537 ; GFX9-NEXT: v_rcp_f32_e32 v5, v4
538 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
539 ; GFX9-NEXT: v_fma_f32 v6, -v4, v5, 1.0
540 ; GFX9-NEXT: v_fma_f32 v5, v6, v5, v5
541 ; GFX9-NEXT: v_mul_f32_e32 v6, v3, v5
542 ; GFX9-NEXT: v_fma_f32 v7, -v4, v6, v3
543 ; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6
544 ; GFX9-NEXT: v_fma_f32 v3, -v4, v6, v3
545 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
546 ; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v6
547 ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v1
548 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
549 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1
550 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
551 ; GFX9-NEXT: s_endpgm
553 ; GFX10-LABEL: frem_f32:
555 ; GFX10-NEXT: s_clause 0x1
556 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
557 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
558 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
559 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
560 ; GFX10-NEXT: s_clause 0x1
561 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
562 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16
563 ; GFX10-NEXT: s_waitcnt vmcnt(0)
564 ; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1
565 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
566 ; GFX10-NEXT: v_rcp_f32_e32 v5, v4
567 ; GFX10-NEXT: s_denorm_mode 15
568 ; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0
569 ; GFX10-NEXT: v_fma_f32 v5, v6, v5, v5
570 ; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5
571 ; GFX10-NEXT: v_fma_f32 v7, -v4, v6, v3
572 ; GFX10-NEXT: v_fma_f32 v6, v7, v5, v6
573 ; GFX10-NEXT: v_fma_f32 v3, -v4, v6, v3
574 ; GFX10-NEXT: s_denorm_mode 12
575 ; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6
576 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1
577 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
578 ; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2
579 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
580 ; GFX10-NEXT: s_endpgm
581 float addrspace(1)* %in2) #0 {
582 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
583 %r0 = load float, float addrspace(1)* %in1, align 4
584 %r1 = load float, float addrspace(1)* %gep2, align 4
585 %r2 = frem float %r0, %r1
586 store float %r2, float addrspace(1)* %out, align 4
590 define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
591 ; SI-LABEL: fast_frem_f32:
593 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
594 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
595 ; SI-NEXT: s_mov_b32 s11, 0xf000
596 ; SI-NEXT: s_mov_b32 s10, -1
597 ; SI-NEXT: s_waitcnt lgkmcnt(0)
598 ; SI-NEXT: s_mov_b32 s8, s4
599 ; SI-NEXT: s_mov_b32 s9, s5
600 ; SI-NEXT: s_mov_b32 s4, s6
601 ; SI-NEXT: s_mov_b32 s5, s7
602 ; SI-NEXT: s_mov_b32 s6, s10
603 ; SI-NEXT: s_mov_b32 s7, s11
604 ; SI-NEXT: s_mov_b32 s2, s10
605 ; SI-NEXT: s_mov_b32 s3, s11
606 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
607 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
608 ; SI-NEXT: s_waitcnt vmcnt(0)
609 ; SI-NEXT: v_rcp_f32_e32 v2, v1
610 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2
611 ; SI-NEXT: v_trunc_f32_e32 v2, v2
612 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
613 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
616 ; CI-LABEL: fast_frem_f32:
618 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
619 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
620 ; CI-NEXT: s_mov_b32 s11, 0xf000
621 ; CI-NEXT: s_mov_b32 s10, -1
622 ; CI-NEXT: s_mov_b32 s2, s10
623 ; CI-NEXT: s_waitcnt lgkmcnt(0)
624 ; CI-NEXT: s_mov_b32 s8, s4
625 ; CI-NEXT: s_mov_b32 s9, s5
626 ; CI-NEXT: s_mov_b32 s4, s6
627 ; CI-NEXT: s_mov_b32 s5, s7
628 ; CI-NEXT: s_mov_b32 s6, s10
629 ; CI-NEXT: s_mov_b32 s7, s11
630 ; CI-NEXT: s_mov_b32 s3, s11
631 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
632 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
633 ; CI-NEXT: s_waitcnt vmcnt(0)
634 ; CI-NEXT: v_rcp_f32_e32 v2, v1
635 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
636 ; CI-NEXT: v_trunc_f32_e32 v2, v2
637 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
638 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0
641 ; VI-LABEL: fast_frem_f32:
643 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
644 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
645 ; VI-NEXT: s_waitcnt lgkmcnt(0)
646 ; VI-NEXT: v_mov_b32_e32 v2, s6
647 ; VI-NEXT: s_add_u32 s0, s0, 16
648 ; VI-NEXT: v_mov_b32_e32 v3, s7
649 ; VI-NEXT: s_addc_u32 s1, s1, 0
650 ; VI-NEXT: flat_load_dword v4, v[2:3]
651 ; VI-NEXT: v_mov_b32_e32 v3, s1
652 ; VI-NEXT: v_mov_b32_e32 v2, s0
653 ; VI-NEXT: flat_load_dword v2, v[2:3]
654 ; VI-NEXT: v_mov_b32_e32 v0, s4
655 ; VI-NEXT: v_mov_b32_e32 v1, s5
656 ; VI-NEXT: s_waitcnt vmcnt(0)
657 ; VI-NEXT: v_rcp_f32_e32 v3, v2
658 ; VI-NEXT: v_mul_f32_e32 v3, v4, v3
659 ; VI-NEXT: v_trunc_f32_e32 v3, v3
660 ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4
661 ; VI-NEXT: flat_store_dword v[0:1], v2
664 ; GFX9-LABEL: fast_frem_f32:
666 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
667 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
668 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
669 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
670 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
671 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16
672 ; GFX9-NEXT: s_waitcnt vmcnt(0)
673 ; GFX9-NEXT: v_rcp_f32_e32 v3, v2
674 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3
675 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
676 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1
677 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
678 ; GFX9-NEXT: s_endpgm
680 ; GFX10-LABEL: fast_frem_f32:
682 ; GFX10-NEXT: s_clause 0x1
683 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
684 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
685 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
686 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
687 ; GFX10-NEXT: s_clause 0x1
688 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
689 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16
690 ; GFX10-NEXT: s_waitcnt vmcnt(0)
691 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2
692 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3
693 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
694 ; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2
695 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
696 ; GFX10-NEXT: s_endpgm
697 float addrspace(1)* %in2) #0 {
698 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
699 %r0 = load float, float addrspace(1)* %in1, align 4
700 %r1 = load float, float addrspace(1)* %gep2, align 4
701 %r2 = frem fast float %r0, %r1
702 store float %r2, float addrspace(1)* %out, align 4
706 define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
707 ; SI-LABEL: unsafe_frem_f32:
709 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
710 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
711 ; SI-NEXT: s_mov_b32 s11, 0xf000
712 ; SI-NEXT: s_mov_b32 s10, -1
713 ; SI-NEXT: s_waitcnt lgkmcnt(0)
714 ; SI-NEXT: s_mov_b32 s8, s4
715 ; SI-NEXT: s_mov_b32 s9, s5
716 ; SI-NEXT: s_mov_b32 s4, s6
717 ; SI-NEXT: s_mov_b32 s5, s7
718 ; SI-NEXT: s_mov_b32 s6, s10
719 ; SI-NEXT: s_mov_b32 s7, s11
720 ; SI-NEXT: s_mov_b32 s2, s10
721 ; SI-NEXT: s_mov_b32 s3, s11
722 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
723 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
724 ; SI-NEXT: s_waitcnt vmcnt(0)
725 ; SI-NEXT: v_rcp_f32_e32 v2, v1
726 ; SI-NEXT: v_mul_f32_e32 v2, v0, v2
727 ; SI-NEXT: v_trunc_f32_e32 v2, v2
728 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0
729 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
732 ; CI-LABEL: unsafe_frem_f32:
734 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
735 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
736 ; CI-NEXT: s_mov_b32 s11, 0xf000
737 ; CI-NEXT: s_mov_b32 s10, -1
738 ; CI-NEXT: s_mov_b32 s2, s10
739 ; CI-NEXT: s_waitcnt lgkmcnt(0)
740 ; CI-NEXT: s_mov_b32 s8, s4
741 ; CI-NEXT: s_mov_b32 s9, s5
742 ; CI-NEXT: s_mov_b32 s4, s6
743 ; CI-NEXT: s_mov_b32 s5, s7
744 ; CI-NEXT: s_mov_b32 s6, s10
745 ; CI-NEXT: s_mov_b32 s7, s11
746 ; CI-NEXT: s_mov_b32 s3, s11
747 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
748 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16
749 ; CI-NEXT: s_waitcnt vmcnt(0)
750 ; CI-NEXT: v_rcp_f32_e32 v2, v1
751 ; CI-NEXT: v_mul_f32_e32 v2, v0, v2
752 ; CI-NEXT: v_trunc_f32_e32 v2, v2
753 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
754 ; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0
757 ; VI-LABEL: unsafe_frem_f32:
759 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
760 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
761 ; VI-NEXT: s_waitcnt lgkmcnt(0)
762 ; VI-NEXT: v_mov_b32_e32 v2, s6
763 ; VI-NEXT: s_add_u32 s0, s0, 16
764 ; VI-NEXT: v_mov_b32_e32 v3, s7
765 ; VI-NEXT: s_addc_u32 s1, s1, 0
766 ; VI-NEXT: flat_load_dword v4, v[2:3]
767 ; VI-NEXT: v_mov_b32_e32 v3, s1
768 ; VI-NEXT: v_mov_b32_e32 v2, s0
769 ; VI-NEXT: flat_load_dword v2, v[2:3]
770 ; VI-NEXT: v_mov_b32_e32 v0, s4
771 ; VI-NEXT: v_mov_b32_e32 v1, s5
772 ; VI-NEXT: s_waitcnt vmcnt(0)
773 ; VI-NEXT: v_rcp_f32_e32 v3, v2
774 ; VI-NEXT: v_mul_f32_e32 v3, v4, v3
775 ; VI-NEXT: v_trunc_f32_e32 v3, v3
776 ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4
777 ; VI-NEXT: flat_store_dword v[0:1], v2
780 ; GFX9-LABEL: unsafe_frem_f32:
782 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
783 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
784 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
785 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
786 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
787 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16
788 ; GFX9-NEXT: s_waitcnt vmcnt(0)
789 ; GFX9-NEXT: v_rcp_f32_e32 v3, v2
790 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3
791 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
792 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1
793 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
794 ; GFX9-NEXT: s_endpgm
796 ; GFX10-LABEL: unsafe_frem_f32:
798 ; GFX10-NEXT: s_clause 0x1
799 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
800 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
801 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
802 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
803 ; GFX10-NEXT: s_clause 0x1
804 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
805 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16
806 ; GFX10-NEXT: s_waitcnt vmcnt(0)
807 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2
808 ; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3
809 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
810 ; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2
811 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
812 ; GFX10-NEXT: s_endpgm
813 float addrspace(1)* %in2) #1 {
814 %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
815 %r0 = load float, float addrspace(1)* %in1, align 4
816 %r1 = load float, float addrspace(1)* %gep2, align 4
817 %r2 = frem afn float %r0, %r1
818 store float %r2, float addrspace(1)* %out, align 4
822 define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
823 ; SI-LABEL: frem_f64:
825 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
826 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
827 ; SI-NEXT: s_mov_b32 s7, 0xf000
828 ; SI-NEXT: s_mov_b32 s6, -1
829 ; SI-NEXT: s_waitcnt lgkmcnt(0)
830 ; SI-NEXT: s_mov_b32 s4, s8
831 ; SI-NEXT: s_mov_b32 s5, s9
832 ; SI-NEXT: s_mov_b32 s8, s10
833 ; SI-NEXT: s_mov_b32 s9, s11
834 ; SI-NEXT: s_mov_b32 s10, s6
835 ; SI-NEXT: s_mov_b32 s11, s7
836 ; SI-NEXT: s_mov_b32 s2, s6
837 ; SI-NEXT: s_mov_b32 s3, s7
838 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
839 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
840 ; SI-NEXT: s_waitcnt vmcnt(0)
841 ; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
842 ; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
843 ; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
844 ; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
845 ; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
846 ; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
847 ; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1]
848 ; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
849 ; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9]
850 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
851 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9
852 ; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
854 ; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
855 ; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
856 ; SI-NEXT: v_bfe_u32 v6, v5, 20, 11
857 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6
858 ; SI-NEXT: s_mov_b32 s1, 0xfffff
859 ; SI-NEXT: s_mov_b32 s0, s6
860 ; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8
861 ; SI-NEXT: v_not_b32_e32 v6, v6
862 ; SI-NEXT: v_and_b32_e32 v6, v4, v6
863 ; SI-NEXT: v_not_b32_e32 v7, v7
864 ; SI-NEXT: v_and_b32_e32 v7, v5, v7
865 ; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5
866 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8
867 ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
868 ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8
869 ; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1]
870 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
871 ; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1]
872 ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
873 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
876 ; CI-LABEL: frem_f64:
878 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
879 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
880 ; CI-NEXT: s_mov_b32 s11, 0xf000
881 ; CI-NEXT: s_mov_b32 s10, -1
882 ; CI-NEXT: s_mov_b32 s2, s10
883 ; CI-NEXT: s_waitcnt lgkmcnt(0)
884 ; CI-NEXT: s_mov_b32 s8, s4
885 ; CI-NEXT: s_mov_b32 s9, s5
886 ; CI-NEXT: s_mov_b32 s4, s6
887 ; CI-NEXT: s_mov_b32 s5, s7
888 ; CI-NEXT: s_mov_b32 s6, s10
889 ; CI-NEXT: s_mov_b32 s7, s11
890 ; CI-NEXT: s_mov_b32 s3, s11
891 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
892 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
893 ; CI-NEXT: s_waitcnt vmcnt(0)
894 ; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
895 ; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
896 ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
897 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
898 ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
899 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
900 ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
901 ; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
902 ; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
904 ; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
905 ; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
906 ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
907 ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
908 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
911 ; VI-LABEL: frem_f64:
913 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
914 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
915 ; VI-NEXT: s_waitcnt lgkmcnt(0)
916 ; VI-NEXT: v_mov_b32_e32 v2, s6
917 ; VI-NEXT: v_mov_b32_e32 v3, s7
918 ; VI-NEXT: v_mov_b32_e32 v4, s0
919 ; VI-NEXT: v_mov_b32_e32 v5, s1
920 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
921 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
922 ; VI-NEXT: v_mov_b32_e32 v0, s4
923 ; VI-NEXT: v_mov_b32_e32 v1, s5
924 ; VI-NEXT: s_waitcnt vmcnt(0)
925 ; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3]
926 ; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
927 ; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
928 ; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
929 ; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
930 ; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
931 ; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3]
932 ; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
933 ; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
935 ; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
936 ; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3]
937 ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
938 ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
939 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
942 ; GFX9-LABEL: frem_f64:
944 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
945 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
946 ; GFX9-NEXT: v_mov_b32_e32 v12, 0
947 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
948 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7]
949 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3]
950 ; GFX9-NEXT: s_waitcnt vmcnt(0)
951 ; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
952 ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
953 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
954 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
955 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
956 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
957 ; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
958 ; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
959 ; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
961 ; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
962 ; GFX9-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
963 ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
964 ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
965 ; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5]
966 ; GFX9-NEXT: s_endpgm
968 ; GFX10-LABEL: frem_f64:
970 ; GFX10-NEXT: s_clause 0x1
971 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
972 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
973 ; GFX10-NEXT: v_mov_b32_e32 v12, 0
974 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
975 ; GFX10-NEXT: s_clause 0x1
976 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7]
977 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3]
978 ; GFX10-NEXT: s_waitcnt vmcnt(0)
979 ; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1]
980 ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
981 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
982 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
983 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
984 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
985 ; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
986 ; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7]
987 ; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
988 ; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
989 ; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
990 ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
991 ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
992 ; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5]
993 ; GFX10-NEXT: s_endpgm
994 double addrspace(1)* %in2) #0 {
995 %r0 = load double, double addrspace(1)* %in1, align 8
996 %r1 = load double, double addrspace(1)* %in2, align 8
997 %r2 = frem double %r0, %r1
998 store double %r2, double addrspace(1)* %out, align 8
1002 define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
1003 ; SI-LABEL: fast_frem_f64:
1005 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1006 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1007 ; SI-NEXT: s_mov_b32 s11, 0xf000
1008 ; SI-NEXT: s_mov_b32 s10, -1
1009 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1010 ; SI-NEXT: s_mov_b32 s8, s4
1011 ; SI-NEXT: s_mov_b32 s9, s5
1012 ; SI-NEXT: s_mov_b32 s4, s6
1013 ; SI-NEXT: s_mov_b32 s5, s7
1014 ; SI-NEXT: s_mov_b32 s6, s10
1015 ; SI-NEXT: s_mov_b32 s7, s11
1016 ; SI-NEXT: s_mov_b32 s2, s10
1017 ; SI-NEXT: s_mov_b32 s3, s11
1018 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1019 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1020 ; SI-NEXT: s_waitcnt vmcnt(0)
1021 ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1022 ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1023 ; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1024 ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1025 ; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1026 ; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1027 ; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1028 ; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1029 ; SI-NEXT: v_bfe_u32 v6, v5, 20, 11
1030 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6
1031 ; SI-NEXT: s_mov_b32 s1, 0xfffff
1032 ; SI-NEXT: s_mov_b32 s0, s10
1033 ; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8
1034 ; SI-NEXT: v_not_b32_e32 v6, v6
1035 ; SI-NEXT: v_and_b32_e32 v6, v4, v6
1036 ; SI-NEXT: v_not_b32_e32 v7, v7
1037 ; SI-NEXT: v_and_b32_e32 v7, v5, v7
1038 ; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5
1039 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8
1040 ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
1041 ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8
1042 ; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1]
1043 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
1044 ; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1]
1045 ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1046 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1049 ; CI-LABEL: fast_frem_f64:
1051 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1052 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1053 ; CI-NEXT: s_mov_b32 s11, 0xf000
1054 ; CI-NEXT: s_mov_b32 s10, -1
1055 ; CI-NEXT: s_mov_b32 s2, s10
1056 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1057 ; CI-NEXT: s_mov_b32 s8, s4
1058 ; CI-NEXT: s_mov_b32 s9, s5
1059 ; CI-NEXT: s_mov_b32 s4, s6
1060 ; CI-NEXT: s_mov_b32 s5, s7
1061 ; CI-NEXT: s_mov_b32 s6, s10
1062 ; CI-NEXT: s_mov_b32 s7, s11
1063 ; CI-NEXT: s_mov_b32 s3, s11
1064 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1065 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1066 ; CI-NEXT: s_waitcnt vmcnt(0)
1067 ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1068 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1069 ; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1070 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1071 ; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1072 ; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1073 ; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1074 ; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1075 ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1076 ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1077 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1080 ; VI-LABEL: fast_frem_f64:
1082 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1083 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1084 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1085 ; VI-NEXT: v_mov_b32_e32 v2, s6
1086 ; VI-NEXT: v_mov_b32_e32 v3, s7
1087 ; VI-NEXT: v_mov_b32_e32 v4, s0
1088 ; VI-NEXT: v_mov_b32_e32 v5, s1
1089 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1090 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
1091 ; VI-NEXT: v_mov_b32_e32 v0, s4
1092 ; VI-NEXT: v_mov_b32_e32 v1, s5
1093 ; VI-NEXT: s_waitcnt vmcnt(0)
1094 ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1095 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1096 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1097 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1098 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1099 ; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7]
1100 ; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1101 ; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1102 ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
1103 ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1104 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1107 ; GFX9-LABEL: fast_frem_f64:
1109 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1110 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1111 ; GFX9-NEXT: v_mov_b32_e32 v10, 0
1112 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1113 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7]
1114 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3]
1115 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1116 ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1117 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1118 ; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1119 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1120 ; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1121 ; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1122 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1123 ; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1124 ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1125 ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1126 ; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
1127 ; GFX9-NEXT: s_endpgm
1129 ; GFX10-LABEL: fast_frem_f64:
1131 ; GFX10-NEXT: s_clause 0x1
1132 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1133 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1134 ; GFX10-NEXT: v_mov_b32_e32 v10, 0
1135 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1136 ; GFX10-NEXT: s_clause 0x1
1137 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7]
1138 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3]
1139 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1140 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1141 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1142 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1143 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1144 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1145 ; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1146 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1147 ; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1148 ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1149 ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1150 ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
1151 ; GFX10-NEXT: s_endpgm
1152 double addrspace(1)* %in2) #0 {
1153 %r0 = load double, double addrspace(1)* %in1, align 8
1154 %r1 = load double, double addrspace(1)* %in2, align 8
1155 %r2 = frem fast double %r0, %r1
1156 store double %r2, double addrspace(1)* %out, align 8
1160 define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
1161 ; SI-LABEL: unsafe_frem_f64:
1163 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1164 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1165 ; SI-NEXT: s_mov_b32 s11, 0xf000
1166 ; SI-NEXT: s_mov_b32 s10, -1
1167 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1168 ; SI-NEXT: s_mov_b32 s8, s4
1169 ; SI-NEXT: s_mov_b32 s9, s5
1170 ; SI-NEXT: s_mov_b32 s4, s6
1171 ; SI-NEXT: s_mov_b32 s5, s7
1172 ; SI-NEXT: s_mov_b32 s6, s10
1173 ; SI-NEXT: s_mov_b32 s7, s11
1174 ; SI-NEXT: s_mov_b32 s2, s10
1175 ; SI-NEXT: s_mov_b32 s3, s11
1176 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1177 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1178 ; SI-NEXT: s_waitcnt vmcnt(0)
1179 ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1180 ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1181 ; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1182 ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1183 ; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1184 ; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1185 ; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1186 ; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1187 ; SI-NEXT: v_bfe_u32 v6, v5, 20, 11
1188 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6
1189 ; SI-NEXT: s_mov_b32 s1, 0xfffff
1190 ; SI-NEXT: s_mov_b32 s0, s10
1191 ; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8
1192 ; SI-NEXT: v_not_b32_e32 v6, v6
1193 ; SI-NEXT: v_and_b32_e32 v6, v4, v6
1194 ; SI-NEXT: v_not_b32_e32 v7, v7
1195 ; SI-NEXT: v_and_b32_e32 v7, v5, v7
1196 ; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5
1197 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8
1198 ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
1199 ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8
1200 ; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1]
1201 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
1202 ; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1]
1203 ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1204 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1207 ; CI-LABEL: unsafe_frem_f64:
1209 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1210 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1211 ; CI-NEXT: s_mov_b32 s11, 0xf000
1212 ; CI-NEXT: s_mov_b32 s10, -1
1213 ; CI-NEXT: s_mov_b32 s2, s10
1214 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1215 ; CI-NEXT: s_mov_b32 s8, s4
1216 ; CI-NEXT: s_mov_b32 s9, s5
1217 ; CI-NEXT: s_mov_b32 s4, s6
1218 ; CI-NEXT: s_mov_b32 s5, s7
1219 ; CI-NEXT: s_mov_b32 s6, s10
1220 ; CI-NEXT: s_mov_b32 s7, s11
1221 ; CI-NEXT: s_mov_b32 s3, s11
1222 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1223 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1224 ; CI-NEXT: s_waitcnt vmcnt(0)
1225 ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1226 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1227 ; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1228 ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1229 ; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1230 ; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1231 ; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1232 ; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1233 ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1234 ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1235 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1238 ; VI-LABEL: unsafe_frem_f64:
1240 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1241 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1242 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1243 ; VI-NEXT: v_mov_b32_e32 v2, s6
1244 ; VI-NEXT: v_mov_b32_e32 v3, s7
1245 ; VI-NEXT: v_mov_b32_e32 v4, s0
1246 ; VI-NEXT: v_mov_b32_e32 v5, s1
1247 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1248 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
1249 ; VI-NEXT: v_mov_b32_e32 v0, s4
1250 ; VI-NEXT: v_mov_b32_e32 v1, s5
1251 ; VI-NEXT: s_waitcnt vmcnt(0)
1252 ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
1253 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1254 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1255 ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1256 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1257 ; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7]
1258 ; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1259 ; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1260 ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
1261 ; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1262 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1265 ; GFX9-LABEL: unsafe_frem_f64:
1267 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1268 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1269 ; GFX9-NEXT: v_mov_b32_e32 v10, 0
1270 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1271 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7]
1272 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3]
1273 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1274 ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1275 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1276 ; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1277 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1278 ; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1279 ; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1280 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1281 ; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1282 ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1283 ; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1284 ; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
1285 ; GFX9-NEXT: s_endpgm
1287 ; GFX10-LABEL: unsafe_frem_f64:
1289 ; GFX10-NEXT: s_clause 0x1
1290 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1291 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1292 ; GFX10-NEXT: v_mov_b32_e32 v10, 0
1293 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1294 ; GFX10-NEXT: s_clause 0x1
1295 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[6:7]
1296 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[2:3]
1297 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1298 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1299 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1300 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1301 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1302 ; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1303 ; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
1304 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1305 ; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1306 ; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1307 ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1308 ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
1309 ; GFX10-NEXT: s_endpgm
1310 double addrspace(1)* %in2) #1 {
1311 %r0 = load double, double addrspace(1)* %in1, align 8
1312 %r1 = load double, double addrspace(1)* %in2, align 8
1313 %r2 = frem afn double %r0, %r1
1314 store double %r2, double addrspace(1)* %out, align 8
1318 define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
1319 ; SI-LABEL: frem_v2f16:
1321 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1322 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1323 ; SI-NEXT: s_mov_b32 s3, 0xf000
1324 ; SI-NEXT: s_mov_b32 s2, -1
1325 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1326 ; SI-NEXT: s_mov_b32 s0, s4
1327 ; SI-NEXT: s_mov_b32 s1, s5
1328 ; SI-NEXT: s_mov_b32 s4, s6
1329 ; SI-NEXT: s_mov_b32 s5, s7
1330 ; SI-NEXT: s_mov_b32 s6, s2
1331 ; SI-NEXT: s_mov_b32 s7, s3
1332 ; SI-NEXT: s_mov_b32 s10, s2
1333 ; SI-NEXT: s_mov_b32 s11, s3
1334 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
1335 ; SI-NEXT: s_waitcnt vmcnt(0)
1336 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
1337 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1338 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1339 ; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16
1340 ; SI-NEXT: s_waitcnt vmcnt(0)
1341 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2
1342 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1343 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
1344 ; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0
1345 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0
1346 ; SI-NEXT: v_rcp_f32_e32 v6, v5
1347 ; SI-NEXT: s_mov_b32 s6, 3
1348 ; SI-NEXT: s_mov_b32 s7, 0
1349 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1350 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
1351 ; SI-NEXT: v_fma_f32 v6, v7, v6, v6
1352 ; SI-NEXT: v_mul_f32_e32 v7, v4, v6
1353 ; SI-NEXT: v_fma_f32 v8, -v5, v7, v4
1354 ; SI-NEXT: v_fma_f32 v7, v8, v6, v7
1355 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4
1356 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1357 ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
1358 ; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0
1359 ; SI-NEXT: v_trunc_f32_e32 v4, v4
1360 ; SI-NEXT: v_fma_f32 v0, -v4, v2, v0
1361 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1362 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1363 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1364 ; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1
1365 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1
1366 ; SI-NEXT: v_rcp_f32_e32 v5, v4
1367 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1368 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
1369 ; SI-NEXT: v_fma_f32 v5, v6, v5, v5
1370 ; SI-NEXT: v_mul_f32_e32 v6, v2, v5
1371 ; SI-NEXT: v_fma_f32 v7, -v4, v6, v2
1372 ; SI-NEXT: v_fma_f32 v6, v7, v5, v6
1373 ; SI-NEXT: v_fma_f32 v2, -v4, v6, v2
1374 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1375 ; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6
1376 ; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1
1377 ; SI-NEXT: v_trunc_f32_e32 v2, v2
1378 ; SI-NEXT: v_fma_f32 v1, -v2, v3, v1
1379 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1380 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1381 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1384 ; CI-LABEL: frem_v2f16:
1386 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1387 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1388 ; CI-NEXT: s_mov_b32 s3, 0xf000
1389 ; CI-NEXT: s_mov_b32 s2, -1
1390 ; CI-NEXT: s_mov_b32 s10, s2
1391 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1392 ; CI-NEXT: s_mov_b32 s0, s4
1393 ; CI-NEXT: s_mov_b32 s1, s5
1394 ; CI-NEXT: s_mov_b32 s4, s6
1395 ; CI-NEXT: s_mov_b32 s5, s7
1396 ; CI-NEXT: s_mov_b32 s11, s3
1397 ; CI-NEXT: s_mov_b32 s6, s2
1398 ; CI-NEXT: s_mov_b32 s7, s3
1399 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
1400 ; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16
1401 ; CI-NEXT: s_mov_b32 s6, 3
1402 ; CI-NEXT: s_mov_b32 s7, 0
1403 ; CI-NEXT: s_waitcnt vmcnt(1)
1404 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v0
1405 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1406 ; CI-NEXT: s_waitcnt vmcnt(0)
1407 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v2
1408 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1409 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1410 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
1411 ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0
1412 ; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0
1413 ; CI-NEXT: v_rcp_f32_e32 v6, v5
1414 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1415 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
1416 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6
1417 ; CI-NEXT: v_mul_f32_e32 v7, v4, v6
1418 ; CI-NEXT: v_fma_f32 v8, -v5, v7, v4
1419 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7
1420 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4
1421 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1422 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
1423 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0
1424 ; CI-NEXT: v_trunc_f32_e32 v4, v4
1425 ; CI-NEXT: v_fma_f32 v0, -v4, v2, v0
1426 ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1
1427 ; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1
1428 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1429 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1430 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1431 ; CI-NEXT: v_rcp_f32_e32 v5, v4
1432 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1433 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
1434 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5
1435 ; CI-NEXT: v_mul_f32_e32 v6, v2, v5
1436 ; CI-NEXT: v_fma_f32 v7, -v4, v6, v2
1437 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6
1438 ; CI-NEXT: v_fma_f32 v2, -v4, v6, v2
1439 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1440 ; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6
1441 ; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1
1442 ; CI-NEXT: v_trunc_f32_e32 v2, v2
1443 ; CI-NEXT: v_fma_f32 v1, -v2, v3, v1
1444 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1445 ; CI-NEXT: v_or_b32_e32 v0, v1, v0
1446 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1449 ; VI-LABEL: frem_v2f16:
1451 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1452 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1453 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1454 ; VI-NEXT: v_mov_b32_e32 v2, s6
1455 ; VI-NEXT: s_add_u32 s0, s0, 16
1456 ; VI-NEXT: v_mov_b32_e32 v3, s7
1457 ; VI-NEXT: s_addc_u32 s1, s1, 0
1458 ; VI-NEXT: flat_load_dword v4, v[2:3]
1459 ; VI-NEXT: v_mov_b32_e32 v3, s1
1460 ; VI-NEXT: v_mov_b32_e32 v2, s0
1461 ; VI-NEXT: flat_load_dword v2, v[2:3]
1462 ; VI-NEXT: v_mov_b32_e32 v0, s4
1463 ; VI-NEXT: v_mov_b32_e32 v1, s5
1464 ; VI-NEXT: s_waitcnt vmcnt(1)
1465 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
1466 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v3
1467 ; VI-NEXT: s_waitcnt vmcnt(0)
1468 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
1469 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
1470 ; VI-NEXT: v_rcp_f32_e32 v7, v7
1471 ; VI-NEXT: v_mul_f32_e32 v5, v5, v7
1472 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5
1473 ; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3
1474 ; VI-NEXT: v_trunc_f16_e32 v5, v5
1475 ; VI-NEXT: v_fma_f16 v3, -v5, v6, v3
1476 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v2
1477 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v4
1478 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1479 ; VI-NEXT: v_rcp_f32_e32 v6, v6
1480 ; VI-NEXT: v_mul_f32_e32 v5, v5, v6
1481 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5
1482 ; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4
1483 ; VI-NEXT: v_trunc_f16_e32 v5, v5
1484 ; VI-NEXT: v_fma_f16 v2, -v5, v2, v4
1485 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
1486 ; VI-NEXT: flat_store_dword v[0:1], v2
1489 ; GFX9-LABEL: frem_v2f16:
1491 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1492 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1493 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1494 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1495 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
1496 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16
1497 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1498 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
1499 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1500 ; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2
1501 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4
1502 ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4
1503 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
1504 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1
1505 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
1506 ; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1
1507 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1508 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2
1509 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1510 ; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1
1511 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5
1512 ; GFX9-NEXT: v_mul_f32_e32 v4, v4, v5
1513 ; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
1514 ; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1
1515 ; GFX9-NEXT: v_trunc_f16_e32 v4, v4
1516 ; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1
1517 ; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
1518 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
1519 ; GFX9-NEXT: s_endpgm
1521 ; GFX10-LABEL: frem_v2f16:
1523 ; GFX10-NEXT: s_clause 0x1
1524 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1525 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1526 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1527 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1528 ; GFX10-NEXT: s_clause 0x1
1529 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
1530 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16
1531 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1532 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
1533 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1534 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
1535 ; GFX10-NEXT: v_rcp_f32_e32 v4, v4
1536 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4
1537 ; GFX10-NEXT: v_mov_b32_e32 v4, v1
1538 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
1539 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
1540 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1541 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
1542 ; GFX10-NEXT: v_fmac_f16_e64 v4, -v3, v2
1543 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1544 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
1545 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
1546 ; GFX10-NEXT: v_rcp_f32_e32 v5, v5
1547 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
1548 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
1549 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
1550 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
1551 ; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
1552 ; GFX10-NEXT: v_pack_b32_f16 v1, v4, v1
1553 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
1554 ; GFX10-NEXT: s_endpgm
1555 <2 x half> addrspace(1)* %in2) #0 {
1556 %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4
1557 %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8
1558 %r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8
1559 %r2 = frem <2 x half> %r0, %r1
1560 store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8
1564 define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in1,
1565 ; SI-LABEL: frem_v4f16:
1567 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1568 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1569 ; SI-NEXT: s_mov_b32 s3, 0xf000
1570 ; SI-NEXT: s_mov_b32 s2, -1
1571 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1572 ; SI-NEXT: s_mov_b32 s0, s4
1573 ; SI-NEXT: s_mov_b32 s1, s5
1574 ; SI-NEXT: s_mov_b32 s4, s6
1575 ; SI-NEXT: s_mov_b32 s5, s7
1576 ; SI-NEXT: s_mov_b32 s6, s2
1577 ; SI-NEXT: s_mov_b32 s7, s3
1578 ; SI-NEXT: s_mov_b32 s10, s2
1579 ; SI-NEXT: s_mov_b32 s11, s3
1580 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1581 ; SI-NEXT: s_waitcnt vmcnt(0)
1582 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
1583 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1584 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
1585 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
1586 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
1587 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v0
1588 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
1589 ; SI-NEXT: s_waitcnt vmcnt(0)
1590 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v0
1591 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1592 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1593 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v1
1594 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1595 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1596 ; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5
1597 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5
1598 ; SI-NEXT: v_rcp_f32_e32 v10, v9
1599 ; SI-NEXT: s_mov_b32 s6, 3
1600 ; SI-NEXT: s_mov_b32 s7, 0
1601 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1602 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
1603 ; SI-NEXT: v_fma_f32 v10, v11, v10, v10
1604 ; SI-NEXT: v_mul_f32_e32 v11, v8, v10
1605 ; SI-NEXT: v_fma_f32 v12, -v9, v11, v8
1606 ; SI-NEXT: v_fma_f32 v11, v12, v10, v11
1607 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8
1608 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1609 ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
1610 ; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5
1611 ; SI-NEXT: v_trunc_f32_e32 v8, v8
1612 ; SI-NEXT: v_fma_f32 v1, -v8, v1, v5
1613 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1614 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1615 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1616 ; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4
1617 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4
1618 ; SI-NEXT: v_rcp_f32_e32 v9, v8
1619 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1620 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
1621 ; SI-NEXT: v_fma_f32 v9, v10, v9, v9
1622 ; SI-NEXT: v_mul_f32_e32 v10, v5, v9
1623 ; SI-NEXT: v_fma_f32 v11, -v8, v10, v5
1624 ; SI-NEXT: v_fma_f32 v10, v11, v9, v10
1625 ; SI-NEXT: v_fma_f32 v5, -v8, v10, v5
1626 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1627 ; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10
1628 ; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4
1629 ; SI-NEXT: v_trunc_f32_e32 v5, v5
1630 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4
1631 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
1632 ; SI-NEXT: v_or_b32_e32 v1, v4, v1
1633 ; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3
1634 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3
1635 ; SI-NEXT: v_rcp_f32_e32 v7, v5
1636 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1637 ; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0
1638 ; SI-NEXT: v_fma_f32 v7, v8, v7, v7
1639 ; SI-NEXT: v_mul_f32_e32 v8, v4, v7
1640 ; SI-NEXT: v_fma_f32 v9, -v5, v8, v4
1641 ; SI-NEXT: v_fma_f32 v8, v9, v7, v8
1642 ; SI-NEXT: v_fma_f32 v4, -v5, v8, v4
1643 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1644 ; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8
1645 ; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3
1646 ; SI-NEXT: v_trunc_f32_e32 v4, v4
1647 ; SI-NEXT: v_fma_f32 v0, -v4, v0, v3
1648 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1649 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1650 ; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2
1651 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2
1652 ; SI-NEXT: v_rcp_f32_e32 v5, v4
1653 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1654 ; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0
1655 ; SI-NEXT: v_fma_f32 v5, v7, v5, v5
1656 ; SI-NEXT: v_mul_f32_e32 v7, v3, v5
1657 ; SI-NEXT: v_fma_f32 v8, -v4, v7, v3
1658 ; SI-NEXT: v_fma_f32 v7, v8, v5, v7
1659 ; SI-NEXT: v_fma_f32 v3, -v4, v7, v3
1660 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1661 ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7
1662 ; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2
1663 ; SI-NEXT: v_trunc_f32_e32 v3, v3
1664 ; SI-NEXT: v_fma_f32 v2, -v3, v6, v2
1665 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
1666 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
1667 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1670 ; CI-LABEL: frem_v4f16:
1672 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1673 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1674 ; CI-NEXT: s_mov_b32 s3, 0xf000
1675 ; CI-NEXT: s_mov_b32 s2, -1
1676 ; CI-NEXT: s_mov_b32 s10, s2
1677 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1678 ; CI-NEXT: s_mov_b32 s0, s4
1679 ; CI-NEXT: s_mov_b32 s1, s5
1680 ; CI-NEXT: s_mov_b32 s4, s6
1681 ; CI-NEXT: s_mov_b32 s5, s7
1682 ; CI-NEXT: s_mov_b32 s6, s2
1683 ; CI-NEXT: s_mov_b32 s7, s3
1684 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1685 ; CI-NEXT: s_mov_b32 s11, s3
1686 ; CI-NEXT: s_mov_b32 s6, 3
1687 ; CI-NEXT: s_mov_b32 s7, 0
1688 ; CI-NEXT: s_waitcnt vmcnt(0)
1689 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v0
1690 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1691 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v0
1692 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
1693 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v1
1694 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v0
1695 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
1696 ; CI-NEXT: s_waitcnt vmcnt(0)
1697 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v1
1698 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1699 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
1700 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v0
1701 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1702 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1703 ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5
1704 ; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5
1705 ; CI-NEXT: v_rcp_f32_e32 v10, v9
1706 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1707 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
1708 ; CI-NEXT: v_fma_f32 v10, v11, v10, v10
1709 ; CI-NEXT: v_mul_f32_e32 v11, v8, v10
1710 ; CI-NEXT: v_fma_f32 v12, -v9, v11, v8
1711 ; CI-NEXT: v_fma_f32 v11, v12, v10, v11
1712 ; CI-NEXT: v_fma_f32 v8, -v9, v11, v8
1713 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1714 ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
1715 ; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5
1716 ; CI-NEXT: v_trunc_f32_e32 v8, v8
1717 ; CI-NEXT: v_fma_f32 v1, -v8, v1, v5
1718 ; CI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4
1719 ; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4
1720 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1721 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1722 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1723 ; CI-NEXT: v_rcp_f32_e32 v9, v8
1724 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1725 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
1726 ; CI-NEXT: v_fma_f32 v9, v10, v9, v9
1727 ; CI-NEXT: v_mul_f32_e32 v10, v5, v9
1728 ; CI-NEXT: v_fma_f32 v11, -v8, v10, v5
1729 ; CI-NEXT: v_fma_f32 v10, v11, v9, v10
1730 ; CI-NEXT: v_fma_f32 v5, -v8, v10, v5
1731 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1732 ; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10
1733 ; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4
1734 ; CI-NEXT: v_trunc_f32_e32 v5, v5
1735 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4
1736 ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3
1737 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
1738 ; CI-NEXT: v_or_b32_e32 v1, v4, v1
1739 ; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3
1740 ; CI-NEXT: v_rcp_f32_e32 v7, v5
1741 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1742 ; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0
1743 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7
1744 ; CI-NEXT: v_mul_f32_e32 v8, v4, v7
1745 ; CI-NEXT: v_fma_f32 v9, -v5, v8, v4
1746 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8
1747 ; CI-NEXT: v_fma_f32 v4, -v5, v8, v4
1748 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1749 ; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8
1750 ; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3
1751 ; CI-NEXT: v_trunc_f32_e32 v4, v4
1752 ; CI-NEXT: v_fma_f32 v0, -v4, v0, v3
1753 ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2
1754 ; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2
1755 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1756 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1757 ; CI-NEXT: v_rcp_f32_e32 v5, v4
1758 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1759 ; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0
1760 ; CI-NEXT: v_fma_f32 v5, v7, v5, v5
1761 ; CI-NEXT: v_mul_f32_e32 v7, v3, v5
1762 ; CI-NEXT: v_fma_f32 v8, -v4, v7, v3
1763 ; CI-NEXT: v_fma_f32 v7, v8, v5, v7
1764 ; CI-NEXT: v_fma_f32 v3, -v4, v7, v3
1765 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1766 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7
1767 ; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2
1768 ; CI-NEXT: v_trunc_f32_e32 v3, v3
1769 ; CI-NEXT: v_fma_f32 v2, -v3, v6, v2
1770 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
1771 ; CI-NEXT: v_or_b32_e32 v0, v2, v0
1772 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1775 ; VI-LABEL: frem_v4f16:
1777 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1778 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1779 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1780 ; VI-NEXT: v_mov_b32_e32 v2, s6
1781 ; VI-NEXT: s_add_u32 s0, s0, 32
1782 ; VI-NEXT: s_addc_u32 s1, s1, 0
1783 ; VI-NEXT: v_mov_b32_e32 v5, s1
1784 ; VI-NEXT: v_mov_b32_e32 v4, s0
1785 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
1786 ; VI-NEXT: v_mov_b32_e32 v3, s7
1787 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1788 ; VI-NEXT: v_mov_b32_e32 v0, s4
1789 ; VI-NEXT: v_mov_b32_e32 v1, s5
1790 ; VI-NEXT: s_waitcnt vmcnt(1)
1791 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5
1792 ; VI-NEXT: v_cvt_f32_f16_e32 v9, v8
1793 ; VI-NEXT: s_waitcnt vmcnt(0)
1794 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3
1795 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
1796 ; VI-NEXT: v_rcp_f32_e32 v9, v9
1797 ; VI-NEXT: v_mul_f32_e32 v7, v7, v9
1798 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7
1799 ; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6
1800 ; VI-NEXT: v_trunc_f16_e32 v7, v7
1801 ; VI-NEXT: v_fma_f16 v6, -v7, v8, v6
1802 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v5
1803 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v3
1804 ; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
1805 ; VI-NEXT: v_rcp_f32_e32 v8, v8
1806 ; VI-NEXT: v_mul_f32_e32 v7, v7, v8
1807 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7
1808 ; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3
1809 ; VI-NEXT: v_trunc_f16_e32 v7, v7
1810 ; VI-NEXT: v_fma_f16 v3, -v7, v5, v3
1811 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
1812 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v7
1813 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
1814 ; VI-NEXT: v_or_b32_e32 v3, v3, v6
1815 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v5
1816 ; VI-NEXT: v_rcp_f32_e32 v8, v8
1817 ; VI-NEXT: v_mul_f32_e32 v6, v6, v8
1818 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
1819 ; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5
1820 ; VI-NEXT: v_trunc_f16_e32 v6, v6
1821 ; VI-NEXT: v_fma_f16 v5, -v6, v7, v5
1822 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v4
1823 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v2
1824 ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
1825 ; VI-NEXT: v_rcp_f32_e32 v7, v7
1826 ; VI-NEXT: v_mul_f32_e32 v6, v6, v7
1827 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
1828 ; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2
1829 ; VI-NEXT: v_trunc_f16_e32 v6, v6
1830 ; VI-NEXT: v_fma_f16 v2, -v6, v4, v2
1831 ; VI-NEXT: v_or_b32_e32 v2, v2, v5
1832 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1835 ; GFX9-LABEL: frem_v4f16:
1837 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1838 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1839 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
1840 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1841 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
1842 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
1843 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1844 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1
1845 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1846 ; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3
1847 ; GFX9-NEXT: v_rcp_f32_e32 v6, v6
1848 ; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6
1849 ; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5
1850 ; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1
1851 ; GFX9-NEXT: v_trunc_f16_e32 v5, v5
1852 ; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1
1853 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1854 ; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v3
1855 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1856 ; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v1
1857 ; GFX9-NEXT: v_rcp_f32_e32 v7, v7
1858 ; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7
1859 ; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6
1860 ; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1
1861 ; GFX9-NEXT: v_trunc_f16_e32 v6, v6
1862 ; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1
1863 ; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1
1864 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2
1865 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
1866 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5
1867 ; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5
1868 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
1869 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0
1870 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3
1871 ; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0
1872 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1873 ; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v2
1874 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1875 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0
1876 ; GFX9-NEXT: v_rcp_f32_e32 v6, v6
1877 ; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6
1878 ; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5
1879 ; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0
1880 ; GFX9-NEXT: v_trunc_f16_e32 v5, v5
1881 ; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0
1882 ; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
1883 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
1884 ; GFX9-NEXT: s_endpgm
1886 ; GFX10-LABEL: frem_v4f16:
1888 ; GFX10-NEXT: s_clause 0x1
1889 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1890 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1891 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
1892 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1893 ; GFX10-NEXT: s_clause 0x1
1894 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
1895 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
1896 ; GFX10-NEXT: s_waitcnt vmcnt(1)
1897 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1
1898 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1899 ; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3
1900 ; GFX10-NEXT: v_rcp_f32_e32 v6, v6
1901 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6
1902 ; GFX10-NEXT: v_mov_b32_e32 v6, v1
1903 ; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
1904 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
1905 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1906 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5
1907 ; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v3
1908 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1909 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1
1910 ; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3
1911 ; GFX10-NEXT: v_rcp_f32_e32 v7, v7
1912 ; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7
1913 ; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
1914 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
1915 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5
1916 ; GFX10-NEXT: v_fmac_f16_e64 v1, -v5, v3
1917 ; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
1918 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
1919 ; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1
1920 ; GFX10-NEXT: v_rcp_f32_e32 v5, v5
1921 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
1922 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
1923 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
1924 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
1925 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1926 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
1927 ; GFX10-NEXT: v_fmac_f16_e64 v5, -v3, v2
1928 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1929 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
1930 ; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2
1931 ; GFX10-NEXT: v_rcp_f32_e32 v6, v6
1932 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v6
1933 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
1934 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
1935 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3
1936 ; GFX10-NEXT: v_fmac_f16_e64 v0, -v3, v2
1937 ; GFX10-NEXT: v_pack_b32_f16 v0, v5, v0
1938 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
1939 ; GFX10-NEXT: s_endpgm
1940 <4 x half> addrspace(1)* %in2) #0 {
1941 %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4
1942 %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16
1943 %r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16
1944 %r2 = frem <4 x half> %r0, %r1
1945 store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16
1949 define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
1950 ; SI-LABEL: frem_v2f32:
1952 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1953 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1954 ; SI-NEXT: s_mov_b32 s3, 0xf000
1955 ; SI-NEXT: s_mov_b32 s2, -1
1956 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1957 ; SI-NEXT: s_mov_b32 s0, s4
1958 ; SI-NEXT: s_mov_b32 s1, s5
1959 ; SI-NEXT: s_mov_b32 s4, s6
1960 ; SI-NEXT: s_mov_b32 s5, s7
1961 ; SI-NEXT: s_mov_b32 s6, s2
1962 ; SI-NEXT: s_mov_b32 s7, s3
1963 ; SI-NEXT: s_mov_b32 s10, s2
1964 ; SI-NEXT: s_mov_b32 s11, s3
1965 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1966 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
1967 ; SI-NEXT: s_waitcnt vmcnt(0)
1968 ; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1
1969 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
1970 ; SI-NEXT: v_rcp_f32_e32 v6, v5
1971 ; SI-NEXT: s_mov_b32 s6, 3
1972 ; SI-NEXT: s_mov_b32 s7, 0
1973 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1974 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
1975 ; SI-NEXT: v_fma_f32 v6, v7, v6, v6
1976 ; SI-NEXT: v_mul_f32_e32 v7, v4, v6
1977 ; SI-NEXT: v_fma_f32 v8, -v5, v7, v4
1978 ; SI-NEXT: v_fma_f32 v7, v8, v6, v7
1979 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4
1980 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1981 ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
1982 ; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1
1983 ; SI-NEXT: v_trunc_f32_e32 v4, v4
1984 ; SI-NEXT: v_fma_f32 v1, -v4, v3, v1
1985 ; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0
1986 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
1987 ; SI-NEXT: v_rcp_f32_e32 v5, v4
1988 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1989 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
1990 ; SI-NEXT: v_fma_f32 v5, v6, v5, v5
1991 ; SI-NEXT: v_mul_f32_e32 v6, v3, v5
1992 ; SI-NEXT: v_fma_f32 v7, -v4, v6, v3
1993 ; SI-NEXT: v_fma_f32 v6, v7, v5, v6
1994 ; SI-NEXT: v_fma_f32 v3, -v4, v6, v3
1995 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1996 ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
1997 ; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0
1998 ; SI-NEXT: v_trunc_f32_e32 v3, v3
1999 ; SI-NEXT: v_fma_f32 v0, -v3, v2, v0
2000 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2003 ; CI-LABEL: frem_v2f32:
2005 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2006 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2007 ; CI-NEXT: s_mov_b32 s3, 0xf000
2008 ; CI-NEXT: s_mov_b32 s2, -1
2009 ; CI-NEXT: s_mov_b32 s10, s2
2010 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2011 ; CI-NEXT: s_mov_b32 s0, s4
2012 ; CI-NEXT: s_mov_b32 s1, s5
2013 ; CI-NEXT: s_mov_b32 s4, s6
2014 ; CI-NEXT: s_mov_b32 s5, s7
2015 ; CI-NEXT: s_mov_b32 s6, s2
2016 ; CI-NEXT: s_mov_b32 s7, s3
2017 ; CI-NEXT: s_mov_b32 s11, s3
2018 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2019 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
2020 ; CI-NEXT: s_mov_b32 s6, 3
2021 ; CI-NEXT: s_mov_b32 s7, 0
2022 ; CI-NEXT: s_waitcnt vmcnt(0)
2023 ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
2024 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1
2025 ; CI-NEXT: v_rcp_f32_e32 v6, v5
2026 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2027 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2028 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6
2029 ; CI-NEXT: v_mul_f32_e32 v7, v4, v6
2030 ; CI-NEXT: v_fma_f32 v8, -v5, v7, v4
2031 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7
2032 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4
2033 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2034 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
2035 ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1
2036 ; CI-NEXT: v_trunc_f32_e32 v4, v4
2037 ; CI-NEXT: v_fma_f32 v1, -v4, v3, v1
2038 ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
2039 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0
2040 ; CI-NEXT: v_rcp_f32_e32 v5, v4
2041 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2042 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
2043 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5
2044 ; CI-NEXT: v_mul_f32_e32 v6, v3, v5
2045 ; CI-NEXT: v_fma_f32 v7, -v4, v6, v3
2046 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6
2047 ; CI-NEXT: v_fma_f32 v3, -v4, v6, v3
2048 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2049 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
2050 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2051 ; CI-NEXT: v_trunc_f32_e32 v3, v3
2052 ; CI-NEXT: v_fma_f32 v0, -v3, v2, v0
2053 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2056 ; VI-LABEL: frem_v2f32:
2058 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2059 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2060 ; VI-NEXT: s_mov_b32 s2, 3
2061 ; VI-NEXT: s_mov_b32 s3, 0
2062 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2063 ; VI-NEXT: v_mov_b32_e32 v2, s6
2064 ; VI-NEXT: s_add_u32 s0, s0, 32
2065 ; VI-NEXT: s_addc_u32 s1, s1, 0
2066 ; VI-NEXT: v_mov_b32_e32 v5, s1
2067 ; VI-NEXT: v_mov_b32_e32 v3, s7
2068 ; VI-NEXT: v_mov_b32_e32 v4, s0
2069 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
2070 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
2071 ; VI-NEXT: v_mov_b32_e32 v0, s4
2072 ; VI-NEXT: v_mov_b32_e32 v1, s5
2073 ; VI-NEXT: s_waitcnt vmcnt(0)
2074 ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3
2075 ; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3
2076 ; VI-NEXT: v_rcp_f32_e32 v8, v7
2077 ; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2078 ; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0
2079 ; VI-NEXT: v_fma_f32 v8, v9, v8, v8
2080 ; VI-NEXT: v_mul_f32_e32 v9, v6, v8
2081 ; VI-NEXT: v_fma_f32 v10, -v7, v9, v6
2082 ; VI-NEXT: v_fma_f32 v9, v10, v8, v9
2083 ; VI-NEXT: v_fma_f32 v6, -v7, v9, v6
2084 ; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2085 ; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9
2086 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3
2087 ; VI-NEXT: v_trunc_f32_e32 v6, v6
2088 ; VI-NEXT: v_fma_f32 v3, -v6, v5, v3
2089 ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2
2090 ; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2
2091 ; VI-NEXT: v_rcp_f32_e32 v7, v6
2092 ; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2093 ; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2094 ; VI-NEXT: v_fma_f32 v7, v8, v7, v7
2095 ; VI-NEXT: v_mul_f32_e32 v8, v5, v7
2096 ; VI-NEXT: v_fma_f32 v9, -v6, v8, v5
2097 ; VI-NEXT: v_fma_f32 v8, v9, v7, v8
2098 ; VI-NEXT: v_fma_f32 v5, -v6, v8, v5
2099 ; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2100 ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2101 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2
2102 ; VI-NEXT: v_trunc_f32_e32 v5, v5
2103 ; VI-NEXT: v_fma_f32 v2, -v5, v4, v2
2104 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
2107 ; GFX9-LABEL: frem_v2f32:
2109 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2110 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2111 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2112 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2113 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
2114 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2115 ; GFX9-NEXT: s_mov_b32 s2, 3
2116 ; GFX9-NEXT: s_mov_b32 s3, 0
2117 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2118 ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1
2119 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
2120 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6
2121 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2122 ; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2123 ; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7
2124 ; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7
2125 ; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5
2126 ; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8
2127 ; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5
2128 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2129 ; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2130 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1
2131 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
2132 ; GFX9-NEXT: v_fma_f32 v1, -v5, v3, v1
2133 ; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0
2134 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0
2135 ; GFX9-NEXT: v_rcp_f32_e32 v6, v5
2136 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2137 ; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2138 ; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6
2139 ; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6
2140 ; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3
2141 ; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7
2142 ; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3
2143 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2144 ; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7
2145 ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2146 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3
2147 ; GFX9-NEXT: v_fma_f32 v0, -v3, v2, v0
2148 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
2149 ; GFX9-NEXT: s_endpgm
2151 ; GFX10-LABEL: frem_v2f32:
2153 ; GFX10-NEXT: s_clause 0x1
2154 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2155 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2156 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
2157 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2158 ; GFX10-NEXT: s_clause 0x1
2159 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
2160 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2161 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2162 ; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1
2163 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
2164 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6
2165 ; GFX10-NEXT: s_denorm_mode 15
2166 ; GFX10-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2167 ; GFX10-NEXT: v_fma_f32 v7, v8, v7, v7
2168 ; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7
2169 ; GFX10-NEXT: v_fma_f32 v9, -v6, v8, v5
2170 ; GFX10-NEXT: v_fma_f32 v8, v9, v7, v8
2171 ; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5
2172 ; GFX10-NEXT: s_denorm_mode 12
2173 ; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2174 ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1
2175 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5
2176 ; GFX10-NEXT: v_fma_f32 v1, v3, -v5, v1
2177 ; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0
2178 ; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0
2179 ; GFX10-NEXT: v_rcp_f32_e32 v6, v5
2180 ; GFX10-NEXT: s_denorm_mode 15
2181 ; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0
2182 ; GFX10-NEXT: v_fma_f32 v6, v7, v6, v6
2183 ; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6
2184 ; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3
2185 ; GFX10-NEXT: v_fma_f32 v7, v8, v6, v7
2186 ; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3
2187 ; GFX10-NEXT: s_denorm_mode 12
2188 ; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7
2189 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0
2190 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3
2191 ; GFX10-NEXT: v_fmac_f32_e64 v0, -v3, v2
2192 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
2193 ; GFX10-NEXT: s_endpgm
2194 <2 x float> addrspace(1)* %in2) #0 {
2195 %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
2196 %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
2197 %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8
2198 %r2 = frem <2 x float> %r0, %r1
2199 store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
2203 define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
2204 ; SI-LABEL: frem_v4f32:
2206 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2207 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2208 ; SI-NEXT: s_mov_b32 s3, 0xf000
2209 ; SI-NEXT: s_mov_b32 s2, -1
2210 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2211 ; SI-NEXT: s_mov_b32 s0, s4
2212 ; SI-NEXT: s_mov_b32 s1, s5
2213 ; SI-NEXT: s_mov_b32 s4, s6
2214 ; SI-NEXT: s_mov_b32 s5, s7
2215 ; SI-NEXT: s_mov_b32 s6, s2
2216 ; SI-NEXT: s_mov_b32 s7, s3
2217 ; SI-NEXT: s_mov_b32 s10, s2
2218 ; SI-NEXT: s_mov_b32 s11, s3
2219 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2220 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2221 ; SI-NEXT: s_waitcnt vmcnt(0)
2222 ; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3
2223 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3
2224 ; SI-NEXT: v_rcp_f32_e32 v10, v9
2225 ; SI-NEXT: s_mov_b32 s6, 3
2226 ; SI-NEXT: s_mov_b32 s7, 0
2227 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2228 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
2229 ; SI-NEXT: v_fma_f32 v10, v11, v10, v10
2230 ; SI-NEXT: v_mul_f32_e32 v11, v8, v10
2231 ; SI-NEXT: v_fma_f32 v12, -v9, v11, v8
2232 ; SI-NEXT: v_fma_f32 v11, v12, v10, v11
2233 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8
2234 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2235 ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
2236 ; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3
2237 ; SI-NEXT: v_trunc_f32_e32 v8, v8
2238 ; SI-NEXT: v_fma_f32 v3, -v8, v7, v3
2239 ; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
2240 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2
2241 ; SI-NEXT: v_rcp_f32_e32 v9, v8
2242 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2243 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
2244 ; SI-NEXT: v_fma_f32 v9, v10, v9, v9
2245 ; SI-NEXT: v_mul_f32_e32 v10, v7, v9
2246 ; SI-NEXT: v_fma_f32 v11, -v8, v10, v7
2247 ; SI-NEXT: v_fma_f32 v10, v11, v9, v10
2248 ; SI-NEXT: v_fma_f32 v7, -v8, v10, v7
2249 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2250 ; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10
2251 ; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2
2252 ; SI-NEXT: v_trunc_f32_e32 v7, v7
2253 ; SI-NEXT: v_fma_f32 v2, -v7, v6, v2
2254 ; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
2255 ; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1
2256 ; SI-NEXT: v_rcp_f32_e32 v8, v7
2257 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2258 ; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0
2259 ; SI-NEXT: v_fma_f32 v8, v9, v8, v8
2260 ; SI-NEXT: v_mul_f32_e32 v9, v6, v8
2261 ; SI-NEXT: v_fma_f32 v10, -v7, v9, v6
2262 ; SI-NEXT: v_fma_f32 v9, v10, v8, v9
2263 ; SI-NEXT: v_fma_f32 v6, -v7, v9, v6
2264 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2265 ; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9
2266 ; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1
2267 ; SI-NEXT: v_trunc_f32_e32 v6, v6
2268 ; SI-NEXT: v_fma_f32 v1, -v6, v5, v1
2269 ; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
2270 ; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0
2271 ; SI-NEXT: v_rcp_f32_e32 v7, v6
2272 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2273 ; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2274 ; SI-NEXT: v_fma_f32 v7, v8, v7, v7
2275 ; SI-NEXT: v_mul_f32_e32 v8, v5, v7
2276 ; SI-NEXT: v_fma_f32 v9, -v6, v8, v5
2277 ; SI-NEXT: v_fma_f32 v8, v9, v7, v8
2278 ; SI-NEXT: v_fma_f32 v5, -v6, v8, v5
2279 ; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2280 ; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2281 ; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0
2282 ; SI-NEXT: v_trunc_f32_e32 v5, v5
2283 ; SI-NEXT: v_fma_f32 v0, -v5, v4, v0
2284 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2287 ; CI-LABEL: frem_v4f32:
2289 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2290 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2291 ; CI-NEXT: s_mov_b32 s3, 0xf000
2292 ; CI-NEXT: s_mov_b32 s2, -1
2293 ; CI-NEXT: s_mov_b32 s10, s2
2294 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2295 ; CI-NEXT: s_mov_b32 s0, s4
2296 ; CI-NEXT: s_mov_b32 s1, s5
2297 ; CI-NEXT: s_mov_b32 s4, s6
2298 ; CI-NEXT: s_mov_b32 s5, s7
2299 ; CI-NEXT: s_mov_b32 s6, s2
2300 ; CI-NEXT: s_mov_b32 s7, s3
2301 ; CI-NEXT: s_mov_b32 s11, s3
2302 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2303 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2304 ; CI-NEXT: s_mov_b32 s6, 3
2305 ; CI-NEXT: s_mov_b32 s7, 0
2306 ; CI-NEXT: s_waitcnt vmcnt(0)
2307 ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3
2308 ; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3
2309 ; CI-NEXT: v_rcp_f32_e32 v10, v9
2310 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2311 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
2312 ; CI-NEXT: v_fma_f32 v10, v11, v10, v10
2313 ; CI-NEXT: v_mul_f32_e32 v11, v8, v10
2314 ; CI-NEXT: v_fma_f32 v12, -v9, v11, v8
2315 ; CI-NEXT: v_fma_f32 v11, v12, v10, v11
2316 ; CI-NEXT: v_fma_f32 v8, -v9, v11, v8
2317 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2318 ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
2319 ; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3
2320 ; CI-NEXT: v_trunc_f32_e32 v8, v8
2321 ; CI-NEXT: v_fma_f32 v3, -v8, v7, v3
2322 ; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2
2323 ; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
2324 ; CI-NEXT: v_rcp_f32_e32 v9, v8
2325 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2326 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
2327 ; CI-NEXT: v_fma_f32 v9, v10, v9, v9
2328 ; CI-NEXT: v_mul_f32_e32 v10, v7, v9
2329 ; CI-NEXT: v_fma_f32 v11, -v8, v10, v7
2330 ; CI-NEXT: v_fma_f32 v10, v11, v9, v10
2331 ; CI-NEXT: v_fma_f32 v7, -v8, v10, v7
2332 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2333 ; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10
2334 ; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2
2335 ; CI-NEXT: v_trunc_f32_e32 v7, v7
2336 ; CI-NEXT: v_fma_f32 v2, -v7, v6, v2
2337 ; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1
2338 ; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
2339 ; CI-NEXT: v_rcp_f32_e32 v8, v7
2340 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2341 ; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0
2342 ; CI-NEXT: v_fma_f32 v8, v9, v8, v8
2343 ; CI-NEXT: v_mul_f32_e32 v9, v6, v8
2344 ; CI-NEXT: v_fma_f32 v10, -v7, v9, v6
2345 ; CI-NEXT: v_fma_f32 v9, v10, v8, v9
2346 ; CI-NEXT: v_fma_f32 v6, -v7, v9, v6
2347 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2348 ; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9
2349 ; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1
2350 ; CI-NEXT: v_trunc_f32_e32 v6, v6
2351 ; CI-NEXT: v_fma_f32 v1, -v6, v5, v1
2352 ; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0
2353 ; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
2354 ; CI-NEXT: v_rcp_f32_e32 v7, v6
2355 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2356 ; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0
2357 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7
2358 ; CI-NEXT: v_mul_f32_e32 v8, v5, v7
2359 ; CI-NEXT: v_fma_f32 v9, -v6, v8, v5
2360 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8
2361 ; CI-NEXT: v_fma_f32 v5, -v6, v8, v5
2362 ; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2363 ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
2364 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0
2365 ; CI-NEXT: v_trunc_f32_e32 v5, v5
2366 ; CI-NEXT: v_fma_f32 v0, -v5, v4, v0
2367 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2370 ; VI-LABEL: frem_v4f32:
2372 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2373 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2374 ; VI-NEXT: s_mov_b32 s2, 3
2375 ; VI-NEXT: s_mov_b32 s3, 0
2376 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2377 ; VI-NEXT: v_mov_b32_e32 v0, s6
2378 ; VI-NEXT: s_add_u32 s0, s0, 64
2379 ; VI-NEXT: s_addc_u32 s1, s1, 0
2380 ; VI-NEXT: v_mov_b32_e32 v5, s1
2381 ; VI-NEXT: v_mov_b32_e32 v1, s7
2382 ; VI-NEXT: v_mov_b32_e32 v4, s0
2383 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2384 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2385 ; VI-NEXT: v_mov_b32_e32 v8, s4
2386 ; VI-NEXT: v_mov_b32_e32 v9, s5
2387 ; VI-NEXT: s_waitcnt vmcnt(0)
2388 ; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3
2389 ; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3
2390 ; VI-NEXT: v_rcp_f32_e32 v12, v11
2391 ; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2392 ; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0
2393 ; VI-NEXT: v_fma_f32 v12, v13, v12, v12
2394 ; VI-NEXT: v_mul_f32_e32 v13, v10, v12
2395 ; VI-NEXT: v_fma_f32 v14, -v11, v13, v10
2396 ; VI-NEXT: v_fma_f32 v13, v14, v12, v13
2397 ; VI-NEXT: v_fma_f32 v10, -v11, v13, v10
2398 ; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2399 ; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13
2400 ; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3
2401 ; VI-NEXT: v_trunc_f32_e32 v10, v10
2402 ; VI-NEXT: v_fma_f32 v3, -v10, v7, v3
2403 ; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2
2404 ; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
2405 ; VI-NEXT: v_rcp_f32_e32 v11, v10
2406 ; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2407 ; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0
2408 ; VI-NEXT: v_fma_f32 v11, v12, v11, v11
2409 ; VI-NEXT: v_mul_f32_e32 v12, v7, v11
2410 ; VI-NEXT: v_fma_f32 v13, -v10, v12, v7
2411 ; VI-NEXT: v_fma_f32 v12, v13, v11, v12
2412 ; VI-NEXT: v_fma_f32 v7, -v10, v12, v7
2413 ; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2414 ; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12
2415 ; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2
2416 ; VI-NEXT: v_trunc_f32_e32 v7, v7
2417 ; VI-NEXT: v_fma_f32 v2, -v7, v6, v2
2418 ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1
2419 ; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
2420 ; VI-NEXT: v_rcp_f32_e32 v10, v7
2421 ; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2422 ; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0
2423 ; VI-NEXT: v_fma_f32 v10, v11, v10, v10
2424 ; VI-NEXT: v_mul_f32_e32 v11, v6, v10
2425 ; VI-NEXT: v_fma_f32 v12, -v7, v11, v6
2426 ; VI-NEXT: v_fma_f32 v11, v12, v10, v11
2427 ; VI-NEXT: v_fma_f32 v6, -v7, v11, v6
2428 ; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2429 ; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11
2430 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1
2431 ; VI-NEXT: v_trunc_f32_e32 v6, v6
2432 ; VI-NEXT: v_fma_f32 v1, -v6, v5, v1
2433 ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0
2434 ; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
2435 ; VI-NEXT: v_rcp_f32_e32 v7, v6
2436 ; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2437 ; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0
2438 ; VI-NEXT: v_fma_f32 v7, v10, v7, v7
2439 ; VI-NEXT: v_mul_f32_e32 v10, v5, v7
2440 ; VI-NEXT: v_fma_f32 v11, -v6, v10, v5
2441 ; VI-NEXT: v_fma_f32 v10, v11, v7, v10
2442 ; VI-NEXT: v_fma_f32 v5, -v6, v10, v5
2443 ; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2444 ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10
2445 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0
2446 ; VI-NEXT: v_trunc_f32_e32 v5, v5
2447 ; VI-NEXT: v_fma_f32 v0, -v5, v4, v0
2448 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2451 ; GFX9-LABEL: frem_v4f32:
2453 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2454 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2455 ; GFX9-NEXT: v_mov_b32_e32 v8, 0
2456 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2457 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
2458 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
2459 ; GFX9-NEXT: s_mov_b32 s2, 3
2460 ; GFX9-NEXT: s_mov_b32 s3, 0
2461 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2462 ; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3
2463 ; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3
2464 ; GFX9-NEXT: v_rcp_f32_e32 v11, v10
2465 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2466 ; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0
2467 ; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11
2468 ; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11
2469 ; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9
2470 ; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12
2471 ; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9
2472 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2473 ; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12
2474 ; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3
2475 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9
2476 ; GFX9-NEXT: v_fma_f32 v3, -v9, v7, v3
2477 ; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2
2478 ; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
2479 ; GFX9-NEXT: v_rcp_f32_e32 v10, v9
2480 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2481 ; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0
2482 ; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10
2483 ; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10
2484 ; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7
2485 ; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11
2486 ; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7
2487 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2488 ; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11
2489 ; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2
2490 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7
2491 ; GFX9-NEXT: v_fma_f32 v2, -v7, v6, v2
2492 ; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1
2493 ; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
2494 ; GFX9-NEXT: v_rcp_f32_e32 v9, v7
2495 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2496 ; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0
2497 ; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9
2498 ; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9
2499 ; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6
2500 ; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10
2501 ; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6
2502 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2503 ; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10
2504 ; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1
2505 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6
2506 ; GFX9-NEXT: v_fma_f32 v1, -v6, v5, v1
2507 ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0
2508 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
2509 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6
2510 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2511 ; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0
2512 ; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7
2513 ; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7
2514 ; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5
2515 ; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9
2516 ; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5
2517 ; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2518 ; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9
2519 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0
2520 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5
2521 ; GFX9-NEXT: v_fma_f32 v0, -v5, v4, v0
2522 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
2523 ; GFX9-NEXT: s_endpgm
2525 ; GFX10-LABEL: frem_v4f32:
2527 ; GFX10-NEXT: s_clause 0x1
2528 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2529 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2530 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
2531 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2532 ; GFX10-NEXT: s_clause 0x1
2533 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
2534 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
2535 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2536 ; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3
2537 ; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
2538 ; GFX10-NEXT: v_rcp_f32_e32 v11, v10
2539 ; GFX10-NEXT: s_denorm_mode 15
2540 ; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0
2541 ; GFX10-NEXT: v_fma_f32 v11, v12, v11, v11
2542 ; GFX10-NEXT: v_mul_f32_e32 v12, v9, v11
2543 ; GFX10-NEXT: v_fma_f32 v13, -v10, v12, v9
2544 ; GFX10-NEXT: v_fma_f32 v12, v13, v11, v12
2545 ; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9
2546 ; GFX10-NEXT: s_denorm_mode 12
2547 ; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12
2548 ; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3
2549 ; GFX10-NEXT: v_trunc_f32_e32 v9, v9
2550 ; GFX10-NEXT: v_fma_f32 v3, v7, -v9, v3
2551 ; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2
2552 ; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2
2553 ; GFX10-NEXT: v_rcp_f32_e32 v10, v9
2554 ; GFX10-NEXT: s_denorm_mode 15
2555 ; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0
2556 ; GFX10-NEXT: v_fma_f32 v10, v11, v10, v10
2557 ; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10
2558 ; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7
2559 ; GFX10-NEXT: v_fma_f32 v11, v12, v10, v11
2560 ; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7
2561 ; GFX10-NEXT: s_denorm_mode 12
2562 ; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11
2563 ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2
2564 ; GFX10-NEXT: v_trunc_f32_e32 v7, v7
2565 ; GFX10-NEXT: v_fma_f32 v2, v6, -v7, v2
2566 ; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1
2567 ; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1
2568 ; GFX10-NEXT: v_rcp_f32_e32 v9, v7
2569 ; GFX10-NEXT: s_denorm_mode 15
2570 ; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0
2571 ; GFX10-NEXT: v_fma_f32 v9, v10, v9, v9
2572 ; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9
2573 ; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6
2574 ; GFX10-NEXT: v_fma_f32 v10, v11, v9, v10
2575 ; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6
2576 ; GFX10-NEXT: s_denorm_mode 12
2577 ; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10
2578 ; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1
2579 ; GFX10-NEXT: v_trunc_f32_e32 v6, v6
2580 ; GFX10-NEXT: v_fma_f32 v1, v5, -v6, v1
2581 ; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0
2582 ; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0
2583 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6
2584 ; GFX10-NEXT: s_denorm_mode 15
2585 ; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0
2586 ; GFX10-NEXT: v_fma_f32 v7, v9, v7, v7
2587 ; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7
2588 ; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5
2589 ; GFX10-NEXT: v_fma_f32 v9, v10, v7, v9
2590 ; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5
2591 ; GFX10-NEXT: s_denorm_mode 12
2592 ; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9
2593 ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0
2594 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5
2595 ; GFX10-NEXT: v_fmac_f32_e64 v0, -v5, v4
2596 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
2597 ; GFX10-NEXT: s_endpgm
2598 <4 x float> addrspace(1)* %in2) #0 {
2599 %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
2600 %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
2601 %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16
2602 %r2 = frem <4 x float> %r0, %r1
2603 store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
2607 define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
2608 ; SI-LABEL: frem_v2f64:
2610 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
2611 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2612 ; SI-NEXT: s_mov_b32 s7, 0xf000
2613 ; SI-NEXT: s_mov_b32 s6, -1
2614 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2615 ; SI-NEXT: s_mov_b32 s4, s8
2616 ; SI-NEXT: s_mov_b32 s5, s9
2617 ; SI-NEXT: s_mov_b32 s8, s10
2618 ; SI-NEXT: s_mov_b32 s9, s11
2619 ; SI-NEXT: s_mov_b32 s10, s6
2620 ; SI-NEXT: s_mov_b32 s11, s7
2621 ; SI-NEXT: s_mov_b32 s2, s6
2622 ; SI-NEXT: s_mov_b32 s3, s7
2623 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2624 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64
2625 ; SI-NEXT: s_waitcnt vmcnt(0)
2626 ; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
2627 ; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
2628 ; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2629 ; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2630 ; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2631 ; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2632 ; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3]
2633 ; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
2634 ; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13]
2635 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9
2636 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v13
2637 ; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
2639 ; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
2640 ; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
2641 ; SI-NEXT: v_bfe_u32 v10, v9, 20, 11
2642 ; SI-NEXT: s_movk_i32 s8, 0xfc01
2643 ; SI-NEXT: v_add_i32_e32 v12, vcc, s8, v10
2644 ; SI-NEXT: s_mov_b32 s3, 0xfffff
2645 ; SI-NEXT: v_lshr_b64 v[10:11], s[2:3], v12
2646 ; SI-NEXT: v_not_b32_e32 v10, v10
2647 ; SI-NEXT: v_and_b32_e32 v10, v8, v10
2648 ; SI-NEXT: v_not_b32_e32 v11, v11
2649 ; SI-NEXT: v_and_b32_e32 v11, v9, v11
2650 ; SI-NEXT: s_brev_b32 s9, 1
2651 ; SI-NEXT: v_and_b32_e32 v13, s9, v9
2652 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12
2653 ; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc
2654 ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12
2655 ; SI-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[0:1]
2656 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
2657 ; SI-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[0:1]
2658 ; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
2659 ; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
2660 ; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
2661 ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2662 ; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2663 ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2664 ; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2665 ; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1]
2666 ; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
2667 ; SI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11]
2668 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
2669 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v11
2670 ; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
2672 ; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
2673 ; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2674 ; SI-NEXT: v_bfe_u32 v8, v7, 20, 11
2675 ; SI-NEXT: v_add_i32_e32 v10, vcc, s8, v8
2676 ; SI-NEXT: v_lshr_b64 v[8:9], s[2:3], v10
2677 ; SI-NEXT: v_not_b32_e32 v8, v8
2678 ; SI-NEXT: v_and_b32_e32 v8, v6, v8
2679 ; SI-NEXT: v_not_b32_e32 v9, v9
2680 ; SI-NEXT: v_and_b32_e32 v9, v7, v9
2681 ; SI-NEXT: v_and_b32_e32 v11, s9, v7
2682 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10
2683 ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
2684 ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10
2685 ; SI-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[0:1]
2686 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
2687 ; SI-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1]
2688 ; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2689 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2692 ; CI-LABEL: frem_v2f64:
2694 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2695 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2696 ; CI-NEXT: s_mov_b32 s3, 0xf000
2697 ; CI-NEXT: s_mov_b32 s2, -1
2698 ; CI-NEXT: s_mov_b32 s10, s2
2699 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2700 ; CI-NEXT: s_mov_b32 s0, s4
2701 ; CI-NEXT: s_mov_b32 s1, s5
2702 ; CI-NEXT: s_mov_b32 s4, s6
2703 ; CI-NEXT: s_mov_b32 s5, s7
2704 ; CI-NEXT: s_mov_b32 s6, s2
2705 ; CI-NEXT: s_mov_b32 s7, s3
2706 ; CI-NEXT: s_mov_b32 s11, s3
2707 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2708 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2709 ; CI-NEXT: s_waitcnt vmcnt(0)
2710 ; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3]
2711 ; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
2712 ; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2713 ; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2714 ; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2715 ; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2716 ; CI-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
2717 ; CI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
2718 ; CI-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
2720 ; CI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
2721 ; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
2722 ; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
2723 ; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
2724 ; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1]
2725 ; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
2726 ; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2727 ; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2728 ; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2729 ; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2730 ; CI-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
2731 ; CI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
2732 ; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
2734 ; CI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
2735 ; CI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2736 ; CI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
2737 ; CI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2738 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2741 ; VI-LABEL: frem_v2f64:
2743 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2744 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2745 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2746 ; VI-NEXT: v_mov_b32_e32 v0, s6
2747 ; VI-NEXT: s_add_u32 s0, s0, 64
2748 ; VI-NEXT: s_addc_u32 s1, s1, 0
2749 ; VI-NEXT: v_mov_b32_e32 v5, s1
2750 ; VI-NEXT: v_mov_b32_e32 v1, s7
2751 ; VI-NEXT: v_mov_b32_e32 v4, s0
2752 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2753 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2754 ; VI-NEXT: v_mov_b32_e32 v8, s4
2755 ; VI-NEXT: v_mov_b32_e32 v9, s5
2756 ; VI-NEXT: s_waitcnt vmcnt(0)
2757 ; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3]
2758 ; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11]
2759 ; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
2760 ; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
2761 ; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
2762 ; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
2763 ; VI-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3]
2764 ; VI-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13]
2765 ; VI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15]
2767 ; VI-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17]
2768 ; VI-NEXT: v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3]
2769 ; VI-NEXT: v_trunc_f64_e32 v[10:11], v[10:11]
2770 ; VI-NEXT: v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3]
2771 ; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
2772 ; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7]
2773 ; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
2774 ; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2775 ; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
2776 ; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2777 ; VI-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1]
2778 ; VI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
2779 ; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13]
2781 ; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15]
2782 ; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2783 ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
2784 ; VI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2785 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2788 ; GFX9-LABEL: frem_v2f64:
2790 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2791 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2792 ; GFX9-NEXT: v_mov_b32_e32 v16, 0
2793 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2794 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
2795 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
2796 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2797 ; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
2798 ; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
2799 ; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2800 ; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2801 ; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2802 ; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2803 ; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
2804 ; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
2805 ; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
2806 ; GFX9-NEXT: s_nop 1
2807 ; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
2808 ; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
2809 ; GFX9-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
2810 ; GFX9-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
2811 ; GFX9-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
2812 ; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
2813 ; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2814 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2815 ; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2816 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2817 ; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
2818 ; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
2819 ; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
2820 ; GFX9-NEXT: s_nop 1
2821 ; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
2822 ; GFX9-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2823 ; GFX9-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
2824 ; GFX9-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2825 ; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5]
2826 ; GFX9-NEXT: s_endpgm
2828 ; GFX10-LABEL: frem_v2f64:
2830 ; GFX10-NEXT: s_clause 0x1
2831 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2832 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2833 ; GFX10-NEXT: v_mov_b32_e32 v16, 0
2834 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2835 ; GFX10-NEXT: s_clause 0x1
2836 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
2837 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
2838 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2839 ; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3]
2840 ; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
2841 ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2842 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2843 ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2844 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2845 ; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
2846 ; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11]
2847 ; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
2848 ; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
2849 ; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
2850 ; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
2851 ; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
2852 ; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1]
2853 ; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
2854 ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2855 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2856 ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2857 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2858 ; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
2859 ; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9]
2860 ; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
2861 ; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
2862 ; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2863 ; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
2864 ; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2865 ; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5]
2866 ; GFX10-NEXT: s_endpgm
2867 <2 x double> addrspace(1)* %in2) #0 {
2868 %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
2869 %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
2870 %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16
2871 %r2 = frem <2 x double> %r0, %r1
2872 store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
2876 attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2877 attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }