1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX8 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11 %s
6 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s
8 define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
9 ; GFX6-LABEL: round_f32:
11 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
12 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
13 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
14 ; GFX6-NEXT: s_mov_b32 s2, -1
15 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
16 ; GFX6-NEXT: v_trunc_f32_e32 v0, s6
17 ; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0
18 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
19 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
20 ; GFX6-NEXT: s_brev_b32 s4, -2
21 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
22 ; GFX6-NEXT: v_bfi_b32 v1, s4, v1, v2
23 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v1
24 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
27 ; GFX8-LABEL: round_f32:
29 ; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c
30 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
31 ; GFX8-NEXT: s_mov_b32 s3, 0xf000
32 ; GFX8-NEXT: s_mov_b32 s2, -1
33 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
34 ; GFX8-NEXT: v_trunc_f32_e32 v0, s6
35 ; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
36 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
37 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
38 ; GFX8-NEXT: s_brev_b32 s4, -2
39 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
40 ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2
41 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
42 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
45 ; GFX9-LABEL: round_f32:
47 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
48 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
49 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
50 ; GFX9-NEXT: s_mov_b32 s2, -1
51 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
52 ; GFX9-NEXT: v_trunc_f32_e32 v0, s6
53 ; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0
54 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
55 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
56 ; GFX9-NEXT: s_brev_b32 s4, -2
57 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
58 ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
59 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
60 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
63 ; GFX11-LABEL: round_f32:
65 ; GFX11-NEXT: s_clause 0x1
66 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
67 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
68 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX11-NEXT: v_trunc_f32_e32 v0, s2
70 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
71 ; GFX11-NEXT: v_sub_f32_e32 v1, s2, v0
72 ; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v1|, 0.5
73 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
74 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3
75 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
76 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2
77 ; GFX11-NEXT: s_mov_b32 s2, -1
78 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
79 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
80 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
81 ; GFX11-NEXT: s_endpgm
83 ; R600-LABEL: round_f32:
85 ; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
86 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
89 ; R600-NEXT: ALU clause starting at 4:
90 ; R600-NEXT: TRUNC * T0.W, KC0[2].Z,
91 ; R600-NEXT: ADD * T1.W, KC0[2].Z, -PV.W,
92 ; R600-NEXT: SETGE * T1.W, |PV.W|, 0.5,
93 ; R600-NEXT: BFI_INT * T1.W, literal.x, PV.W, KC0[2].Z,
94 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
95 ; R600-NEXT: ADD T0.X, T0.W, PV.W,
96 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
97 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
98 %result = call float @llvm.round.f32(float %x) #1
99 store float %result, ptr addrspace(1) %out
103 ; The vector tests are really difficult to verify, since it can be hard to
104 ; predict how the scheduler will order the instructions. We already have
105 ; a test for the scalar case, so the vector tests just check that the
106 ; compiler doesn't crash.
107 define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #0 {
108 ; GFX6-LABEL: round_v2f32:
110 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
111 ; GFX6-NEXT: s_brev_b32 s8, -2
112 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
113 ; GFX6-NEXT: s_mov_b32 s6, -1
114 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
115 ; GFX6-NEXT: v_trunc_f32_e32 v0, s3
116 ; GFX6-NEXT: v_sub_f32_e32 v1, s3, v0
117 ; GFX6-NEXT: s_mov_b32 s4, s0
118 ; GFX6-NEXT: s_mov_b32 s5, s1
119 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
120 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
121 ; GFX6-NEXT: v_mov_b32_e32 v2, s3
122 ; GFX6-NEXT: v_bfi_b32 v1, s8, v1, v2
123 ; GFX6-NEXT: v_add_f32_e32 v1, v0, v1
124 ; GFX6-NEXT: v_trunc_f32_e32 v0, s2
125 ; GFX6-NEXT: v_sub_f32_e32 v2, s2, v0
126 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
127 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
128 ; GFX6-NEXT: v_mov_b32_e32 v3, s2
129 ; GFX6-NEXT: v_bfi_b32 v2, s8, v2, v3
130 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v2
131 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
132 ; GFX6-NEXT: s_endpgm
134 ; GFX8-LABEL: round_v2f32:
136 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
137 ; GFX8-NEXT: s_brev_b32 s8, -2
138 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
139 ; GFX8-NEXT: s_mov_b32 s6, -1
140 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
141 ; GFX8-NEXT: v_trunc_f32_e32 v0, s3
142 ; GFX8-NEXT: v_sub_f32_e32 v1, s3, v0
143 ; GFX8-NEXT: s_mov_b32 s4, s0
144 ; GFX8-NEXT: s_mov_b32 s5, s1
145 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
146 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
147 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
148 ; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2
149 ; GFX8-NEXT: v_add_f32_e32 v1, v0, v1
150 ; GFX8-NEXT: v_trunc_f32_e32 v0, s2
151 ; GFX8-NEXT: v_sub_f32_e32 v2, s2, v0
152 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
153 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
154 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
155 ; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3
156 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
157 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
158 ; GFX8-NEXT: s_endpgm
160 ; GFX9-LABEL: round_v2f32:
162 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
163 ; GFX9-NEXT: s_brev_b32 s8, -2
164 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
165 ; GFX9-NEXT: s_mov_b32 s6, -1
166 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
167 ; GFX9-NEXT: v_trunc_f32_e32 v0, s3
168 ; GFX9-NEXT: v_sub_f32_e32 v1, s3, v0
169 ; GFX9-NEXT: s_mov_b32 s4, s0
170 ; GFX9-NEXT: s_mov_b32 s5, s1
171 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
172 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
173 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
174 ; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2
175 ; GFX9-NEXT: v_add_f32_e32 v1, v0, v1
176 ; GFX9-NEXT: v_trunc_f32_e32 v0, s2
177 ; GFX9-NEXT: v_sub_f32_e32 v2, s2, v0
178 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
179 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
180 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
181 ; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3
182 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
183 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
184 ; GFX9-NEXT: s_endpgm
186 ; GFX11-LABEL: round_v2f32:
188 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
189 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
190 ; GFX11-NEXT: v_trunc_f32_e32 v0, s3
191 ; GFX11-NEXT: v_trunc_f32_e32 v2, s2
192 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
193 ; GFX11-NEXT: v_sub_f32_e32 v1, s3, v0
194 ; GFX11-NEXT: v_sub_f32_e32 v3, s2, v2
195 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
196 ; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v1|, 0.5
197 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s4
198 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
199 ; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v3|, 0.5
200 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s3
201 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
202 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s4
203 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
204 ; GFX11-NEXT: v_add_f32_e32 v1, v0, v1
205 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
206 ; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, s2
207 ; GFX11-NEXT: s_mov_b32 s2, -1
208 ; GFX11-NEXT: v_add_f32_e32 v0, v2, v3
209 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
210 ; GFX11-NEXT: s_endpgm
212 ; R600-LABEL: round_v2f32:
214 ; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
215 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
218 ; R600-NEXT: ALU clause starting at 4:
219 ; R600-NEXT: TRUNC * T0.W, KC0[3].X,
220 ; R600-NEXT: ADD T1.W, KC0[3].X, -PV.W,
221 ; R600-NEXT: TRUNC * T2.W, KC0[2].W,
222 ; R600-NEXT: ADD T3.W, KC0[2].W, -PS,
223 ; R600-NEXT: SETGE * T1.W, |PV.W|, 0.5,
224 ; R600-NEXT: BFI_INT T1.W, literal.x, PS, KC0[3].X,
225 ; R600-NEXT: SETGE * T3.W, |PV.W|, 0.5,
226 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
227 ; R600-NEXT: ADD T0.Y, T0.W, PV.W,
228 ; R600-NEXT: BFI_INT * T0.W, literal.x, PS, KC0[2].W,
229 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
230 ; R600-NEXT: ADD T0.X, T2.W, PV.W,
231 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
232 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
233 %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
234 store <2 x float> %result, ptr addrspace(1) %out
238 define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #0 {
239 ; GFX6-LABEL: round_v4f32:
241 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
242 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
243 ; GFX6-NEXT: s_brev_b32 s10, -2
244 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
245 ; GFX6-NEXT: s_mov_b32 s6, -1
246 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
247 ; GFX6-NEXT: v_trunc_f32_e32 v0, s3
248 ; GFX6-NEXT: v_sub_f32_e32 v1, s3, v0
249 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
250 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
251 ; GFX6-NEXT: v_mov_b32_e32 v2, s3
252 ; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v2
253 ; GFX6-NEXT: v_add_f32_e32 v3, v0, v1
254 ; GFX6-NEXT: v_trunc_f32_e32 v0, s2
255 ; GFX6-NEXT: v_sub_f32_e32 v1, s2, v0
256 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
257 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
258 ; GFX6-NEXT: v_mov_b32_e32 v2, s2
259 ; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v2
260 ; GFX6-NEXT: v_add_f32_e32 v2, v0, v1
261 ; GFX6-NEXT: v_trunc_f32_e32 v0, s1
262 ; GFX6-NEXT: v_sub_f32_e32 v1, s1, v0
263 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, 0.5
264 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[2:3]
265 ; GFX6-NEXT: v_mov_b32_e32 v4, s1
266 ; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v4
267 ; GFX6-NEXT: v_add_f32_e32 v1, v0, v1
268 ; GFX6-NEXT: v_trunc_f32_e32 v0, s0
269 ; GFX6-NEXT: v_sub_f32_e32 v4, s0, v0
270 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5
271 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[2:3]
272 ; GFX6-NEXT: v_mov_b32_e32 v5, s0
273 ; GFX6-NEXT: v_bfi_b32 v4, s10, v4, v5
274 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v4
275 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
276 ; GFX6-NEXT: s_endpgm
278 ; GFX8-LABEL: round_v4f32:
280 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
281 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
282 ; GFX8-NEXT: s_brev_b32 s10, -2
283 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
284 ; GFX8-NEXT: s_mov_b32 s6, -1
285 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
286 ; GFX8-NEXT: v_trunc_f32_e32 v0, s3
287 ; GFX8-NEXT: v_sub_f32_e32 v1, s3, v0
288 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
289 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
290 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
291 ; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2
292 ; GFX8-NEXT: v_add_f32_e32 v3, v0, v1
293 ; GFX8-NEXT: v_trunc_f32_e32 v0, s2
294 ; GFX8-NEXT: v_sub_f32_e32 v1, s2, v0
295 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
296 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
297 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
298 ; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2
299 ; GFX8-NEXT: v_add_f32_e32 v2, v0, v1
300 ; GFX8-NEXT: v_trunc_f32_e32 v0, s1
301 ; GFX8-NEXT: v_sub_f32_e32 v1, s1, v0
302 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, 0.5
303 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[2:3]
304 ; GFX8-NEXT: v_mov_b32_e32 v4, s1
305 ; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4
306 ; GFX8-NEXT: v_add_f32_e32 v1, v0, v1
307 ; GFX8-NEXT: v_trunc_f32_e32 v0, s0
308 ; GFX8-NEXT: v_sub_f32_e32 v4, s0, v0
309 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5
310 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[2:3]
311 ; GFX8-NEXT: v_mov_b32_e32 v5, s0
312 ; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5
313 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
314 ; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
315 ; GFX8-NEXT: s_endpgm
317 ; GFX9-LABEL: round_v4f32:
319 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
320 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
321 ; GFX9-NEXT: s_brev_b32 s6, -2
322 ; GFX9-NEXT: s_mov_b32 s11, 0xf000
323 ; GFX9-NEXT: s_mov_b32 s10, -1
324 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
325 ; GFX9-NEXT: v_trunc_f32_e32 v0, s3
326 ; GFX9-NEXT: v_sub_f32_e32 v1, s3, v0
327 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
328 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
329 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
330 ; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v2
331 ; GFX9-NEXT: v_add_f32_e32 v3, v0, v1
332 ; GFX9-NEXT: v_trunc_f32_e32 v0, s2
333 ; GFX9-NEXT: v_sub_f32_e32 v1, s2, v0
334 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
335 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
336 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
337 ; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v2
338 ; GFX9-NEXT: v_add_f32_e32 v2, v0, v1
339 ; GFX9-NEXT: v_trunc_f32_e32 v0, s1
340 ; GFX9-NEXT: v_sub_f32_e32 v1, s1, v0
341 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, 0.5
342 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[2:3]
343 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
344 ; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v4
345 ; GFX9-NEXT: v_add_f32_e32 v1, v0, v1
346 ; GFX9-NEXT: v_trunc_f32_e32 v0, s0
347 ; GFX9-NEXT: v_sub_f32_e32 v4, s0, v0
348 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5
349 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[2:3]
350 ; GFX9-NEXT: v_mov_b32_e32 v5, s0
351 ; GFX9-NEXT: v_bfi_b32 v4, s6, v4, v5
352 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
353 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
354 ; GFX9-NEXT: s_endpgm
356 ; GFX11-LABEL: round_v4f32:
358 ; GFX11-NEXT: s_clause 0x1
359 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
360 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
361 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
362 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
363 ; GFX11-NEXT: v_trunc_f32_e32 v0, s3
364 ; GFX11-NEXT: v_trunc_f32_e32 v1, s2
365 ; GFX11-NEXT: v_trunc_f32_e32 v4, s1
366 ; GFX11-NEXT: v_trunc_f32_e32 v5, s0
367 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
368 ; GFX11-NEXT: v_dual_sub_f32 v2, s3, v0 :: v_dual_sub_f32 v3, s2, v1
369 ; GFX11-NEXT: v_dual_sub_f32 v6, s1, v4 :: v_dual_sub_f32 v7, s0, v5
370 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
371 ; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v2|, 0.5
372 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s6
373 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
374 ; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v3|, 0.5
375 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s3
376 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
377 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s6
378 ; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v6|, 0.5
379 ; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, v3, s2
380 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
381 ; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s6
382 ; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v7|, 0.5
383 ; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v8
384 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
385 ; GFX11-NEXT: v_bfi_b32 v6, 0x7fffffff, v6, s1
386 ; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s6
387 ; GFX11-NEXT: s_mov_b32 s6, -1
388 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
389 ; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s0
390 ; GFX11-NEXT: v_dual_add_f32 v1, v4, v6 :: v_dual_add_f32 v0, v5, v7
391 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
392 ; GFX11-NEXT: s_endpgm
394 ; R600-LABEL: round_v4f32:
396 ; R600-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[]
397 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
400 ; R600-NEXT: ALU clause starting at 4:
401 ; R600-NEXT: TRUNC * T0.W, KC0[4].X,
402 ; R600-NEXT: ADD T1.W, KC0[4].X, -PV.W,
403 ; R600-NEXT: TRUNC * T2.W, KC0[3].W,
404 ; R600-NEXT: TRUNC T0.Z, KC0[3].Z,
405 ; R600-NEXT: ADD T3.W, KC0[3].W, -PS,
406 ; R600-NEXT: SETGE * T1.W, |PV.W|, 0.5,
407 ; R600-NEXT: BFI_INT T0.Y, literal.x, PS, KC0[4].X,
408 ; R600-NEXT: SETGE T1.Z, |PV.W|, 0.5,
409 ; R600-NEXT: ADD * T1.W, KC0[3].Z, -PV.Z,
410 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
411 ; R600-NEXT: TRUNC * T3.W, KC0[3].Y,
412 ; R600-NEXT: ADD T1.Y, KC0[3].Y, -PV.W,
413 ; R600-NEXT: SETGE T2.Z, |T1.W|, 0.5,
414 ; R600-NEXT: BFI_INT T1.W, literal.x, T1.Z, KC0[3].W,
415 ; R600-NEXT: ADD * T4.W, T0.W, T0.Y,
416 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
417 ; R600-NEXT: ADD T4.Z, T2.W, PV.W,
418 ; R600-NEXT: BFI_INT T0.W, literal.x, PV.Z, KC0[3].Z,
419 ; R600-NEXT: SETGE * T1.W, |PV.Y|, 0.5,
420 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
421 ; R600-NEXT: ADD T4.Y, T0.Z, PV.W,
422 ; R600-NEXT: BFI_INT * T0.W, literal.x, PS, KC0[3].Y,
423 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
424 ; R600-NEXT: ADD T4.X, T3.W, PV.W,
425 ; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
426 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
427 %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
428 store <4 x float> %result, ptr addrspace(1) %out
432 define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #0 {
433 ; GFX6-LABEL: round_v8f32:
435 ; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11
436 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
437 ; GFX6-NEXT: s_brev_b32 s6, -2
438 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
439 ; GFX6-NEXT: s_mov_b32 s2, -1
440 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
441 ; GFX6-NEXT: v_trunc_f32_e32 v0, s11
442 ; GFX6-NEXT: v_sub_f32_e32 v1, s11, v0
443 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
444 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
445 ; GFX6-NEXT: v_mov_b32_e32 v2, s11
446 ; GFX6-NEXT: v_bfi_b32 v1, s6, v1, v2
447 ; GFX6-NEXT: v_add_f32_e32 v3, v0, v1
448 ; GFX6-NEXT: v_trunc_f32_e32 v0, s10
449 ; GFX6-NEXT: v_sub_f32_e32 v1, s10, v0
450 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
451 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
452 ; GFX6-NEXT: v_mov_b32_e32 v2, s10
453 ; GFX6-NEXT: v_bfi_b32 v1, s6, v1, v2
454 ; GFX6-NEXT: v_add_f32_e32 v2, v0, v1
455 ; GFX6-NEXT: v_trunc_f32_e32 v0, s9
456 ; GFX6-NEXT: v_sub_f32_e32 v1, s9, v0
457 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
458 ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
459 ; GFX6-NEXT: v_mov_b32_e32 v4, s9
460 ; GFX6-NEXT: v_bfi_b32 v1, s6, v1, v4
461 ; GFX6-NEXT: v_add_f32_e32 v1, v0, v1
462 ; GFX6-NEXT: v_trunc_f32_e32 v0, s8
463 ; GFX6-NEXT: v_sub_f32_e32 v4, s8, v0
464 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, 0.5
465 ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[4:5]
466 ; GFX6-NEXT: v_mov_b32_e32 v5, s8
467 ; GFX6-NEXT: v_bfi_b32 v4, s6, v4, v5
468 ; GFX6-NEXT: v_add_f32_e32 v0, v0, v4
469 ; GFX6-NEXT: v_trunc_f32_e32 v4, s15
470 ; GFX6-NEXT: v_sub_f32_e32 v5, s15, v4
471 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
472 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
473 ; GFX6-NEXT: v_mov_b32_e32 v6, s15
474 ; GFX6-NEXT: v_bfi_b32 v5, s6, v5, v6
475 ; GFX6-NEXT: v_add_f32_e32 v7, v4, v5
476 ; GFX6-NEXT: v_trunc_f32_e32 v4, s14
477 ; GFX6-NEXT: v_sub_f32_e32 v5, s14, v4
478 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
479 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
480 ; GFX6-NEXT: v_mov_b32_e32 v6, s14
481 ; GFX6-NEXT: v_bfi_b32 v5, s6, v5, v6
482 ; GFX6-NEXT: v_add_f32_e32 v6, v4, v5
483 ; GFX6-NEXT: v_trunc_f32_e32 v4, s13
484 ; GFX6-NEXT: v_sub_f32_e32 v5, s13, v4
485 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
486 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
487 ; GFX6-NEXT: v_mov_b32_e32 v8, s13
488 ; GFX6-NEXT: v_bfi_b32 v5, s6, v5, v8
489 ; GFX6-NEXT: v_add_f32_e32 v5, v4, v5
490 ; GFX6-NEXT: v_trunc_f32_e32 v4, s12
491 ; GFX6-NEXT: v_sub_f32_e32 v8, s12, v4
492 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5
493 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5]
494 ; GFX6-NEXT: v_mov_b32_e32 v9, s12
495 ; GFX6-NEXT: v_bfi_b32 v8, s6, v8, v9
496 ; GFX6-NEXT: v_add_f32_e32 v4, v4, v8
497 ; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
498 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
499 ; GFX6-NEXT: s_endpgm
501 ; GFX8-LABEL: round_v8f32:
503 ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
504 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
505 ; GFX8-NEXT: s_brev_b32 s6, -2
506 ; GFX8-NEXT: s_mov_b32 s3, 0xf000
507 ; GFX8-NEXT: s_mov_b32 s2, -1
508 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
509 ; GFX8-NEXT: v_trunc_f32_e32 v0, s11
510 ; GFX8-NEXT: v_sub_f32_e32 v1, s11, v0
511 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
512 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
513 ; GFX8-NEXT: v_mov_b32_e32 v2, s11
514 ; GFX8-NEXT: v_bfi_b32 v1, s6, v1, v2
515 ; GFX8-NEXT: v_add_f32_e32 v3, v0, v1
516 ; GFX8-NEXT: v_trunc_f32_e32 v0, s10
517 ; GFX8-NEXT: v_sub_f32_e32 v1, s10, v0
518 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
519 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
520 ; GFX8-NEXT: v_mov_b32_e32 v2, s10
521 ; GFX8-NEXT: v_bfi_b32 v1, s6, v1, v2
522 ; GFX8-NEXT: v_add_f32_e32 v2, v0, v1
523 ; GFX8-NEXT: v_trunc_f32_e32 v0, s9
524 ; GFX8-NEXT: v_sub_f32_e32 v1, s9, v0
525 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
526 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
527 ; GFX8-NEXT: v_mov_b32_e32 v4, s9
528 ; GFX8-NEXT: v_bfi_b32 v1, s6, v1, v4
529 ; GFX8-NEXT: v_add_f32_e32 v1, v0, v1
530 ; GFX8-NEXT: v_trunc_f32_e32 v0, s8
531 ; GFX8-NEXT: v_sub_f32_e32 v4, s8, v0
532 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, 0.5
533 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[4:5]
534 ; GFX8-NEXT: v_mov_b32_e32 v5, s8
535 ; GFX8-NEXT: v_bfi_b32 v4, s6, v4, v5
536 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
537 ; GFX8-NEXT: v_trunc_f32_e32 v4, s15
538 ; GFX8-NEXT: v_sub_f32_e32 v5, s15, v4
539 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
540 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
541 ; GFX8-NEXT: v_mov_b32_e32 v6, s15
542 ; GFX8-NEXT: v_bfi_b32 v5, s6, v5, v6
543 ; GFX8-NEXT: v_add_f32_e32 v7, v4, v5
544 ; GFX8-NEXT: v_trunc_f32_e32 v4, s14
545 ; GFX8-NEXT: v_sub_f32_e32 v5, s14, v4
546 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
547 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
548 ; GFX8-NEXT: v_mov_b32_e32 v6, s14
549 ; GFX8-NEXT: v_bfi_b32 v5, s6, v5, v6
550 ; GFX8-NEXT: v_add_f32_e32 v6, v4, v5
551 ; GFX8-NEXT: v_trunc_f32_e32 v4, s13
552 ; GFX8-NEXT: v_sub_f32_e32 v5, s13, v4
553 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
554 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
555 ; GFX8-NEXT: v_mov_b32_e32 v8, s13
556 ; GFX8-NEXT: v_bfi_b32 v5, s6, v5, v8
557 ; GFX8-NEXT: v_add_f32_e32 v5, v4, v5
558 ; GFX8-NEXT: v_trunc_f32_e32 v4, s12
559 ; GFX8-NEXT: v_sub_f32_e32 v8, s12, v4
560 ; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5
561 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5]
562 ; GFX8-NEXT: v_mov_b32_e32 v9, s12
563 ; GFX8-NEXT: v_bfi_b32 v8, s6, v8, v9
564 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v8
565 ; GFX8-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
566 ; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
567 ; GFX8-NEXT: s_endpgm
569 ; GFX9-LABEL: round_v8f32:
571 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
572 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
573 ; GFX9-NEXT: s_brev_b32 s6, -2
574 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
575 ; GFX9-NEXT: s_mov_b32 s2, -1
576 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
577 ; GFX9-NEXT: v_trunc_f32_e32 v0, s11
578 ; GFX9-NEXT: v_sub_f32_e32 v1, s11, v0
579 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
580 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
581 ; GFX9-NEXT: v_mov_b32_e32 v2, s11
582 ; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v2
583 ; GFX9-NEXT: v_add_f32_e32 v3, v0, v1
584 ; GFX9-NEXT: v_trunc_f32_e32 v0, s10
585 ; GFX9-NEXT: v_sub_f32_e32 v1, s10, v0
586 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
587 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
588 ; GFX9-NEXT: v_mov_b32_e32 v2, s10
589 ; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v2
590 ; GFX9-NEXT: v_add_f32_e32 v2, v0, v1
591 ; GFX9-NEXT: v_trunc_f32_e32 v0, s9
592 ; GFX9-NEXT: v_sub_f32_e32 v1, s9, v0
593 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
594 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
595 ; GFX9-NEXT: v_mov_b32_e32 v4, s9
596 ; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v4
597 ; GFX9-NEXT: v_add_f32_e32 v1, v0, v1
598 ; GFX9-NEXT: v_trunc_f32_e32 v0, s8
599 ; GFX9-NEXT: v_sub_f32_e32 v4, s8, v0
600 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, 0.5
601 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[4:5]
602 ; GFX9-NEXT: v_mov_b32_e32 v5, s8
603 ; GFX9-NEXT: v_bfi_b32 v4, s6, v4, v5
604 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
605 ; GFX9-NEXT: v_trunc_f32_e32 v4, s15
606 ; GFX9-NEXT: v_sub_f32_e32 v5, s15, v4
607 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
608 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
609 ; GFX9-NEXT: v_mov_b32_e32 v6, s15
610 ; GFX9-NEXT: v_bfi_b32 v5, s6, v5, v6
611 ; GFX9-NEXT: v_add_f32_e32 v7, v4, v5
612 ; GFX9-NEXT: v_trunc_f32_e32 v4, s14
613 ; GFX9-NEXT: v_sub_f32_e32 v5, s14, v4
614 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
615 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
616 ; GFX9-NEXT: v_mov_b32_e32 v6, s14
617 ; GFX9-NEXT: v_bfi_b32 v5, s6, v5, v6
618 ; GFX9-NEXT: v_add_f32_e32 v6, v4, v5
619 ; GFX9-NEXT: v_trunc_f32_e32 v4, s13
620 ; GFX9-NEXT: v_sub_f32_e32 v5, s13, v4
621 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
622 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
623 ; GFX9-NEXT: v_mov_b32_e32 v8, s13
624 ; GFX9-NEXT: v_bfi_b32 v5, s6, v5, v8
625 ; GFX9-NEXT: v_add_f32_e32 v5, v4, v5
626 ; GFX9-NEXT: v_trunc_f32_e32 v4, s12
627 ; GFX9-NEXT: v_sub_f32_e32 v8, s12, v4
628 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5
629 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5]
630 ; GFX9-NEXT: v_mov_b32_e32 v9, s12
631 ; GFX9-NEXT: v_bfi_b32 v8, s6, v8, v9
632 ; GFX9-NEXT: v_add_f32_e32 v4, v4, v8
633 ; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
634 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
635 ; GFX9-NEXT: s_endpgm
637 ; GFX11-LABEL: round_v8f32:
639 ; GFX11-NEXT: s_clause 0x1
640 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x44
641 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
642 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
643 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
644 ; GFX11-NEXT: v_trunc_f32_e32 v0, s11
645 ; GFX11-NEXT: v_trunc_f32_e32 v1, s10
646 ; GFX11-NEXT: v_trunc_f32_e32 v4, s9
647 ; GFX11-NEXT: v_trunc_f32_e32 v8, s8
648 ; GFX11-NEXT: v_trunc_f32_e32 v5, s15
649 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
650 ; GFX11-NEXT: v_dual_sub_f32 v2, s11, v0 :: v_dual_sub_f32 v3, s10, v1
651 ; GFX11-NEXT: v_sub_f32_e32 v7, s9, v4
652 ; GFX11-NEXT: v_trunc_f32_e32 v9, s13
653 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
654 ; GFX11-NEXT: v_sub_f32_e32 v12, s15, v5
655 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5
656 ; GFX11-NEXT: v_sub_f32_e32 v11, s8, v8
657 ; GFX11-NEXT: v_trunc_f32_e32 v6, s14
658 ; GFX11-NEXT: v_sub_f32_e32 v14, s13, v9
659 ; GFX11-NEXT: v_trunc_f32_e32 v10, s12
660 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
661 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5
662 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
663 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s11
664 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2
665 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5
666 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
667 ; GFX11-NEXT: v_bfi_b32 v16, 0x7fffffff, v3, s10
668 ; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2
669 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v11|, 0.5
670 ; GFX11-NEXT: v_sub_f32_e32 v13, s14, v6
671 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
672 ; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v16
673 ; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s9
674 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
675 ; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1.0, s2
676 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v12|, 0.5
677 ; GFX11-NEXT: v_add_f32_e32 v1, v4, v7
678 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
679 ; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, s8
680 ; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1.0, s2
681 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v13|, 0.5
682 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
683 ; GFX11-NEXT: v_bfi_b32 v12, 0x7fffffff, v12, s15
684 ; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1.0, s2
685 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v14|, 0.5
686 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
687 ; GFX11-NEXT: v_add_f32_e32 v7, v5, v12
688 ; GFX11-NEXT: v_bfi_b32 v13, 0x7fffffff, v13, s14
689 ; GFX11-NEXT: v_sub_f32_e32 v15, s12, v10
690 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
691 ; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, 1.0, s2
692 ; GFX11-NEXT: v_add_f32_e32 v6, v6, v13
693 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
694 ; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v15|, 0.5
695 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v14, s13
696 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
697 ; GFX11-NEXT: v_cndmask_b32_e64 v15, 0, 1.0, s2
698 ; GFX11-NEXT: v_dual_add_f32 v5, v9, v0 :: v_dual_add_f32 v0, v8, v11
699 ; GFX11-NEXT: s_mov_b32 s2, -1
700 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
701 ; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v15, s12
702 ; GFX11-NEXT: v_add_f32_e32 v4, v10, v4
703 ; GFX11-NEXT: s_clause 0x1
704 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
705 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
706 ; GFX11-NEXT: s_endpgm
708 ; R600-LABEL: round_v8f32:
710 ; R600-NEXT: ALU 50, @4, KC0[CB0:0-32], KC1[]
711 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0
712 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1
714 ; R600-NEXT: ALU clause starting at 4:
715 ; R600-NEXT: TRUNC * T0.W, KC0[6].X,
716 ; R600-NEXT: ADD T0.Z, KC0[6].X, -PV.W,
717 ; R600-NEXT: TRUNC * T1.W, KC0[5].X,
718 ; R600-NEXT: TRUNC * T2.W, KC0[4].W,
719 ; R600-NEXT: ADD T1.Z, KC0[4].W, -PV.W,
720 ; R600-NEXT: ADD T3.W, KC0[5].X, -T1.W,
721 ; R600-NEXT: SETGE * T4.W, |T0.Z|, 0.5,
722 ; R600-NEXT: BFI_INT T0.Y, literal.x, PS, KC0[6].X,
723 ; R600-NEXT: SETGE T0.Z, |PV.W|, 0.5,
724 ; R600-NEXT: SETGE T3.W, |PV.Z|, 0.5,
725 ; R600-NEXT: TRUNC * T4.W, KC0[5].Y,
726 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
727 ; R600-NEXT: ADD T1.Y, KC0[5].Y, -PS,
728 ; R600-NEXT: BFI_INT T1.Z, literal.x, PV.W, KC0[4].W,
729 ; R600-NEXT: BFI_INT T3.W, literal.x, PV.Z, KC0[5].X,
730 ; R600-NEXT: TRUNC * T5.W, KC0[4].Z,
731 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
732 ; R600-NEXT: TRUNC T0.Z, KC0[4].Y,
733 ; R600-NEXT: TRUNC * T6.W, KC0[5].W,
734 ; R600-NEXT: ADD * T7.W, KC0[4].Z, -T5.W,
735 ; R600-NEXT: TRUNC T0.X, KC0[5].Z,
736 ; R600-NEXT: SETGE T2.Y, |PV.W|, 0.5,
737 ; R600-NEXT: ADD T2.Z, KC0[5].W, -T6.W, BS:VEC_102/SCL_221
738 ; R600-NEXT: ADD T7.W, KC0[4].Y, -T0.Z,
739 ; R600-NEXT: ADD * T3.W, T1.W, T3.W,
740 ; R600-NEXT: SETGE T1.X, |PV.W|, 0.5,
741 ; R600-NEXT: SETGE T4.Y, |PV.Z|, 0.5,
742 ; R600-NEXT: ADD T3.Z, T2.W, T1.Z,
743 ; R600-NEXT: BFI_INT T1.W, literal.x, PV.Y, KC0[4].Z,
744 ; R600-NEXT: ADD * T2.W, KC0[5].Z, -PV.X,
745 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
746 ; R600-NEXT: SETGE T2.X, |PS|, 0.5,
747 ; R600-NEXT: ADD T3.Y, T5.W, PV.W,
748 ; R600-NEXT: BFI_INT T1.Z, literal.x, PV.Y, KC0[5].W,
749 ; R600-NEXT: BFI_INT T1.W, literal.x, PV.X, KC0[4].Y,
750 ; R600-NEXT: ADD * T0.W, T0.W, T0.Y,
751 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
752 ; R600-NEXT: ADD T3.X, T0.Z, PV.W,
753 ; R600-NEXT: ADD T0.Z, T6.W, PV.Z,
754 ; R600-NEXT: BFI_INT T1.W, literal.x, PV.X, KC0[5].Z,
755 ; R600-NEXT: SETGE * T2.W, |T1.Y|, 0.5,
756 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
757 ; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
758 ; R600-NEXT: ADD T0.Y, T0.X, PV.W,
759 ; R600-NEXT: BFI_INT * T1.W, literal.y, PS, KC0[5].Y,
760 ; R600-NEXT: 2(2.802597e-45), 2147483647(nan)
761 ; R600-NEXT: ADD T0.X, T4.W, PV.W,
762 ; R600-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
763 ; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
764 ; R600-NEXT: LSHR * T2.X, PV.W, literal.x,
765 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
766 %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
767 store <8 x float> %result, ptr addrspace(1) %out
771 define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 {
772 ; GFX6-LABEL: round_f16:
774 ; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
775 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
776 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0
777 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
778 ; GFX6-NEXT: v_trunc_f32_e32 v1, v0
779 ; GFX6-NEXT: v_sub_f32_e32 v2, v0, v1
780 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, 0.5
781 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[2:3]
782 ; GFX6-NEXT: s_brev_b32 s2, -2
783 ; GFX6-NEXT: v_bfi_b32 v0, s2, v2, v0
784 ; GFX6-NEXT: v_add_f32_e32 v0, v1, v0
785 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
786 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
787 ; GFX6-NEXT: s_mov_b32 s2, -1
788 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
789 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
790 ; GFX6-NEXT: s_endpgm
792 ; GFX8-LABEL: round_f16:
794 ; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c
795 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
796 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00
797 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
798 ; GFX8-NEXT: s_mov_b32 s3, 0xf000
799 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
800 ; GFX8-NEXT: v_trunc_f16_e32 v1, s6
801 ; GFX8-NEXT: v_sub_f16_e32 v2, s6, v1
802 ; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
803 ; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
804 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
805 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2
806 ; GFX8-NEXT: s_mov_b32 s2, -1
807 ; GFX8-NEXT: v_add_f16_e32 v0, v1, v0
808 ; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0
809 ; GFX8-NEXT: s_endpgm
811 ; GFX9-LABEL: round_f16:
813 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
814 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
815 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00
816 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
817 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
818 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
819 ; GFX9-NEXT: v_trunc_f16_e32 v1, s6
820 ; GFX9-NEXT: v_sub_f16_e32 v2, s6, v1
821 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
822 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
823 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
824 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
825 ; GFX9-NEXT: s_mov_b32 s2, -1
826 ; GFX9-NEXT: v_add_f16_e32 v0, v1, v0
827 ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
828 ; GFX9-NEXT: s_endpgm
830 ; GFX11-LABEL: round_f16:
832 ; GFX11-NEXT: s_clause 0x1
833 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
834 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
835 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
836 ; GFX11-NEXT: v_trunc_f16_e32 v0, s2
837 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
838 ; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0
839 ; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5
840 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
841 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3
842 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
843 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2
844 ; GFX11-NEXT: s_mov_b32 s2, -1
845 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
846 ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1
847 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
848 ; GFX11-NEXT: s_endpgm
850 ; R600-LABEL: round_f16:
852 ; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[]
853 ; R600-NEXT: MEM_RAT MSKOR T0.XW, T1.X
856 ; R600-NEXT: ALU clause starting at 4:
857 ; R600-NEXT: FLT16_TO_FLT32 * T0.W, KC0[2].Z,
858 ; R600-NEXT: TRUNC * T1.W, PV.W,
859 ; R600-NEXT: ADD * T2.W, T0.W, -PV.W,
860 ; R600-NEXT: SETGE * T2.W, |PV.W|, 0.5,
861 ; R600-NEXT: BFI_INT T0.W, literal.x, PV.W, T0.W,
862 ; R600-NEXT: AND_INT * T2.W, KC0[2].Y, literal.y,
863 ; R600-NEXT: 2147483647(nan), 3(4.203895e-45)
864 ; R600-NEXT: ADD * T0.W, T1.W, PV.W,
865 ; R600-NEXT: FLT32_TO_FLT16 T0.W, PV.W,
866 ; R600-NEXT: LSHL * T1.W, T2.W, literal.x,
867 ; R600-NEXT: 3(4.203895e-45), 0(0.000000e+00)
868 ; R600-NEXT: LSHL T0.X, PV.W, PS,
869 ; R600-NEXT: LSHL * T0.W, literal.x, PS,
870 ; R600-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
871 ; R600-NEXT: MOV T0.Y, 0.0,
872 ; R600-NEXT: MOV * T0.Z, 0.0,
873 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
874 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
875 %x.arg.trunc = trunc i32 %x.arg to i16
876 %x = bitcast i16 %x.arg.trunc to half
877 %result = call half @llvm.round.f16(half %x) #1
878 store half %result, ptr addrspace(1) %out
882 ; Should be scalarized
883 define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
884 ; GFX6-LABEL: round_v2f16:
886 ; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
887 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
888 ; GFX6-NEXT: s_lshr_b32 s1, s0, 16
889 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s1
890 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0
891 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
892 ; GFX6-NEXT: s_brev_b32 s4, -2
893 ; GFX6-NEXT: v_trunc_f32_e32 v3, v1
894 ; GFX6-NEXT: v_sub_f32_e32 v5, v1, v3
895 ; GFX6-NEXT: v_trunc_f32_e32 v2, v0
896 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, 0.5
897 ; GFX6-NEXT: v_sub_f32_e32 v4, v0, v2
898 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[2:3]
899 ; GFX6-NEXT: v_bfi_b32 v1, s4, v5, v1
900 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5
901 ; GFX6-NEXT: v_add_f32_e32 v1, v3, v1
902 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[2:3]
903 ; GFX6-NEXT: v_bfi_b32 v0, s4, v3, v0
904 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
905 ; GFX6-NEXT: v_add_f32_e32 v0, v2, v0
906 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
907 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
908 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
909 ; GFX6-NEXT: s_mov_b32 s2, -1
910 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
911 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
912 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
913 ; GFX6-NEXT: s_endpgm
915 ; GFX8-LABEL: round_v2f16:
917 ; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c
918 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
919 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00
920 ; GFX8-NEXT: s_movk_i32 s5, 0x7fff
921 ; GFX8-NEXT: s_mov_b32 s3, 0xf000
922 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
923 ; GFX8-NEXT: s_lshr_b32 s4, s6, 16
924 ; GFX8-NEXT: v_trunc_f16_e32 v1, s4
925 ; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1
926 ; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
927 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
928 ; GFX8-NEXT: v_mov_b32_e32 v3, s4
929 ; GFX8-NEXT: v_bfi_b32 v2, s5, v2, v3
930 ; GFX8-NEXT: v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
931 ; GFX8-NEXT: v_trunc_f16_e32 v2, s6
932 ; GFX8-NEXT: v_sub_f16_e32 v3, s6, v2
933 ; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5
934 ; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
935 ; GFX8-NEXT: v_mov_b32_e32 v3, s6
936 ; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v3
937 ; GFX8-NEXT: v_add_f16_e32 v0, v2, v0
938 ; GFX8-NEXT: s_mov_b32 s2, -1
939 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
940 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
941 ; GFX8-NEXT: s_endpgm
943 ; GFX9-LABEL: round_v2f16:
945 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
946 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
947 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00
948 ; GFX9-NEXT: s_movk_i32 s5, 0x7fff
949 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
950 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
951 ; GFX9-NEXT: s_lshr_b32 s4, s6, 16
952 ; GFX9-NEXT: v_trunc_f16_e32 v1, s4
953 ; GFX9-NEXT: v_sub_f16_e32 v2, s4, v1
954 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
955 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
956 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
957 ; GFX9-NEXT: v_bfi_b32 v2, s5, v2, v3
958 ; GFX9-NEXT: v_add_f16_e32 v1, v1, v2
959 ; GFX9-NEXT: v_trunc_f16_e32 v2, s6
960 ; GFX9-NEXT: v_sub_f16_e32 v3, s6, v2
961 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5
962 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
963 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
964 ; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v3
965 ; GFX9-NEXT: v_add_f16_e32 v0, v2, v0
966 ; GFX9-NEXT: s_mov_b32 s2, -1
967 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
968 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
969 ; GFX9-NEXT: s_endpgm
971 ; GFX11-LABEL: round_v2f16:
973 ; GFX11-NEXT: s_clause 0x1
974 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
975 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
976 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
977 ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
978 ; GFX11-NEXT: v_trunc_f16_e32 v1, s2
979 ; GFX11-NEXT: v_trunc_f16_e32 v0, s3
980 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
981 ; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1
982 ; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0
983 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
984 ; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5
985 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4
986 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
987 ; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5
988 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3
989 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
990 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4
991 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
992 ; GFX11-NEXT: v_add_f16_e32 v0, v0, v2
993 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
994 ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2
995 ; GFX11-NEXT: s_mov_b32 s2, -1
996 ; GFX11-NEXT: v_add_f16_e32 v1, v1, v3
997 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
998 ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
999 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
1000 ; GFX11-NEXT: s_endpgm
1002 ; R600-LABEL: round_v2f16:
1004 ; R600-NEXT: ALU 22, @4, KC0[CB0:0-32], KC1[]
1005 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1008 ; R600-NEXT: ALU clause starting at 4:
1009 ; R600-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
1010 ; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1011 ; R600-NEXT: FLT16_TO_FLT32 * T0.W, PV.W,
1012 ; R600-NEXT: FLT16_TO_FLT32 T1.W, KC0[2].Z,
1013 ; R600-NEXT: TRUNC * T2.W, PV.W,
1014 ; R600-NEXT: ADD T3.W, T0.W, -PS,
1015 ; R600-NEXT: TRUNC * T4.W, PV.W,
1016 ; R600-NEXT: ADD T5.W, T1.W, -PS,
1017 ; R600-NEXT: SETGE * T3.W, |PV.W|, 0.5,
1018 ; R600-NEXT: BFI_INT T0.W, literal.x, PS, T0.W,
1019 ; R600-NEXT: SETGE * T3.W, |PV.W|, 0.5,
1020 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
1021 ; R600-NEXT: BFI_INT T1.W, literal.x, PS, T1.W, BS:VEC_021/SCL_122
1022 ; R600-NEXT: ADD * T0.W, T2.W, PV.W,
1023 ; R600-NEXT: 2147483647(nan), 0(0.000000e+00)
1024 ; R600-NEXT: FLT32_TO_FLT16 T0.W, PS,
1025 ; R600-NEXT: ADD * T1.W, T4.W, PV.W,
1026 ; R600-NEXT: FLT32_TO_FLT16 T1.W, PS,
1027 ; R600-NEXT: LSHL * T0.W, PV.W, literal.x,
1028 ; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1029 ; R600-NEXT: OR_INT T0.X, PV.W, PS,
1030 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1031 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1032 %in = bitcast i32 %in.arg to <2 x half>
1033 %result = call <2 x half> @llvm.round.v2f16(<2 x half> %in)
1034 store <2 x half> %result, ptr addrspace(1) %out
1038 declare float @llvm.round.f32(float) #1
1039 declare <2 x float> @llvm.round.v2f32(<2 x float>) #1
1040 declare <4 x float> @llvm.round.v4f32(<4 x float>) #1
1041 declare <8 x float> @llvm.round.v8f32(<8 x float>) #1
1043 declare half @llvm.round.f16(half) #1
1044 declare <2 x half> @llvm.round.v2f16(<2 x half>) #1
1045 declare <4 x half> @llvm.round.v4f16(<4 x half>) #1
1046 declare <8 x half> @llvm.round.v8f16(<8 x half>) #1
1048 attributes #0 = { nounwind }
1049 attributes #1 = { nounwind readnone }