1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=CI %s
5 define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 {
8 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
9 ; SI-NEXT: s_mov_b32 s6, -1
10 ; SI-NEXT: s_mov_b32 s3, 0xfffff
11 ; SI-NEXT: s_mov_b32 s2, s6
12 ; SI-NEXT: v_mov_b32_e32 v4, 0x3ff00000
13 ; SI-NEXT: s_waitcnt lgkmcnt(0)
14 ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014
15 ; SI-NEXT: s_add_i32 s5, s0, 0xfffffc01
16 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s5
17 ; SI-NEXT: s_andn2_b64 s[2:3], s[10:11], s[0:1]
18 ; SI-NEXT: s_and_b32 s0, s11, 0x80000000
19 ; SI-NEXT: v_mov_b32_e32 v1, s0
20 ; SI-NEXT: v_mov_b32_e32 v0, s3
21 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s5, 0
22 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
23 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s5, 51
24 ; SI-NEXT: v_mov_b32_e32 v1, s11
25 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
26 ; SI-NEXT: v_mov_b32_e32 v0, s2
27 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
28 ; SI-NEXT: v_mov_b32_e32 v2, s10
29 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
30 ; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1]
31 ; SI-NEXT: s_brev_b32 s0, -2
32 ; SI-NEXT: v_mov_b32_e32 v5, s11
33 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
34 ; SI-NEXT: v_bfi_b32 v4, s0, v4, v5
35 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
36 ; SI-NEXT: v_mov_b32_e32 v2, 0
37 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
38 ; SI-NEXT: s_mov_b32 s7, 0xf000
39 ; SI-NEXT: s_mov_b32 s4, s8
40 ; SI-NEXT: s_mov_b32 s5, s9
41 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
44 ; CI-LABEL: round_f64:
46 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
47 ; CI-NEXT: s_brev_b32 s8, -2
48 ; CI-NEXT: v_mov_b32_e32 v4, 0x3ff00000
49 ; CI-NEXT: s_mov_b32 s3, 0xf000
50 ; CI-NEXT: s_mov_b32 s2, -1
51 ; CI-NEXT: s_waitcnt lgkmcnt(0)
52 ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7]
53 ; CI-NEXT: v_mov_b32_e32 v5, s7
54 ; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1]
55 ; CI-NEXT: v_bfi_b32 v4, s8, v4, v5
56 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
57 ; CI-NEXT: v_mov_b32_e32 v2, 0
58 ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
59 ; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
60 ; CI-NEXT: s_mov_b32 s0, s4
61 ; CI-NEXT: s_mov_b32 s1, s5
62 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
64 %result = call double @llvm.round.f64(double %x) #1
65 store double %result, double addrspace(1)* %out
69 define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
70 ; SI-LABEL: v_round_f64:
72 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
73 ; SI-NEXT: s_mov_b32 s3, 0xf000
74 ; SI-NEXT: s_mov_b32 s2, 0
75 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
76 ; SI-NEXT: v_mov_b32_e32 v1, 0
77 ; SI-NEXT: s_waitcnt lgkmcnt(0)
78 ; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
79 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
80 ; SI-NEXT: s_mov_b32 s0, -1
81 ; SI-NEXT: s_movk_i32 s7, 0xfc01
82 ; SI-NEXT: s_mov_b32 s1, 0xfffff
83 ; SI-NEXT: s_brev_b32 s6, -2
84 ; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
85 ; SI-NEXT: s_waitcnt vmcnt(0)
86 ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
87 ; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v4
88 ; SI-NEXT: v_lshr_b64 v[4:5], s[0:1], v6
89 ; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
90 ; SI-NEXT: v_not_b32_e32 v4, v4
91 ; SI-NEXT: v_not_b32_e32 v5, v5
92 ; SI-NEXT: v_and_b32_e32 v5, v3, v5
93 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6
94 ; SI-NEXT: v_and_b32_e32 v4, v2, v4
95 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
96 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
97 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 51, v6
98 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
99 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
100 ; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5]
101 ; SI-NEXT: v_bfi_b32 v2, s6, v8, v3
102 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
103 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
104 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
105 ; SI-NEXT: v_mov_b32_e32 v2, 0
106 ; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
107 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
110 ; CI-LABEL: v_round_f64:
112 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
113 ; CI-NEXT: s_mov_b32 s3, 0xf000
114 ; CI-NEXT: s_mov_b32 s2, 0
115 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
116 ; CI-NEXT: v_mov_b32_e32 v1, 0
117 ; CI-NEXT: s_waitcnt lgkmcnt(0)
118 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
119 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
120 ; CI-NEXT: s_brev_b32 s0, -2
121 ; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
122 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
123 ; CI-NEXT: s_waitcnt vmcnt(0)
124 ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3]
125 ; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5]
126 ; CI-NEXT: v_bfi_b32 v2, s0, v8, v3
127 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
128 ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
129 ; CI-NEXT: v_mov_b32_e32 v2, 0
130 ; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
131 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
133 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
134 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
135 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
136 %x = load double, double addrspace(1)* %gep
137 %result = call double @llvm.round.f64(double %x) #1
138 store double %result, double addrspace(1)* %out.gep
142 define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
143 ; SI-LABEL: round_v2f64:
145 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
146 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
147 ; SI-NEXT: s_mov_b32 s6, -1
148 ; SI-NEXT: s_movk_i32 s7, 0xfc01
149 ; SI-NEXT: s_mov_b32 s3, 0xfffff
150 ; SI-NEXT: s_mov_b32 s2, s6
151 ; SI-NEXT: s_waitcnt lgkmcnt(0)
152 ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014
153 ; SI-NEXT: s_add_i32 s14, s0, s7
154 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s14
155 ; SI-NEXT: s_brev_b32 s15, 1
156 ; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1]
157 ; SI-NEXT: s_and_b32 s0, s11, s15
158 ; SI-NEXT: v_mov_b32_e32 v1, s0
159 ; SI-NEXT: v_mov_b32_e32 v0, s13
160 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s14, 0
161 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
162 ; SI-NEXT: v_mov_b32_e32 v1, s11
163 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s14, 51
164 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
165 ; SI-NEXT: v_mov_b32_e32 v0, s12
166 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
167 ; SI-NEXT: v_mov_b32_e32 v2, s10
168 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
169 ; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1]
170 ; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014
171 ; SI-NEXT: s_add_i32 s7, s0, s7
172 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
173 ; SI-NEXT: s_brev_b32 s10, -2
174 ; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000
175 ; SI-NEXT: v_mov_b32_e32 v4, s11
176 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7
177 ; SI-NEXT: v_bfi_b32 v4, s10, v6, v4
178 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
179 ; SI-NEXT: v_mov_b32_e32 v2, 0
180 ; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1]
181 ; SI-NEXT: s_and_b32 s0, s9, s15
182 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
183 ; SI-NEXT: v_mov_b32_e32 v1, s0
184 ; SI-NEXT: v_mov_b32_e32 v0, s3
185 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s7, 0
186 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
187 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s7, 51
188 ; SI-NEXT: v_mov_b32_e32 v1, s9
189 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
190 ; SI-NEXT: v_mov_b32_e32 v0, s2
191 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
192 ; SI-NEXT: v_mov_b32_e32 v4, s8
193 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
194 ; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1]
195 ; SI-NEXT: v_mov_b32_e32 v7, s9
196 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
197 ; SI-NEXT: v_bfi_b32 v6, s10, v6, v7
198 ; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc
199 ; SI-NEXT: v_mov_b32_e32 v4, 0
200 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
201 ; SI-NEXT: s_mov_b32 s7, 0xf000
202 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
205 ; CI-LABEL: round_v2f64:
207 ; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
208 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
209 ; CI-NEXT: s_brev_b32 s6, -2
210 ; CI-NEXT: v_mov_b32_e32 v6, 0x3ff00000
211 ; CI-NEXT: s_mov_b32 s7, 0xf000
212 ; CI-NEXT: s_waitcnt lgkmcnt(0)
213 ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3]
214 ; CI-NEXT: v_mov_b32_e32 v4, s3
215 ; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1]
216 ; CI-NEXT: v_bfi_b32 v4, s6, v6, v4
217 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
218 ; CI-NEXT: v_mov_b32_e32 v2, 0
219 ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
220 ; CI-NEXT: v_trunc_f64_e32 v[4:5], s[0:1]
221 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
222 ; CI-NEXT: v_add_f64 v[0:1], s[0:1], -v[4:5]
223 ; CI-NEXT: v_mov_b32_e32 v7, s1
224 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
225 ; CI-NEXT: v_bfi_b32 v6, s6, v6, v7
226 ; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc
227 ; CI-NEXT: v_mov_b32_e32 v0, 0
228 ; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1]
229 ; CI-NEXT: s_mov_b32 s6, -1
230 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
232 %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
233 store <2 x double> %result, <2 x double> addrspace(1)* %out
237 define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
238 ; SI-LABEL: round_v4f64:
240 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11
241 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9
242 ; SI-NEXT: s_mov_b32 s14, -1
243 ; SI-NEXT: s_movk_i32 s18, 0xfc01
244 ; SI-NEXT: s_mov_b32 s3, 0xfffff
245 ; SI-NEXT: s_waitcnt lgkmcnt(0)
246 ; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014
247 ; SI-NEXT: s_add_i32 s19, s0, s18
248 ; SI-NEXT: s_mov_b32 s2, s14
249 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19
250 ; SI-NEXT: s_brev_b32 s20, 1
251 ; SI-NEXT: s_andn2_b64 s[16:17], s[6:7], s[0:1]
252 ; SI-NEXT: s_and_b32 s0, s7, s20
253 ; SI-NEXT: v_mov_b32_e32 v1, s0
254 ; SI-NEXT: v_mov_b32_e32 v0, s17
255 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s19, 0
256 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
257 ; SI-NEXT: v_mov_b32_e32 v1, s7
258 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s19, 51
259 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
260 ; SI-NEXT: v_mov_b32_e32 v0, s16
261 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
262 ; SI-NEXT: v_mov_b32_e32 v2, s6
263 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
264 ; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1]
265 ; SI-NEXT: s_bfe_u32 s0, s5, 0xb0014
266 ; SI-NEXT: s_add_i32 s17, s0, s18
267 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
268 ; SI-NEXT: s_brev_b32 s16, -2
269 ; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000
270 ; SI-NEXT: v_mov_b32_e32 v4, s7
271 ; SI-NEXT: v_bfi_b32 v4, s16, v12, v4
272 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17
273 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
274 ; SI-NEXT: v_mov_b32_e32 v2, 0
275 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[0:1]
276 ; SI-NEXT: s_and_b32 s0, s5, s20
277 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
278 ; SI-NEXT: v_mov_b32_e32 v1, s0
279 ; SI-NEXT: v_mov_b32_e32 v0, s7
280 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s17, 0
281 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
282 ; SI-NEXT: v_mov_b32_e32 v1, s5
283 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s17, 51
284 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
285 ; SI-NEXT: v_mov_b32_e32 v0, s6
286 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
287 ; SI-NEXT: v_mov_b32_e32 v4, s4
288 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
289 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1]
290 ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014
291 ; SI-NEXT: s_add_i32 s6, s0, s18
292 ; SI-NEXT: v_mov_b32_e32 v6, s5
293 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s6
294 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
295 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[0:1]
296 ; SI-NEXT: v_bfi_b32 v6, s16, v12, v6
297 ; SI-NEXT: s_and_b32 s0, s11, s20
298 ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc
299 ; SI-NEXT: v_mov_b32_e32 v5, s0
300 ; SI-NEXT: v_mov_b32_e32 v4, s5
301 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0
302 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
303 ; SI-NEXT: v_mov_b32_e32 v5, s11
304 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s6, 51
305 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1]
306 ; SI-NEXT: v_mov_b32_e32 v4, s4
307 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
308 ; SI-NEXT: v_mov_b32_e32 v6, s10
309 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1]
310 ; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5]
311 ; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014
312 ; SI-NEXT: v_mov_b32_e32 v10, s11
313 ; SI-NEXT: s_add_i32 s4, s0, s18
314 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
315 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s4
316 ; SI-NEXT: v_bfi_b32 v10, s16, v12, v10
317 ; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc
318 ; SI-NEXT: v_mov_b32_e32 v6, 0
319 ; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1]
320 ; SI-NEXT: s_and_b32 s0, s9, s20
321 ; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7]
322 ; SI-NEXT: v_mov_b32_e32 v5, s0
323 ; SI-NEXT: v_mov_b32_e32 v4, s3
324 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s4, 0
325 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
326 ; SI-NEXT: v_mov_b32_e32 v5, s9
327 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, 51
328 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1]
329 ; SI-NEXT: v_mov_b32_e32 v4, s2
330 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
331 ; SI-NEXT: v_mov_b32_e32 v10, s8
332 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
333 ; SI-NEXT: v_add_f64 v[10:11], s[8:9], -v[4:5]
334 ; SI-NEXT: v_mov_b32_e32 v13, s9
335 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5
336 ; SI-NEXT: v_bfi_b32 v12, s16, v12, v13
337 ; SI-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc
338 ; SI-NEXT: v_mov_b32_e32 v10, 0
339 ; SI-NEXT: v_mov_b32_e32 v8, 0
340 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11]
341 ; SI-NEXT: s_mov_b32 s15, 0xf000
342 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[8:9]
343 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
344 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
347 ; CI-LABEL: round_v4f64:
349 ; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11
350 ; CI-NEXT: s_brev_b32 s12, -2
351 ; CI-NEXT: v_mov_b32_e32 v12, 0x3ff00000
352 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
353 ; CI-NEXT: s_mov_b32 s3, 0xf000
354 ; CI-NEXT: s_waitcnt lgkmcnt(0)
355 ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7]
356 ; CI-NEXT: v_mov_b32_e32 v4, s7
357 ; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1]
358 ; CI-NEXT: v_bfi_b32 v4, s12, v12, v4
359 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
360 ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[4:5]
361 ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
362 ; CI-NEXT: v_mov_b32_e32 v2, 0
363 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
364 ; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[8:9]
365 ; CI-NEXT: v_mov_b32_e32 v4, s5
366 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
367 ; CI-NEXT: v_bfi_b32 v4, s12, v12, v4
368 ; CI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
369 ; CI-NEXT: v_trunc_f64_e32 v[4:5], s[10:11]
370 ; CI-NEXT: v_mov_b32_e32 v10, s11
371 ; CI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5]
372 ; CI-NEXT: v_bfi_b32 v10, s12, v12, v10
373 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
374 ; CI-NEXT: v_mov_b32_e32 v6, 0
375 ; CI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc
376 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9]
377 ; CI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7]
378 ; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[10:11]
379 ; CI-NEXT: v_mov_b32_e32 v13, s9
380 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
381 ; CI-NEXT: v_bfi_b32 v12, s12, v12, v13
382 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc
383 ; CI-NEXT: v_mov_b32_e32 v4, 0
384 ; CI-NEXT: v_mov_b32_e32 v0, 0
385 ; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5]
386 ; CI-NEXT: s_mov_b32 s2, -1
387 ; CI-NEXT: v_add_f64 v[0:1], v[8:9], v[0:1]
388 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
389 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
391 %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
392 store <4 x double> %result, <4 x double> addrspace(1)* %out
396 define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
397 ; SI-LABEL: round_v8f64:
399 ; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19
400 ; SI-NEXT: s_mov_b32 s22, -1
401 ; SI-NEXT: s_movk_i32 s23, 0xfc01
402 ; SI-NEXT: s_mov_b32 s21, 0xfffff
403 ; SI-NEXT: s_mov_b32 s20, s22
404 ; SI-NEXT: s_waitcnt lgkmcnt(0)
405 ; SI-NEXT: s_bfe_u32 s2, s7, 0xb0014
406 ; SI-NEXT: s_add_i32 s26, s2, s23
407 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s26
408 ; SI-NEXT: s_brev_b32 s27, 1
409 ; SI-NEXT: s_andn2_b64 s[24:25], s[6:7], s[2:3]
410 ; SI-NEXT: s_and_b32 s2, s7, s27
411 ; SI-NEXT: v_mov_b32_e32 v1, s2
412 ; SI-NEXT: v_mov_b32_e32 v0, s25
413 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s26, 0
414 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
415 ; SI-NEXT: v_mov_b32_e32 v1, s7
416 ; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s26, 51
417 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
418 ; SI-NEXT: v_mov_b32_e32 v0, s24
419 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
420 ; SI-NEXT: v_mov_b32_e32 v2, s6
421 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3]
422 ; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1]
423 ; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014
424 ; SI-NEXT: s_add_i32 s25, s2, s23
425 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
426 ; SI-NEXT: s_brev_b32 s24, -2
427 ; SI-NEXT: v_mov_b32_e32 v18, 0x3ff00000
428 ; SI-NEXT: v_mov_b32_e32 v4, s7
429 ; SI-NEXT: v_bfi_b32 v4, s24, v18, v4
430 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s25
431 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
432 ; SI-NEXT: v_mov_b32_e32 v2, 0
433 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3]
434 ; SI-NEXT: s_and_b32 s2, s5, s27
435 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
436 ; SI-NEXT: v_mov_b32_e32 v1, s2
437 ; SI-NEXT: v_mov_b32_e32 v0, s7
438 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s25, 0
439 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
440 ; SI-NEXT: v_mov_b32_e32 v1, s5
441 ; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s25, 51
442 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
443 ; SI-NEXT: v_mov_b32_e32 v0, s6
444 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
445 ; SI-NEXT: v_mov_b32_e32 v4, s4
446 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3]
447 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1]
448 ; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014
449 ; SI-NEXT: v_mov_b32_e32 v6, s5
450 ; SI-NEXT: s_add_i32 s6, s2, s23
451 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
452 ; SI-NEXT: v_bfi_b32 v6, s24, v18, v6
453 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6
454 ; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc
455 ; SI-NEXT: v_mov_b32_e32 v4, 0
456 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[2:3]
457 ; SI-NEXT: s_and_b32 s2, s11, s27
458 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
459 ; SI-NEXT: v_mov_b32_e32 v5, s2
460 ; SI-NEXT: v_mov_b32_e32 v4, s5
461 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0
462 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
463 ; SI-NEXT: v_mov_b32_e32 v5, s11
464 ; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s6, 51
465 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3]
466 ; SI-NEXT: v_mov_b32_e32 v4, s4
467 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
468 ; SI-NEXT: v_mov_b32_e32 v6, s10
469 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[2:3]
470 ; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5]
471 ; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014
472 ; SI-NEXT: v_mov_b32_e32 v8, s11
473 ; SI-NEXT: s_add_i32 s6, s2, s23
474 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
475 ; SI-NEXT: v_bfi_b32 v8, s24, v18, v8
476 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6
477 ; SI-NEXT: v_cndmask_b32_e32 v7, 0, v8, vcc
478 ; SI-NEXT: v_mov_b32_e32 v6, 0
479 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[2:3]
480 ; SI-NEXT: s_and_b32 s2, s9, s27
481 ; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7]
482 ; SI-NEXT: v_mov_b32_e32 v5, s2
483 ; SI-NEXT: v_mov_b32_e32 v4, s5
484 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0
485 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
486 ; SI-NEXT: v_mov_b32_e32 v5, s9
487 ; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s6, 51
488 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3]
489 ; SI-NEXT: v_mov_b32_e32 v4, s4
490 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
491 ; SI-NEXT: v_mov_b32_e32 v8, s8
492 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[2:3]
493 ; SI-NEXT: v_add_f64 v[8:9], s[8:9], -v[4:5]
494 ; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014
495 ; SI-NEXT: v_mov_b32_e32 v10, s9
496 ; SI-NEXT: s_add_i32 s6, s2, s23
497 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5
498 ; SI-NEXT: v_bfi_b32 v10, s24, v18, v10
499 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6
500 ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc
501 ; SI-NEXT: v_mov_b32_e32 v8, 0
502 ; SI-NEXT: s_andn2_b64 s[4:5], s[14:15], s[2:3]
503 ; SI-NEXT: s_and_b32 s2, s15, s27
504 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[8:9]
505 ; SI-NEXT: v_mov_b32_e32 v9, s2
506 ; SI-NEXT: v_mov_b32_e32 v8, s5
507 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0
508 ; SI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
509 ; SI-NEXT: v_mov_b32_e32 v9, s15
510 ; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s6, 51
511 ; SI-NEXT: v_cndmask_b32_e64 v13, v8, v9, s[2:3]
512 ; SI-NEXT: v_mov_b32_e32 v8, s4
513 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
514 ; SI-NEXT: v_mov_b32_e32 v9, s14
515 ; SI-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[2:3]
516 ; SI-NEXT: s_bfe_u32 s2, s13, 0xb0014
517 ; SI-NEXT: s_add_i32 s8, s2, s23
518 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s8
519 ; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[2:3]
520 ; SI-NEXT: s_bfe_u32 s2, s19, 0xb0014
521 ; SI-NEXT: s_add_i32 s10, s2, s23
522 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s10
523 ; SI-NEXT: v_mov_b32_e32 v8, s15
524 ; SI-NEXT: s_andn2_b64 s[6:7], s[18:19], s[2:3]
525 ; SI-NEXT: s_and_b32 s2, s19, s27
526 ; SI-NEXT: v_bfi_b32 v19, s24, v18, v8
527 ; SI-NEXT: v_mov_b32_e32 v9, s2
528 ; SI-NEXT: v_mov_b32_e32 v8, s7
529 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0
530 ; SI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
531 ; SI-NEXT: v_mov_b32_e32 v9, s19
532 ; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51
533 ; SI-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[2:3]
534 ; SI-NEXT: v_mov_b32_e32 v8, s6
535 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
536 ; SI-NEXT: v_mov_b32_e32 v10, s18
537 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[2:3]
538 ; SI-NEXT: s_bfe_u32 s2, s17, 0xb0014
539 ; SI-NEXT: s_add_i32 s10, s2, s23
540 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s10
541 ; SI-NEXT: s_andn2_b64 s[6:7], s[16:17], s[2:3]
542 ; SI-NEXT: s_and_b32 s2, s17, s27
543 ; SI-NEXT: v_mov_b32_e32 v11, s2
544 ; SI-NEXT: v_mov_b32_e32 v10, s7
545 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0
546 ; SI-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
547 ; SI-NEXT: v_mov_b32_e32 v11, s17
548 ; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51
549 ; SI-NEXT: v_cndmask_b32_e64 v15, v10, v11, s[2:3]
550 ; SI-NEXT: v_mov_b32_e32 v10, s6
551 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
552 ; SI-NEXT: v_mov_b32_e32 v11, s16
553 ; SI-NEXT: v_cndmask_b32_e64 v14, v10, v11, s[2:3]
554 ; SI-NEXT: v_add_f64 v[10:11], s[16:17], -v[14:15]
555 ; SI-NEXT: v_mov_b32_e32 v17, s19
556 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5
557 ; SI-NEXT: v_add_f64 v[10:11], s[18:19], -v[8:9]
558 ; SI-NEXT: v_mov_b32_e32 v16, s17
559 ; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[10:11]|, 0.5
560 ; SI-NEXT: v_bfi_b32 v17, s24, v18, v17
561 ; SI-NEXT: v_cndmask_b32_e64 v11, 0, v17, s[2:3]
562 ; SI-NEXT: v_mov_b32_e32 v10, 0
563 ; SI-NEXT: v_bfi_b32 v16, s24, v18, v16
564 ; SI-NEXT: v_add_f64 v[10:11], v[8:9], v[10:11]
565 ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v16, vcc
566 ; SI-NEXT: v_mov_b32_e32 v8, 0
567 ; SI-NEXT: s_and_b32 s9, s13, s27
568 ; SI-NEXT: v_add_f64 v[8:9], v[14:15], v[8:9]
569 ; SI-NEXT: v_mov_b32_e32 v14, s5
570 ; SI-NEXT: v_mov_b32_e32 v15, s9
571 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s8, 0
572 ; SI-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc
573 ; SI-NEXT: v_mov_b32_e32 v15, s13
574 ; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s8, 51
575 ; SI-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[2:3]
576 ; SI-NEXT: v_mov_b32_e32 v14, s4
577 ; SI-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc
578 ; SI-NEXT: v_mov_b32_e32 v15, s12
579 ; SI-NEXT: v_cndmask_b32_e64 v16, v14, v15, s[2:3]
580 ; SI-NEXT: v_mov_b32_e32 v14, s13
581 ; SI-NEXT: v_bfi_b32 v18, s24, v18, v14
582 ; SI-NEXT: v_add_f64 v[14:15], s[12:13], -v[16:17]
583 ; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9
584 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
585 ; SI-NEXT: v_add_f64 v[14:15], s[14:15], -v[12:13]
586 ; SI-NEXT: s_mov_b32 s23, 0xf000
587 ; SI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
588 ; SI-NEXT: v_mov_b32_e32 v14, 0
589 ; SI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1]
590 ; SI-NEXT: v_add_f64 v[14:15], v[12:13], v[14:15]
591 ; SI-NEXT: v_cndmask_b32_e32 v13, 0, v18, vcc
592 ; SI-NEXT: v_mov_b32_e32 v12, 0
593 ; SI-NEXT: v_add_f64 v[12:13], v[16:17], v[12:13]
594 ; SI-NEXT: s_waitcnt lgkmcnt(0)
595 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:48
596 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:32
597 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
598 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0
601 ; CI-LABEL: round_v8f64:
603 ; CI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9
604 ; CI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x19
605 ; CI-NEXT: s_brev_b32 s18, -2
606 ; CI-NEXT: v_mov_b32_e32 v16, 0x3ff00000
607 ; CI-NEXT: s_mov_b32 s19, 0xf000
608 ; CI-NEXT: s_waitcnt lgkmcnt(0)
609 ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3]
610 ; CI-NEXT: v_mov_b32_e32 v4, s3
611 ; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1]
612 ; CI-NEXT: v_bfi_b32 v4, s18, v16, v4
613 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
614 ; CI-NEXT: v_mov_b32_e32 v2, 0
615 ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
616 ; CI-NEXT: v_trunc_f64_e32 v[4:5], s[0:1]
617 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
618 ; CI-NEXT: v_add_f64 v[0:1], s[0:1], -v[4:5]
619 ; CI-NEXT: v_mov_b32_e32 v6, s1
620 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
621 ; CI-NEXT: v_bfi_b32 v6, s18, v16, v6
622 ; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc
623 ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[6:7]
624 ; CI-NEXT: v_mov_b32_e32 v0, 0
625 ; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1]
626 ; CI-NEXT: v_add_f64 v[4:5], s[6:7], -v[6:7]
627 ; CI-NEXT: v_mov_b32_e32 v8, s7
628 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
629 ; CI-NEXT: v_bfi_b32 v8, s18, v16, v8
630 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc
631 ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[4:5]
632 ; CI-NEXT: v_mov_b32_e32 v4, 0
633 ; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5]
634 ; CI-NEXT: v_add_f64 v[4:5], s[4:5], -v[8:9]
635 ; CI-NEXT: v_mov_b32_e32 v10, s5
636 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
637 ; CI-NEXT: v_bfi_b32 v10, s18, v16, v10
638 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v10, vcc
639 ; CI-NEXT: v_mov_b32_e32 v4, 0
640 ; CI-NEXT: v_add_f64 v[4:5], v[8:9], v[4:5]
641 ; CI-NEXT: v_mov_b32_e32 v8, s11
642 ; CI-NEXT: v_bfi_b32 v18, s18, v16, v8
643 ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[12:13]
644 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[14:15]
645 ; CI-NEXT: v_add_f64 v[14:15], s[12:13], -v[8:9]
646 ; CI-NEXT: v_mov_b32_e32 v19, s15
647 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
648 ; CI-NEXT: v_add_f64 v[14:15], s[14:15], -v[10:11]
649 ; CI-NEXT: v_mov_b32_e32 v17, s13
650 ; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
651 ; CI-NEXT: v_bfi_b32 v19, s18, v16, v19
652 ; CI-NEXT: v_trunc_f64_e32 v[12:13], s[8:9]
653 ; CI-NEXT: v_bfi_b32 v17, s18, v16, v17
654 ; CI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1]
655 ; CI-NEXT: v_mov_b32_e32 v14, 0
656 ; CI-NEXT: v_add_f64 v[10:11], v[10:11], v[14:15]
657 ; CI-NEXT: v_cndmask_b32_e32 v15, 0, v17, vcc
658 ; CI-NEXT: v_mov_b32_e32 v14, 0
659 ; CI-NEXT: v_mov_b32_e32 v17, s9
660 ; CI-NEXT: v_bfi_b32 v19, s18, v16, v17
661 ; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[14:15]
662 ; CI-NEXT: v_add_f64 v[14:15], s[8:9], -v[12:13]
663 ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[10:11]
664 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
665 ; CI-NEXT: v_add_f64 v[14:15], s[10:11], -v[16:17]
666 ; CI-NEXT: s_mov_b32 s18, -1
667 ; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
668 ; CI-NEXT: v_mov_b32_e32 v14, 0
669 ; CI-NEXT: v_cndmask_b32_e64 v15, 0, v18, s[0:1]
670 ; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[14:15]
671 ; CI-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc
672 ; CI-NEXT: v_mov_b32_e32 v16, 0
673 ; CI-NEXT: v_add_f64 v[12:13], v[12:13], v[16:17]
674 ; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:48
675 ; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:32
676 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
677 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
679 %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
680 store <8 x double> %result, <8 x double> addrspace(1)* %out
684 declare i32 @llvm.amdgcn.workitem.id.x() #1
686 declare double @llvm.round.f64(double) #1
687 declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
688 declare <4 x double> @llvm.round.v4f64(<4 x double>) #1
689 declare <8 x double> @llvm.round.v8f64(<8 x double>) #1
691 attributes #0 = { nounwind }
692 attributes #1 = { nounwind readnone }