1 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
7 ; CHECK-LABEL: {{^}}valu_dep_1:
9 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
10 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
11 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
12 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
13 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
20 ; CHECK-LABEL: {{^}}valu_dep_2:
22 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
23 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
24 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2)
25 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
26 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
27 $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
28 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
35 ; CHECK-LABEL: {{^}}valu_dep_3:
37 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
38 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
39 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
40 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
41 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
42 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
43 $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
44 $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
45 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
52 ; CHECK-LABEL: {{^}}valu_dep_4:
54 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
55 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
56 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
57 ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
58 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4)
59 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
60 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
61 $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
62 $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
63 $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
64 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
67 # There's no encoding for VALU_DEP_5. A normal VALU instruction will have
73 ; CHECK-LABEL: {{^}}valu_dep_5:
75 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
76 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
77 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
78 ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
79 ; CHECK-NEXT: v_add_nc_u32_e32 v4, v4, v4
80 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
81 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
82 $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
83 $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
84 $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
85 $vgpr4 = V_ADD_U32_e32 $vgpr4, $vgpr4, implicit $exec
86 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
93 ; CHECK-LABEL: {{^}}trans32_dep_1:
95 ; CHECK-NEXT: v_exp_f32_e32 v0, v0
96 ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
97 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
98 $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
99 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
106 ; CHECK-LABEL: {{^}}trans32_dep_2:
108 ; CHECK-NEXT: v_exp_f32_e32 v0, v0
109 ; CHECK-NEXT: v_exp_f32_e32 v1, v1
110 ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
111 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
112 $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
113 $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
114 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
121 ; CHECK-LABEL: {{^}}trans32_dep_3:
123 ; CHECK-NEXT: v_exp_f32_e32 v0, v0
124 ; CHECK-NEXT: v_exp_f32_e32 v1, v1
125 ; CHECK-NEXT: v_exp_f32_e32 v2, v2
126 ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
127 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
128 $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
129 $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
130 $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
131 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
134 # There's no encoding for TRANS32_DEP_4. A normal TRANS instruction will have
140 ; CHECK-LABEL: {{^}}trans32_dep_4:
142 ; CHECK-NEXT: v_exp_f32_e32 v0, v0
143 ; CHECK-NEXT: v_exp_f32_e32 v1, v1
144 ; CHECK-NEXT: v_exp_f32_e32 v2, v2
145 ; CHECK-NEXT: v_exp_f32_e32 v3, v3
146 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
147 $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
148 $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
149 $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
150 $vgpr3 = V_EXP_F32_e32 $vgpr3, implicit $exec, implicit $mode
151 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
158 ; CHECK-LABEL: {{^}}salu_cycle_1:
160 ; CHECK-NEXT: s_mov_b32 s0, 0
161 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
162 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
164 $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
167 # There's no need for SALU_CYCLE_2 here because the s_mov will have completed
173 ; CHECK-LABEL: {{^}}salu_cycle_2:
175 ; CHECK-NEXT: s_mov_b32 s0, 0
176 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
177 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
179 $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
180 $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
184 name: valu_dep_1_same_trans32_dep_1
187 ; CHECK-LABEL: {{^}}valu_dep_1_same_trans32_dep_1:
189 ; CHECK-NEXT: v_exp_f32_e32 v0, v0
190 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
191 ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
192 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
193 $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
194 $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
195 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
198 # There's no need to encode the VALU depdendency because it will complete before
201 name: trans32_dep_1_only
204 ; CHECK-LABEL: {{^}}trans32_dep_1_only:
206 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
207 ; CHECK-NEXT: v_exp_f32_e32 v1, v1
208 ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
209 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
210 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
211 $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
212 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
216 name: valu_dep_1_same_salu_cycle_1
219 ; CHECK-LABEL: {{^}}valu_dep_1_same_salu_cycle_1:
221 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
222 ; CHECK-NEXT: s_mov_b32 s0, 0
223 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
224 ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
225 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
227 $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
231 name: valu_dep_1_next_valu_dep_1
234 ; CHECK-LABEL: {{^}}valu_dep_1_next_valu_dep_1:
236 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
237 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
238 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
239 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
240 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
241 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
242 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
246 name: valu_dep_2_next_valu_dep_2
249 ; CHECK-LABEL: {{^}}valu_dep_2_next_valu_dep_2:
251 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
252 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
253 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
254 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
255 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
256 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
257 $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
258 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
259 $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
262 # There's no need to encode a dependency for the second mul, because the
263 # dependency for the first mul has already guaranteed that the add has
266 name: valu_dep_1_no_next_1
269 ; CHECK-LABEL: {{^}}valu_dep_1_no_next_1:
271 ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
272 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
273 ; CHECK-NEXT: v_mul_f32_e32 v1, v0, v0
274 ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v0
275 $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
276 $vgpr1 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
277 $vgpr2 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
280 # There's no need to encode a dependency for the second add, because the
281 # dependency for the second mul has already guaranteed that a later VALU has
284 name: valu_dep_1_no_next_2
287 ; CHECK-LABEL: {{^}}valu_dep_1_no_next_2:
289 ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
290 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
291 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
292 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
293 ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
294 $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
295 $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
296 $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
297 $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
300 # There are no wait states between an add/sub/cmp generating carry and an
301 # add/sub/cndmask that consumes it, so no need to encode a dependency.
304 name: implicit_cmp_cndmask
307 ; CHECK-LABEL: {{^}}implicit_cmp_cndmask:
309 ; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, v0, v1
310 ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, vcc
311 implicit $vcc = V_CMP_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
312 $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $vcc, implicit $exec
315 # TODO: There should be no s_delay_alu here.
317 name: explicit_cmp_cndmask
320 ; CHECK-LABEL: {{^}}explicit_cmp_cndmask:
322 ; CHECK-NEXT: v_cmp_eq_i32_e64 s[0:1], v0, v1
323 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
324 ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1]
325 $sgpr0_sgpr1 = V_CMP_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec
326 $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr0_sgpr1, implicit $exec
330 name: implicit_addc_addc
333 ; CHECK-LABEL: {{^}}implicit_addc_addc:
335 ; CHECK-NEXT: v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc
336 ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
337 $vgpr0 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
338 $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
342 name: explicit_addc_addc
345 ; CHECK-LABEL: {{^}}explicit_addc_addc:
347 ; CHECK-NEXT: v_add_co_u32 v0, vcc, v0, v0
348 ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
349 $vgpr0,$vcc = V_ADD_CO_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec
350 $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
354 name: valu_dep_3_bundle
357 ; CHECK-LABEL: {{^}}valu_dep_3_bundle:
359 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
360 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
361 ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
362 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
363 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
364 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
366 $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
367 $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
369 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
376 ; CHECK-LABEL: {{^}}if:
378 ; CHECK-NEXT: s_cbranch_vccz .LBB23_2
380 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
381 ; CHECK-NEXT: .LBB23_2:
382 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
383 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
384 S_CBRANCH_VCCZ %bb.2, implicit $vcc
386 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
388 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
395 ; CHECK-LABEL: {{^}}else:
397 ; CHECK-NEXT: s_cbranch_vccz .LBB24_2
399 ; CHECK-NEXT: s_branch .LBB24_3
400 ; CHECK-NEXT: .LBB24_2:
401 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
402 ; CHECK-NEXT: .LBB24_3:
403 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
404 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
405 S_CBRANCH_VCCZ %bb.2, implicit $vcc
409 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
411 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
418 ; CHECK-LABEL: {{^}}if_else:
420 ; CHECK-NEXT: s_cbranch_vccz .LBB25_2
422 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
423 ; CHECK-NEXT: s_branch .LBB25_3
424 ; CHECK-NEXT: .LBB25_2:
425 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
426 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v1
427 ; CHECK-NEXT: .LBB25_3:
428 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
429 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
430 S_CBRANCH_VCCZ %bb.2, implicit $vcc
432 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
435 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
436 $vgpr0 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
438 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
441 # Dependency from outside the loop.
446 ; CHECK-LABEL: {{^}}loop_1:
448 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
449 ; CHECK-NEXT: .LBB26_1:
450 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
451 ; CHECK-NEXT: v_add_nc_u32_e32 v1, v0, v0
452 ; CHECK-NEXT: s_cbranch_vccz .LBB26_1
453 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
455 $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
456 S_CBRANCH_VCCZ %bb.1, implicit $vcc
460 # Dependency from inside the loop.
465 ; CHECK-LABEL: {{^}}loop_2:
467 ; CHECK-NEXT: .LBB27_1:
468 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
469 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
470 ; CHECK-NEXT: s_cbranch_vccz .LBB27_1
472 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
473 S_CBRANCH_VCCZ %bb.1, implicit $vcc
477 # No VALU delay across s_sendmsg_rtn because it waits for all outstanding VALU
483 ; CHECK-LABEL: {{^}}sendmsg_rtn:
485 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
486 ; CHECK-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
487 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
488 ; CHECK-NEXT: s_add_u32 s0, s0, s0
489 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
490 $vgpr0 = V_MOV_B32_e32 0, implicit $exec
491 $sgpr0 = S_SENDMSG_RTN_B32 128
492 $sgpr0 = S_ADD_U32 $sgpr0, $sgpr0, implicit-def $scc
493 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
496 # No VALU delay before or across FLAT because it waits for all outstanding VALU
502 ; CHECK-LABEL: {{^}}flat_load:
504 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
505 ; CHECK-NEXT: v_mov_b32_e32 v1, 0
506 ; CHECK-NEXT: v_mov_b32_e32 v2, 0
507 ; CHECK-NEXT: flat_load_b32 v0, v[0:1]
508 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v2, v2
509 $vgpr0 = V_MOV_B32_e32 0, implicit $exec
510 $vgpr1 = V_MOV_B32_e32 0, implicit $exec
511 $vgpr2 = V_MOV_B32_e32 0, implicit $exec
512 $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
513 $vgpr0 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
516 # No VALU delay across an s_waitcnt_depctr that waits for all outstanding VALU
522 ; CHECK-LABEL: {{^}}waitcnt_depctr:
524 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
525 ; CHECK-NEXT: s_waitcnt_depctr 0xfff
526 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
527 $vgpr0 = V_MOV_B32_e32 0, implicit $exec
528 S_WAITCNT_DEPCTR 4095
529 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
532 # Check that no delays are emitted for writelane instructions.
537 ; CHECK-LABEL: {{^}}writelane1:
539 ; CHECK-NEXT: v_writelane_b32 v0, s0, 0
540 ; CHECK-NEXT: v_writelane_b32 v0, s0, 1
541 ; CHECK-NEXT: v_writelane_b32 v0, s0, 2
542 ; CHECK-NEXT: v_writelane_b32 v0, s0, 3
543 $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0
544 $vgpr0 = V_WRITELANE_B32 $sgpr0, 1, $vgpr0
545 $vgpr0 = V_WRITELANE_B32 $sgpr0, 2, $vgpr0
546 $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
549 # Check if a VALU delay is added after writelane.
554 ; CHECK-LABEL: {{^}}writelane2:
556 ; CHECK-NEXT: v_writelane_b32 v0, s0, 3
557 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
558 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
559 $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
560 $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec