1 # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s
2 # RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s
6 # The loop contains a store and a use of a value loaded outside of the loop.
7 # We expect the waitcnt for the use to be hoisted on GFX9, but not on GFX10+
8 # because we have the vscnt counter.
10 # GFX9-LABEL: waitcnt_vm_loop
14 # GFX9-NOT: S_WAITCNT 39
17 # GFX10-LABEL: waitcnt_vm_loop
19 # GFX10-NOT: S_WAITCNT 16
28 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
32 successors: %bb.1, %bb.2
34 BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
35 $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
36 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
37 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
46 # Same as before, but the loop preheader has no terminator.
48 # GFX9-LABEL: waitcnt_vm_loop_noterm
52 # GFX9-NOT: S_WAITCNT 39
55 # GFX10-LABEL: waitcnt_vm_loop_noterm
57 # GFX10-NOT: S_WAITCNT 16
61 name: waitcnt_vm_loop_noterm
66 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
69 successors: %bb.1, %bb.2
71 BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
72 $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
73 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
74 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
83 # Same as before but there is a preexisting waitcnt in the preheader.
85 # GFX9-LABEL: waitcnt_vm_loop_noterm_wait
88 # GFX9-NOT: S_WAITCNT 39
90 # GFX9-NOT: S_WAITCNT 39
92 name: waitcnt_vm_loop_noterm_wait
97 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
101 successors: %bb.1, %bb.2
103 BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
104 $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
105 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
106 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
115 # The loop contains a store, a load, and uses values loaded both inside and
117 # We do not expect the waitcnt to be hoisted out of the loop.
119 # GFX9-LABEL: waitcnt_vm_loop_load
121 # GFX9-NOT: S_WAITCNT 39
126 # GFX10-LABEL: waitcnt_vm_loop_load
128 # GFX10-NOT: S_WAITCNT 16
130 # GFX10: S_WAITCNT 16
132 name: waitcnt_vm_loop_load
137 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
141 successors: %bb.1, %bb.2
143 BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
144 $vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
145 $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec
146 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
147 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
156 # The loop contains a use of a value loaded outside of the loop, and no store
158 # We do not expect the waitcnt to be hoisted out of the loop.
160 # GFX9-LABEL: waitcnt_vm_loop_no_store
162 # GFX9-NOT: S_WAITCNT 39
167 # GFX10-LABEL: waitcnt_vm_loop_no_store
169 # GFX10-NOT: S_WAITCNT 16
171 # GFX10: S_WAITCNT 16
173 name: waitcnt_vm_loop_no_store
178 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
182 successors: %bb.1, %bb.2
184 $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
185 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
186 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
195 # The loop contains a store, no load, and doesn't use any value loaded inside
196 # or outside of the loop. There is only one use of the loaded value in the
198 # We don't expect any s_waitcnt vmcnt in the loop body or preheader, but expect
199 # one in the exit block.
202 # GFX9-LABEL: waitcnt_vm_loop_no_use
204 # GFX9-NOT: S_WAITCNT 39
206 # GFX9-NOT: S_WAITCNT 39
209 # GFX10-LABEL: waitcnt_vm_loop_no_use
211 # GFX10-NOT: S_WAITCNT 16
213 # GFX10-NOT: S_WAITCNT 16
215 name: waitcnt_vm_loop_no_use
220 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
224 successors: %bb.1, %bb.2
226 BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
227 $vgpr1 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
228 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
229 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
233 $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
239 # The loop loads a value that is not used in the loop, and uses a value loaded
240 # outside of the loop.
241 # We expect the waitcnt to be hoisted of the loop to wait a single time before
242 # the loop is executed and avoid waiting for the load to complete on each
245 # GFX9-LABEL: waitcnt_vm_loop2
249 # GFX9-NOT: S_WAITCNT 39
252 # GFX10-LABEL: waitcnt_vm_loop2
254 # GFX10: S_WAITCNT 16
256 # GFX10-NOT: S_WAITCNT 16
258 name: waitcnt_vm_loop2
263 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
267 successors: %bb.1, %bb.2
269 $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
270 $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
271 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
272 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
281 # Same as before with an additional store in the loop. We still expect the
282 # waitcnt instructions to be hoisted.
284 # GFX9-LABEL: waitcnt_vm_loop2_store
288 # GFX9-NOT: S_WAITCNT 39
291 # GFX10-LABEL: waitcnt_vm_loop2_store
293 # GFX10: S_WAITCNT 16
295 # GFX10-NOT: S_WAITCNT 16
297 name: waitcnt_vm_loop2_store
302 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
306 successors: %bb.1, %bb.2
308 $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
309 $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
310 BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
311 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
312 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
321 # Same as loop2 but the value loaded inside the loop is also used in the loop.
322 # We do not expect the waitcnt to be hoisted out of the loop.
324 # GFX9-LABEL: waitcnt_vm_loop2_use_in_loop
326 # GFX9-NOT: S_WAITCNT 39
331 # GFX10-LABEL: waitcnt_vm_loop2_use_in_loop
333 # GFX10-NOT: S_WAITCNT 16
335 # GFX10: S_WAITCNT 16
337 name: waitcnt_vm_loop2_use_in_loop
342 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
346 successors: %bb.1, %bb.2
348 $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
349 $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
350 $vgpr4 = V_ADD_U32_e32 $vgpr5, $vgpr1, implicit $exec
351 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
352 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
361 # The loop contains a use of a value loaded outside of the loop, but we already
362 # waited for that load to complete. The loop also loads a value that is not used
363 # in the loop. We do not expect any waitcnt in the loop.
365 # GFX9-LABEL: waitcnt_vm_loop2_nowait
369 # GFX9-NOT: S_WAITCNT 39
371 # GFX9-NOT: S_WAITCNT 39
374 # GFX10-LABEL: waitcnt_vm_loop2_nowait
376 # GFX10: S_WAITCNT 16
378 # GFX10-NOT: S_WAITCNT 16
380 # GFX10-NOT: S_WAITCNT 16
382 name: waitcnt_vm_loop2_nowait
387 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
388 $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
394 $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
395 $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
396 $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec
401 successors: %bb.2, %bb.3
403 $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
404 $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
405 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
406 S_CBRANCH_SCC1 %bb.2, implicit killed $scc
415 # Similar test case but for register intervals.
417 # GFX9-LABEL: waitcnt_vm_loop2_reginterval
421 # GFX9-NOT: S_WAITCNT 39
424 # GFX10-LABEL: waitcnt_vm_loop2_reginterval
426 # GFX10: S_WAITCNT 16
428 # GFX10-NOT: S_WAITCNT 16
430 name: waitcnt_vm_loop2_reginterval
435 $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec
440 successors: %bb.1, %bb.2
442 $vgpr10 = COPY $vgpr0
444 $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
445 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
446 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
455 # Similar test case but for register intervals.
457 # GFX9-LABEL: waitcnt_vm_loop2_reginterval2
459 # GFX9-NOT: S_WAITCNT 39
464 # GFX10-LABEL: waitcnt_vm_loop2_reginterval2
466 # GFX10-NOT: S_WAITCNT 16
468 # GFX10: S_WAITCNT 16
470 name: waitcnt_vm_loop2_reginterval2
475 $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec
480 successors: %bb.1, %bb.2
482 $vgpr10 = COPY $vgpr0
484 $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
485 $vgpr11 = COPY $vgpr7
486 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
487 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
496 # The loop loads a value that is not used in the loop, but uses a value loaded
497 # outside of it. We expect the s_waitcnt instruction to be hoisted.
498 # A s_waitcnt vmcnt(0) is generated to flush in the preheader, but for this
499 # specific test case, it would be better to use vmcnt(1) instead. This is
500 # currently not implemented.
502 # GFX9-LABEL: waitcnt_vm_zero
504 # GFX9: S_WAITCNT 3952
506 # GFX9-NOT: S_WAITCNT 39
509 # GFX10-LABEL: waitcnt_vm_zero
511 # GFX10: S_WAITCNT 16240
513 # GFX10-NOT: S_WAITCNT 16240
516 name: waitcnt_vm_zero
521 $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
522 $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
526 successors: %bb.1, %bb.2
528 $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr3, implicit $exec
529 $vgpr2 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr3, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
530 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
531 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
540 # This test case checks that we flush the vmcnt counter only if necessary
541 # (i.e. if a waitcnt is needed for the vgpr use we find in the loop)
543 # GFX10-LABEL: waitcnt_vm_necessary
545 # GFX10: S_WAITCNT 16240
547 # GFX10-NOT: S_WAITCNT
549 # GFX10-NOT: S_WAITCNT
551 # GFX9-LABEL: waitcnt_vm_necessary
553 # GFX9: S_WAITCNT 3952
555 # GFX9-NOT: S_WAITCNT
557 # GFX9-NOT: S_WAITCNT
559 name: waitcnt_vm_necessary
562 successors: %bb.1(0x80000000)
564 $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 killed $vgpr0_vgpr1, 0, 0, implicit $exec
565 $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
568 successors: %bb.1(0x40000000)
570 $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec
571 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
577 # The loop contains a global store, and uses a (global) loaded value outside of the loop.
579 # GFX9-LABEL: waitcnt_vm_loop_global_mem
583 # GFX9-NOT: S_WAITCNT 39
586 # GFX10-LABEL: waitcnt_vm_loop_global_mem
588 # GFX10-NOT: S_WAITCNT 16
590 # GFX10: S_WAITCNT 16
593 name: waitcnt_vm_loop_global_mem
597 $vgpr0 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec
601 successors: %bb.1, %bb.2
603 $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
604 GLOBAL_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec
605 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
606 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
618 # Same as above case, but use scratch memory instructions instead
620 # GFX9-LABEL: waitcnt_vm_loop_scratch_mem
624 # GFX9-NOT: S_WAITCNT 39
627 # GFX10-LABEL: waitcnt_vm_loop_scratch_mem
629 # GFX10-NOT: S_WAITCNT 16
631 # GFX10: S_WAITCNT 16
634 name: waitcnt_vm_loop_scratch_mem
638 $vgpr0 = SCRATCH_LOAD_DWORD $vgpr1, 0, 0, implicit $exec, implicit $flat_scr
642 successors: %bb.1, %bb.2
644 $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
645 SCRATCH_STORE_DWORD $vgpr4, $vgpr6, 0, 0, implicit $exec, implicit $flat_scr
646 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
647 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
659 # Same as above case, but use flat memory instructions instead
661 # GFX9-LABEL: waitcnt_vm_loop_flat_mem
665 # GFX9-NOT: S_WAITCNT 39
668 # GFX10-LABEL: waitcnt_vm_loop_flat_mem
670 # GFX10-NOT: S_WAITCNT 11
672 # GFX10: S_WAITCNT 11
674 name: waitcnt_vm_loop_flat_mem
678 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec, implicit $flat_scr
682 successors: %bb.1, %bb.2
684 $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
685 FLAT_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec, implicit $flat_scr
686 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
687 S_CBRANCH_SCC1 %bb.1, implicit killed $scc
699 # The loop contains a store, a load, and uses values loaded both inside and
701 # We do not expect the waitcnt to be hoisted out of the loop.
703 # GFX9-LABEL: waitcnt_vm_loop_flat_load
705 # GFX9-NOT: S_WAITCNT 39
710 # GFX10-LABEL: waitcnt_vm_loop_flat_load
712 # GFX10-NOT: S_WAITCNT 16
714 # GFX10: S_WAITCNT 16
716 name: waitcnt_vm_loop_flat_load
721 $vgpr0 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec
725 successors: %bb.1, %bb.2
727 GLOBAL_STORE_DWORD $vgpr4_vgpr5, $vgpr6, 0, 0, implicit $exec
728 $vgpr7 = GLOBAL_LOAD_DWORD $vgpr1_vgpr2, 0, 0, implicit $exec
729 $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec
730 S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc
731 S_CBRANCH_SCC1 %bb.1, implicit killed $scc