1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: opt -S -passes=always-inline -o %t.bc %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %t.bc | FileCheck %s --check-prefixes=CHECK
5 ; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully
6 ; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs,
7 ; which (incorrectly) used to look to the scheduler like an occupancy reduction.
9 ; 6 kB of LDS, allows 10 workgroups
10 @lds = internal addrspace(3) global [384 x <4 x i32>] undef
12 define internal amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 %ofs) alwaysinline {
13 %src.gep = getelementptr <4 x i32>, ptr addrspace(1) %src, i32 %ofs
14 %ld = load <4 x i32>, ptr addrspace(1) %src.gep
15 %dst.gep = getelementptr <4 x i32>, ptr addrspace(3) @lds, i32 %ofs
16 store <4 x i32> %ld, ptr addrspace(3) %dst.gep
20 define amdgpu_cs void @test(ptr addrspace(1) %src) "amdgpu-flat-work-group-size"="32,32" {
23 ; CHECK-NEXT: s_clause 0x1f
24 ; CHECK-NEXT: global_load_b128 v[2:5], v[0:1], off
25 ; CHECK-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16
26 ; CHECK-NEXT: global_load_b128 v[10:13], v[0:1], off offset:32
27 ; CHECK-NEXT: global_load_b128 v[14:17], v[0:1], off offset:48
28 ; CHECK-NEXT: global_load_b128 v[18:21], v[0:1], off offset:64
29 ; CHECK-NEXT: global_load_b128 v[22:25], v[0:1], off offset:80
30 ; CHECK-NEXT: global_load_b128 v[26:29], v[0:1], off offset:96
31 ; CHECK-NEXT: global_load_b128 v[30:33], v[0:1], off offset:112
32 ; CHECK-NEXT: global_load_b128 v[34:37], v[0:1], off offset:128
33 ; CHECK-NEXT: global_load_b128 v[38:41], v[0:1], off offset:144
34 ; CHECK-NEXT: global_load_b128 v[42:45], v[0:1], off offset:160
35 ; CHECK-NEXT: global_load_b128 v[46:49], v[0:1], off offset:176
36 ; CHECK-NEXT: global_load_b128 v[50:53], v[0:1], off offset:192
37 ; CHECK-NEXT: global_load_b128 v[54:57], v[0:1], off offset:208
38 ; CHECK-NEXT: global_load_b128 v[58:61], v[0:1], off offset:224
39 ; CHECK-NEXT: global_load_b128 v[62:65], v[0:1], off offset:240
40 ; CHECK-NEXT: global_load_b128 v[66:69], v[0:1], off offset:256
41 ; CHECK-NEXT: global_load_b128 v[70:73], v[0:1], off offset:272
42 ; CHECK-NEXT: global_load_b128 v[74:77], v[0:1], off offset:288
43 ; CHECK-NEXT: global_load_b128 v[78:81], v[0:1], off offset:304
44 ; CHECK-NEXT: global_load_b128 v[82:85], v[0:1], off offset:320
45 ; CHECK-NEXT: global_load_b128 v[86:89], v[0:1], off offset:336
46 ; CHECK-NEXT: global_load_b128 v[90:93], v[0:1], off offset:352
47 ; CHECK-NEXT: global_load_b128 v[94:97], v[0:1], off offset:368
48 ; CHECK-NEXT: global_load_b128 v[98:101], v[0:1], off offset:384
49 ; CHECK-NEXT: global_load_b128 v[102:105], v[0:1], off offset:400
50 ; CHECK-NEXT: global_load_b128 v[106:109], v[0:1], off offset:416
51 ; CHECK-NEXT: global_load_b128 v[110:113], v[0:1], off offset:432
52 ; CHECK-NEXT: global_load_b128 v[114:117], v[0:1], off offset:448
53 ; CHECK-NEXT: global_load_b128 v[118:121], v[0:1], off offset:464
54 ; CHECK-NEXT: global_load_b128 v[122:125], v[0:1], off offset:480
55 ; CHECK-NEXT: global_load_b128 v[126:129], v[0:1], off offset:496
56 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
57 ; CHECK-NEXT: s_waitcnt vmcnt(31)
58 ; CHECK-NEXT: ds_store_b128 v0, v[2:5]
59 ; CHECK-NEXT: s_waitcnt vmcnt(30)
60 ; CHECK-NEXT: ds_store_b128 v0, v[6:9] offset:16
61 ; CHECK-NEXT: s_waitcnt vmcnt(29)
62 ; CHECK-NEXT: ds_store_b128 v0, v[10:13] offset:32
63 ; CHECK-NEXT: s_waitcnt vmcnt(28)
64 ; CHECK-NEXT: ds_store_b128 v0, v[14:17] offset:48
65 ; CHECK-NEXT: s_waitcnt vmcnt(27)
66 ; CHECK-NEXT: ds_store_b128 v0, v[18:21] offset:64
67 ; CHECK-NEXT: s_waitcnt vmcnt(26)
68 ; CHECK-NEXT: ds_store_b128 v0, v[22:25] offset:80
69 ; CHECK-NEXT: s_waitcnt vmcnt(25)
70 ; CHECK-NEXT: ds_store_b128 v0, v[26:29] offset:96
71 ; CHECK-NEXT: s_waitcnt vmcnt(24)
72 ; CHECK-NEXT: ds_store_b128 v0, v[30:33] offset:112
73 ; CHECK-NEXT: s_waitcnt vmcnt(23)
74 ; CHECK-NEXT: ds_store_b128 v0, v[34:37] offset:128
75 ; CHECK-NEXT: s_waitcnt vmcnt(22)
76 ; CHECK-NEXT: ds_store_b128 v0, v[38:41] offset:144
77 ; CHECK-NEXT: s_waitcnt vmcnt(21)
78 ; CHECK-NEXT: ds_store_b128 v0, v[42:45] offset:160
79 ; CHECK-NEXT: s_waitcnt vmcnt(20)
80 ; CHECK-NEXT: ds_store_b128 v0, v[46:49] offset:176
81 ; CHECK-NEXT: s_waitcnt vmcnt(19)
82 ; CHECK-NEXT: ds_store_b128 v0, v[50:53] offset:192
83 ; CHECK-NEXT: s_waitcnt vmcnt(18)
84 ; CHECK-NEXT: ds_store_b128 v0, v[54:57] offset:208
85 ; CHECK-NEXT: s_waitcnt vmcnt(17)
86 ; CHECK-NEXT: ds_store_b128 v0, v[58:61] offset:224
87 ; CHECK-NEXT: s_waitcnt vmcnt(16)
88 ; CHECK-NEXT: ds_store_b128 v0, v[62:65] offset:240
89 ; CHECK-NEXT: s_waitcnt vmcnt(15)
90 ; CHECK-NEXT: ds_store_b128 v0, v[66:69] offset:256
91 ; CHECK-NEXT: s_waitcnt vmcnt(14)
92 ; CHECK-NEXT: ds_store_b128 v0, v[70:73] offset:272
93 ; CHECK-NEXT: s_waitcnt vmcnt(13)
94 ; CHECK-NEXT: ds_store_b128 v0, v[74:77] offset:288
95 ; CHECK-NEXT: s_waitcnt vmcnt(12)
96 ; CHECK-NEXT: ds_store_b128 v0, v[78:81] offset:304
97 ; CHECK-NEXT: s_waitcnt vmcnt(11)
98 ; CHECK-NEXT: ds_store_b128 v0, v[82:85] offset:320
99 ; CHECK-NEXT: s_waitcnt vmcnt(10)
100 ; CHECK-NEXT: ds_store_b128 v0, v[86:89] offset:336
101 ; CHECK-NEXT: s_waitcnt vmcnt(9)
102 ; CHECK-NEXT: ds_store_b128 v0, v[90:93] offset:352
103 ; CHECK-NEXT: s_waitcnt vmcnt(8)
104 ; CHECK-NEXT: ds_store_b128 v0, v[94:97] offset:368
105 ; CHECK-NEXT: s_waitcnt vmcnt(7)
106 ; CHECK-NEXT: ds_store_b128 v0, v[98:101] offset:384
107 ; CHECK-NEXT: s_waitcnt vmcnt(6)
108 ; CHECK-NEXT: ds_store_b128 v0, v[102:105] offset:400
109 ; CHECK-NEXT: s_waitcnt vmcnt(5)
110 ; CHECK-NEXT: ds_store_b128 v0, v[106:109] offset:416
111 ; CHECK-NEXT: s_waitcnt vmcnt(4)
112 ; CHECK-NEXT: ds_store_b128 v0, v[110:113] offset:432
113 ; CHECK-NEXT: s_waitcnt vmcnt(3)
114 ; CHECK-NEXT: ds_store_b128 v0, v[114:117] offset:448
115 ; CHECK-NEXT: s_waitcnt vmcnt(2)
116 ; CHECK-NEXT: ds_store_b128 v0, v[118:121] offset:464
117 ; CHECK-NEXT: s_waitcnt vmcnt(1)
118 ; CHECK-NEXT: ds_store_b128 v0, v[122:125] offset:480
119 ; CHECK-NEXT: s_waitcnt vmcnt(0)
120 ; CHECK-NEXT: ds_store_b128 v0, v[126:129] offset:496
121 ; CHECK-NEXT: s_endpgm
122 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 0)
123 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 1)
124 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 2)
125 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 3)
126 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 4)
127 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 5)
128 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 6)
129 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 7)
130 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 8)
131 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 9)
132 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 10)
133 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 11)
134 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 12)
135 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 13)
136 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 14)
137 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 15)
138 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 16)
139 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 17)
140 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 18)
141 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 19)
142 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 20)
143 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 21)
144 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 22)
145 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 23)
146 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 24)
147 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 25)
148 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 26)
149 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 27)
150 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 28)
151 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 29)
152 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 30)
153 call amdgpu_gfx void @copy(ptr addrspace(1) %src, i32 31)