1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX9 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX10 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX9 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX10 %s
7 ; Test case looks at the allocated offset of @used_by_both. It's at zero when
8 ; allocated by itself, but at 8 when allocated in combination with the double.
9 ; Redundantly also checks LDSByteSize.
10 @used_by_both = addrspace(3) global i32 undef
11 @used_by_kernel = addrspace(3) global i32 undef
12 @used_by_function = addrspace(3) global double undef
14 ; kernel that calls no functions and uses an LDS variable allocates only that
15 ; variable, so accesses at at offset 0 and LDSByteSize is 4
16 define amdgpu_kernel void @nocall_ideal() {
17 ; CHECK-LABEL: nocall_ideal:
19 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
20 ; CHECK-NEXT: ds_write_b32 v0, v0
21 ; CHECK-NEXT: s_endpgm
22 store i32 0, ptr addrspace(3) @used_by_kernel
25 ; CHECK: ; LDSByteSize: 4 bytes
27 ; Needs to allocate both variables, store to used_by_both is at sizeof(double)
28 define amdgpu_kernel void @withcall() {
29 ; GFX9-LABEL: withcall:
31 ; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
32 ; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
33 ; GFX9-NEXT: s_mov_b32 s10, -1
34 ; GFX9-NEXT: s_mov_b32 s11, 0xe00000
35 ; GFX9-NEXT: s_add_u32 s8, s8, s3
36 ; GFX9-NEXT: s_addc_u32 s9, s9, 0
37 ; GFX9-NEXT: s_getpc_b64 s[2:3]
38 ; GFX9-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4
39 ; GFX9-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12
40 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
41 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
42 ; GFX9-NEXT: s_mov_b64 s[0:1], s[8:9]
43 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
44 ; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11]
45 ; GFX9-NEXT: s_mov_b32 s32, 0
46 ; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
47 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
48 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
51 ; GFX10-LABEL: withcall:
53 ; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
54 ; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
55 ; GFX10-NEXT: s_mov_b32 s10, -1
56 ; GFX10-NEXT: s_mov_b32 s11, 0x31c16000
57 ; GFX10-NEXT: s_add_u32 s8, s8, s3
58 ; GFX10-NEXT: s_addc_u32 s9, s9, 0
59 ; GFX10-NEXT: s_getpc_b64 s[2:3]
60 ; GFX10-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4
61 ; GFX10-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12
62 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
63 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
64 ; GFX10-NEXT: s_mov_b64 s[6:7], s[0:1]
65 ; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9]
66 ; GFX10-NEXT: s_mov_b64 s[2:3], s[10:11]
67 ; GFX10-NEXT: s_mov_b32 s32, 0
68 ; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
69 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
70 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
71 ; GFX10-NEXT: s_endpgm
73 ; G_GFX9-LABEL: withcall:
75 ; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
76 ; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
77 ; G_GFX9-NEXT: s_mov_b32 s10, -1
78 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000
79 ; G_GFX9-NEXT: s_add_u32 s8, s8, s3
80 ; G_GFX9-NEXT: s_addc_u32 s9, s9, 0
81 ; G_GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
82 ; G_GFX9-NEXT: s_getpc_b64 s[0:1]
83 ; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
84 ; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
85 ; G_GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
86 ; G_GFX9-NEXT: s_mov_b64 s[0:1], s[8:9]
87 ; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
88 ; G_GFX9-NEXT: v_mov_b32_e32 v1, 8
89 ; G_GFX9-NEXT: s_mov_b64 s[2:3], s[10:11]
90 ; G_GFX9-NEXT: s_mov_b32 s32, 0
91 ; G_GFX9-NEXT: ds_write_b32 v1, v0
92 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
93 ; G_GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
94 ; G_GFX9-NEXT: s_endpgm
96 ; G_GFX10-LABEL: withcall:
98 ; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
99 ; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
100 ; G_GFX10-NEXT: s_mov_b32 s10, -1
101 ; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000
102 ; G_GFX10-NEXT: s_add_u32 s8, s8, s3
103 ; G_GFX10-NEXT: s_addc_u32 s9, s9, 0
104 ; G_GFX10-NEXT: s_mov_b64 s[6:7], s[0:1]
105 ; G_GFX10-NEXT: s_getpc_b64 s[0:1]
106 ; G_GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
107 ; G_GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
108 ; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
109 ; G_GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
110 ; G_GFX10-NEXT: v_mov_b32_e32 v1, 8
111 ; G_GFX10-NEXT: s_mov_b64 s[0:1], s[8:9]
112 ; G_GFX10-NEXT: s_mov_b64 s[2:3], s[10:11]
113 ; G_GFX10-NEXT: s_mov_b32 s32, 0
114 ; G_GFX10-NEXT: ds_write_b32 v1, v0
115 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
116 ; G_GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
117 ; G_GFX10-NEXT: s_endpgm
118 store i32 0, ptr addrspace(3) @used_by_both
119 call void @nonkernel()
122 ; CHECK: ; LDSByteSize: 16 bytes
124 ; Previous lowering was less efficient here than necessary as the i32 used
125 ; by the kernel is also used by an unrelated non-kernel function. Codegen
126 ; is now the same as nocall_ideal.
127 define amdgpu_kernel void @nocall_false_sharing() {
128 ; CHECK-LABEL: nocall_false_sharing:
130 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
131 ; CHECK-NEXT: ds_write_b32 v0, v0
132 ; CHECK-NEXT: s_endpgm
133 store i32 0, ptr addrspace(3) @used_by_both
136 ; CHECK: ; LDSByteSize: 4 bytes
139 define void @nonkernel() {
140 ; GFX9-LABEL: nonkernel:
142 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
144 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
145 ; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
146 ; GFX9-NEXT: ds_write_b64 v0, v[0:1]
147 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
148 ; GFX9-NEXT: s_setpc_b64 s[30:31]
150 ; GFX10-LABEL: nonkernel:
152 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
154 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
155 ; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
156 ; GFX10-NEXT: ds_write_b64 v0, v[0:1]
157 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
158 ; GFX10-NEXT: s_setpc_b64 s[30:31]
160 ; G_GFX9-LABEL: nonkernel:
162 ; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
163 ; G_GFX9-NEXT: v_mov_b32_e32 v2, 0
164 ; G_GFX9-NEXT: v_mov_b32_e32 v3, 8
165 ; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
166 ; G_GFX9-NEXT: v_mov_b32_e32 v1, 0
167 ; G_GFX9-NEXT: ds_write_b32 v3, v2
168 ; G_GFX9-NEXT: ds_write_b64 v2, v[0:1]
169 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
170 ; G_GFX9-NEXT: s_setpc_b64 s[30:31]
172 ; G_GFX10-LABEL: nonkernel:
174 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
175 ; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
176 ; G_GFX10-NEXT: v_mov_b32_e32 v3, 8
177 ; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
178 ; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
179 ; G_GFX10-NEXT: ds_write_b32 v3, v2
180 ; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
181 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
182 ; G_GFX10-NEXT: s_setpc_b64 s[30:31]
183 store i32 0, ptr addrspace(3) @used_by_both
184 store double 0.0, ptr addrspace(3) @used_by_function