1 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=si-pre-allocate-wwm-regs -o %t.mir %s
2 ; RUN: llc -run-pass=none -verify-machineinstrs %t.mir -o - | FileCheck %s
4 ; Test that SIMachineFunctionInfo can be round trip serialized through
7 @lds = addrspace(3) global [512 x float] undef, align 4
9 ; CHECK-LABEL: {{^}}name: kernel
10 ; CHECK: machineFunctionInfo:
11 ; CHECK-NEXT: explicitKernArgSize: 128
12 ; CHECK-NEXT: maxKernArgAlign: 64
13 ; CHECK-NEXT: ldsSize: 2048
14 ; CHECK-NEXT: gdsSize: 0
15 ; CHECK-NEXT: dynLDSAlign: 1
16 ; CHECK-NEXT: isEntryFunction: true
17 ; CHECK-NEXT: isChainFunction: false
18 ; CHECK-NEXT: noSignedZerosFPMath: false
19 ; CHECK-NEXT: memoryBound: false
20 ; CHECK-NEXT: waveLimiter: false
21 ; CHECK-NEXT: hasSpilledSGPRs: false
22 ; CHECK-NEXT: hasSpilledVGPRs: false
23 ; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
24 ; CHECK-NEXT: frameOffsetReg: '$fp_reg'
25 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
26 ; CHECK-NEXT: bytesInStackArgArea: 0
27 ; CHECK-NEXT: returnsVoid: true
28 ; CHECK-NEXT: argumentInfo:
29 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
30 ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
31 ; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' }
32 ; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
33 ; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
34 ; CHECK-NEXT: psInputAddr: 0
35 ; CHECK-NEXT: psInputEnable: 0
37 ; CHECK-NEXT: ieee: true
38 ; CHECK-NEXT: dx10-clamp: true
39 ; CHECK-NEXT: fp32-input-denormals: true
40 ; CHECK-NEXT: fp32-output-denormals: true
41 ; CHECK-NEXT: fp64-fp16-input-denormals: true
42 ; CHECK-NEXT: fp64-fp16-output-denormals: true
43 ; CHECK-NEXT: highBitsOf32BitAddress: 0
44 ; CHECK-NEXT: occupancy: 8
45 ; CHECK-NEXT: vgprForAGPRCopy: ''
46 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
47 ; CHECK-NEXT: longBranchReservedReg: ''
49 define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
50 %gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0
51 store float 0.0, ptr addrspace(3) %gep, align 4
55 @gds = addrspace(2) global [128 x i32] undef, align 4
57 ; CHECK-LABEL: {{^}}name: ps_shader
58 ; CHECK: machineFunctionInfo:
59 ; CHECK-NEXT: explicitKernArgSize: 0
60 ; CHECK-NEXT: maxKernArgAlign: 4
61 ; CHECK-NEXT: ldsSize: 0
62 ; CHECK-NEXT: gdsSize: 512
63 ; CHECK-NEXT: dynLDSAlign: 1
64 ; CHECK-NEXT: isEntryFunction: true
65 ; CHECK-NEXT: isChainFunction: false
66 ; CHECK-NEXT: noSignedZerosFPMath: false
67 ; CHECK-NEXT: memoryBound: false
68 ; CHECK-NEXT: waveLimiter: false
69 ; CHECK-NEXT: hasSpilledSGPRs: false
70 ; CHECK-NEXT: hasSpilledVGPRs: false
71 ; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
72 ; CHECK-NEXT: frameOffsetReg: '$fp_reg'
73 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
74 ; CHECK-NEXT: bytesInStackArgArea: 0
75 ; CHECK-NEXT: returnsVoid: true
76 ; CHECK-NEXT: argumentInfo:
77 ; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr3' }
78 ; CHECK-NEXT: implicitBufferPtr: { reg: '$sgpr0_sgpr1' }
79 ; CHECK-NEXT: psInputAddr: 1
80 ; CHECK-NEXT: psInputEnable: 1
82 ; CHECK-NEXT: ieee: false
83 ; CHECK-NEXT: dx10-clamp: true
84 ; CHECK-NEXT: fp32-input-denormals: true
85 ; CHECK-NEXT: fp32-output-denormals: true
86 ; CHECK-NEXT: fp64-fp16-input-denormals: true
87 ; CHECK-NEXT: fp64-fp16-output-denormals: true
88 ; CHECK-NEXT: highBitsOf32BitAddress: 0
89 ; CHECK-NEXT: occupancy: 10
90 ; CHECK-NEXT: vgprForAGPRCopy: ''
91 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
92 ; CHECK-NEXT: longBranchReservedReg: ''
94 define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
95 %gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0
96 atomicrmw add ptr addrspace(2) %gep, i32 8 seq_cst
100 ; CHECK-LABEL: {{^}}name: ps_shader_ps_input_enable
101 ; CHECK: machineFunctionInfo:
102 ; CHECK: psInputAddr: 36983
103 ; CHECK-NEXT: psInputEnable: 1{{$}}
104 define amdgpu_ps void @ps_shader_ps_input_enable(i32 %arg0, i32 inreg %arg1) #7 {
105 %gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0
106 atomicrmw add ptr addrspace(2) %gep, i32 8 seq_cst
110 ; CHECK-LABEL: {{^}}name: gds_size_shader
111 ; CHECK: gdsSize: 4096
112 define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
116 ; CHECK-LABEL: {{^}}name: function
117 ; CHECK: machineFunctionInfo:
118 ; CHECK-NEXT: explicitKernArgSize: 0
119 ; CHECK-NEXT: maxKernArgAlign: 1
120 ; CHECK-NEXT: ldsSize: 0
121 ; CHECK-NEXT: gdsSize: 0
122 ; CHECK-NEXT: dynLDSAlign: 1
123 ; CHECK-NEXT: isEntryFunction: false
124 ; CHECK-NEXT: isChainFunction: false
125 ; CHECK-NEXT: noSignedZerosFPMath: false
126 ; CHECK-NEXT: memoryBound: false
127 ; CHECK-NEXT: waveLimiter: false
128 ; CHECK-NEXT: hasSpilledSGPRs: false
129 ; CHECK-NEXT: hasSpilledVGPRs: false
130 ; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
131 ; CHECK-NEXT: frameOffsetReg: '$sgpr33'
132 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
133 ; CHECK-NEXT: bytesInStackArgArea: 0
134 ; CHECK-NEXT: returnsVoid: true
135 ; CHECK-NEXT: argumentInfo:
136 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
137 ; CHECK-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
138 ; CHECK-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
139 ; CHECK-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
140 ; CHECK-NEXT: workGroupIDX: { reg: '$sgpr12' }
141 ; CHECK-NEXT: workGroupIDY: { reg: '$sgpr13' }
142 ; CHECK-NEXT: workGroupIDZ: { reg: '$sgpr14' }
143 ; CHECK-NEXT: LDSKernelId: { reg: '$sgpr15' }
144 ; CHECK-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
145 ; CHECK-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
146 ; CHECK-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
147 ; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
148 ; CHECK-NEXT: psInputAddr: 0
149 ; CHECK-NEXT: psInputEnable: 0
151 ; CHECK-NEXT: ieee: true
152 ; CHECK-NEXT: dx10-clamp: true
153 ; CHECK-NEXT: fp32-input-denormals: true
154 ; CHECK-NEXT: fp32-output-denormals: true
155 ; CHECK-NEXT: fp64-fp16-input-denormals: true
156 ; CHECK-NEXT: fp64-fp16-output-denormals: true
157 ; CHECK-NEXT: highBitsOf32BitAddress: 0
158 ; CHECK-NEXT: occupancy: 8
159 ; CHECK-NEXT: vgprForAGPRCopy: ''
160 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
161 ; CHECK-NEXT: longBranchReservedReg: ''
163 define void @function() {
167 ; CHECK-LABEL: {{^}}name: function_nsz
168 ; CHECK: machineFunctionInfo:
169 ; CHECK-NEXT: explicitKernArgSize: 0
170 ; CHECK-NEXT: maxKernArgAlign: 1
171 ; CHECK-NEXT: ldsSize: 0
172 ; CHECK-NEXT: gdsSize: 0
173 ; CHECK-NEXT: dynLDSAlign: 1
174 ; CHECK-NEXT: isEntryFunction: false
175 ; CHECK-NEXT: isChainFunction: false
176 ; CHECK-NEXT: noSignedZerosFPMath: true
177 ; CHECK-NEXT: memoryBound: false
178 ; CHECK-NEXT: waveLimiter: false
179 ; CHECK-NEXT: hasSpilledSGPRs: false
180 ; CHECK-NEXT: hasSpilledVGPRs: false
181 ; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
182 ; CHECK-NEXT: frameOffsetReg: '$sgpr33'
183 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
184 ; CHECK-NEXT: bytesInStackArgArea: 0
185 ; CHECK-NEXT: returnsVoid: true
186 ; CHECK-NEXT: argumentInfo:
187 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
188 ; CHECK-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
189 ; CHECK-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
190 ; CHECK-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
191 ; CHECK-NEXT: workGroupIDX: { reg: '$sgpr12' }
192 ; CHECK-NEXT: workGroupIDY: { reg: '$sgpr13' }
193 ; CHECK-NEXT: workGroupIDZ: { reg: '$sgpr14' }
194 ; CHECK-NEXT: LDSKernelId: { reg: '$sgpr15' }
195 ; CHECK-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
196 ; CHECK-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
197 ; CHECK-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
198 ; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
199 ; CHECK-NEXT: psInputAddr: 0
200 ; CHECK-NEXT: psInputEnable: 0
202 ; CHECK-NEXT: ieee: true
203 ; CHECK-NEXT: dx10-clamp: true
204 ; CHECK-NEXT: fp32-input-denormals: true
205 ; CHECK-NEXT: fp32-output-denormals: true
206 ; CHECK-NEXT: fp64-fp16-input-denormals: true
207 ; CHECK-NEXT: fp64-fp16-output-denormals: true
208 ; CHECK-NEXT: highBitsOf32BitAddress: 0
209 ; CHECK-NEXT: occupancy: 8
210 ; CHECK-NEXT: vgprForAGPRCopy: ''
211 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
212 ; CHECK-NEXT: longBranchReservedReg: ''
214 define void @function_nsz() #0 {
218 ; CHECK-LABEL: {{^}}name: function_dx10_clamp_off
220 ; CHECK-NEXT: ieee: true
221 ; CHECK-NEXT: dx10-clamp: false
222 ; CHECK-NEXT: fp32-input-denormals: true
223 ; CHECK-NEXT: fp32-output-denormals: true
224 ; CHECK-NEXT: fp64-fp16-input-denormals: true
225 ; CHECK-NEXT: fp64-fp16-output-denormals: true
226 define void @function_dx10_clamp_off() #1 {
230 ; CHECK-LABEL: {{^}}name: function_ieee_off
232 ; CHECK-NEXT: ieee: false
233 ; CHECK-NEXT: dx10-clamp: true
234 ; CHECK-NEXT: fp32-input-denormals: true
235 ; CHECK-NEXT: fp32-output-denormals: true
236 ; CHECK-NEXT: fp64-fp16-input-denormals: true
237 ; CHECK-NEXT: fp64-fp16-output-denormals: true
238 define void @function_ieee_off() #2 {
242 ; CHECK-LABEL: {{^}}name: function_ieee_off_dx10_clamp_off
244 ; CHECK-NEXT: ieee: false
245 ; CHECK-NEXT: dx10-clamp: false
246 ; CHECK-NEXT: fp32-input-denormals: true
247 ; CHECK-NEXT: fp32-output-denormals: true
248 ; CHECK-NEXT: fp64-fp16-input-denormals: true
249 ; CHECK-NEXT: fp64-fp16-output-denormals: true
250 define void @function_ieee_off_dx10_clamp_off() #3 {
254 ; CHECK-LABEL: {{^}}name: high_address_bits
255 ; CHECK: machineFunctionInfo:
256 ; CHECK: highBitsOf32BitAddress: 4294934528
257 define amdgpu_ps void @high_address_bits() #4 {
261 ; CHECK-LABEL: {{^}}name: wwm_reserved_regs
262 ; CHECK: wwmReservedRegs:
263 ; CHECK-NEXT: - '$vgpr2'
264 ; CHECK-NEXT: - '$vgpr3'
265 define amdgpu_cs void @wwm_reserved_regs(ptr addrspace(1) %ptr, <4 x i32> inreg %tmp14) {
266 %ld0 = load volatile i32, ptr addrspace(1) %ptr
267 %ld1 = load volatile i32, ptr addrspace(1) %ptr
268 %inactive0 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld1, i32 0)
269 %inactive1 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld0, i32 0)
270 store volatile i32 %inactive0, ptr addrspace(1) %ptr
271 store volatile i32 %inactive1, ptr addrspace(1) %ptr
275 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #6
277 attributes #0 = { "no-signed-zeros-fp-math" = "true" }
278 attributes #1 = { "amdgpu-dx10-clamp" = "false" }
279 attributes #2 = { "amdgpu-ieee" = "false" }
280 attributes #3 = { "amdgpu-dx10-clamp" = "false" "amdgpu-ieee" = "false" }
281 attributes #4 = { "amdgpu-32bit-address-high-bits"="0xffff8000" }
282 attributes #5 = { "amdgpu-gds-size"="4096" }
283 attributes #6 = { convergent nounwind readnone willreturn }
284 attributes #7 = { "InitialPSInputAddr"="36983" }