2 Copyright (c) 2002, Thomas Kurschel
5 Part of Radeon accelerant
7 Command Processor handling
10 Something about synchronization in general:
12 The DDK says that only some register accesses are stored in the
13 Command FIFO, i.e. in almost all cases you don't have to wait until
14 there is enough space in this FIFO. Unfortunately, ATI doesn't speak
15 clearly here and doesn't tell you which registers are buffered and
16 which not (the r300 DDK provides some examples only, other DDKs refer
17 to some include file where no such info could be found).
19 Looking at pre-Radeon specs, we have the following register ranges:
20 0 configuration/display/multi-media registers
21 0xf00 read-only PCI configuration space
23 0x1400 FIFOed GUI-registers
25 So, if the list is still correct, the affected registers are only
26 those used for 2D/3D drawing.
28 This is very important as if the register you want to write is
29 buffered, you have to do a busy wait until there is enough FIFO
30 space. As concurrent threads may do the same, register access should
31 only be done with a lock held. We never write GUI-registers directly,
32 so we never have to wait for the FIFO and thus don't need this lock.
36 #include "radeon_accelerant.h"
38 #include "buscntrl_regs.h"
40 #include <sys/ioctl.h>
49 // get number of free entries in CP's ring buffer
50 static uint
getAvailRingBuffer( accelerator_info
*ai
)
52 CP_info
*cp
= &ai
->si
->cp
;
56 *(uint32
*)(ai
->mapped_memory
[cp
->feedback
.mem_type
].data
+ cp
->feedback
.head_mem_offset
)
59 //space = INREG( ai->regs, RADEON_CP_RB_RPTR ) - cp->ring.tail;
62 space
+= cp
->ring
.size
;
64 // don't fill up the entire buffer as we cannot
65 // distinguish between a full and an empty ring
68 SHOW_FLOW( 3, "head=%ld, tail=%ld, space=%ld",
69 *(uint32
*)(ai
->mapped_memory
[cp
->feedback
.mem_type
].data
+ cp
->feedback
.head_mem_offset
),
71 cp
->ring
.tail
, space
);
73 LOG1( si
->log
, _GetAvailRingBufferQueue
, space
);
75 cp
->ring
.space
= space
;
81 // mark all indirect buffers that have been processed as being free;
83 void Radeon_FreeIndirectBuffers( accelerator_info
*ai
)
85 CP_info
*cp
= &ai
->si
->cp
;
86 int32 cur_processed_tag
=
87 ((uint32
*)(ai
->mapped_memory
[cp
->feedback
.mem_type
].data
+ cp
->feedback
.scratch_mem_offset
))[1];
88 //ai->si->cp.scratch.ptr[1];
89 //INREG( ai->regs, RADEON_SCRATCH_REG1 );
91 SHOW_FLOW( 3, "processed_tag=%d", cur_processed_tag
);
93 // mark all sent indirect buffers as free
94 while( cp
->buffers
.oldest
!= -1 ) {
95 indirect_buffer
*oldest_buffer
=
96 &cp
->buffers
.buffers
[cp
->buffers
.oldest
];
97 int tmp_oldest_buffer
;
99 SHOW_FLOW( 3, "oldset buffer's tag: %d", oldest_buffer
->send_tag
);
101 // this is a tricky calculation to handle wrap-arounds correctly,
102 // so don't change it unless you really understand the signess problem
103 if( (int32
)(cur_processed_tag
- oldest_buffer
->send_tag
) < 0 )
106 SHOW_FLOW( 3, "mark %d as being free", oldest_buffer
->send_tag
);
108 // remove buffer from "used" list
109 tmp_oldest_buffer
= oldest_buffer
->next
;
111 if( tmp_oldest_buffer
== -1 )
112 cp
->buffers
.newest
= -1;
114 // put it on free list
115 oldest_buffer
->next
= cp
->buffers
.free_list
;
116 cp
->buffers
.free_list
= cp
->buffers
.oldest
;
118 cp
->buffers
.oldest
= tmp_oldest_buffer
;
123 // wait until an indirect buffer becomes available;
125 static void Radeon_WaitForFreeIndirectBuffers( accelerator_info
*ai
)
127 bigtime_t start_time
;
128 CP_info
*cp
= &ai
->si
->cp
;
132 start_time
= system_time();
135 bigtime_t sample_time
;
137 Radeon_FreeIndirectBuffers( ai
);
139 if( cp
->buffers
.free_list
>= 0 )
142 sample_time
= system_time();
144 if( sample_time
- start_time
> 100000 )
147 RELEASE_BEN( cp
->lock
);
149 // use exponential fall-off
150 // in the beginning do busy-waiting, later on we let the thread sleep;
151 // the micro-spin is used to reduce PCI load
152 if( sample_time
- start_time
> 5000 )
153 snooze( (sample_time
- start_time
) / 10 );
157 ACQUIRE_BEN( cp
->lock
);
160 SHOW_ERROR0( 0, "All buffers are in use and engine doesn't finish any of them" );
162 // lock must be released during reset (reset acquires it automatically)
163 RELEASE_BEN( cp
->lock
);
164 Radeon_ResetEngine( ai
);
165 ACQUIRE_BEN( cp
->lock
);
168 // allocate an indirect buffer
169 int Radeon_AllocIndirectBuffer( accelerator_info
*ai
, bool keep_lock
)
171 CP_info
*cp
= &ai
->si
->cp
;
176 ACQUIRE_BEN( cp
->lock
);
178 if( cp
->buffers
.free_list
== -1 )
179 Radeon_WaitForFreeIndirectBuffers( ai
);
181 buffer_idx
= cp
->buffers
.free_list
;
182 cp
->buffers
.free_list
= cp
->buffers
.buffers
[buffer_idx
].next
;
185 RELEASE_BEN( cp
->lock
);
188 SHOW_FLOW( 3, "got %d", buffer_idx
);
194 // explicitely free an indirect buffer;
195 // this is not needed if the buffer was send via SendIndirectBuffer()
196 // never_used - set to true if the buffer wasn't even sent indirectly
199 // if never_used is false, execution may take very long as all buffers
201 void Radeon_FreeIndirectBuffer( accelerator_info
*ai
, int buffer_idx
, bool never_used
)
203 CP_info
*cp
= &ai
->si
->cp
;
205 SHOW_FLOW( 3, "buffer_idx=%d, never_used=%d", buffer_idx
, never_used
);
207 // if the buffer was used as a state buffer, we don't record its usage,
208 // so we don't know if the buffer was/is/will be used;
209 // the only way to be sure is to let the CP run dry
211 Radeon_WaitForIdle( ai
, false );
213 ACQUIRE_BEN( cp
->lock
);
215 cp
->buffers
.buffers
[buffer_idx
].next
= cp
->buffers
.free_list
;
216 cp
->buffers
.free_list
= buffer_idx
;
218 RELEASE_BEN( cp
->lock
);
220 SHOW_FLOW0( 3, "done" );
223 // this function must be moved to end of file to avoid inlining
224 void Radeon_WaitForRingBufferSpace( accelerator_info
*ai
, uint num_dwords
);
227 // start writing to ring buffer
228 // num_dwords - number of dwords to write (must be precise!)
230 // during wait, CP's benaphore is released
231 #define WRITE_RB_START( num_dwords ) \
233 uint32 *ring_start; \
234 uint32 ring_tail, ring_tail_mask; \
235 uint32 ring_tail_increment = (num_dwords); \
236 if( cp->ring.space < ring_tail_increment ) \
237 Radeon_WaitForRingBufferSpace( ai, ring_tail_increment ); \
239 (uint32 *)(ai->mapped_memory[cp->ring.mem_type].data + cp->ring.mem_offset); \
240 /*cp->ring.start;*/ \
241 ring_tail = cp->ring.tail; \
242 ring_tail_mask = cp->ring.tail_mask;
244 // write single dword to ring buffer
245 #define WRITE_RB( value ) \
247 uint32 val = (value); \
248 SHOW_FLOW( 3, "@%d: %x", ring_tail, val ); \
249 ring_start[ring_tail++] = val; \
250 ring_tail &= ring_tail_mask; \
253 // finish writing to ring buffer
254 #define WRITE_RB_FINISH \
255 cp->ring.tail = ring_tail; \
256 cp->ring.space -= ring_tail_increment; \
259 // submit indirect buffer for execution.
260 // the indirect buffer must not be used afterwards!
261 // buffer_idx - index of indirect buffer to submit
262 // buffer_size - size of indirect buffer in 32 bits
263 // state_buffer_idx - index of indirect buffer to restore required state
264 // state_buffer_size - size of indirect buffer to restore required state
265 // returns: tag of buffer (so you can wait for its execution)
266 // if no special state is required, set state_buffer_size to zero
267 void Radeon_SendIndirectBuffer( accelerator_info
*ai
,
268 int buffer_idx
, int buffer_size
,
269 int state_buffer_idx
, int state_buffer_size
, bool has_lock
)
271 CP_info
*cp
= &ai
->si
->cp
;
272 bool need_stateupdate
;
274 SHOW_FLOW( 3, "buffer_idx=%d, buffer_size=%d, state_buffer_idx=%d, state_buffer_size=%d",
275 buffer_idx
, buffer_size
, state_buffer_idx
, state_buffer_size
);
277 if( (buffer_size
& 1) != 0 ) {
278 SHOW_FLOW( 3, "buffer has uneven size (%d)", buffer_size
);
279 // size of indirect buffers _must_ be multiple of 64 bits, so
280 // add a nop to fulfil alignment
281 Radeon_GetIndirectBufferPtr( ai
, buffer_idx
)[buffer_size
] = RADEON_CP_PACKET2
;
286 ACQUIRE_BEN( cp
->lock
);
290 state_buffer_size
> 0 && state_buffer_idx
!= cp
->buffers
.active_state
;
292 WRITE_RB_START( 5 + (need_stateupdate
? 3 : 0) );
294 // if the indirect buffer to submit requires a special state and the
295 // hardware is in wrong state then execute state buffer
296 if( need_stateupdate
) {
297 SHOW_FLOW0( 3, "update state" );
299 WRITE_RB( CP_PACKET0( RADEON_CP_IB_BASE
, 2 ));
300 WRITE_RB( cp
->buffers
.vm_start
+
301 state_buffer_idx
* INDIRECT_BUFFER_SIZE
* sizeof( uint32
));
302 WRITE_RB( state_buffer_size
);
304 cp
->buffers
.active_state
= state_buffer_idx
;
307 // execute indirect buffer
308 WRITE_RB( CP_PACKET0( RADEON_CP_IB_BASE
, 2 ));
309 WRITE_RB( cp
->buffers
.vm_start
+ buffer_idx
* INDIRECT_BUFFER_SIZE
* sizeof( uint32
));
310 WRITE_RB( buffer_size
);
312 // give buffer a tag so it can be freed after execution
313 WRITE_RB( CP_PACKET0( RADEON_SCRATCH_REG1
, 1 ));
314 WRITE_RB( cp
->buffers
.buffers
[buffer_idx
].send_tag
= (int32
)++cp
->buffers
.cur_tag
);
316 SHOW_FLOW( 3, "Assigned tag %d", cp
->buffers
.buffers
[buffer_idx
].send_tag
);
320 // append buffer to list of submitted buffers
321 if( cp
->buffers
.newest
> 0 )
322 cp
->buffers
.buffers
[cp
->buffers
.newest
].next
= buffer_idx
;
324 cp
->buffers
.oldest
= buffer_idx
;
326 cp
->buffers
.newest
= buffer_idx
;
327 cp
->buffers
.buffers
[buffer_idx
].next
= -1;
329 // flush writes to CP buffers
330 // (this code is a bit of a overkill - currently, only some WinChip/Cyrix
331 // CPU's support out-of-order writes, but we are prepared)
332 // TODO : Other Architectures? PowerPC?
334 __asm__
__volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
336 // make sure the motherboard chipset has flushed its write buffer by
337 // reading some uncached memory
338 //(void)*(volatile int *)si->framebuffer;
339 INREG( ai
->regs
, RADEON_CP_RB_RPTR
);
341 //SHOW_FLOW( 3, "new tail: %d", cp->ring.tail );
345 // now, the command list should really be written to memory,
346 // so it's safe to instruct the graphics card to read it
347 OUTREG( ai
->regs
, RADEON_CP_RB_WPTR
, cp
->ring
.tail
);
349 // read from PCI bus to ensure correct posting
350 //INREG( ai->regs, RADEON_CP_RB_RPTR );
352 RELEASE_BEN( cp
->lock
);
354 SHOW_FLOW0( 3, "done" );
358 // mark state buffer as being invalid;
359 // this must be done _before_ modifying the state buffer as the
360 // state buffer may be in use
361 void Radeon_InvalidateStateBuffer( accelerator_info
*ai
, int state_buffer_idx
)
363 CP_info
*cp
= &ai
->si
->cp
;
365 // make sure state buffer is not used anymore
366 Radeon_WaitForIdle( ai
, false );
368 ACQUIRE_BEN( cp
->lock
);
370 // mark state as being invalid
371 if( cp
->buffers
.active_state
== state_buffer_idx
)
372 cp
->buffers
.active_state
= -1;
374 RELEASE_BEN( cp
->lock
);
378 // wait until there is enough space in ring buffer
379 // num_dwords - number of dwords needed in ring buffer
380 // must be called with benaphore hold
381 void Radeon_WaitForRingBufferSpace( accelerator_info
*ai
, uint num_dwords
)
383 bigtime_t start_time
;
384 CP_info
*cp
= &ai
->si
->cp
;
386 start_time
= system_time();
388 while( getAvailRingBuffer( ai
) < num_dwords
) {
389 bigtime_t sample_time
;
391 sample_time
= system_time();
393 if( sample_time
- start_time
> 100000 )
396 RELEASE_BEN( cp
->lock
);
398 // use exponential fall-off
399 // in the beginning do busy-waiting, later on we let the thread sleep;
400 // the micro-spin is used to reduce PCI load
401 if( sample_time
- start_time
> 5000 )
402 snooze( (sample_time
- start_time
) / 10 );
406 ACQUIRE_BEN( cp
->lock
);