src/add-ons/accelerants/radeon/CP.c

   1 /*
   2         Copyright (c) 2002, Thomas Kurschel
   3
   4
   5         Part of Radeon accelerant
   6
   7         Command Processor handling
   8
   9
  10         Something about synchronization in general:
  11
  12         The DDK says that only some register accesses are stored in the
  13         Command FIFO, i.e. in almost all cases you don't have to wait until
  14         there is enough space in this FIFO. Unfortunately, ATI doesn't speak
  15         clearly here and doesn't tell you which registers are buffered and
  16         which not (the r300 DDK provides some examples only, other DDKs refer
  17         to some include file where no such info could be found).
  18
  19         Looking at pre-Radeon specs, we have the following register ranges:
  20                 0               configuration/display/multi-media registers
  21                 0xf00   read-only PCI configuration space
  22                 0x1000  CCE registers
  23                 0x1400  FIFOed GUI-registers
  24
  25         So, if the list is still correct, the affected registers are only
  26         those used for 2D/3D drawing.
  27
  28         This is very important as if the register you want to write is
  29         buffered, you have to do a busy wait until there is enough FIFO
  30         space. As concurrent threads may do the same, register access should
  31         only be done with a lock held. We never write GUI-registers directly,
  32         so we never have to wait for the FIFO and thus don't need this lock.
  33
  34 */
  35
  36 #include "radeon_accelerant.h"
  37 #include "mmio.h"
  38 #include "buscntrl_regs.h"
  39 #include "utils.h"
  40 #include <sys/ioctl.h>
  41 #include "CP.h"
  42
  43 #include "log_coll.h"
  44 #include "log_enum.h"
  45
  46 #include <string.h>
  47
  48
  49 // get number of free entries in CP's ring buffer
  50 static uint getAvailRingBuffer( accelerator_info *ai )
  51 {
  52         CP_info *cp = &ai->si->cp;
  53         int space;
  54
  55         space =
  56                 *(uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.head_mem_offset)
  57                 //*cp->ring.head
  58                 - cp->ring.tail;
  59         //space = INREG( ai->regs, RADEON_CP_RB_RPTR ) - cp->ring.tail;
  60
  61         if( space <= 0 )
  62                 space += cp->ring.size;
  63
  64         // don't fill up the entire buffer as we cannot
  65         // distinguish between a full and an empty ring
  66         --space;
  67
  68         SHOW_FLOW( 3, "head=%ld, tail=%ld, space=%ld",
  69                 *(uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.head_mem_offset),
  70                 //*cp->ring.head,
  71                 cp->ring.tail, space );
  72
  73         LOG1( si->log, _GetAvailRingBufferQueue, space );
  74
  75         cp->ring.space = space;
  76
  77         return space;
  78 }
  79
  80
  81 // mark all indirect buffers that have been processed as being free;
  82 // lock must be hold
  83 void Radeon_FreeIndirectBuffers( accelerator_info *ai )
  84 {
  85         CP_info *cp = &ai->si->cp;
  86         int32 cur_processed_tag =
  87                 ((uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.scratch_mem_offset))[1];
  88                 //ai->si->cp.scratch.ptr[1];
  89         //INREG( ai->regs, RADEON_SCRATCH_REG1 );
  90
  91         SHOW_FLOW( 3, "processed_tag=%d", cur_processed_tag );
  92
  93         // mark all sent indirect buffers as free
  94         while( cp->buffers.oldest != -1 ) {
  95                 indirect_buffer *oldest_buffer =
  96                         &cp->buffers.buffers[cp->buffers.oldest];
  97                 int tmp_oldest_buffer;
  98
  99                 SHOW_FLOW( 3, "oldset buffer's tag: %d", oldest_buffer->send_tag );
 100
 101                 // this is a tricky calculation to handle wrap-arounds correctly,
 102                 // so don't change it unless you really understand the signess problem
 103                 if( (int32)(cur_processed_tag - oldest_buffer->send_tag) < 0 )
 104                         break;
 105
 106                 SHOW_FLOW( 3, "mark %d as being free", oldest_buffer->send_tag );
 107
 108                 // remove buffer from "used" list
 109                 tmp_oldest_buffer = oldest_buffer->next;
 110
 111                 if( tmp_oldest_buffer == -1 )
 112                         cp->buffers.newest = -1;
 113
 114                 // put it on free list
 115                 oldest_buffer->next = cp->buffers.free_list;
 116                 cp->buffers.free_list = cp->buffers.oldest;
 117
 118                 cp->buffers.oldest = tmp_oldest_buffer;
 119         }
 120 }
 121
 122
 123 // wait until an indirect buffer becomes available;
 124 // lock must be hold
 125 static void Radeon_WaitForFreeIndirectBuffers( accelerator_info *ai )
 126 {
 127         bigtime_t start_time;
 128         CP_info *cp = &ai->si->cp;
 129
 130         SHOW_FLOW0( 3, "" );
 131
 132         start_time = system_time();
 133
 134         while( 1 ) {
 135                 bigtime_t sample_time;
 136
 137                 Radeon_FreeIndirectBuffers( ai );
 138
 139                 if( cp->buffers.free_list >= 0 )
 140                         return;
 141
 142                 sample_time = system_time();
 143
 144                 if( sample_time - start_time > 100000 )
 145                         break;
 146
 147                 RELEASE_BEN( cp->lock );
 148
 149                 // use exponential fall-off
 150                 // in the beginning do busy-waiting, later on we let the thread sleep;
 151                 // the micro-spin is used to reduce PCI load
 152                 if( sample_time - start_time > 5000 )
 153                         snooze( (sample_time - start_time) / 10 );
 154                 else
 155                         Radeon_Spin( 1 );
 156
 157                 ACQUIRE_BEN( cp->lock );
 158         }
 159
 160         SHOW_ERROR0( 0, "All buffers are in use and engine doesn't finish any of them" );
 161
 162         // lock must be released during reset (reset acquires it automatically)
 163         RELEASE_BEN( cp->lock );
 164         Radeon_ResetEngine( ai );
 165         ACQUIRE_BEN( cp->lock );
 166 }
 167
 168 // allocate an indirect buffer
 169 int Radeon_AllocIndirectBuffer( accelerator_info *ai, bool keep_lock )
 170 {
 171         CP_info *cp = &ai->si->cp;
 172         int buffer_idx;
 173
 174         SHOW_FLOW0( 3, "" );
 175
 176         ACQUIRE_BEN( cp->lock );
 177
 178         if( cp->buffers.free_list == -1 )
 179                 Radeon_WaitForFreeIndirectBuffers( ai );
 180
 181         buffer_idx = cp->buffers.free_list;
 182         cp->buffers.free_list = cp->buffers.buffers[buffer_idx].next;
 183
 184         //if( !keep_lock )
 185                 RELEASE_BEN( cp->lock );
 186         (void)keep_lock;
 187
 188         SHOW_FLOW( 3, "got %d", buffer_idx );
 189
 190         return buffer_idx;
 191 }
 192
 193
 194 // explicitely free an indirect buffer;
 195 // this is not needed if the buffer was send via SendIndirectBuffer()
 196 // never_used   -       set to true if the buffer wasn't even sent indirectly
 197 //                                      as a state buffer
 198 // !Warning!
 199 // if never_used is false, execution may take very long as all buffers
 200 // must be flushed!
 201 void Radeon_FreeIndirectBuffer( accelerator_info *ai, int buffer_idx, bool never_used )
 202 {
 203         CP_info *cp = &ai->si->cp;
 204
 205         SHOW_FLOW( 3, "buffer_idx=%d, never_used=%d", buffer_idx, never_used );
 206
 207         // if the buffer was used as a state buffer, we don't record its usage,
 208         // so we don't know if the buffer was/is/will be used;
 209         // the only way to be sure is to let the CP run dry
 210         if( !never_used )
 211                 Radeon_WaitForIdle( ai, false );
 212
 213         ACQUIRE_BEN( cp->lock );
 214
 215         cp->buffers.buffers[buffer_idx].next = cp->buffers.free_list;
 216         cp->buffers.free_list = buffer_idx;
 217
 218         RELEASE_BEN( cp->lock );
 219
 220         SHOW_FLOW0( 3, "done" );
 221 }
 222
 223 // this function must be moved to end of file to avoid inlining
 224 void Radeon_WaitForRingBufferSpace( accelerator_info *ai, uint num_dwords );
 225
 226
 227 // start writing to ring buffer
 228 // num_dwords - number of dwords to write (must be precise!)
 229 // !Warning!
 230 // during wait, CP's benaphore is released
 231 #define WRITE_RB_START( num_dwords ) \
 232         { \
 233                 uint32 *ring_start; \
 234                 uint32 ring_tail, ring_tail_mask; \
 235                 uint32 ring_tail_increment = (num_dwords); \
 236                 if( cp->ring.space < ring_tail_increment ) \
 237                         Radeon_WaitForRingBufferSpace( ai, ring_tail_increment ); \
 238                 ring_start = \
 239                 (uint32 *)(ai->mapped_memory[cp->ring.mem_type].data + cp->ring.mem_offset); \
 240                         /*cp->ring.start;*/ \
 241                 ring_tail = cp->ring.tail; \
 242                 ring_tail_mask = cp->ring.tail_mask;
 243
 244 // write single dword to ring buffer
 245 #define WRITE_RB( value ) \
 246         { \
 247                 uint32 val = (value); \
 248                 SHOW_FLOW( 3, "@%d: %x", ring_tail, val ); \
 249                 ring_start[ring_tail++] = val; \
 250                 ring_tail &= ring_tail_mask; \
 251         }
 252
 253 // finish writing to ring buffer
 254 #define WRITE_RB_FINISH \
 255                 cp->ring.tail = ring_tail; \
 256                 cp->ring.space -= ring_tail_increment; \
 257         }
 258
 259 // submit indirect buffer for execution.
 260 // the indirect buffer must not be used afterwards!
 261 // buffer_idx                   - index of indirect buffer to submit
 262 // buffer_size                  - size of indirect buffer in 32 bits
 263 // state_buffer_idx             - index of indirect buffer to restore required state
 264 // state_buffer_size    - size of indirect buffer to restore required state
 265 // returns:                               tag of buffer (so you can wait for its execution)
 266 // if no special state is required, set state_buffer_size to zero
 267 void Radeon_SendIndirectBuffer( accelerator_info *ai,
 268         int buffer_idx, int buffer_size,
 269         int state_buffer_idx, int state_buffer_size, bool has_lock )
 270 {
 271         CP_info *cp = &ai->si->cp;
 272         bool need_stateupdate;
 273
 274         SHOW_FLOW( 3, "buffer_idx=%d, buffer_size=%d, state_buffer_idx=%d, state_buffer_size=%d",
 275                 buffer_idx, buffer_size, state_buffer_idx, state_buffer_size );
 276
 277         if( (buffer_size & 1) != 0 ) {
 278                 SHOW_FLOW( 3, "buffer has uneven size (%d)", buffer_size );
 279                 // size of indirect buffers _must_ be multiple of 64 bits, so
 280                 // add a nop to fulfil alignment
 281                 Radeon_GetIndirectBufferPtr( ai, buffer_idx )[buffer_size] = RADEON_CP_PACKET2;
 282                 buffer_size += 1;
 283         }
 284
 285         //if( !has_lock )
 286                 ACQUIRE_BEN( cp->lock );
 287         (void)has_lock;
 288
 289         need_stateupdate =
 290                 state_buffer_size > 0 && state_buffer_idx != cp->buffers.active_state;
 291
 292         WRITE_RB_START( 5 + (need_stateupdate ? 3 : 0) );
 293
 294         // if the indirect buffer to submit requires a special state and the
 295         // hardware is in wrong state then execute state buffer
 296         if( need_stateupdate ) {
 297                 SHOW_FLOW0( 3, "update state" );
 298
 299                 WRITE_RB( CP_PACKET0( RADEON_CP_IB_BASE, 2 ));
 300                 WRITE_RB( cp->buffers.vm_start +
 301                         state_buffer_idx * INDIRECT_BUFFER_SIZE * sizeof( uint32 ));
 302                 WRITE_RB( state_buffer_size );
 303
 304                 cp->buffers.active_state = state_buffer_idx;
 305         }
 306
 307         // execute indirect buffer
 308         WRITE_RB( CP_PACKET0( RADEON_CP_IB_BASE, 2 ));
 309         WRITE_RB( cp->buffers.vm_start + buffer_idx * INDIRECT_BUFFER_SIZE * sizeof( uint32 ));
 310         WRITE_RB( buffer_size );
 311
 312         // give buffer a tag so it can be freed after execution
 313         WRITE_RB( CP_PACKET0( RADEON_SCRATCH_REG1, 1 ));
 314         WRITE_RB( cp->buffers.buffers[buffer_idx].send_tag = (int32)++cp->buffers.cur_tag );
 315
 316         SHOW_FLOW( 3, "Assigned tag %d", cp->buffers.buffers[buffer_idx].send_tag );
 317
 318         WRITE_RB_FINISH;
 319
 320         // append buffer to list of submitted buffers
 321         if( cp->buffers.newest > 0 )
 322                 cp->buffers.buffers[cp->buffers.newest].next = buffer_idx;
 323         else
 324                 cp->buffers.oldest = buffer_idx;
 325
 326         cp->buffers.newest = buffer_idx;
 327         cp->buffers.buffers[buffer_idx].next = -1;
 328
 329         // flush writes to CP buffers
 330         // (this code is a bit of a overkill - currently, only some WinChip/Cyrix
 331         //  CPU's support out-of-order writes, but we are prepared)
 332         // TODO : Other Architectures? PowerPC?
 333         #ifdef __INTEL__
 334         __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
 335         #endif
 336         // make sure the motherboard chipset has flushed its write buffer by
 337         // reading some uncached memory
 338         //(void)*(volatile int *)si->framebuffer;
 339         INREG( ai->regs, RADEON_CP_RB_RPTR );
 340
 341         //SHOW_FLOW( 3, "new tail: %d", cp->ring.tail );
 342
 343         //snooze( 100 );
 344
 345         // now, the command list should really be written to memory,
 346         // so it's safe to instruct the graphics card to read it
 347         OUTREG( ai->regs, RADEON_CP_RB_WPTR, cp->ring.tail );
 348
 349         // read from PCI bus to ensure correct posting
 350         //INREG( ai->regs, RADEON_CP_RB_RPTR );
 351
 352         RELEASE_BEN( cp->lock );
 353
 354         SHOW_FLOW0( 3, "done" );
 355 }
 356
 357
 358 // mark state buffer as being invalid;
 359 // this must be done _before_ modifying the state buffer as the
 360 // state buffer may be in use
 361 void Radeon_InvalidateStateBuffer( accelerator_info *ai, int state_buffer_idx )
 362 {
 363         CP_info *cp = &ai->si->cp;
 364
 365         // make sure state buffer is not used anymore
 366         Radeon_WaitForIdle( ai, false );
 367
 368         ACQUIRE_BEN( cp->lock );
 369
 370         // mark state as being invalid
 371         if( cp->buffers.active_state == state_buffer_idx )
 372                 cp->buffers.active_state = -1;
 373
 374         RELEASE_BEN( cp->lock );
 375 }
 376
 377
 378 // wait until there is enough space in ring buffer
 379 // num_dwords - number of dwords needed in ring buffer
 380 // must be called with benaphore hold
 381 void Radeon_WaitForRingBufferSpace( accelerator_info *ai, uint num_dwords )
 382 {
 383         bigtime_t start_time;
 384         CP_info *cp = &ai->si->cp;
 385
 386         start_time = system_time();
 387
 388         while( getAvailRingBuffer( ai ) < num_dwords ) {
 389                 bigtime_t sample_time;
 390
 391                 sample_time = system_time();
 392
 393                 if( sample_time - start_time > 100000 )
 394                         break;
 395
 396                 RELEASE_BEN( cp->lock );
 397
 398                 // use exponential fall-off
 399                 // in the beginning do busy-waiting, later on we let the thread sleep;
 400                 // the micro-spin is used to reduce PCI load
 401                 if( sample_time - start_time > 5000 )
 402                         snooze( (sample_time - start_time) / 10 );
 403                 else
 404                         Radeon_Spin( 1 );
 405
 406                 ACQUIRE_BEN( cp->lock );
 407         }
 408 }