Objects/obmalloc.c

   1 #include "Python.h"
   2
   3 #ifdef WITH_PYMALLOC
   4
   5 /* An object allocator for Python.
   6
   7    Here is an introduction to the layers of the Python memory architecture,
   8    showing where the object allocator is actually used (layer +2), It is
   9    called for every object allocation and deallocation (PyObject_New/Del),
  10    unless the object-specific allocators implement a proprietary allocation
  11    scheme (ex.: ints use a simple free list). This is also the place where
  12    the cyclic garbage collector operates selectively on container objects.
  13
  14
  15         Object-specific allocators
  16     _____   ______   ______       ________
  17    [ int ] [ dict ] [ list ] ... [ string ]       Python core         |
  18 +3 | <----- Object-specific memory -----> | <-- Non-object memory --> |
  19     _______________________________       |                           |
  20    [   Python's object allocator   ]      |                           |
  21 +2 | ####### Object memory ####### | <------ Internal buffers ------> |
  22     ______________________________________________________________    |
  23    [          Python's raw memory allocator (PyMem_ API)          ]   |
  24 +1 | <----- Python memory (under PyMem manager's control) ------> |   |
  25     __________________________________________________________________
  26    [    Underlying general-purpose allocator (ex: C library malloc)   ]
  27  0 | <------ Virtual memory allocated for the python process -------> |
  28
  29    =========================================================================
  30     _______________________________________________________________________
  31    [                OS-specific Virtual Memory Manager (VMM)               ]
  32 -1 | <--- Kernel dynamic storage allocation & management (page-based) ---> |
  33     __________________________________   __________________________________
  34    [                                  ] [                                  ]
  35 -2 | <-- Physical memory: ROM/RAM --> | | <-- Secondary storage (swap) --> |
  36
  37 */
  38 /*==========================================================================*/
  39
  40 /* A fast, special-purpose memory allocator for small blocks, to be used
  41    on top of a general-purpose malloc -- heavily based on previous art. */
  42
  43 /* Vladimir Marangozov -- August 2000 */
  44
  45 /*
  46  * "Memory management is where the rubber meets the road -- if we do the wrong
  47  * thing at any level, the results will not be good. And if we don't make the
  48  * levels work well together, we are in serious trouble." (1)
  49  *
  50  * (1) Paul R. Wilson, Mark S. Johnstone, Michael Neely, and David Boles,
  51  *    "Dynamic Storage Allocation: A Survey and Critical Review",
  52  *    in Proc. 1995 Int'l. Workshop on Memory Management, September 1995.
  53  */
  54
  55 /* #undef WITH_MEMORY_LIMITS */         /* disable mem limit checks  */
  56
  57 /*==========================================================================*/
  58
  59 /*
  60  * Allocation strategy abstract:
  61  *
  62  * For small requests, the allocator sub-allocates <Big> blocks of memory.
  63  * Requests greater than 256 bytes are routed to the system's allocator.
  64  *
  65  * Small requests are grouped in size classes spaced 8 bytes apart, due
  66  * to the required valid alignment of the returned address. Requests of
  67  * a particular size are serviced from memory pools of 4K (one VMM page).
  68  * Pools are fragmented on demand and contain free lists of blocks of one
  69  * particular size class. In other words, there is a fixed-size allocator
  70  * for each size class. Free pools are shared by the different allocators
  71  * thus minimizing the space reserved for a particular size class.
  72  *
  73  * This allocation strategy is a variant of what is known as "simple
  74  * segregated storage based on array of free lists". The main drawback of
  75  * simple segregated storage is that we might end up with lot of reserved
  76  * memory for the different free lists, which degenerate in time. To avoid
  77  * this, we partition each free list in pools and we share dynamically the
  78  * reserved space between all free lists. This technique is quite efficient
  79  * for memory intensive programs which allocate mainly small-sized blocks.
  80  *
  81  * For small requests we have the following table:
  82  *
  83  * Request in bytes     Size of allocated block      Size class idx
  84  * ----------------------------------------------------------------
  85  *        1-8                     8                       0
  86  *        9-16                   16                       1
  87  *       17-24                   24                       2
  88  *       25-32                   32                       3
  89  *       33-40                   40                       4
  90  *       41-48                   48                       5
  91  *       49-56                   56                       6
  92  *       57-64                   64                       7
  93  *       65-72                   72                       8
  94  *        ...                   ...                     ...
  95  *      241-248                 248                      30
  96  *      249-256                 256                      31
  97  *
  98  *      0, 257 and up: routed to the underlying allocator.
  99  */
 100
 101 /*==========================================================================*/
 102
 103 /*
 104  * -- Main tunable settings section --
 105  */
 106
 107 /*
 108  * Alignment of addresses returned to the user. 8-bytes alignment works
 109  * on most current architectures (with 32-bit or 64-bit address busses).
 110  * The alignment value is also used for grouping small requests in size
 111  * classes spaced ALIGNMENT bytes apart.
 112  *
 113  * You shouldn't change this unless you know what you are doing.
 114  */
 115 #define ALIGNMENT               8               /* must be 2^N */
 116 #define ALIGNMENT_SHIFT         3
 117 #define ALIGNMENT_MASK          (ALIGNMENT - 1)
 118
 119 /* Return the number of bytes in size class I, as a uint. */
 120 #define INDEX2SIZE(I) (((uint)(I) + 1) << ALIGNMENT_SHIFT)
 121
 122 /*
 123  * Max size threshold below which malloc requests are considered to be
 124  * small enough in order to use preallocated memory pools. You can tune
 125  * this value according to your application behaviour and memory needs.
 126  *
 127  * The following invariants must hold:
 128  *      1) ALIGNMENT <= SMALL_REQUEST_THRESHOLD <= 256
 129  *      2) SMALL_REQUEST_THRESHOLD is evenly divisible by ALIGNMENT
 130  *
 131  * Although not required, for better performance and space efficiency,
 132  * it is recommended that SMALL_REQUEST_THRESHOLD is set to a power of 2.
 133  */
 134 #define SMALL_REQUEST_THRESHOLD 256
 135 #define NB_SMALL_SIZE_CLASSES   (SMALL_REQUEST_THRESHOLD / ALIGNMENT)
 136
 137 /*
 138  * The system's VMM page size can be obtained on most unices with a
 139  * getpagesize() call or deduced from various header files. To make
 140  * things simpler, we assume that it is 4K, which is OK for most systems.
 141  * It is probably better if this is the native page size, but it doesn't
 142  * have to be.  In theory, if SYSTEM_PAGE_SIZE is larger than the native page
 143  * size, then `POOL_ADDR(p)->arenaindex' could rarely cause a segmentation
 144  * violation fault.  4K is apparently OK for all the platforms that python
 145  * currently targets.
 146  */
 147 #define SYSTEM_PAGE_SIZE        (4 * 1024)
 148 #define SYSTEM_PAGE_SIZE_MASK   (SYSTEM_PAGE_SIZE - 1)
 149
 150 /*
 151  * Maximum amount of memory managed by the allocator for small requests.
 152  */
 153 #ifdef WITH_MEMORY_LIMITS
 154 #ifndef SMALL_MEMORY_LIMIT
 155 #define SMALL_MEMORY_LIMIT      (64 * 1024 * 1024)      /* 64 MB -- more? */
 156 #endif
 157 #endif
 158
 159 /*
 160  * The allocator sub-allocates <Big> blocks of memory (called arenas) aligned
 161  * on a page boundary. This is a reserved virtual address space for the
 162  * current process (obtained through a malloc call). In no way this means
 163  * that the memory arenas will be used entirely. A malloc(<Big>) is usually
 164  * an address range reservation for <Big> bytes, unless all pages within this
 165  * space are referenced subsequently. So malloc'ing big blocks and not using
 166  * them does not mean "wasting memory". It's an addressable range wastage...
 167  *
 168  * Therefore, allocating arenas with malloc is not optimal, because there is
 169  * some address space wastage, but this is the most portable way to request
 170  * memory from the system across various platforms.
 171  */
 172 #define ARENA_SIZE              (256 << 10)     /* 256KB */
 173
 174 #ifdef WITH_MEMORY_LIMITS
 175 #define MAX_ARENAS              (SMALL_MEMORY_LIMIT / ARENA_SIZE)
 176 #endif
 177
 178 /*
 179  * Size of the pools used for small blocks. Should be a power of 2,
 180  * between 1K and SYSTEM_PAGE_SIZE, that is: 1k, 2k, 4k.
 181  */
 182 #define POOL_SIZE               SYSTEM_PAGE_SIZE        /* must be 2^N */
 183 #define POOL_SIZE_MASK          SYSTEM_PAGE_SIZE_MASK
 184
 185 /*
 186  * -- End of tunable settings section --
 187  */
 188
 189 /*==========================================================================*/
 190
 191 /*
 192  * Locking
 193  *
 194  * To reduce lock contention, it would probably be better to refine the
 195  * crude function locking with per size class locking. I'm not positive
 196  * however, whether it's worth switching to such locking policy because
 197  * of the performance penalty it might introduce.
 198  *
 199  * The following macros describe the simplest (should also be the fastest)
 200  * lock object on a particular platform and the init/fini/lock/unlock
 201  * operations on it. The locks defined here are not expected to be recursive
 202  * because it is assumed that they will always be called in the order:
 203  * INIT, [LOCK, UNLOCK]*, FINI.
 204  */
 205
 206 /*
 207  * Python's threads are serialized, so object malloc locking is disabled.
 208  */
 209 #define SIMPLELOCK_DECL(lock)   /* simple lock declaration              */
 210 #define SIMPLELOCK_INIT(lock)   /* allocate (if needed) and initialize  */
 211 #define SIMPLELOCK_FINI(lock)   /* free/destroy an existing lock        */
 212 #define SIMPLELOCK_LOCK(lock)   /* acquire released lock */
 213 #define SIMPLELOCK_UNLOCK(lock) /* release acquired lock */
 214
 215 /*
 216  * Basic types
 217  * I don't care if these are defined in <sys/types.h> or elsewhere. Axiom.
 218  */
 219 #undef  uchar
 220 #define uchar                   unsigned char   /* assuming == 8 bits  */
 221
 222 #undef  uint
 223 #define uint                    unsigned int    /* assuming >= 16 bits */
 224
 225 #undef  ulong
 226 #define ulong                   unsigned long   /* assuming >= 32 bits */
 227
 228 #undef uptr
 229 #define uptr                    Py_uintptr_t
 230
 231 /* When you say memory, my mind reasons in terms of (pointers to) blocks */
 232 typedef uchar block;
 233
 234 /* Pool for small blocks. */
 235 struct pool_header {
 236         union { block *_padding;
 237                 uint count; } ref;      /* number of allocated blocks    */
 238         block *freeblock;               /* pool's free list head         */
 239         struct pool_header *nextpool;   /* next pool of this size class  */
 240         struct pool_header *prevpool;   /* previous pool       ""        */
 241         uint arenaindex;                /* index into arenas of base adr */
 242         uint szidx;                     /* block size class index        */
 243         uint nextoffset;                /* bytes to virgin block         */
 244         uint maxnextoffset;             /* largest valid nextoffset      */
 245 };
 246
 247 typedef struct pool_header *poolp;
 248
 249 #undef  ROUNDUP
 250 #define ROUNDUP(x)              (((x) + ALIGNMENT_MASK) & ~ALIGNMENT_MASK)
 251 #define POOL_OVERHEAD           ROUNDUP(sizeof(struct pool_header))
 252
 253 #define DUMMY_SIZE_IDX          0xffff  /* size class of newly cached pools */
 254
 255 /* Round pointer P down to the closest pool-aligned address <= P, as a poolp */
 256 #define POOL_ADDR(P) ((poolp)((uptr)(P) & ~(uptr)POOL_SIZE_MASK))
 257
 258 /* Return total number of blocks in pool of size index I, as a uint. */
 259 #define NUMBLOCKS(I) ((uint)(POOL_SIZE - POOL_OVERHEAD) / INDEX2SIZE(I))
 260
 261 /*==========================================================================*/
 262
 263 /*
 264  * This malloc lock
 265  */
 266 SIMPLELOCK_DECL(_malloc_lock)
 267 #define LOCK()          SIMPLELOCK_LOCK(_malloc_lock)
 268 #define UNLOCK()        SIMPLELOCK_UNLOCK(_malloc_lock)
 269 #define LOCK_INIT()     SIMPLELOCK_INIT(_malloc_lock)
 270 #define LOCK_FINI()     SIMPLELOCK_FINI(_malloc_lock)
 271
 272 /*
 273  * Pool table -- headed, circular, doubly-linked lists of partially used pools.
 274
 275 This is involved.  For an index i, usedpools[i+i] is the header for a list of
 276 all partially used pools holding small blocks with "size class idx" i. So
 277 usedpools[0] corresponds to blocks of size 8, usedpools[2] to blocks of size
 278 16, and so on:  index 2*i <-> blocks of size (i+1)<<ALIGNMENT_SHIFT.
 279
 280 Pools are carved off the current arena highwater mark (file static arenabase)
 281 as needed.  Once carved off, a pool is in one of three states forever after:
 282
 283 used == partially used, neither empty nor full
 284     At least one block in the pool is currently allocated, and at least one
 285     block in the pool is not currently allocated (note this implies a pool
 286     has room for at least two blocks).
 287     This is a pool's initial state, as a pool is created only when malloc
 288     needs space.
 289     The pool holds blocks of a fixed size, and is in the circular list headed
 290     at usedpools[i] (see above).  It's linked to the other used pools of the
 291     same size class via the pool_header's nextpool and prevpool members.
 292     If all but one block is currently allocated, a malloc can cause a
 293     transition to the full state.  If all but one block is not currently
 294     allocated, a free can cause a transition to the empty state.
 295
 296 full == all the pool's blocks are currently allocated
 297     On transition to full, a pool is unlinked from its usedpools[] list.
 298     It's not linked to from anything then anymore, and its nextpool and
 299     prevpool members are meaningless until it transitions back to used.
 300     A free of a block in a full pool puts the pool back in the used state.
 301     Then it's linked in at the front of the appropriate usedpools[] list, so
 302     that the next allocation for its size class will reuse the freed block.
 303
 304 empty == all the pool's blocks are currently available for allocation
 305     On transition to empty, a pool is unlinked from its usedpools[] list,
 306     and linked to the front of the (file static) singly-linked freepools list,
 307     via its nextpool member.  The prevpool member has no meaning in this case.
 308     Empty pools have no inherent size class:  the next time a malloc finds
 309     an empty list in usedpools[], it takes the first pool off of freepools.
 310     If the size class needed happens to be the same as the size class the pool
 311     last had, some pool initialization can be skipped.
 312
 313
 314 Block Management
 315
 316 Blocks within pools are again carved out as needed.  pool->freeblock points to
 317 the start of a singly-linked list of free blocks within the pool.  When a
 318 block is freed, it's inserted at the front of its pool's freeblock list.  Note
 319 that the available blocks in a pool are *not* linked all together when a pool
 320 is initialized.  Instead only "the first two" (lowest addresses) blocks are
 321 set up, returning the first such block, and setting pool->freeblock to a
 322 one-block list holding the second such block.  This is consistent with that
 323 pymalloc strives at all levels (arena, pool, and block) never to touch a piece
 324 of memory until it's actually needed.
 325
 326 So long as a pool is in the used state, we're certain there *is* a block
 327 available for allocating, and pool->freeblock is not NULL.  If pool->freeblock
 328 points to the end of the free list before we've carved the entire pool into
 329 blocks, that means we simply haven't yet gotten to one of the higher-address
 330 blocks.  The offset from the pool_header to the start of "the next" virgin
 331 block is stored in the pool_header nextoffset member, and the largest value
 332 of nextoffset that makes sense is stored in the maxnextoffset member when a
 333 pool is initialized.  All the blocks in a pool have been passed out at least
 334 once when and only when nextoffset > maxnextoffset.
 335
 336
 337 Major obscurity:  While the usedpools vector is declared to have poolp
 338 entries, it doesn't really.  It really contains two pointers per (conceptual)
 339 poolp entry, the nextpool and prevpool members of a pool_header.  The
 340 excruciating initialization code below fools C so that
 341
 342     usedpool[i+i]
 343
 344 "acts like" a genuine poolp, but only so long as you only reference its
 345 nextpool and prevpool members.  The "- 2*sizeof(block *)" gibberish is
 346 compensating for that a pool_header's nextpool and prevpool members
 347 immediately follow a pool_header's first two members:
 348
 349         union { block *_padding;
 350                 uint count; } ref;
 351         block *freeblock;
 352
 353 each of which consume sizeof(block *) bytes.  So what usedpools[i+i] really
 354 contains is a fudged-up pointer p such that *if* C believes it's a poolp
 355 pointer, then p->nextpool and p->prevpool are both p (meaning that the headed
 356 circular list is empty).
 357
 358 It's unclear why the usedpools setup is so convoluted.  It could be to
 359 minimize the amount of cache required to hold this heavily-referenced table
 360 (which only *needs* the two interpool pointer members of a pool_header). OTOH,
 361 referencing code has to remember to "double the index" and doing so isn't
 362 free, usedpools[0] isn't a strictly legal pointer, and we're crucially relying
 363 on that C doesn't insert any padding anywhere in a pool_header at or before
 364 the prevpool member.
 365 **************************************************************************** */
 366
 367 #define PTA(x)  ((poolp )((uchar *)&(usedpools[2*(x)]) - 2*sizeof(block *)))
 368 #define PT(x)   PTA(x), PTA(x)
 369
 370 static poolp usedpools[2 * ((NB_SMALL_SIZE_CLASSES + 7) / 8) * 8] = {
 371         PT(0), PT(1), PT(2), PT(3), PT(4), PT(5), PT(6), PT(7)
 372 #if NB_SMALL_SIZE_CLASSES > 8
 373         , PT(8), PT(9), PT(10), PT(11), PT(12), PT(13), PT(14), PT(15)
 374 #if NB_SMALL_SIZE_CLASSES > 16
 375         , PT(16), PT(17), PT(18), PT(19), PT(20), PT(21), PT(22), PT(23)
 376 #if NB_SMALL_SIZE_CLASSES > 24
 377         , PT(24), PT(25), PT(26), PT(27), PT(28), PT(29), PT(30), PT(31)
 378 #if NB_SMALL_SIZE_CLASSES > 32
 379         , PT(32), PT(33), PT(34), PT(35), PT(36), PT(37), PT(38), PT(39)
 380 #if NB_SMALL_SIZE_CLASSES > 40
 381         , PT(40), PT(41), PT(42), PT(43), PT(44), PT(45), PT(46), PT(47)
 382 #if NB_SMALL_SIZE_CLASSES > 48
 383         , PT(48), PT(49), PT(50), PT(51), PT(52), PT(53), PT(54), PT(55)
 384 #if NB_SMALL_SIZE_CLASSES > 56
 385         , PT(56), PT(57), PT(58), PT(59), PT(60), PT(61), PT(62), PT(63)
 386 #endif /* NB_SMALL_SIZE_CLASSES > 56 */
 387 #endif /* NB_SMALL_SIZE_CLASSES > 48 */
 388 #endif /* NB_SMALL_SIZE_CLASSES > 40 */
 389 #endif /* NB_SMALL_SIZE_CLASSES > 32 */
 390 #endif /* NB_SMALL_SIZE_CLASSES > 24 */
 391 #endif /* NB_SMALL_SIZE_CLASSES > 16 */
 392 #endif /* NB_SMALL_SIZE_CLASSES >  8 */
 393 };
 394
 395 /*
 396  * Free (cached) pools
 397  */
 398 static poolp freepools = NULL;          /* free list for cached pools */
 399
 400 /*==========================================================================*/
 401 /* Arena management. */
 402
 403 /* arenas is a vector of arena base addresses, in order of allocation time.
 404  * arenas currently contains narenas entries, and has space allocated
 405  * for at most maxarenas entries.
 406  *
 407  * CAUTION:  See the long comment block about thread safety in new_arena():
 408  * the code currently relies in deep ways on that this vector only grows,
 409  * and only grows by appending at the end.  For now we never return an arena
 410  * to the OS.
 411  */
 412 static uptr *volatile arenas = NULL;    /* the pointer itself is volatile */
 413 static volatile uint narenas = 0;
 414 static uint maxarenas = 0;
 415
 416 /* Number of pools still available to be allocated in the current arena. */
 417 static uint nfreepools = 0;
 418
 419 /* Free space start address in current arena.  This is pool-aligned. */
 420 static block *arenabase = NULL;
 421
 422 /* Allocate a new arena and return its base address.  If we run out of
 423  * memory, return NULL.
 424  */
 425 static block *
 426 new_arena(void)
 427 {
 428         uint excess;    /* number of bytes above pool alignment */
 429         block *bp = (block *)malloc(ARENA_SIZE);
 430         if (bp == NULL)
 431                 return NULL;
 432
 433 #ifdef PYMALLOC_DEBUG
 434         if (Py_GETENV("PYTHONMALLOCSTATS"))
 435                 _PyObject_DebugMallocStats();
 436 #endif
 437
 438         /* arenabase <- first pool-aligned address in the arena
 439            nfreepools <- number of whole pools that fit after alignment */
 440         arenabase = bp;
 441         nfreepools = ARENA_SIZE / POOL_SIZE;
 442         assert(POOL_SIZE * nfreepools == ARENA_SIZE);
 443         excess = (uint) ((Py_uintptr_t)bp & POOL_SIZE_MASK);
 444         if (excess != 0) {
 445                 --nfreepools;
 446                 arenabase += POOL_SIZE - excess;
 447         }
 448
 449         /* Make room for a new entry in the arenas vector. */
 450         if (arenas == NULL) {
 451                 assert(narenas == 0 && maxarenas == 0);
 452                 arenas = (uptr *)malloc(16 * sizeof(*arenas));
 453                 if (arenas == NULL)
 454                         goto error;
 455                 maxarenas = 16;
 456         }
 457         else if (narenas == maxarenas) {
 458                 /* Grow arenas.
 459                  *
 460                  * Exceedingly subtle:  Someone may be calling the pymalloc
 461                  * free via PyMem_{DEL, Del, FREE, Free} without holding the
 462                  *.GIL.  Someone else may simultaneously be calling the
 463                  * pymalloc malloc while holding the GIL via, e.g.,
 464                  * PyObject_New.  Now the pymalloc free may index into arenas
 465                  * for an address check, while the pymalloc malloc calls
 466                  * new_arena and we end up here to grow a new arena *and*
 467                  * grow the arenas vector.  If the value for arenas pymalloc
 468                  * free picks up "vanishes" during this resize, anything may
 469                  * happen, and it would be an incredibly rare bug.  Therefore
 470                  * the code here takes great pains to make sure that, at every
 471                  * moment, arenas always points to an intact vector of
 472                  * addresses.  It doesn't matter whether arenas points to a
 473                  * wholly up-to-date vector when pymalloc free checks it in
 474                  * this case, because the only legal (and that even this is
 475                  * legal is debatable) way to call PyMem_{Del, etc} while not
 476                  * holding the GIL is if the memory being released is not
 477                  * object memory, i.e. if the address check in pymalloc free
 478                  * is supposed to fail.  Having an incomplete vector can't
 479                  * make a supposed-to-fail case succeed by mistake (it could
 480                  * only make a supposed-to-succeed case fail by mistake).
 481                  *
 482                  * In addition, without a lock we can't know for sure when
 483                  * an old vector is no longer referenced, so we simply let
 484                  * old vectors leak.
 485                  *
 486                  * And on top of that, since narenas and arenas can't be
 487                  * changed as-a-pair atomically without a lock, we're also
 488                  * careful to declare them volatile and ensure that we change
 489                  * arenas first.  This prevents another thread from picking
 490                  * up an narenas value too large for the arenas value it
 491                  * reads up (arenas never shrinks).
 492                  *
 493                  * Read the above 50 times before changing anything in this
 494                  * block.
 495                  */
 496                 uptr *p;
 497                 uint newmax = maxarenas << 1;
 498                 if (newmax <= maxarenas)        /* overflow */
 499                         goto error;
 500                 p = (uptr *)malloc(newmax * sizeof(*arenas));
 501                 if (p == NULL)
 502                         goto error;
 503                 memcpy(p, arenas, narenas * sizeof(*arenas));
 504                 arenas = p;     /* old arenas deliberately leaked */
 505                 maxarenas = newmax;
 506         }
 507
 508         /* Append the new arena address to arenas. */
 509         assert(narenas < maxarenas);
 510         arenas[narenas] = (uptr)bp;
 511         ++narenas;      /* can't overflow, since narenas < maxarenas before */
 512         return bp;
 513
 514 error:
 515         free(bp);
 516         nfreepools = 0;
 517         return NULL;
 518 }
 519
 520 /* Return true if and only if P is an address that was allocated by
 521  * pymalloc.  I must be the index into arenas that the address claims
 522  * to come from.
 523  *
 524  * Tricky:  Letting B be the arena base address in arenas[I], P belongs to the
 525  * arena if and only if
 526  *      B <= P < B + ARENA_SIZE
 527  * Subtracting B throughout, this is true iff
 528  *      0 <= P-B < ARENA_SIZE
 529  * By using unsigned arithmetic, the "0 <=" half of the test can be skipped.
 530  *
 531  * Obscure:  A PyMem "free memory" function can call the pymalloc free or
 532  * realloc before the first arena has been allocated.  arenas is still
 533  * NULL in that case.  We're relying on that narenas is also 0 in that case,
 534  * so the (I) < narenas must be false, saving us from trying to index into
 535  * a NULL arenas.
 536  */
 537 #define ADDRESS_IN_RANGE(P, I) \
 538         ((I) < narenas && (uptr)(P) - arenas[I] < (uptr)ARENA_SIZE)
 539
 540 /*==========================================================================*/
 541
 542 /* malloc.  Note that nbytes==0 tries to return a non-NULL pointer, distinct
 543  * from all other currently live pointers.  This may not be possible.
 544  */
 545
 546 /*
 547  * The basic blocks are ordered by decreasing execution frequency,
 548  * which minimizes the number of jumps in the most common cases,
 549  * improves branching prediction and instruction scheduling (small
 550  * block allocations typically result in a couple of instructions).
 551  * Unless the optimizer reorders everything, being too smart...
 552  */
 553
 554 #undef PyObject_Malloc
 555 void *
 556 PyObject_Malloc(size_t nbytes)
 557 {
 558         block *bp;
 559         poolp pool;
 560         poolp next;
 561         uint size;
 562
 563         /*
 564          * This implicitly redirects malloc(0).
 565          */
 566         if ((nbytes - 1) < SMALL_REQUEST_THRESHOLD) {
 567                 LOCK();
 568                 /*
 569                  * Most frequent paths first
 570                  */
 571                 size = (uint )(nbytes - 1) >> ALIGNMENT_SHIFT;
 572                 pool = usedpools[size + size];
 573                 if (pool != pool->nextpool) {
 574                         /*
 575                          * There is a used pool for this size class.
 576                          * Pick up the head block of its free list.
 577                          */
 578                         ++pool->ref.count;
 579                         bp = pool->freeblock;
 580                         assert(bp != NULL);
 581                         if ((pool->freeblock = *(block **)bp) != NULL) {
 582                                 UNLOCK();
 583                                 return (void *)bp;
 584                         }
 585                         /*
 586                          * Reached the end of the free list, try to extend it
 587                          */
 588                         if (pool->nextoffset <= pool->maxnextoffset) {
 589                                 /*
 590                                  * There is room for another block
 591                                  */
 592                                 pool->freeblock = (block *)pool +
 593                                                   pool->nextoffset;
 594                                 pool->nextoffset += INDEX2SIZE(size);
 595                                 *(block **)(pool->freeblock) = NULL;
 596                                 UNLOCK();
 597                                 return (void *)bp;
 598                         }
 599                         /*
 600                          * Pool is full, unlink from used pools
 601                          */
 602                         next = pool->nextpool;
 603                         pool = pool->prevpool;
 604                         next->prevpool = pool;
 605                         pool->nextpool = next;
 606                         UNLOCK();
 607                         return (void *)bp;
 608                 }
 609                 /*
 610                  * Try to get a cached free pool
 611                  */
 612                 pool = freepools;
 613                 if (pool != NULL) {
 614                         /*
 615                          * Unlink from cached pools
 616                          */
 617                         freepools = pool->nextpool;
 618                 init_pool:
 619                         /*
 620                          * Frontlink to used pools
 621                          */
 622                         next = usedpools[size + size]; /* == prev */
 623                         pool->nextpool = next;
 624                         pool->prevpool = next;
 625                         next->nextpool = pool;
 626                         next->prevpool = pool;
 627                         pool->ref.count = 1;
 628                         if (pool->szidx == size) {
 629                                 /*
 630                                  * Luckily, this pool last contained blocks
 631                                  * of the same size class, so its header
 632                                  * and free list are already initialized.
 633                                  */
 634                                 bp = pool->freeblock;
 635                                 pool->freeblock = *(block **)bp;
 636                                 UNLOCK();
 637                                 return (void *)bp;
 638                         }
 639                         /*
 640                          * Initialize the pool header, set up the free list to
 641                          * contain just the second block, and return the first
 642                          * block.
 643                          */
 644                         pool->szidx = size;
 645                         size = INDEX2SIZE(size);
 646                         bp = (block *)pool + POOL_OVERHEAD;
 647                         pool->nextoffset = POOL_OVERHEAD + (size << 1);
 648                         pool->maxnextoffset = POOL_SIZE - size;
 649                         pool->freeblock = bp + size;
 650                         *(block **)(pool->freeblock) = NULL;
 651                         UNLOCK();
 652                         return (void *)bp;
 653                 }
 654                 /*
 655                  * Allocate new pool
 656                  */
 657                 if (nfreepools) {
 658                 commit_pool:
 659                         --nfreepools;
 660                         pool = (poolp)arenabase;
 661                         arenabase += POOL_SIZE;
 662                         pool->arenaindex = narenas - 1;
 663                         pool->szidx = DUMMY_SIZE_IDX;
 664                         goto init_pool;
 665                 }
 666                 /*
 667                  * Allocate new arena
 668                  */
 669 #ifdef WITH_MEMORY_LIMITS
 670                 if (!(narenas < MAX_ARENAS)) {
 671                         UNLOCK();
 672                         goto redirect;
 673                 }
 674 #endif
 675                 bp = new_arena();
 676                 if (bp != NULL)
 677                         goto commit_pool;
 678                 UNLOCK();
 679                 goto redirect;
 680         }
 681
 682         /* The small block allocator ends here. */
 683
 684 redirect:
 685         /*
 686          * Redirect the original request to the underlying (libc) allocator.
 687          * We jump here on bigger requests, on error in the code above (as a
 688          * last chance to serve the request) or when the max memory limit
 689          * has been reached.
 690          */
 691         if (nbytes == 0)
 692                 nbytes = 1;
 693         return (void *)malloc(nbytes);
 694 }
 695
 696 /* free */
 697
 698 #undef PyObject_Free
 699 void
 700 PyObject_Free(void *p)
 701 {
 702         poolp pool;
 703         block *lastfree;
 704         poolp next, prev;
 705         uint size;
 706
 707         if (p == NULL)  /* free(NULL) has no effect */
 708                 return;
 709
 710         pool = POOL_ADDR(p);
 711         if (ADDRESS_IN_RANGE(p, pool->arenaindex)) {
 712                 /* We allocated this address. */
 713                 LOCK();
 714                 /*
 715                  * Link p to the start of the pool's freeblock list.  Since
 716                  * the pool had at least the p block outstanding, the pool
 717                  * wasn't empty (so it's already in a usedpools[] list, or
 718                  * was full and is in no list -- it's not in the freeblocks
 719                  * list in any case).
 720                  */
 721                 assert(pool->ref.count > 0);    /* else it was empty */
 722                 *(block **)p = lastfree = pool->freeblock;
 723                 pool->freeblock = (block *)p;
 724                 if (lastfree) {
 725                         /*
 726                          * freeblock wasn't NULL, so the pool wasn't full,
 727                          * and the pool is in a usedpools[] list.
 728                          */
 729                         if (--pool->ref.count != 0) {
 730                                 /* pool isn't empty:  leave it in usedpools */
 731                                 UNLOCK();
 732                                 return;
 733                         }
 734                         /*
 735                          * Pool is now empty:  unlink from usedpools, and
 736                          * link to the front of freepools.  This ensures that
 737                          * previously freed pools will be allocated later
 738                          * (being not referenced, they are perhaps paged out).
 739                          */
 740                         next = pool->nextpool;
 741                         prev = pool->prevpool;
 742                         next->prevpool = prev;
 743                         prev->nextpool = next;
 744                         /* Link to freepools.  This is a singly-linked list,
 745                          * and pool->prevpool isn't used there.
 746                          */
 747                         pool->nextpool = freepools;
 748                         freepools = pool;
 749                         UNLOCK();
 750                         return;
 751                 }
 752                 /*
 753                  * Pool was full, so doesn't currently live in any list:
 754                  * link it to the front of the appropriate usedpools[] list.
 755                  * This mimics LRU pool usage for new allocations and
 756                  * targets optimal filling when several pools contain
 757                  * blocks of the same size class.
 758                  */
 759                 --pool->ref.count;
 760                 assert(pool->ref.count > 0);    /* else the pool is empty */
 761                 size = pool->szidx;
 762                 next = usedpools[size + size];
 763                 prev = next->prevpool;
 764                 /* insert pool before next:   prev <-> pool <-> next */
 765                 pool->nextpool = next;
 766                 pool->prevpool = prev;
 767                 next->prevpool = pool;
 768                 prev->nextpool = pool;
 769                 UNLOCK();
 770                 return;
 771         }
 772
 773         /* We didn't allocate this address. */
 774         free(p);
 775 }
 776
 777 /* realloc.  If p is NULL, this acts like malloc(nbytes).  Else if nbytes==0,
 778  * then as the Python docs promise, we do not treat this like free(p), and
 779  * return a non-NULL result.
 780  */
 781
 782 #undef PyObject_Realloc
 783 void *
 784 PyObject_Realloc(void *p, size_t nbytes)
 785 {
 786         void *bp;
 787         poolp pool;
 788         uint size;
 789
 790         if (p == NULL)
 791                 return PyObject_Malloc(nbytes);
 792
 793         pool = POOL_ADDR(p);
 794         if (ADDRESS_IN_RANGE(p, pool->arenaindex)) {
 795                 /* We're in charge of this block */
 796                 size = INDEX2SIZE(pool->szidx);
 797                 if (nbytes <= size) {
 798                         /* The block is staying the same or shrinking.  If
 799                          * it's shrinking, there's a tradeoff:  it costs
 800                          * cycles to copy the block to a smaller size class,
 801                          * but it wastes memory not to copy it.  The
 802                          * compromise here is to copy on shrink only if at
 803                          * least 25% of size can be shaved off.
 804                          */
 805                         if (4 * nbytes > 3 * size) {
 806                                 /* It's the same,
 807                                  * or shrinking and new/old > 3/4.
 808                                  */
 809                                 return p;
 810                         }
 811                         size = nbytes;
 812                 }
 813                 bp = PyObject_Malloc(nbytes);
 814                 if (bp != NULL) {
 815                         memcpy(bp, p, size);
 816                         PyObject_Free(p);
 817                 }
 818                 return bp;
 819         }
 820         /* We're not managing this block. */
 821         if (nbytes <= SMALL_REQUEST_THRESHOLD) {
 822                 /* Take over this block -- ask for at least one byte so
 823                  * we really do take it over (PyObject_Malloc(0) goes to
 824                  * the system malloc).
 825                  */
 826                 bp = PyObject_Malloc(nbytes ? nbytes : 1);
 827                 if (bp != NULL) {
 828                         memcpy(bp, p, nbytes);
 829                         free(p);
 830                 }
 831                 else if (nbytes == 0) {
 832                         /* Meet the doc's promise that nbytes==0 will
 833                          * never return a NULL pointer when p isn't NULL.
 834                          */
 835                         bp = p;
 836                 }
 837
 838         }
 839         else {
 840                 assert(nbytes != 0);
 841                 bp = realloc(p, nbytes);
 842         }
 843         return bp;
 844 }
 845
 846 #else   /* ! WITH_PYMALLOC */
 847
 848 /*==========================================================================*/
 849 /* pymalloc not enabled:  Redirect the entry points to malloc.  These will
 850  * only be used by extensions that are compiled with pymalloc enabled. */
 851
 852 void *
 853 PyObject_Malloc(size_t n)
 854 {
 855         return PyMem_MALLOC(n);
 856 }
 857
 858 void *
 859 PyObject_Realloc(void *p, size_t n)
 860 {
 861         return PyMem_REALLOC(p, n);
 862 }
 863
 864 void
 865 PyObject_Free(void *p)
 866 {
 867         PyMem_FREE(p);
 868 }
 869 #endif /* WITH_PYMALLOC */
 870
 871 #ifdef PYMALLOC_DEBUG
 872 /*==========================================================================*/
 873 /* A x-platform debugging allocator.  This doesn't manage memory directly,
 874  * it wraps a real allocator, adding extra debugging info to the memory blocks.
 875  */
 876
 877 /* Special bytes broadcast into debug memory blocks at appropriate times.
 878  * Strings of these are unlikely to be valid addresses, floats, ints or
 879  * 7-bit ASCII.
 880  */
 881 #undef CLEANBYTE
 882 #undef DEADBYTE
 883 #undef FORBIDDENBYTE
 884 #define CLEANBYTE      0xCB    /* clean (newly allocated) memory */
 885 #define DEADBYTE       0xDB    /* dead (newly freed) memory */
 886 #define FORBIDDENBYTE  0xFB    /* untouchable bytes at each end of a block */
 887
 888 static ulong serialno = 0;      /* incremented on each debug {m,re}alloc */
 889
 890 /* serialno is always incremented via calling this routine.  The point is
 891    to supply a single place to set a breakpoint.
 892 */
 893 static void
 894 bumpserialno(void)
 895 {
 896         ++serialno;
 897 }
 898
 899
 900 /* Read 4 bytes at p as a big-endian ulong. */
 901 static ulong
 902 read4(const void *p)
 903 {
 904         const uchar *q = (const uchar *)p;
 905         return ((ulong)q[0] << 24) |
 906                ((ulong)q[1] << 16) |
 907                ((ulong)q[2] <<  8) |
 908                 (ulong)q[3];
 909 }
 910
 911 /* Write the 4 least-significant bytes of n as a big-endian unsigned int,
 912    MSB at address p, LSB at p+3. */
 913 static void
 914 write4(void *p, ulong n)
 915 {
 916         uchar *q = (uchar *)p;
 917         q[0] = (uchar)((n >> 24) & 0xff);
 918         q[1] = (uchar)((n >> 16) & 0xff);
 919         q[2] = (uchar)((n >>  8) & 0xff);
 920         q[3] = (uchar)( n        & 0xff);
 921 }
 922
 923 #ifdef Py_DEBUG
 924 /* Is target in the list?  The list is traversed via the nextpool pointers.
 925  * The list may be NULL-terminated, or circular.  Return 1 if target is in
 926  * list, else 0.
 927  */
 928 static int
 929 pool_is_in_list(const poolp target, poolp list)
 930 {
 931         poolp origlist = list;
 932         assert(target != NULL);
 933         if (list == NULL)
 934                 return 0;
 935         do {
 936                 if (target == list)
 937                         return 1;
 938                 list = list->nextpool;
 939         } while (list != NULL && list != origlist);
 940         return 0;
 941 }
 942
 943 #else
 944 #define pool_is_in_list(X, Y) 1
 945
 946 #endif  /* Py_DEBUG */
 947
 948 /* The debug malloc asks for 16 extra bytes and fills them with useful stuff,
 949    here calling the underlying malloc's result p:
 950
 951 p[0:4]
 952     Number of bytes originally asked for.  4-byte unsigned integer,
 953     big-endian (easier to read in a memory dump).
 954 p[4:8]
 955     Copies of FORBIDDENBYTE.  Used to catch under- writes and reads.
 956 p[8:8+n]
 957     The requested memory, filled with copies of CLEANBYTE.
 958     Used to catch reference to uninitialized memory.
 959     &p[8] is returned.  Note that this is 8-byte aligned if pymalloc
 960     handled the request itself.
 961 p[8+n:8+n+4]
 962     Copies of FORBIDDENBYTE.  Used to catch over- writes and reads.
 963 p[8+n+4:8+n+8]
 964     A serial number, incremented by 1 on each call to _PyObject_DebugMalloc
 965     and _PyObject_DebugRealloc.
 966     4-byte unsigned integer, big-endian.
 967     If "bad memory" is detected later, the serial number gives an
 968     excellent way to set a breakpoint on the next run, to capture the
 969     instant at which this block was passed out.
 970 */
 971
 972 void *
 973 _PyObject_DebugMalloc(size_t nbytes)
 974 {
 975         uchar *p;       /* base address of malloc'ed block */
 976         uchar *tail;    /* p + 8 + nbytes == pointer to tail pad bytes */
 977         size_t total;   /* nbytes + 16 */
 978
 979         bumpserialno();
 980         total = nbytes + 16;
 981         if (total < nbytes || (total >> 31) > 1) {
 982                 /* overflow, or we can't represent it in 4 bytes */
 983                 /* Obscure:  can't do (total >> 32) != 0 instead, because
 984                    C doesn't define what happens for a right-shift of 32
 985                    when size_t is a 32-bit type.  At least C guarantees
 986                    size_t is an unsigned type. */
 987                 return NULL;
 988         }
 989
 990         p = (uchar *)PyObject_Malloc(total);
 991         if (p == NULL)
 992                 return NULL;
 993
 994         write4(p, nbytes);
 995         p[4] = p[5] = p[6] = p[7] = FORBIDDENBYTE;
 996
 997         if (nbytes > 0)
 998                 memset(p+8, CLEANBYTE, nbytes);
 999
1000         tail = p + 8 + nbytes;
1001         tail[0] = tail[1] = tail[2] = tail[3] = FORBIDDENBYTE;
1002         write4(tail + 4, serialno);
1003
1004         return p+8;
1005 }
1006
1007 /* The debug free first checks the 8 bytes on each end for sanity (in
1008    particular, that the FORBIDDENBYTEs are still intact).
1009    Then fills the original bytes with DEADBYTE.
1010    Then calls the underlying free.
1011 */
1012 void
1013 _PyObject_DebugFree(void *p)
1014 {
1015         uchar *q = (uchar *)p;
1016         size_t nbytes;
1017
1018         if (p == NULL)
1019                 return;
1020         _PyObject_DebugCheckAddress(p);
1021         nbytes = read4(q-8);
1022         if (nbytes > 0)
1023                 memset(q, DEADBYTE, nbytes);
1024         PyObject_Free(q-8);
1025 }
1026
1027 void *
1028 _PyObject_DebugRealloc(void *p, size_t nbytes)
1029 {
1030         uchar *q = (uchar *)p;
1031         uchar *tail;
1032         size_t total;   /* nbytes + 16 */
1033         size_t original_nbytes;
1034
1035         if (p == NULL)
1036                 return _PyObject_DebugMalloc(nbytes);
1037
1038         _PyObject_DebugCheckAddress(p);
1039         bumpserialno();
1040         original_nbytes = read4(q-8);
1041         total = nbytes + 16;
1042         if (total < nbytes || (total >> 31) > 1) {
1043                 /* overflow, or we can't represent it in 4 bytes */
1044                 return NULL;
1045         }
1046
1047         if (nbytes < original_nbytes) {
1048                 /* shrinking:  mark old extra memory dead */
1049                 memset(q + nbytes, DEADBYTE, original_nbytes - nbytes);
1050         }
1051
1052         /* Resize and add decorations. */
1053         q = (uchar *)PyObject_Realloc(q-8, total);
1054         if (q == NULL)
1055                 return NULL;
1056
1057         write4(q, nbytes);
1058         assert(q[4] == FORBIDDENBYTE &&
1059                q[5] == FORBIDDENBYTE &&
1060                q[6] == FORBIDDENBYTE &&
1061                q[7] == FORBIDDENBYTE);
1062         q += 8;
1063         tail = q + nbytes;
1064         tail[0] = tail[1] = tail[2] = tail[3] = FORBIDDENBYTE;
1065         write4(tail + 4, serialno);
1066
1067         if (nbytes > original_nbytes) {
1068                 /* growing:  mark new extra memory clean */
1069                 memset(q + original_nbytes, CLEANBYTE,
1070                         nbytes - original_nbytes);
1071         }
1072
1073         return q;
1074 }
1075
1076 /* Check the forbidden bytes on both ends of the memory allocated for p.
1077  * If anything is wrong, print info to stderr via _PyObject_DebugDumpAddress,
1078  * and call Py_FatalError to kill the program.
1079  */
1080  void
1081 _PyObject_DebugCheckAddress(const void *p)
1082 {
1083         const uchar *q = (const uchar *)p;
1084         char *msg;
1085         ulong nbytes;
1086         const uchar *tail;
1087         int i;
1088
1089         if (p == NULL) {
1090                 msg = "didn't expect a NULL pointer";
1091                 goto error;
1092         }
1093
1094         /* Check the stuff at the start of p first:  if there's underwrite
1095          * corruption, the number-of-bytes field may be nuts, and checking
1096          * the tail could lead to a segfault then.
1097          */
1098         for (i = 4; i >= 1; --i) {
1099                 if (*(q-i) != FORBIDDENBYTE) {
1100                         msg = "bad leading pad byte";
1101                         goto error;
1102                 }
1103         }
1104
1105         nbytes = read4(q-8);
1106         tail = q + nbytes;
1107         for (i = 0; i < 4; ++i) {
1108                 if (tail[i] != FORBIDDENBYTE) {
1109                         msg = "bad trailing pad byte";
1110                         goto error;
1111                 }
1112         }
1113
1114         return;
1115
1116 error:
1117         _PyObject_DebugDumpAddress(p);
1118         Py_FatalError(msg);
1119 }
1120
1121 /* Display info to stderr about the memory block at p. */
1122 void
1123 _PyObject_DebugDumpAddress(const void *p)
1124 {
1125         const uchar *q = (const uchar *)p;
1126         const uchar *tail;
1127         ulong nbytes, serial;
1128         int i;
1129
1130         fprintf(stderr, "Debug memory block at address p=%p:\n", p);
1131         if (p == NULL)
1132                 return;
1133
1134         nbytes = read4(q-8);
1135         fprintf(stderr, "    %lu bytes originally requested\n", nbytes);
1136
1137         /* In case this is nuts, check the leading pad bytes first. */
1138         fputs("    The 4 pad bytes at p-4 are ", stderr);
1139         if (*(q-4) == FORBIDDENBYTE &&
1140             *(q-3) == FORBIDDENBYTE &&
1141             *(q-2) == FORBIDDENBYTE &&
1142             *(q-1) == FORBIDDENBYTE) {
1143                 fputs("FORBIDDENBYTE, as expected.\n", stderr);
1144         }
1145         else {
1146                 fprintf(stderr, "not all FORBIDDENBYTE (0x%02x):\n",
1147                         FORBIDDENBYTE);
1148                 for (i = 4; i >= 1; --i) {
1149                         const uchar byte = *(q-i);
1150                         fprintf(stderr, "        at p-%d: 0x%02x", i, byte);
1151                         if (byte != FORBIDDENBYTE)
1152                                 fputs(" *** OUCH", stderr);
1153                         fputc('\n', stderr);
1154                 }
1155
1156                 fputs("    Because memory is corrupted at the start, the "
1157                       "count of bytes requested\n"
1158                       "       may be bogus, and checking the trailing pad "
1159                       "bytes may segfault.\n", stderr);
1160         }
1161
1162         tail = q + nbytes;
1163         fprintf(stderr, "    The 4 pad bytes at tail=%p are ", tail);
1164         if (tail[0] == FORBIDDENBYTE &&
1165             tail[1] == FORBIDDENBYTE &&
1166             tail[2] == FORBIDDENBYTE &&
1167             tail[3] == FORBIDDENBYTE) {
1168                 fputs("FORBIDDENBYTE, as expected.\n", stderr);
1169         }
1170         else {
1171                 fprintf(stderr, "not all FORBIDDENBYTE (0x%02x):\n",
1172                         FORBIDDENBYTE);
1173                 for (i = 0; i < 4; ++i) {
1174                         const uchar byte = tail[i];
1175                         fprintf(stderr, "        at tail+%d: 0x%02x",
1176                                 i, byte);
1177                         if (byte != FORBIDDENBYTE)
1178                                 fputs(" *** OUCH", stderr);
1179                         fputc('\n', stderr);
1180                 }
1181         }
1182
1183         serial = read4(tail+4);
1184         fprintf(stderr, "    The block was made by call #%lu to "
1185                         "debug malloc/realloc.\n", serial);
1186
1187         if (nbytes > 0) {
1188                 int i = 0;
1189                 fputs("    Data at p:", stderr);
1190                 /* print up to 8 bytes at the start */
1191                 while (q < tail && i < 8) {
1192                         fprintf(stderr, " %02x", *q);
1193                         ++i;
1194                         ++q;
1195                 }
1196                 /* and up to 8 at the end */
1197                 if (q < tail) {
1198                         if (tail - q > 8) {
1199                                 fputs(" ...", stderr);
1200                                 q = tail - 8;
1201                         }
1202                         while (q < tail) {
1203                                 fprintf(stderr, " %02x", *q);
1204                                 ++q;
1205                         }
1206                 }
1207                 fputc('\n', stderr);
1208         }
1209 }
1210
1211 static ulong
1212 printone(const char* msg, ulong value)
1213 {
1214         int i, k;
1215         char buf[100];
1216         ulong origvalue = value;
1217
1218         fputs(msg, stderr);
1219         for (i = (int)strlen(msg); i < 35; ++i)
1220                 fputc(' ', stderr);
1221         fputc('=', stderr);
1222
1223         /* Write the value with commas. */
1224         i = 22;
1225         buf[i--] = '\0';
1226         buf[i--] = '\n';
1227         k = 3;
1228         do {
1229                 ulong nextvalue = value / 10UL;
1230                 uint digit = value - nextvalue * 10UL;
1231                 value = nextvalue;
1232                 buf[i--] = (char)(digit + '0');
1233                 --k;
1234                 if (k == 0 && value && i >= 0) {
1235                         k = 3;
1236                         buf[i--] = ',';
1237                 }
1238         } while (value && i >= 0);
1239
1240         while (i >= 0)
1241                 buf[i--] = ' ';
1242         fputs(buf, stderr);
1243
1244         return origvalue;
1245 }
1246
1247 /* Print summary info to stderr about the state of pymalloc's structures.
1248  * In Py_DEBUG mode, also perform some expensive internal consistency
1249  * checks.
1250  */
1251 void
1252 _PyObject_DebugMallocStats(void)
1253 {
1254         uint i;
1255         const uint numclasses = SMALL_REQUEST_THRESHOLD >> ALIGNMENT_SHIFT;
1256         /* # of pools, allocated blocks, and free blocks per class index */
1257         ulong numpools[SMALL_REQUEST_THRESHOLD >> ALIGNMENT_SHIFT];
1258         ulong numblocks[SMALL_REQUEST_THRESHOLD >> ALIGNMENT_SHIFT];
1259         ulong numfreeblocks[SMALL_REQUEST_THRESHOLD >> ALIGNMENT_SHIFT];
1260         /* total # of allocated bytes in used and full pools */
1261         ulong allocated_bytes = 0;
1262         /* total # of available bytes in used pools */
1263         ulong available_bytes = 0;
1264         /* # of free pools + pools not yet carved out of current arena */
1265         uint numfreepools = 0;
1266         /* # of bytes for arena alignment padding */
1267         ulong arena_alignment = 0;
1268         /* # of bytes in used and full pools used for pool_headers */
1269         ulong pool_header_bytes = 0;
1270         /* # of bytes in used and full pools wasted due to quantization,
1271          * i.e. the necessarily leftover space at the ends of used and
1272          * full pools.
1273          */
1274         ulong quantization = 0;
1275         /* running total -- should equal narenas * ARENA_SIZE */
1276         ulong total;
1277         char buf[128];
1278
1279         fprintf(stderr, "Small block threshold = %d, in %u size classes.\n",
1280                 SMALL_REQUEST_THRESHOLD, numclasses);
1281
1282         for (i = 0; i < numclasses; ++i)
1283                 numpools[i] = numblocks[i] = numfreeblocks[i] = 0;
1284
1285         /* Because full pools aren't linked to from anything, it's easiest
1286          * to march over all the arenas.  If we're lucky, most of the memory
1287          * will be living in full pools -- would be a shame to miss them.
1288          */
1289         for (i = 0; i < narenas; ++i) {
1290                 uint poolsinarena;
1291                 uint j;
1292                 uptr base = arenas[i];
1293
1294                 /* round up to pool alignment */
1295                 poolsinarena = ARENA_SIZE / POOL_SIZE;
1296                 if (base & (uptr)POOL_SIZE_MASK) {
1297                         --poolsinarena;
1298                         arena_alignment += POOL_SIZE;
1299                         base &= ~(uptr)POOL_SIZE_MASK;
1300                         base += POOL_SIZE;
1301                 }
1302
1303                 if (i == narenas - 1) {
1304                         /* current arena may have raw memory at the end */
1305                         numfreepools += nfreepools;
1306                         poolsinarena -= nfreepools;
1307                 }
1308
1309                 /* visit every pool in the arena */
1310                 for (j = 0; j < poolsinarena; ++j, base += POOL_SIZE) {
1311                         poolp p = (poolp)base;
1312                         const uint sz = p->szidx;
1313                         uint freeblocks;
1314
1315                         if (p->ref.count == 0) {
1316                                 /* currently unused */
1317                                 ++numfreepools;
1318                                 assert(pool_is_in_list(p, freepools));
1319                                 continue;
1320                         }
1321                         ++numpools[sz];
1322                         numblocks[sz] += p->ref.count;
1323                         freeblocks = NUMBLOCKS(sz) - p->ref.count;
1324                         numfreeblocks[sz] += freeblocks;
1325 #ifdef Py_DEBUG
1326                         if (freeblocks > 0)
1327                                 assert(pool_is_in_list(p, usedpools[sz + sz]));
1328 #endif
1329                 }
1330         }
1331
1332         fputc('\n', stderr);
1333         fputs("class   size   num pools   blocks in use  avail blocks\n"
1334               "-----   ----   ---------   -------------  ------------\n",
1335                 stderr);
1336
1337         for (i = 0; i < numclasses; ++i) {
1338                 ulong p = numpools[i];
1339                 ulong b = numblocks[i];
1340                 ulong f = numfreeblocks[i];
1341                 uint size = INDEX2SIZE(i);
1342                 if (p == 0) {
1343                         assert(b == 0 && f == 0);
1344                         continue;
1345                 }
1346                 fprintf(stderr, "%5u %6u %11lu %15lu %13lu\n",
1347                         i, size, p, b, f);
1348                 allocated_bytes += b * size;
1349                 available_bytes += f * size;
1350                 pool_header_bytes += p * POOL_OVERHEAD;
1351                 quantization += p * ((POOL_SIZE - POOL_OVERHEAD) % size);
1352         }
1353         fputc('\n', stderr);
1354         (void)printone("# times object malloc called", serialno);
1355
1356         PyOS_snprintf(buf, sizeof(buf),
1357                 "%u arenas * %d bytes/arena", narenas, ARENA_SIZE);
1358         (void)printone(buf, (ulong)narenas * ARENA_SIZE);
1359
1360         fputc('\n', stderr);
1361
1362         total = printone("# bytes in allocated blocks", allocated_bytes);
1363         total += printone("# bytes in available blocks", available_bytes);
1364
1365         PyOS_snprintf(buf, sizeof(buf),
1366                 "%u unused pools * %d bytes", numfreepools, POOL_SIZE);
1367         total += printone(buf, (ulong)numfreepools * POOL_SIZE);
1368
1369         total += printone("# bytes lost to pool headers", pool_header_bytes);
1370         total += printone("# bytes lost to quantization", quantization);
1371         total += printone("# bytes lost to arena alignment", arena_alignment);
1372         (void)printone("Total", total);
1373 }
1374
1375 #endif  /* PYMALLOC_DEBUG */