src/vdbesort.c

   1 /*
   2 ** 2011 July 9
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 *************************************************************************
  12 ** This file contains code for the VdbeSorter object, used in concert with
  13 ** a VdbeCursor to sort large numbers of keys (as may be required, for
  14 ** example, by CREATE INDEX statements on tables too large to fit in main
  15 ** memory).
  16 */
  17
  18 #include "sqliteInt.h"
  19 #include "vdbeInt.h"
  20
  21 #ifndef SQLITE_OMIT_MERGE_SORT
  22
  23 typedef struct VdbeSorterIter VdbeSorterIter;
  24 typedef struct SorterRecord SorterRecord;
  25
  26 /*
  27 ** NOTES ON DATA STRUCTURE USED FOR N-WAY MERGES:
  28 **
  29 ** As keys are added to the sorter, they are written to disk in a series
  30 ** of sorted packed-memory-arrays (PMAs). The size of each PMA is roughly
  31 ** the same as the cache-size allowed for temporary databases. In order
  32 ** to allow the caller to extract keys from the sorter in sorted order,
  33 ** all PMAs currently stored on disk must be merged together. This comment
  34 ** describes the data structure used to do so. The structure supports
  35 ** merging any number of arrays in a single pass with no redundant comparison
  36 ** operations.
  37 **
  38 ** The aIter[] array contains an iterator for each of the PMAs being merged.
  39 ** An aIter[] iterator either points to a valid key or else is at EOF. For
  40 ** the purposes of the paragraphs below, we assume that the array is actually
  41 ** N elements in size, where N is the smallest power of 2 greater to or equal
  42 ** to the number of iterators being merged. The extra aIter[] elements are
  43 ** treated as if they are empty (always at EOF).
  44 **
  45 ** The aTree[] array is also N elements in size. The value of N is stored in
  46 ** the VdbeSorter.nTree variable.
  47 **
  48 ** The final (N/2) elements of aTree[] contain the results of comparing
  49 ** pairs of iterator keys together. Element i contains the result of
  50 ** comparing aIter[2*i-N] and aIter[2*i-N+1]. Whichever key is smaller, the
  51 ** aTree element is set to the index of it.
  52 **
  53 ** For the purposes of this comparison, EOF is considered greater than any
  54 ** other key value. If the keys are equal (only possible with two EOF
  55 ** values), it doesn't matter which index is stored.
  56 **
  57 ** The (N/4) elements of aTree[] that preceed the final (N/2) described
  58 ** above contains the index of the smallest of each block of 4 iterators.
  59 ** And so on. So that aTree[1] contains the index of the iterator that
  60 ** currently points to the smallest key value. aTree[0] is unused.
  61 **
  62 ** Example:
  63 **
  64 **     aIter[0] -> Banana
  65 **     aIter[1] -> Feijoa
  66 **     aIter[2] -> Elderberry
  67 **     aIter[3] -> Currant
  68 **     aIter[4] -> Grapefruit
  69 **     aIter[5] -> Apple
  70 **     aIter[6] -> Durian
  71 **     aIter[7] -> EOF
  72 **
  73 **     aTree[] = { X, 5   0, 5    0, 3, 5, 6 }
  74 **
  75 ** The current element is "Apple" (the value of the key indicated by
  76 ** iterator 5). When the Next() operation is invoked, iterator 5 will
  77 ** be advanced to the next key in its segment. Say the next key is
  78 ** "Eggplant":
  79 **
  80 **     aIter[5] -> Eggplant
  81 **
  82 ** The contents of aTree[] are updated first by comparing the new iterator
  83 ** 5 key to the current key of iterator 4 (still "Grapefruit"). The iterator
  84 ** 5 value is still smaller, so aTree[6] is set to 5. And so on up the tree.
  85 ** The value of iterator 6 - "Durian" - is now smaller than that of iterator
  86 ** 5, so aTree[3] is set to 6. Key 0 is smaller than key 6 (Banana<Durian),
  87 ** so the value written into element 1 of the array is 0. As follows:
  88 **
  89 **     aTree[] = { X, 0   0, 6    0, 3, 5, 6 }
  90 **
  91 ** In other words, each time we advance to the next sorter element, log2(N)
  92 ** key comparison operations are required, where N is the number of segments
  93 ** being merged (rounded up to the next power of 2).
  94 */
  95 struct VdbeSorter {
  96   i64 iWriteOff;                  /* Current write offset within file pTemp1 */
  97   i64 iReadOff;                   /* Current read offset within file pTemp1 */
  98   int nInMemory;                  /* Current size of pRecord list as PMA */
  99   int nTree;                      /* Used size of aTree/aIter (power of 2) */
 100   int nPMA;                       /* Number of PMAs stored in pTemp1 */
 101   int mnPmaSize;                  /* Minimum PMA size, in bytes */
 102   int mxPmaSize;                  /* Maximum PMA size, in bytes.  0==no limit */
 103   VdbeSorterIter *aIter;          /* Array of iterators to merge */
 104   int *aTree;                     /* Current state of incremental merge */
 105   sqlite3_file *pTemp1;           /* PMA file 1 */
 106   SorterRecord *pRecord;          /* Head of in-memory record list */
 107   UnpackedRecord *pUnpacked;      /* Used to unpack keys */
 108 };
 109
 110 /*
 111 ** The following type is an iterator for a PMA. It caches the current key in
 112 ** variables nKey/aKey. If the iterator is at EOF, pFile==0.
 113 */
 114 struct VdbeSorterIter {
 115   i64 iReadOff;                   /* Current read offset */
 116   i64 iEof;                       /* 1 byte past EOF for this iterator */
 117   int nAlloc;                     /* Bytes of space at aAlloc */
 118   int nKey;                       /* Number of bytes in key */
 119   sqlite3_file *pFile;            /* File iterator is reading from */
 120   u8 *aAlloc;                     /* Allocated space */
 121   u8 *aKey;                       /* Pointer to current key */
 122 };
 123
 124 /*
 125 ** A structure to store a single record. All in-memory records are connected
 126 ** together into a linked list headed at VdbeSorter.pRecord using the
 127 ** SorterRecord.pNext pointer.
 128 */
 129 struct SorterRecord {
 130   void *pVal;
 131   int nVal;
 132   SorterRecord *pNext;
 133 };
 134
 135 /* Minimum allowable value for the VdbeSorter.nWorking variable */
 136 #define SORTER_MIN_WORKING 10
 137
 138 /* Maximum number of segments to merge in a single pass. */
 139 #define SORTER_MAX_MERGE_COUNT 16
 140
 141 /*
 142 ** Free all memory belonging to the VdbeSorterIter object passed as the second
 143 ** argument. All structure fields are set to zero before returning.
 144 */
 145 static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
 146   sqlite3DbFree(db, pIter->aAlloc);
 147   memset(pIter, 0, sizeof(VdbeSorterIter));
 148 }
 149
 150 /*
 151 ** Advance iterator pIter to the next key in its PMA. Return SQLITE_OK if
 152 ** no error occurs, or an SQLite error code if one does.
 153 */
 154 static int vdbeSorterIterNext(
 155   sqlite3 *db,                    /* Database handle (for sqlite3DbMalloc() ) */
 156   VdbeSorterIter *pIter           /* Iterator to advance */
 157 ){
 158   int rc;                         /* Return Code */
 159   int nRead;                      /* Number of bytes read */
 160   int nRec = 0;                   /* Size of record in bytes */
 161   int iOff = 0;                   /* Size of serialized size varint in bytes */
 162
 163   assert( pIter->iEof>=pIter->iReadOff );
 164   if( pIter->iEof-pIter->iReadOff>5 ){
 165     nRead = 5;
 166   }else{
 167     nRead = (int)(pIter->iEof - pIter->iReadOff);
 168   }
 169   if( nRead<=0 ){
 170     /* This is an EOF condition */
 171     vdbeSorterIterZero(db, pIter);
 172     return SQLITE_OK;
 173   }
 174
 175   rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff);
 176   if( rc==SQLITE_OK ){
 177     iOff = getVarint32(pIter->aAlloc, nRec);
 178     if( (iOff+nRec)>nRead ){
 179       int nRead2;                   /* Number of extra bytes to read */
 180       if( (iOff+nRec)>pIter->nAlloc ){
 181         int nNew = pIter->nAlloc*2;
 182         while( (iOff+nRec)>nNew ) nNew = nNew*2;
 183         pIter->aAlloc = sqlite3DbReallocOrFree(db, pIter->aAlloc, nNew);
 184         if( !pIter->aAlloc ) return SQLITE_NOMEM;
 185         pIter->nAlloc = nNew;
 186       }
 187
 188       nRead2 = iOff + nRec - nRead;
 189       rc = sqlite3OsRead(
 190           pIter->pFile, &pIter->aAlloc[nRead], nRead2, pIter->iReadOff+nRead
 191       );
 192     }
 193   }
 194
 195   assert( rc!=SQLITE_OK || nRec>0 );
 196   pIter->iReadOff += iOff+nRec;
 197   pIter->nKey = nRec;
 198   pIter->aKey = &pIter->aAlloc[iOff];
 199   return rc;
 200 }
 201
 202 /*
 203 ** Write a single varint, value iVal, to file-descriptor pFile. Return
 204 ** SQLITE_OK if successful, or an SQLite error code if some error occurs.
 205 **
 206 ** The value of *piOffset when this function is called is used as the byte
 207 ** offset in file pFile to write to. Before returning, *piOffset is
 208 ** incremented by the number of bytes written.
 209 */
 210 static int vdbeSorterWriteVarint(
 211   sqlite3_file *pFile,            /* File to write to */
 212   i64 iVal,                       /* Value to write as a varint */
 213   i64 *piOffset                   /* IN/OUT: Write offset in file pFile */
 214 ){
 215   u8 aVarint[9];                  /* Buffer large enough for a varint */
 216   int nVarint;                    /* Number of used bytes in varint */
 217   int rc;                         /* Result of write() call */
 218
 219   nVarint = sqlite3PutVarint(aVarint, iVal);
 220   rc = sqlite3OsWrite(pFile, aVarint, nVarint, *piOffset);
 221   *piOffset += nVarint;
 222
 223   return rc;
 224 }
 225
 226 /*
 227 ** Read a single varint from file-descriptor pFile. Return SQLITE_OK if
 228 ** successful, or an SQLite error code if some error occurs.
 229 **
 230 ** The value of *piOffset when this function is called is used as the
 231 ** byte offset in file pFile from whence to read the varint. If successful
 232 ** (i.e. if no IO error occurs), then *piOffset is set to the offset of
 233 ** the first byte past the end of the varint before returning. *piVal is
 234 ** set to the integer value read. If an error occurs, the final values of
 235 ** both *piOffset and *piVal are undefined.
 236 */
 237 static int vdbeSorterReadVarint(
 238   sqlite3_file *pFile,            /* File to read from */
 239   i64 *piOffset,                  /* IN/OUT: Read offset in pFile */
 240   i64 *piVal                      /* OUT: Value read from file */
 241 ){
 242   u8 aVarint[9];                  /* Buffer large enough for a varint */
 243   i64 iOff = *piOffset;           /* Offset in file to read from */
 244   int rc;                         /* Return code */
 245
 246   rc = sqlite3OsRead(pFile, aVarint, 9, iOff);
 247   if( rc==SQLITE_OK ){
 248     *piOffset += getVarint(aVarint, (u64 *)piVal);
 249   }
 250
 251   return rc;
 252 }
 253
 254 /*
 255 ** Initialize iterator pIter to scan through the PMA stored in file pFile
 256 ** starting at offset iStart and ending at offset iEof-1. This function
 257 ** leaves the iterator pointing to the first key in the PMA (or EOF if the
 258 ** PMA is empty).
 259 */
 260 static int vdbeSorterIterInit(
 261   sqlite3 *db,                    /* Database handle */
 262   VdbeSorter *pSorter,            /* Sorter object */
 263   i64 iStart,                     /* Start offset in pFile */
 264   VdbeSorterIter *pIter,          /* Iterator to populate */
 265   i64 *pnByte                     /* IN/OUT: Increment this value by PMA size */
 266 ){
 267   int rc;
 268
 269   assert( pSorter->iWriteOff>iStart );
 270   assert( pIter->aAlloc==0 );
 271   pIter->pFile = pSorter->pTemp1;
 272   pIter->iReadOff = iStart;
 273   pIter->nAlloc = 128;
 274   pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
 275   if( !pIter->aAlloc ){
 276     rc = SQLITE_NOMEM;
 277   }else{
 278     i64 nByte;                         /* Total size of PMA in bytes */
 279     rc = vdbeSorterReadVarint(pSorter->pTemp1, &pIter->iReadOff, &nByte);
 280     *pnByte += nByte;
 281     pIter->iEof = pIter->iReadOff + nByte;
 282   }
 283   if( rc==SQLITE_OK ){
 284     rc = vdbeSorterIterNext(db, pIter);
 285   }
 286   return rc;
 287 }
 288
 289
 290 /*
 291 ** Compare key1 (buffer pKey1, size nKey1 bytes) with key2 (buffer pKey2,
 292 ** size nKey2 bytes).  Argument pKeyInfo supplies the collation functions
 293 ** used by the comparison. If an error occurs, return an SQLite error code.
 294 ** Otherwise, return SQLITE_OK and set *pRes to a negative, zero or positive
 295 ** value, depending on whether key1 is smaller, equal to or larger than key2.
 296 **
 297 ** If the bOmitRowid argument is non-zero, assume both keys end in a rowid
 298 ** field. For the purposes of the comparison, ignore it. Also, if bOmitRowid
 299 ** is true and key1 contains even a single NULL value, it is considered to
 300 ** be less than key2. Even if key2 also contains NULL values.
 301 **
 302 ** If pKey2 is passed a NULL pointer, then it is assumed that the pCsr->aSpace
 303 ** has been allocated and contains an unpacked record that is used as key2.
 304 */
 305 static void vdbeSorterCompare(
 306   VdbeCursor *pCsr,               /* Cursor object (for pKeyInfo) */
 307   int bOmitRowid,                 /* Ignore rowid field at end of keys */
 308   void *pKey1, int nKey1,         /* Left side of comparison */
 309   void *pKey2, int nKey2,         /* Right side of comparison */
 310   int *pRes                       /* OUT: Result of comparison */
 311 ){
 312   KeyInfo *pKeyInfo = pCsr->pKeyInfo;
 313   VdbeSorter *pSorter = pCsr->pSorter;
 314   UnpackedRecord *r2 = pSorter->pUnpacked;
 315   int i;
 316
 317   if( pKey2 ){
 318     sqlite3VdbeRecordUnpack(pKeyInfo, nKey2, pKey2, r2);
 319   }
 320
 321   if( bOmitRowid ){
 322     r2->nField = pKeyInfo->nField;
 323     assert( r2->nField>0 );
 324     for(i=0; i<r2->nField; i++){
 325       if( r2->aMem[i].flags & MEM_Null ){
 326         *pRes = -1;
 327         return;
 328       }
 329     }
 330     r2->flags |= UNPACKED_PREFIX_MATCH;
 331   }
 332
 333   *pRes = sqlite3VdbeRecordCompare(nKey1, pKey1, r2);
 334 }
 335
 336 /*
 337 ** This function is called to compare two iterator keys when merging
 338 ** multiple b-tree segments. Parameter iOut is the index of the aTree[]
 339 ** value to recalculate.
 340 */
 341 static int vdbeSorterDoCompare(VdbeCursor *pCsr, int iOut){
 342   VdbeSorter *pSorter = pCsr->pSorter;
 343   int i1;
 344   int i2;
 345   int iRes;
 346   VdbeSorterIter *p1;
 347   VdbeSorterIter *p2;
 348
 349   assert( iOut<pSorter->nTree && iOut>0 );
 350
 351   if( iOut>=(pSorter->nTree/2) ){
 352     i1 = (iOut - pSorter->nTree/2) * 2;
 353     i2 = i1 + 1;
 354   }else{
 355     i1 = pSorter->aTree[iOut*2];
 356     i2 = pSorter->aTree[iOut*2+1];
 357   }
 358
 359   p1 = &pSorter->aIter[i1];
 360   p2 = &pSorter->aIter[i2];
 361
 362   if( p1->pFile==0 ){
 363     iRes = i2;
 364   }else if( p2->pFile==0 ){
 365     iRes = i1;
 366   }else{
 367     int res;
 368     assert( pCsr->pSorter->pUnpacked!=0 );  /* allocated in vdbeSorterMerge() */
 369     vdbeSorterCompare(
 370         pCsr, 0, p1->aKey, p1->nKey, p2->aKey, p2->nKey, &res
 371     );
 372     if( res<=0 ){
 373       iRes = i1;
 374     }else{
 375       iRes = i2;
 376     }
 377   }
 378
 379   pSorter->aTree[iOut] = iRes;
 380   return SQLITE_OK;
 381 }
 382
 383 /*
 384 ** Initialize the temporary index cursor just opened as a sorter cursor.
 385 */
 386 int sqlite3VdbeSorterInit(sqlite3 *db, VdbeCursor *pCsr){
 387   int pgsz;                       /* Page size of main database */
 388   int mxCache;                    /* Cache size */
 389   VdbeSorter *pSorter;            /* The new sorter */
 390   char *d;                        /* Dummy */
 391
 392   assert( pCsr->pKeyInfo && pCsr->pBt==0 );
 393   pCsr->pSorter = pSorter = sqlite3DbMallocZero(db, sizeof(VdbeSorter));
 394   if( pSorter==0 ){
 395     return SQLITE_NOMEM;
 396   }
 397
 398   pSorter->pUnpacked = sqlite3VdbeAllocUnpackedRecord(pCsr->pKeyInfo, 0, 0, &d);
 399   if( pSorter->pUnpacked==0 ) return SQLITE_NOMEM;
 400   assert( pSorter->pUnpacked==(UnpackedRecord *)d );
 401
 402   if( !sqlite3TempInMemory(db) ){
 403     pgsz = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
 404     pSorter->mnPmaSize = SORTER_MIN_WORKING * pgsz;
 405     mxCache = db->aDb[0].pSchema->cache_size;
 406     if( mxCache<SORTER_MIN_WORKING ) mxCache = SORTER_MIN_WORKING;
 407     pSorter->mxPmaSize = mxCache * pgsz;
 408   }
 409
 410   return SQLITE_OK;
 411 }
 412
 413 /*
 414 ** Free the list of sorted records starting at pRecord.
 415 */
 416 static void vdbeSorterRecordFree(sqlite3 *db, SorterRecord *pRecord){
 417   SorterRecord *p;
 418   SorterRecord *pNext;
 419   for(p=pRecord; p; p=pNext){
 420     pNext = p->pNext;
 421     sqlite3DbFree(db, p);
 422   }
 423 }
 424
 425 /*
 426 ** Free any cursor components allocated by sqlite3VdbeSorterXXX routines.
 427 */
 428 void sqlite3VdbeSorterClose(sqlite3 *db, VdbeCursor *pCsr){
 429   VdbeSorter *pSorter = pCsr->pSorter;
 430   if( pSorter ){
 431     if( pSorter->aIter ){
 432       int i;
 433       for(i=0; i<pSorter->nTree; i++){
 434         vdbeSorterIterZero(db, &pSorter->aIter[i]);
 435       }
 436       sqlite3DbFree(db, pSorter->aIter);
 437     }
 438     if( pSorter->pTemp1 ){
 439       sqlite3OsCloseFree(pSorter->pTemp1);
 440     }
 441     vdbeSorterRecordFree(db, pSorter->pRecord);
 442     sqlite3DbFree(db, pSorter->pUnpacked);
 443     sqlite3DbFree(db, pSorter);
 444     pCsr->pSorter = 0;
 445   }
 446 }
 447
 448 /*
 449 ** Allocate space for a file-handle and open a temporary file. If successful,
 450 ** set *ppFile to point to the malloc'd file-handle and return SQLITE_OK.
 451 ** Otherwise, set *ppFile to 0 and return an SQLite error code.
 452 */
 453 static int vdbeSorterOpenTempFile(sqlite3 *db, sqlite3_file **ppFile){
 454   int dummy;
 455   return sqlite3OsOpenMalloc(db->pVfs, 0, ppFile,
 456       SQLITE_OPEN_TEMP_JOURNAL |
 457       SQLITE_OPEN_READWRITE    | SQLITE_OPEN_CREATE |
 458       SQLITE_OPEN_EXCLUSIVE    | SQLITE_OPEN_DELETEONCLOSE, &dummy
 459   );
 460 }
 461
 462 /*
 463 ** Merge the two sorted lists p1 and p2 into a single list.
 464 ** Set *ppOut to the head of the new list.
 465 */
 466 static void vdbeSorterMerge(
 467   VdbeCursor *pCsr,               /* For pKeyInfo */
 468   SorterRecord *p1,               /* First list to merge */
 469   SorterRecord *p2,               /* Second list to merge */
 470   SorterRecord **ppOut            /* OUT: Head of merged list */
 471 ){
 472   SorterRecord *pFinal = 0;
 473   SorterRecord **pp = &pFinal;
 474   void *pVal2 = p2 ? p2->pVal : 0;
 475
 476   while( p1 && p2 ){
 477     int res;
 478     vdbeSorterCompare(pCsr, 0, p1->pVal, p1->nVal, pVal2, p2->nVal, &res);
 479     if( res<=0 ){
 480       *pp = p1;
 481       pp = &p1->pNext;
 482       p1 = p1->pNext;
 483       pVal2 = 0;
 484     }else{
 485       *pp = p2;
 486        pp = &p2->pNext;
 487       p2 = p2->pNext;
 488       if( p2==0 ) break;
 489       pVal2 = p2->pVal;
 490     }
 491   }
 492   *pp = p1 ? p1 : p2;
 493   *ppOut = pFinal;
 494 }
 495
 496 /*
 497 ** Sort the linked list of records headed at pCsr->pRecord. Return SQLITE_OK
 498 ** if successful, or an SQLite error code (i.e. SQLITE_NOMEM) if an error
 499 ** occurs.
 500 */
 501 static int vdbeSorterSort(VdbeCursor *pCsr){
 502   int i;
 503   SorterRecord **aSlot;
 504   SorterRecord *p;
 505   VdbeSorter *pSorter = pCsr->pSorter;
 506
 507   aSlot = (SorterRecord **)sqlite3MallocZero(64 * sizeof(SorterRecord *));
 508   if( !aSlot ){
 509     return SQLITE_NOMEM;
 510   }
 511
 512   p = pSorter->pRecord;
 513   while( p ){
 514     SorterRecord *pNext = p->pNext;
 515     p->pNext = 0;
 516     for(i=0; aSlot[i]; i++){
 517       vdbeSorterMerge(pCsr, p, aSlot[i], &p);
 518       aSlot[i] = 0;
 519     }
 520     aSlot[i] = p;
 521     p = pNext;
 522   }
 523
 524   p = 0;
 525   for(i=0; i<64; i++){
 526     vdbeSorterMerge(pCsr, p, aSlot[i], &p);
 527   }
 528   pSorter->pRecord = p;
 529
 530   sqlite3_free(aSlot);
 531   return SQLITE_OK;
 532 }
 533
 534
 535 /*
 536 ** Write the current contents of the in-memory linked-list to a PMA. Return
 537 ** SQLITE_OK if successful, or an SQLite error code otherwise.
 538 **
 539 ** The format of a PMA is:
 540 **
 541 **     * A varint. This varint contains the total number of bytes of content
 542 **       in the PMA (not including the varint itself).
 543 **
 544 **     * One or more records packed end-to-end in order of ascending keys.
 545 **       Each record consists of a varint followed by a blob of data (the
 546 **       key). The varint is the number of bytes in the blob of data.
 547 */
 548 static int vdbeSorterListToPMA(sqlite3 *db, VdbeCursor *pCsr){
 549   int rc = SQLITE_OK;             /* Return code */
 550   VdbeSorter *pSorter = pCsr->pSorter;
 551
 552   if( pSorter->nInMemory==0 ){
 553     assert( pSorter->pRecord==0 );
 554     return rc;
 555   }
 556
 557   rc = vdbeSorterSort(pCsr);
 558
 559   /* If the first temporary PMA file has not been opened, open it now. */
 560   if( rc==SQLITE_OK && pSorter->pTemp1==0 ){
 561     rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
 562     assert( rc!=SQLITE_OK || pSorter->pTemp1 );
 563     assert( pSorter->iWriteOff==0 );
 564     assert( pSorter->nPMA==0 );
 565   }
 566
 567   if( rc==SQLITE_OK ){
 568     i64 iOff = pSorter->iWriteOff;
 569     SorterRecord *p;
 570     SorterRecord *pNext = 0;
 571     static const char eightZeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 572
 573     pSorter->nPMA++;
 574     rc = vdbeSorterWriteVarint(pSorter->pTemp1, pSorter->nInMemory, &iOff);
 575     for(p=pSorter->pRecord; rc==SQLITE_OK && p; p=pNext){
 576       pNext = p->pNext;
 577       rc = vdbeSorterWriteVarint(pSorter->pTemp1, p->nVal, &iOff);
 578
 579       if( rc==SQLITE_OK ){
 580         rc = sqlite3OsWrite(pSorter->pTemp1, p->pVal, p->nVal, iOff);
 581         iOff += p->nVal;
 582       }
 583
 584       sqlite3DbFree(db, p);
 585     }
 586
 587     /* This assert verifies that unless an error has occurred, the size of
 588     ** the PMA on disk is the same as the expected size stored in
 589     ** pSorter->nInMemory. */
 590     assert( rc!=SQLITE_OK || pSorter->nInMemory==(
 591           iOff-pSorter->iWriteOff-sqlite3VarintLen(pSorter->nInMemory)
 592     ));
 593
 594     pSorter->iWriteOff = iOff;
 595     if( rc==SQLITE_OK ){
 596       /* Terminate each file with 8 extra bytes so that from any offset
 597       ** in the file we can always read 9 bytes without a SHORT_READ error */
 598       rc = sqlite3OsWrite(pSorter->pTemp1, eightZeros, 8, iOff);
 599     }
 600     pSorter->pRecord = p;
 601   }
 602
 603   return rc;
 604 }
 605
 606 /*
 607 ** Add a record to the sorter.
 608 */
 609 int sqlite3VdbeSorterWrite(
 610   sqlite3 *db,                    /* Database handle */
 611   VdbeCursor *pCsr,               /* Sorter cursor */
 612   Mem *pVal                       /* Memory cell containing record */
 613 ){
 614   VdbeSorter *pSorter = pCsr->pSorter;
 615   int rc = SQLITE_OK;             /* Return Code */
 616   SorterRecord *pNew;             /* New list element */
 617
 618   assert( pSorter );
 619   pSorter->nInMemory += sqlite3VarintLen(pVal->n) + pVal->n;
 620
 621   pNew = (SorterRecord *)sqlite3DbMallocRaw(db, pVal->n + sizeof(SorterRecord));
 622   if( pNew==0 ){
 623     rc = SQLITE_NOMEM;
 624   }else{
 625     pNew->pVal = (void *)&pNew[1];
 626     memcpy(pNew->pVal, pVal->z, pVal->n);
 627     pNew->nVal = pVal->n;
 628     pNew->pNext = pSorter->pRecord;
 629     pSorter->pRecord = pNew;
 630   }
 631
 632   /* See if the contents of the sorter should now be written out. They
 633   ** are written out when either of the following are true:
 634   **
 635   **   * The total memory allocated for the in-memory list is greater
 636   **     than (page-size * cache-size), or
 637   **
 638   **   * The total memory allocated for the in-memory list is greater
 639   **     than (page-size * 10) and sqlite3HeapNearlyFull() returns true.
 640   */
 641   if( rc==SQLITE_OK && pSorter->mxPmaSize>0 && (
 642         (pSorter->nInMemory>pSorter->mxPmaSize)
 643      || (pSorter->nInMemory>pSorter->mnPmaSize && sqlite3HeapNearlyFull())
 644   )){
 645     rc = vdbeSorterListToPMA(db, pCsr);
 646     pSorter->nInMemory = 0;
 647   }
 648
 649   return rc;
 650 }
 651
 652 /*
 653 ** Helper function for sqlite3VdbeSorterRewind().
 654 */
 655 static int vdbeSorterInitMerge(
 656   sqlite3 *db,                    /* Database handle */
 657   VdbeCursor *pCsr,               /* Cursor handle for this sorter */
 658   i64 *pnByte                     /* Sum of bytes in all opened PMAs */
 659 ){
 660   VdbeSorter *pSorter = pCsr->pSorter;
 661   int rc = SQLITE_OK;             /* Return code */
 662   int i;                          /* Used to iterator through aIter[] */
 663   i64 nByte = 0;                  /* Total bytes in all opened PMAs */
 664
 665   /* Initialize the iterators. */
 666   for(i=0; i<SORTER_MAX_MERGE_COUNT; i++){
 667     VdbeSorterIter *pIter = &pSorter->aIter[i];
 668     rc = vdbeSorterIterInit(db, pSorter, pSorter->iReadOff, pIter, &nByte);
 669     pSorter->iReadOff = pIter->iEof;
 670     assert( rc!=SQLITE_OK || pSorter->iReadOff<=pSorter->iWriteOff );
 671     if( rc!=SQLITE_OK || pSorter->iReadOff>=pSorter->iWriteOff ) break;
 672   }
 673
 674   /* Initialize the aTree[] array. */
 675   for(i=pSorter->nTree-1; rc==SQLITE_OK && i>0; i--){
 676     rc = vdbeSorterDoCompare(pCsr, i);
 677   }
 678
 679   *pnByte = nByte;
 680   return rc;
 681 }
 682
 683 /*
 684 ** Once the sorter has been populated, this function is called to prepare
 685 ** for iterating through its contents in sorted order.
 686 */
 687 int sqlite3VdbeSorterRewind(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
 688   VdbeSorter *pSorter = pCsr->pSorter;
 689   int rc;                         /* Return code */
 690   sqlite3_file *pTemp2 = 0;       /* Second temp file to use */
 691   i64 iWrite2 = 0;                /* Write offset for pTemp2 */
 692   int nIter;                      /* Number of iterators used */
 693   int nByte;                      /* Bytes of space required for aIter/aTree */
 694   int N = 2;                      /* Power of 2 >= nIter */
 695
 696   assert( pSorter );
 697
 698   /* If no data has been written to disk, then do not do so now. Instead,
 699   ** sort the VdbeSorter.pRecord list. The vdbe layer will read data directly
 700   ** from the in-memory list.  */
 701   if( pSorter->nPMA==0 ){
 702     *pbEof = !pSorter->pRecord;
 703     assert( pSorter->aTree==0 );
 704     return vdbeSorterSort(pCsr);
 705   }
 706
 707   /* Write the current b-tree to a PMA. Close the b-tree cursor. */
 708   rc = vdbeSorterListToPMA(db, pCsr);
 709   if( rc!=SQLITE_OK ) return rc;
 710
 711   /* Allocate space for aIter[] and aTree[]. */
 712   nIter = pSorter->nPMA;
 713   if( nIter>SORTER_MAX_MERGE_COUNT ) nIter = SORTER_MAX_MERGE_COUNT;
 714   assert( nIter>0 );
 715   while( N<nIter ) N += N;
 716   nByte = N * (sizeof(int) + sizeof(VdbeSorterIter));
 717   pSorter->aIter = (VdbeSorterIter *)sqlite3DbMallocZero(db, nByte);
 718   if( !pSorter->aIter ) return SQLITE_NOMEM;
 719   pSorter->aTree = (int *)&pSorter->aIter[N];
 720   pSorter->nTree = N;
 721
 722   do {
 723     int iNew;                     /* Index of new, merged, PMA */
 724
 725     for(iNew=0;
 726         rc==SQLITE_OK && iNew*SORTER_MAX_MERGE_COUNT<pSorter->nPMA;
 727         iNew++
 728     ){
 729       i64 nWrite;                 /* Number of bytes in new PMA */
 730
 731       /* If there are SORTER_MAX_MERGE_COUNT or less PMAs in file pTemp1,
 732       ** initialize an iterator for each of them and break out of the loop.
 733       ** These iterators will be incrementally merged as the VDBE layer calls
 734       ** sqlite3VdbeSorterNext().
 735       **
 736       ** Otherwise, if pTemp1 contains more than SORTER_MAX_MERGE_COUNT PMAs,
 737       ** initialize interators for SORTER_MAX_MERGE_COUNT of them. These PMAs
 738       ** are merged into a single PMA that is written to file pTemp2.
 739       */
 740       rc = vdbeSorterInitMerge(db, pCsr, &nWrite);
 741       assert( rc!=SQLITE_OK || pSorter->aIter[ pSorter->aTree[1] ].pFile );
 742       if( rc!=SQLITE_OK || pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
 743         break;
 744       }
 745
 746       /* Open the second temp file, if it is not already open. */
 747       if( pTemp2==0 ){
 748         assert( iWrite2==0 );
 749         rc = vdbeSorterOpenTempFile(db, &pTemp2);
 750       }
 751
 752       if( rc==SQLITE_OK ){
 753         rc = vdbeSorterWriteVarint(pTemp2, nWrite, &iWrite2);
 754       }
 755
 756       if( rc==SQLITE_OK ){
 757         int bEof = 0;
 758         while( rc==SQLITE_OK && bEof==0 ){
 759           int nToWrite;
 760           VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
 761           assert( pIter->pFile );
 762           nToWrite = pIter->nKey + sqlite3VarintLen(pIter->nKey);
 763           rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nToWrite, iWrite2);
 764           iWrite2 += nToWrite;
 765           if( rc==SQLITE_OK ){
 766             rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
 767           }
 768         }
 769       }
 770     }
 771
 772     if( pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
 773       break;
 774     }else{
 775       sqlite3_file *pTmp = pSorter->pTemp1;
 776       pSorter->nPMA = iNew;
 777       pSorter->pTemp1 = pTemp2;
 778       pTemp2 = pTmp;
 779       pSorter->iWriteOff = iWrite2;
 780       pSorter->iReadOff = 0;
 781       iWrite2 = 0;
 782     }
 783   }while( rc==SQLITE_OK );
 784
 785   if( pTemp2 ){
 786     sqlite3OsCloseFree(pTemp2);
 787   }
 788   *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
 789   return rc;
 790 }
 791
 792 /*
 793 ** Advance to the next element in the sorter.
 794 */
 795 int sqlite3VdbeSorterNext(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
 796   VdbeSorter *pSorter = pCsr->pSorter;
 797   int rc;                         /* Return code */
 798
 799   if( pSorter->aTree ){
 800     int iPrev = pSorter->aTree[1];/* Index of iterator to advance */
 801     int i;                        /* Index of aTree[] to recalculate */
 802
 803     rc = vdbeSorterIterNext(db, &pSorter->aIter[iPrev]);
 804     for(i=(pSorter->nTree+iPrev)/2; rc==SQLITE_OK && i>0; i=i/2){
 805       rc = vdbeSorterDoCompare(pCsr, i);
 806     }
 807
 808     *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
 809   }else{
 810     SorterRecord *pFree = pSorter->pRecord;
 811     pSorter->pRecord = pFree->pNext;
 812     pFree->pNext = 0;
 813     vdbeSorterRecordFree(db, pFree);
 814     *pbEof = !pSorter->pRecord;
 815     rc = SQLITE_OK;
 816   }
 817   return rc;
 818 }
 819
 820 /*
 821 ** Return a pointer to a buffer owned by the sorter that contains the
 822 ** current key.
 823 */
 824 static void *vdbeSorterRowkey(
 825   VdbeSorter *pSorter,            /* Sorter object */
 826   int *pnKey                      /* OUT: Size of current key in bytes */
 827 ){
 828   void *pKey;
 829   if( pSorter->aTree ){
 830     VdbeSorterIter *pIter;
 831     pIter = &pSorter->aIter[ pSorter->aTree[1] ];
 832     *pnKey = pIter->nKey;
 833     pKey = pIter->aKey;
 834   }else{
 835     *pnKey = pSorter->pRecord->nVal;
 836     pKey = pSorter->pRecord->pVal;
 837   }
 838   return pKey;
 839 }
 840
 841 /*
 842 ** Copy the current sorter key into the memory cell pOut.
 843 */
 844 int sqlite3VdbeSorterRowkey(VdbeCursor *pCsr, Mem *pOut){
 845   VdbeSorter *pSorter = pCsr->pSorter;
 846   void *pKey; int nKey;           /* Sorter key to copy into pOut */
 847
 848   pKey = vdbeSorterRowkey(pSorter, &nKey);
 849   if( sqlite3VdbeMemGrow(pOut, nKey, 0) ){
 850     return SQLITE_NOMEM;
 851   }
 852   pOut->n = nKey;
 853   MemSetTypeFlag(pOut, MEM_Blob);
 854   memcpy(pOut->z, pKey, nKey);
 855
 856   return SQLITE_OK;
 857 }
 858
 859 /*
 860 ** Compare the key in memory cell pVal with the key that the sorter cursor
 861 ** passed as the first argument currently points to. For the purposes of
 862 ** the comparison, ignore the rowid field at the end of each record.
 863 **
 864 ** If an error occurs, return an SQLite error code (i.e. SQLITE_NOMEM).
 865 ** Otherwise, set *pRes to a negative, zero or positive value if the
 866 ** key in pVal is smaller than, equal to or larger than the current sorter
 867 ** key.
 868 */
 869 int sqlite3VdbeSorterCompare(
 870   VdbeCursor *pCsr,               /* Sorter cursor */
 871   Mem *pVal,                      /* Value to compare to current sorter key */
 872   int *pRes                       /* OUT: Result of comparison */
 873 ){
 874   VdbeSorter *pSorter = pCsr->pSorter;
 875   void *pKey; int nKey;           /* Sorter key to compare pVal with */
 876
 877   pKey = vdbeSorterRowkey(pSorter, &nKey);
 878   vdbeSorterCompare(pCsr, 1, pVal->z, pVal->n, pKey, nKey, pRes);
 879   return SQLITE_OK;
 880 }
 881
 882 #endif /* #ifndef SQLITE_OMIT_MERGE_SORT */