Adds PRAGMA cipher_log_subsystem to restrict output of log messages
[sqlcipher.git] / src / btree.c
blob62b89897967cdba22595d9ae02a26d7bd00a8fa9
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
12 ** This file implements an external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
16 #include "btreeInt.h"
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
28 #if 0
29 int sqlite3BtreeTrace=1; /* True to enable tracing */
30 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page. If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
44 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1)
47 ** Values passed as the 5th argument to allocateBtreePage()
49 #define BTALLOC_ANY 0 /* Allocate any page */
50 #define BTALLOC_EXACT 1 /* Allocate exact page if possible */
51 #define BTALLOC_LE 2 /* Allocate any page <= the parameter */
54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
55 ** defined, or 0 if it is. For example:
57 ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
59 #ifndef SQLITE_OMIT_AUTOVACUUM
60 #define IfNotOmitAV(expr) (expr)
61 #else
62 #define IfNotOmitAV(expr) 0
63 #endif
65 #ifndef SQLITE_OMIT_SHARED_CACHE
67 ** A list of BtShared objects that are eligible for participation
68 ** in shared cache. This variable has file scope during normal builds,
69 ** but the test harness needs to access it so we make it global for
70 ** test builds.
72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MAIN.
74 #ifdef SQLITE_TEST
75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
76 #else
77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
78 #endif
79 #endif /* SQLITE_OMIT_SHARED_CACHE */
81 #ifndef SQLITE_OMIT_SHARED_CACHE
83 ** Enable or disable the shared pager and schema features.
85 ** This routine has no effect on existing database connections.
86 ** The shared cache setting effects only future calls to
87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
89 int sqlite3_enable_shared_cache(int enable){
90 sqlite3GlobalConfig.sharedCacheEnabled = enable;
91 return SQLITE_OK;
93 #endif
97 #ifdef SQLITE_OMIT_SHARED_CACHE
99 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
100 ** and clearAllSharedCacheTableLocks()
101 ** manipulate entries in the BtShared.pLock linked list used to store
102 ** shared-cache table level locks. If the library is compiled with the
103 ** shared-cache feature disabled, then there is only ever one user
104 ** of each BtShared structure and so this locking is not necessary.
105 ** So define the lock related functions as no-ops.
107 #define querySharedCacheTableLock(a,b,c) SQLITE_OK
108 #define setSharedCacheTableLock(a,b,c) SQLITE_OK
109 #define clearAllSharedCacheTableLocks(a)
110 #define downgradeAllSharedCacheTableLocks(a)
111 #define hasSharedCacheTableLock(a,b,c,d) 1
112 #define hasReadConflicts(a, b) 0
113 #endif
115 #ifdef SQLITE_DEBUG
117 ** Return and reset the seek counter for a Btree object.
119 sqlite3_uint64 sqlite3BtreeSeekCount(Btree *pBt){
120 u64 n = pBt->nSeek;
121 pBt->nSeek = 0;
122 return n;
124 #endif
127 ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single
128 ** (MemPage*) as an argument. The (MemPage*) must not be NULL.
130 ** If SQLITE_DEBUG is not defined, then this macro is equivalent to
131 ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message
132 ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented
133 ** with the page number and filename associated with the (MemPage*).
135 #ifdef SQLITE_DEBUG
136 int corruptPageError(int lineno, MemPage *p){
137 char *zMsg;
138 sqlite3BeginBenignMalloc();
139 zMsg = sqlite3_mprintf("database corruption page %u of %s",
140 p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0)
142 sqlite3EndBenignMalloc();
143 if( zMsg ){
144 sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg);
146 sqlite3_free(zMsg);
147 return SQLITE_CORRUPT_BKPT;
149 # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage)
150 #else
151 # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno)
152 #endif
154 /* Default value for SHARED_LOCK_TRACE macro if shared-cache is disabled
155 ** or if the lock tracking is disabled. This is always the value for
156 ** release builds.
158 #define SHARED_LOCK_TRACE(X,MSG,TAB,TYPE) /*no-op*/
160 #ifndef SQLITE_OMIT_SHARED_CACHE
162 #if 0
163 /* ^---- Change to 1 and recompile to enable shared-lock tracing
164 ** for debugging purposes.
166 ** Print all shared-cache locks on a BtShared. Debugging use only.
168 static void sharedLockTrace(
169 BtShared *pBt,
170 const char *zMsg,
171 int iRoot,
172 int eLockType
174 BtLock *pLock;
175 if( iRoot>0 ){
176 printf("%s-%p %u%s:", zMsg, pBt, iRoot, eLockType==READ_LOCK?"R":"W");
177 }else{
178 printf("%s-%p:", zMsg, pBt);
180 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
181 printf(" %p/%u%s", pLock->pBtree, pLock->iTable,
182 pLock->eLock==READ_LOCK ? "R" : "W");
183 while( pLock->pNext && pLock->pBtree==pLock->pNext->pBtree ){
184 pLock = pLock->pNext;
185 printf(",%u%s", pLock->iTable, pLock->eLock==READ_LOCK ? "R" : "W");
188 printf("\n");
189 fflush(stdout);
191 #undef SHARED_LOCK_TRACE
192 #define SHARED_LOCK_TRACE(X,MSG,TAB,TYPE) sharedLockTrace(X,MSG,TAB,TYPE)
193 #endif /* Shared-lock tracing */
195 #ifdef SQLITE_DEBUG
197 **** This function is only used as part of an assert() statement. ***
199 ** Check to see if pBtree holds the required locks to read or write to the
200 ** table with root page iRoot. Return 1 if it does and 0 if not.
202 ** For example, when writing to a table with root-page iRoot via
203 ** Btree connection pBtree:
205 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
207 ** When writing to an index that resides in a sharable database, the
208 ** caller should have first obtained a lock specifying the root page of
209 ** the corresponding table. This makes things a bit more complicated,
210 ** as this module treats each table as a separate structure. To determine
211 ** the table corresponding to the index being written, this
212 ** function has to search through the database schema.
214 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
215 ** hold a write-lock on the schema table (root page 1). This is also
216 ** acceptable.
218 static int hasSharedCacheTableLock(
219 Btree *pBtree, /* Handle that must hold lock */
220 Pgno iRoot, /* Root page of b-tree */
221 int isIndex, /* True if iRoot is the root of an index b-tree */
222 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */
224 Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
225 Pgno iTab = 0;
226 BtLock *pLock;
228 /* If this database is not shareable, or if the client is reading
229 ** and has the read-uncommitted flag set, then no lock is required.
230 ** Return true immediately.
232 if( (pBtree->sharable==0)
233 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit))
235 return 1;
238 /* If the client is reading or writing an index and the schema is
239 ** not loaded, then it is too difficult to actually check to see if
240 ** the correct locks are held. So do not bother - just return true.
241 ** This case does not come up very often anyhow.
243 if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
244 return 1;
247 /* Figure out the root-page that the lock should be held on. For table
248 ** b-trees, this is just the root page of the b-tree being read or
249 ** written. For index b-trees, it is the root page of the associated
250 ** table. */
251 if( isIndex ){
252 HashElem *p;
253 int bSeen = 0;
254 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
255 Index *pIdx = (Index *)sqliteHashData(p);
256 if( pIdx->tnum==iRoot ){
257 if( bSeen ){
258 /* Two or more indexes share the same root page. There must
259 ** be imposter tables. So just return true. The assert is not
260 ** useful in that case. */
261 return 1;
263 iTab = pIdx->pTable->tnum;
264 bSeen = 1;
267 }else{
268 iTab = iRoot;
271 SHARED_LOCK_TRACE(pBtree->pBt,"hasLock",iRoot,eLockType);
273 /* Search for the required lock. Either a write-lock on root-page iTab, a
274 ** write-lock on the schema table, or (if the client is reading) a
275 ** read-lock on iTab will suffice. Return 1 if any of these are found. */
276 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
277 if( pLock->pBtree==pBtree
278 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
279 && pLock->eLock>=eLockType
281 return 1;
285 /* Failed to find the required lock. */
286 return 0;
288 #endif /* SQLITE_DEBUG */
290 #ifdef SQLITE_DEBUG
292 **** This function may be used as part of assert() statements only. ****
294 ** Return true if it would be illegal for pBtree to write into the
295 ** table or index rooted at iRoot because other shared connections are
296 ** simultaneously reading that same table or index.
298 ** It is illegal for pBtree to write if some other Btree object that
299 ** shares the same BtShared object is currently reading or writing
300 ** the iRoot table. Except, if the other Btree object has the
301 ** read-uncommitted flag set, then it is OK for the other object to
302 ** have a read cursor.
304 ** For example, before writing to any part of the table or index
305 ** rooted at page iRoot, one should call:
307 ** assert( !hasReadConflicts(pBtree, iRoot) );
309 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
310 BtCursor *p;
311 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
312 if( p->pgnoRoot==iRoot
313 && p->pBtree!=pBtree
314 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit)
316 return 1;
319 return 0;
321 #endif /* #ifdef SQLITE_DEBUG */
324 ** Query to see if Btree handle p may obtain a lock of type eLock
325 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
326 ** SQLITE_OK if the lock may be obtained (by calling
327 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
329 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
330 BtShared *pBt = p->pBt;
331 BtLock *pIter;
333 assert( sqlite3BtreeHoldsMutex(p) );
334 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
335 assert( p->db!=0 );
336 assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 );
338 /* If requesting a write-lock, then the Btree must have an open write
339 ** transaction on this file. And, obviously, for this to be so there
340 ** must be an open write transaction on the file itself.
342 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
343 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
345 /* This routine is a no-op if the shared-cache is not enabled */
346 if( !p->sharable ){
347 return SQLITE_OK;
350 /* If some other connection is holding an exclusive lock, the
351 ** requested lock may not be obtained.
353 if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
354 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
355 return SQLITE_LOCKED_SHAREDCACHE;
358 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
359 /* The condition (pIter->eLock!=eLock) in the following if(...)
360 ** statement is a simplification of:
362 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
364 ** since we know that if eLock==WRITE_LOCK, then no other connection
365 ** may hold a WRITE_LOCK on any table in this file (since there can
366 ** only be a single writer).
368 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
369 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
370 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
371 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
372 if( eLock==WRITE_LOCK ){
373 assert( p==pBt->pWriter );
374 pBt->btsFlags |= BTS_PENDING;
376 return SQLITE_LOCKED_SHAREDCACHE;
379 return SQLITE_OK;
381 #endif /* !SQLITE_OMIT_SHARED_CACHE */
383 #ifndef SQLITE_OMIT_SHARED_CACHE
385 ** Add a lock on the table with root-page iTable to the shared-btree used
386 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
387 ** WRITE_LOCK.
389 ** This function assumes the following:
391 ** (a) The specified Btree object p is connected to a sharable
392 ** database (one with the BtShared.sharable flag set), and
394 ** (b) No other Btree objects hold a lock that conflicts
395 ** with the requested lock (i.e. querySharedCacheTableLock() has
396 ** already been called and returned SQLITE_OK).
398 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
399 ** is returned if a malloc attempt fails.
401 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
402 BtShared *pBt = p->pBt;
403 BtLock *pLock = 0;
404 BtLock *pIter;
406 SHARED_LOCK_TRACE(pBt,"setLock", iTable, eLock);
408 assert( sqlite3BtreeHoldsMutex(p) );
409 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
410 assert( p->db!=0 );
412 /* A connection with the read-uncommitted flag set will never try to
413 ** obtain a read-lock using this function. The only read-lock obtained
414 ** by a connection in read-uncommitted mode is on the sqlite_schema
415 ** table, and that lock is obtained in BtreeBeginTrans(). */
416 assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK );
418 /* This function should only be called on a sharable b-tree after it
419 ** has been determined that no other b-tree holds a conflicting lock. */
420 assert( p->sharable );
421 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
423 /* First search the list for an existing lock on this table. */
424 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
425 if( pIter->iTable==iTable && pIter->pBtree==p ){
426 pLock = pIter;
427 break;
431 /* If the above search did not find a BtLock struct associating Btree p
432 ** with table iTable, allocate one and link it into the list.
434 if( !pLock ){
435 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
436 if( !pLock ){
437 return SQLITE_NOMEM_BKPT;
439 pLock->iTable = iTable;
440 pLock->pBtree = p;
441 pLock->pNext = pBt->pLock;
442 pBt->pLock = pLock;
445 /* Set the BtLock.eLock variable to the maximum of the current lock
446 ** and the requested lock. This means if a write-lock was already held
447 ** and a read-lock requested, we don't incorrectly downgrade the lock.
449 assert( WRITE_LOCK>READ_LOCK );
450 if( eLock>pLock->eLock ){
451 pLock->eLock = eLock;
454 return SQLITE_OK;
456 #endif /* !SQLITE_OMIT_SHARED_CACHE */
458 #ifndef SQLITE_OMIT_SHARED_CACHE
460 ** Release all the table locks (locks obtained via calls to
461 ** the setSharedCacheTableLock() procedure) held by Btree object p.
463 ** This function assumes that Btree p has an open read or write
464 ** transaction. If it does not, then the BTS_PENDING flag
465 ** may be incorrectly cleared.
467 static void clearAllSharedCacheTableLocks(Btree *p){
468 BtShared *pBt = p->pBt;
469 BtLock **ppIter = &pBt->pLock;
471 assert( sqlite3BtreeHoldsMutex(p) );
472 assert( p->sharable || 0==*ppIter );
473 assert( p->inTrans>0 );
475 SHARED_LOCK_TRACE(pBt, "clearAllLocks", 0, 0);
477 while( *ppIter ){
478 BtLock *pLock = *ppIter;
479 assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
480 assert( pLock->pBtree->inTrans>=pLock->eLock );
481 if( pLock->pBtree==p ){
482 *ppIter = pLock->pNext;
483 assert( pLock->iTable!=1 || pLock==&p->lock );
484 if( pLock->iTable!=1 ){
485 sqlite3_free(pLock);
487 }else{
488 ppIter = &pLock->pNext;
492 assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
493 if( pBt->pWriter==p ){
494 pBt->pWriter = 0;
495 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
496 }else if( pBt->nTransaction==2 ){
497 /* This function is called when Btree p is concluding its
498 ** transaction. If there currently exists a writer, and p is not
499 ** that writer, then the number of locks held by connections other
500 ** than the writer must be about to drop to zero. In this case
501 ** set the BTS_PENDING flag to 0.
503 ** If there is not currently a writer, then BTS_PENDING must
504 ** be zero already. So this next line is harmless in that case.
506 pBt->btsFlags &= ~BTS_PENDING;
511 ** This function changes all write-locks held by Btree p into read-locks.
513 static void downgradeAllSharedCacheTableLocks(Btree *p){
514 BtShared *pBt = p->pBt;
516 SHARED_LOCK_TRACE(pBt, "downgradeLocks", 0, 0);
518 if( pBt->pWriter==p ){
519 BtLock *pLock;
520 pBt->pWriter = 0;
521 pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
522 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
523 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
524 pLock->eLock = READ_LOCK;
529 #endif /* SQLITE_OMIT_SHARED_CACHE */
531 static void releasePage(MemPage *pPage); /* Forward reference */
532 static void releasePageOne(MemPage *pPage); /* Forward reference */
533 static void releasePageNotNull(MemPage *pPage); /* Forward reference */
536 ***** This routine is used inside of assert() only ****
538 ** Verify that the cursor holds the mutex on its BtShared
540 #ifdef SQLITE_DEBUG
541 static int cursorHoldsMutex(BtCursor *p){
542 return sqlite3_mutex_held(p->pBt->mutex);
545 /* Verify that the cursor and the BtShared agree about what is the current
546 ** database connetion. This is important in shared-cache mode. If the database
547 ** connection pointers get out-of-sync, it is possible for routines like
548 ** btreeInitPage() to reference an stale connection pointer that references a
549 ** a connection that has already closed. This routine is used inside assert()
550 ** statements only and for the purpose of double-checking that the btree code
551 ** does keep the database connection pointers up-to-date.
553 static int cursorOwnsBtShared(BtCursor *p){
554 assert( cursorHoldsMutex(p) );
555 return (p->pBtree->db==p->pBt->db);
557 #endif
560 ** Invalidate the overflow cache of the cursor passed as the first argument.
561 ** on the shared btree structure pBt.
563 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
566 ** Invalidate the overflow page-list cache for all cursors opened
567 ** on the shared btree structure pBt.
569 static void invalidateAllOverflowCache(BtShared *pBt){
570 BtCursor *p;
571 assert( sqlite3_mutex_held(pBt->mutex) );
572 for(p=pBt->pCursor; p; p=p->pNext){
573 invalidateOverflowCache(p);
577 #ifndef SQLITE_OMIT_INCRBLOB
579 ** This function is called before modifying the contents of a table
580 ** to invalidate any incrblob cursors that are open on the
581 ** row or one of the rows being modified.
583 ** If argument isClearTable is true, then the entire contents of the
584 ** table is about to be deleted. In this case invalidate all incrblob
585 ** cursors open on any row within the table with root-page pgnoRoot.
587 ** Otherwise, if argument isClearTable is false, then the row with
588 ** rowid iRow is being replaced or deleted. In this case invalidate
589 ** only those incrblob cursors open on that specific row.
591 static void invalidateIncrblobCursors(
592 Btree *pBtree, /* The database file to check */
593 Pgno pgnoRoot, /* The table that might be changing */
594 i64 iRow, /* The rowid that might be changing */
595 int isClearTable /* True if all rows are being deleted */
597 BtCursor *p;
598 assert( pBtree->hasIncrblobCur );
599 assert( sqlite3BtreeHoldsMutex(pBtree) );
600 pBtree->hasIncrblobCur = 0;
601 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
602 if( (p->curFlags & BTCF_Incrblob)!=0 ){
603 pBtree->hasIncrblobCur = 1;
604 if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){
605 p->eState = CURSOR_INVALID;
611 #else
612 /* Stub function when INCRBLOB is omitted */
613 #define invalidateIncrblobCursors(w,x,y,z)
614 #endif /* SQLITE_OMIT_INCRBLOB */
617 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
618 ** when a page that previously contained data becomes a free-list leaf
619 ** page.
621 ** The BtShared.pHasContent bitvec exists to work around an obscure
622 ** bug caused by the interaction of two useful IO optimizations surrounding
623 ** free-list leaf pages:
625 ** 1) When all data is deleted from a page and the page becomes
626 ** a free-list leaf page, the page is not written to the database
627 ** (as free-list leaf pages contain no meaningful data). Sometimes
628 ** such a page is not even journalled (as it will not be modified,
629 ** why bother journalling it?).
631 ** 2) When a free-list leaf page is reused, its content is not read
632 ** from the database or written to the journal file (why should it
633 ** be, if it is not at all meaningful?).
635 ** By themselves, these optimizations work fine and provide a handy
636 ** performance boost to bulk delete or insert operations. However, if
637 ** a page is moved to the free-list and then reused within the same
638 ** transaction, a problem comes up. If the page is not journalled when
639 ** it is moved to the free-list and it is also not journalled when it
640 ** is extracted from the free-list and reused, then the original data
641 ** may be lost. In the event of a rollback, it may not be possible
642 ** to restore the database to its original configuration.
644 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
645 ** moved to become a free-list leaf page, the corresponding bit is
646 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
647 ** optimization 2 above is omitted if the corresponding bit is already
648 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
649 ** at the end of every transaction.
651 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
652 int rc = SQLITE_OK;
653 if( !pBt->pHasContent ){
654 assert( pgno<=pBt->nPage );
655 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
656 if( !pBt->pHasContent ){
657 rc = SQLITE_NOMEM_BKPT;
660 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
661 rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
663 return rc;
667 ** Query the BtShared.pHasContent vector.
669 ** This function is called when a free-list leaf page is removed from the
670 ** free-list for reuse. It returns false if it is safe to retrieve the
671 ** page from the pager layer with the 'no-content' flag set. True otherwise.
673 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
674 Bitvec *p = pBt->pHasContent;
675 return p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTestNotNull(p, pgno));
679 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
680 ** invoked at the conclusion of each write-transaction.
682 static void btreeClearHasContent(BtShared *pBt){
683 sqlite3BitvecDestroy(pBt->pHasContent);
684 pBt->pHasContent = 0;
688 ** Release all of the apPage[] pages for a cursor.
690 static void btreeReleaseAllCursorPages(BtCursor *pCur){
691 int i;
692 if( pCur->iPage>=0 ){
693 for(i=0; i<pCur->iPage; i++){
694 releasePageNotNull(pCur->apPage[i]);
696 releasePageNotNull(pCur->pPage);
697 pCur->iPage = -1;
702 ** The cursor passed as the only argument must point to a valid entry
703 ** when this function is called (i.e. have eState==CURSOR_VALID). This
704 ** function saves the current cursor key in variables pCur->nKey and
705 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error
706 ** code otherwise.
708 ** If the cursor is open on an intkey table, then the integer key
709 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
710 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is
711 ** set to point to a malloced buffer pCur->nKey bytes in size containing
712 ** the key.
714 static int saveCursorKey(BtCursor *pCur){
715 int rc = SQLITE_OK;
716 assert( CURSOR_VALID==pCur->eState );
717 assert( 0==pCur->pKey );
718 assert( cursorHoldsMutex(pCur) );
720 if( pCur->curIntKey ){
721 /* Only the rowid is required for a table btree */
722 pCur->nKey = sqlite3BtreeIntegerKey(pCur);
723 }else{
724 /* For an index btree, save the complete key content. It is possible
725 ** that the current key is corrupt. In that case, it is possible that
726 ** the sqlite3VdbeRecordUnpack() function may overread the buffer by
727 ** up to the size of 1 varint plus 1 8-byte value when the cursor
728 ** position is restored. Hence the 17 bytes of padding allocated
729 ** below. */
730 void *pKey;
731 pCur->nKey = sqlite3BtreePayloadSize(pCur);
732 pKey = sqlite3Malloc( pCur->nKey + 9 + 8 );
733 if( pKey ){
734 rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
735 if( rc==SQLITE_OK ){
736 memset(((u8*)pKey)+pCur->nKey, 0, 9+8);
737 pCur->pKey = pKey;
738 }else{
739 sqlite3_free(pKey);
741 }else{
742 rc = SQLITE_NOMEM_BKPT;
745 assert( !pCur->curIntKey || !pCur->pKey );
746 return rc;
750 ** Save the current cursor position in the variables BtCursor.nKey
751 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
753 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
754 ** prior to calling this routine.
756 static int saveCursorPosition(BtCursor *pCur){
757 int rc;
759 assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
760 assert( 0==pCur->pKey );
761 assert( cursorHoldsMutex(pCur) );
763 if( pCur->curFlags & BTCF_Pinned ){
764 return SQLITE_CONSTRAINT_PINNED;
766 if( pCur->eState==CURSOR_SKIPNEXT ){
767 pCur->eState = CURSOR_VALID;
768 }else{
769 pCur->skipNext = 0;
772 rc = saveCursorKey(pCur);
773 if( rc==SQLITE_OK ){
774 btreeReleaseAllCursorPages(pCur);
775 pCur->eState = CURSOR_REQUIRESEEK;
778 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
779 return rc;
782 /* Forward reference */
783 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
786 ** Save the positions of all cursors (except pExcept) that are open on
787 ** the table with root-page iRoot. "Saving the cursor position" means that
788 ** the location in the btree is remembered in such a way that it can be
789 ** moved back to the same spot after the btree has been modified. This
790 ** routine is called just before cursor pExcept is used to modify the
791 ** table, for example in BtreeDelete() or BtreeInsert().
793 ** If there are two or more cursors on the same btree, then all such
794 ** cursors should have their BTCF_Multiple flag set. The btreeCursor()
795 ** routine enforces that rule. This routine only needs to be called in
796 ** the uncommon case when pExpect has the BTCF_Multiple flag set.
798 ** If pExpect!=NULL and if no other cursors are found on the same root-page,
799 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
800 ** pointless call to this routine.
802 ** Implementation note: This routine merely checks to see if any cursors
803 ** need to be saved. It calls out to saveCursorsOnList() in the (unusual)
804 ** event that cursors are in need to being saved.
806 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
807 BtCursor *p;
808 assert( sqlite3_mutex_held(pBt->mutex) );
809 assert( pExcept==0 || pExcept->pBt==pBt );
810 for(p=pBt->pCursor; p; p=p->pNext){
811 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
813 if( p ) return saveCursorsOnList(p, iRoot, pExcept);
814 if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
815 return SQLITE_OK;
818 /* This helper routine to saveAllCursors does the actual work of saving
819 ** the cursors if and when a cursor is found that actually requires saving.
820 ** The common case is that no cursors need to be saved, so this routine is
821 ** broken out from its caller to avoid unnecessary stack pointer movement.
823 static int SQLITE_NOINLINE saveCursorsOnList(
824 BtCursor *p, /* The first cursor that needs saving */
825 Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */
826 BtCursor *pExcept /* Do not save this cursor */
829 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
830 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
831 int rc = saveCursorPosition(p);
832 if( SQLITE_OK!=rc ){
833 return rc;
835 }else{
836 testcase( p->iPage>=0 );
837 btreeReleaseAllCursorPages(p);
840 p = p->pNext;
841 }while( p );
842 return SQLITE_OK;
846 ** Clear the current cursor position.
848 void sqlite3BtreeClearCursor(BtCursor *pCur){
849 assert( cursorHoldsMutex(pCur) );
850 sqlite3_free(pCur->pKey);
851 pCur->pKey = 0;
852 pCur->eState = CURSOR_INVALID;
856 ** In this version of BtreeMoveto, pKey is a packed index record
857 ** such as is generated by the OP_MakeRecord opcode. Unpack the
858 ** record and then call sqlite3BtreeIndexMoveto() to do the work.
860 static int btreeMoveto(
861 BtCursor *pCur, /* Cursor open on the btree to be searched */
862 const void *pKey, /* Packed key if the btree is an index */
863 i64 nKey, /* Integer key for tables. Size of pKey for indices */
864 int bias, /* Bias search to the high end */
865 int *pRes /* Write search results here */
867 int rc; /* Status code */
868 UnpackedRecord *pIdxKey; /* Unpacked index key */
870 if( pKey ){
871 KeyInfo *pKeyInfo = pCur->pKeyInfo;
872 assert( nKey==(i64)(int)nKey );
873 pIdxKey = sqlite3VdbeAllocUnpackedRecord(pKeyInfo);
874 if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
875 sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey);
876 if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){
877 rc = SQLITE_CORRUPT_BKPT;
878 }else{
879 rc = sqlite3BtreeIndexMoveto(pCur, pIdxKey, pRes);
881 sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
882 }else{
883 pIdxKey = 0;
884 rc = sqlite3BtreeTableMoveto(pCur, nKey, bias, pRes);
886 return rc;
890 ** Restore the cursor to the position it was in (or as close to as possible)
891 ** when saveCursorPosition() was called. Note that this call deletes the
892 ** saved position info stored by saveCursorPosition(), so there can be
893 ** at most one effective restoreCursorPosition() call after each
894 ** saveCursorPosition().
896 static int btreeRestoreCursorPosition(BtCursor *pCur){
897 int rc;
898 int skipNext = 0;
899 assert( cursorOwnsBtShared(pCur) );
900 assert( pCur->eState>=CURSOR_REQUIRESEEK );
901 if( pCur->eState==CURSOR_FAULT ){
902 return pCur->skipNext;
904 pCur->eState = CURSOR_INVALID;
905 if( sqlite3FaultSim(410) ){
906 rc = SQLITE_IOERR;
907 }else{
908 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
910 if( rc==SQLITE_OK ){
911 sqlite3_free(pCur->pKey);
912 pCur->pKey = 0;
913 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
914 if( skipNext ) pCur->skipNext = skipNext;
915 if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
916 pCur->eState = CURSOR_SKIPNEXT;
919 return rc;
922 #define restoreCursorPosition(p) \
923 (p->eState>=CURSOR_REQUIRESEEK ? \
924 btreeRestoreCursorPosition(p) : \
925 SQLITE_OK)
928 ** Determine whether or not a cursor has moved from the position where
929 ** it was last placed, or has been invalidated for any other reason.
930 ** Cursors can move when the row they are pointing at is deleted out
931 ** from under them, for example. Cursor might also move if a btree
932 ** is rebalanced.
934 ** Calling this routine with a NULL cursor pointer returns false.
936 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
937 ** back to where it ought to be if this routine returns true.
939 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
940 assert( EIGHT_BYTE_ALIGNMENT(pCur)
941 || pCur==sqlite3BtreeFakeValidCursor() );
942 assert( offsetof(BtCursor, eState)==0 );
943 assert( sizeof(pCur->eState)==1 );
944 return CURSOR_VALID != *(u8*)pCur;
948 ** Return a pointer to a fake BtCursor object that will always answer
949 ** false to the sqlite3BtreeCursorHasMoved() routine above. The fake
950 ** cursor returned must not be used with any other Btree interface.
952 BtCursor *sqlite3BtreeFakeValidCursor(void){
953 static u8 fakeCursor = CURSOR_VALID;
954 assert( offsetof(BtCursor, eState)==0 );
955 return (BtCursor*)&fakeCursor;
959 ** This routine restores a cursor back to its original position after it
960 ** has been moved by some outside activity (such as a btree rebalance or
961 ** a row having been deleted out from under the cursor).
963 ** On success, the *pDifferentRow parameter is false if the cursor is left
964 ** pointing at exactly the same row. *pDifferntRow is the row the cursor
965 ** was pointing to has been deleted, forcing the cursor to point to some
966 ** nearby row.
968 ** This routine should only be called for a cursor that just returned
969 ** TRUE from sqlite3BtreeCursorHasMoved().
971 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
972 int rc;
974 assert( pCur!=0 );
975 assert( pCur->eState!=CURSOR_VALID );
976 rc = restoreCursorPosition(pCur);
977 if( rc ){
978 *pDifferentRow = 1;
979 return rc;
981 if( pCur->eState!=CURSOR_VALID ){
982 *pDifferentRow = 1;
983 }else{
984 *pDifferentRow = 0;
986 return SQLITE_OK;
989 #ifdef SQLITE_ENABLE_CURSOR_HINTS
991 ** Provide hints to the cursor. The particular hint given (and the type
992 ** and number of the varargs parameters) is determined by the eHintType
993 ** parameter. See the definitions of the BTREE_HINT_* macros for details.
995 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
996 /* Used only by system that substitute their own storage engine */
997 #ifdef SQLITE_DEBUG
998 if( ALWAYS(eHintType==BTREE_HINT_RANGE) ){
999 va_list ap;
1000 Expr *pExpr;
1001 Walker w;
1002 memset(&w, 0, sizeof(w));
1003 w.xExprCallback = sqlite3CursorRangeHintExprCheck;
1004 va_start(ap, eHintType);
1005 pExpr = va_arg(ap, Expr*);
1006 w.u.aMem = va_arg(ap, Mem*);
1007 va_end(ap);
1008 assert( pExpr!=0 );
1009 assert( w.u.aMem!=0 );
1010 sqlite3WalkExpr(&w, pExpr);
1012 #endif /* SQLITE_DEBUG */
1014 #endif /* SQLITE_ENABLE_CURSOR_HINTS */
1018 ** Provide flag hints to the cursor.
1020 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
1021 assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
1022 pCur->hints = x;
1026 #ifndef SQLITE_OMIT_AUTOVACUUM
1028 ** Given a page number of a regular database page, return the page
1029 ** number for the pointer-map page that contains the entry for the
1030 ** input page number.
1032 ** Return 0 (not a valid page) for pgno==1 since there is
1033 ** no pointer map associated with page 1. The integrity_check logic
1034 ** requires that ptrmapPageno(*,1)!=1.
1036 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
1037 int nPagesPerMapPage;
1038 Pgno iPtrMap, ret;
1039 assert( sqlite3_mutex_held(pBt->mutex) );
1040 if( pgno<2 ) return 0;
1041 nPagesPerMapPage = (pBt->usableSize/5)+1;
1042 iPtrMap = (pgno-2)/nPagesPerMapPage;
1043 ret = (iPtrMap*nPagesPerMapPage) + 2;
1044 if( ret==PENDING_BYTE_PAGE(pBt) ){
1045 ret++;
1047 return ret;
1051 ** Write an entry into the pointer map.
1053 ** This routine updates the pointer map entry for page number 'key'
1054 ** so that it maps to type 'eType' and parent page number 'pgno'.
1056 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
1057 ** a no-op. If an error occurs, the appropriate error code is written
1058 ** into *pRC.
1060 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
1061 DbPage *pDbPage; /* The pointer map page */
1062 u8 *pPtrmap; /* The pointer map data */
1063 Pgno iPtrmap; /* The pointer map page number */
1064 int offset; /* Offset in pointer map page */
1065 int rc; /* Return code from subfunctions */
1067 if( *pRC ) return;
1069 assert( sqlite3_mutex_held(pBt->mutex) );
1070 /* The super-journal page number must never be used as a pointer map page */
1071 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
1073 assert( pBt->autoVacuum );
1074 if( key==0 ){
1075 *pRC = SQLITE_CORRUPT_BKPT;
1076 return;
1078 iPtrmap = PTRMAP_PAGENO(pBt, key);
1079 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
1080 if( rc!=SQLITE_OK ){
1081 *pRC = rc;
1082 return;
1084 if( ((char*)sqlite3PagerGetExtra(pDbPage))[0]!=0 ){
1085 /* The first byte of the extra data is the MemPage.isInit byte.
1086 ** If that byte is set, it means this page is also being used
1087 ** as a btree page. */
1088 *pRC = SQLITE_CORRUPT_BKPT;
1089 goto ptrmap_exit;
1091 offset = PTRMAP_PTROFFSET(iPtrmap, key);
1092 if( offset<0 ){
1093 *pRC = SQLITE_CORRUPT_BKPT;
1094 goto ptrmap_exit;
1096 assert( offset <= (int)pBt->usableSize-5 );
1097 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1099 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
1100 TRACE(("PTRMAP_UPDATE: %u->(%u,%u)\n", key, eType, parent));
1101 *pRC= rc = sqlite3PagerWrite(pDbPage);
1102 if( rc==SQLITE_OK ){
1103 pPtrmap[offset] = eType;
1104 put4byte(&pPtrmap[offset+1], parent);
1108 ptrmap_exit:
1109 sqlite3PagerUnref(pDbPage);
1113 ** Read an entry from the pointer map.
1115 ** This routine retrieves the pointer map entry for page 'key', writing
1116 ** the type and parent page number to *pEType and *pPgno respectively.
1117 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
1119 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
1120 DbPage *pDbPage; /* The pointer map page */
1121 int iPtrmap; /* Pointer map page index */
1122 u8 *pPtrmap; /* Pointer map page data */
1123 int offset; /* Offset of entry in pointer map */
1124 int rc;
1126 assert( sqlite3_mutex_held(pBt->mutex) );
1128 iPtrmap = PTRMAP_PAGENO(pBt, key);
1129 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
1130 if( rc!=0 ){
1131 return rc;
1133 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1135 offset = PTRMAP_PTROFFSET(iPtrmap, key);
1136 if( offset<0 ){
1137 sqlite3PagerUnref(pDbPage);
1138 return SQLITE_CORRUPT_BKPT;
1140 assert( offset <= (int)pBt->usableSize-5 );
1141 assert( pEType!=0 );
1142 *pEType = pPtrmap[offset];
1143 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
1145 sqlite3PagerUnref(pDbPage);
1146 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap);
1147 return SQLITE_OK;
1150 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
1151 #define ptrmapPut(w,x,y,z,rc)
1152 #define ptrmapGet(w,x,y,z) SQLITE_OK
1153 #define ptrmapPutOvflPtr(x, y, z, rc)
1154 #endif
1157 ** Given a btree page and a cell index (0 means the first cell on
1158 ** the page, 1 means the second cell, and so forth) return a pointer
1159 ** to the cell content.
1161 ** findCellPastPtr() does the same except it skips past the initial
1162 ** 4-byte child pointer found on interior pages, if there is one.
1164 ** This routine works only for pages that do not contain overflow cells.
1166 #define findCell(P,I) \
1167 ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1168 #define findCellPastPtr(P,I) \
1169 ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1173 ** This is common tail processing for btreeParseCellPtr() and
1174 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
1175 ** on a single B-tree page. Make necessary adjustments to the CellInfo
1176 ** structure.
1178 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
1179 MemPage *pPage, /* Page containing the cell */
1180 u8 *pCell, /* Pointer to the cell text. */
1181 CellInfo *pInfo /* Fill in this structure */
1183 /* If the payload will not fit completely on the local page, we have
1184 ** to decide how much to store locally and how much to spill onto
1185 ** overflow pages. The strategy is to minimize the amount of unused
1186 ** space on overflow pages while keeping the amount of local storage
1187 ** in between minLocal and maxLocal.
1189 ** Warning: changing the way overflow payload is distributed in any
1190 ** way will result in an incompatible file format.
1192 int minLocal; /* Minimum amount of payload held locally */
1193 int maxLocal; /* Maximum amount of payload held locally */
1194 int surplus; /* Overflow payload available for local storage */
1196 minLocal = pPage->minLocal;
1197 maxLocal = pPage->maxLocal;
1198 surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
1199 testcase( surplus==maxLocal );
1200 testcase( surplus==maxLocal+1 );
1201 if( surplus <= maxLocal ){
1202 pInfo->nLocal = (u16)surplus;
1203 }else{
1204 pInfo->nLocal = (u16)minLocal;
1206 pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
1210 ** Given a record with nPayload bytes of payload stored within btree
1211 ** page pPage, return the number of bytes of payload stored locally.
1213 static int btreePayloadToLocal(MemPage *pPage, i64 nPayload){
1214 int maxLocal; /* Maximum amount of payload held locally */
1215 maxLocal = pPage->maxLocal;
1216 if( nPayload<=maxLocal ){
1217 return nPayload;
1218 }else{
1219 int minLocal; /* Minimum amount of payload held locally */
1220 int surplus; /* Overflow payload available for local storage */
1221 minLocal = pPage->minLocal;
1222 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize-4);
1223 return ( surplus <= maxLocal ) ? surplus : minLocal;
1228 ** The following routines are implementations of the MemPage.xParseCell()
1229 ** method.
1231 ** Parse a cell content block and fill in the CellInfo structure.
1233 ** btreeParseCellPtr() => table btree leaf nodes
1234 ** btreeParseCellNoPayload() => table btree internal nodes
1235 ** btreeParseCellPtrIndex() => index btree nodes
1237 ** There is also a wrapper function btreeParseCell() that works for
1238 ** all MemPage types and that references the cell by index rather than
1239 ** by pointer.
1241 static void btreeParseCellPtrNoPayload(
1242 MemPage *pPage, /* Page containing the cell */
1243 u8 *pCell, /* Pointer to the cell text. */
1244 CellInfo *pInfo /* Fill in this structure */
1246 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1247 assert( pPage->leaf==0 );
1248 assert( pPage->childPtrSize==4 );
1249 #ifndef SQLITE_DEBUG
1250 UNUSED_PARAMETER(pPage);
1251 #endif
1252 pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
1253 pInfo->nPayload = 0;
1254 pInfo->nLocal = 0;
1255 pInfo->pPayload = 0;
1256 return;
1258 static void btreeParseCellPtr(
1259 MemPage *pPage, /* Page containing the cell */
1260 u8 *pCell, /* Pointer to the cell text. */
1261 CellInfo *pInfo /* Fill in this structure */
1263 u8 *pIter; /* For scanning through pCell */
1264 u32 nPayload; /* Number of bytes of cell payload */
1265 u64 iKey; /* Extracted Key value */
1267 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1268 assert( pPage->leaf==0 || pPage->leaf==1 );
1269 assert( pPage->intKeyLeaf );
1270 assert( pPage->childPtrSize==0 );
1271 pIter = pCell;
1273 /* The next block of code is equivalent to:
1275 ** pIter += getVarint32(pIter, nPayload);
1277 ** The code is inlined to avoid a function call.
1279 nPayload = *pIter;
1280 if( nPayload>=0x80 ){
1281 u8 *pEnd = &pIter[8];
1282 nPayload &= 0x7f;
1284 nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1285 }while( (*pIter)>=0x80 && pIter<pEnd );
1287 pIter++;
1289 /* The next block of code is equivalent to:
1291 ** pIter += getVarint(pIter, (u64*)&pInfo->nKey);
1293 ** The code is inlined and the loop is unrolled for performance.
1294 ** This routine is a high-runner.
1296 iKey = *pIter;
1297 if( iKey>=0x80 ){
1298 u8 x;
1299 iKey = (iKey<<7) ^ (x = *++pIter);
1300 if( x>=0x80 ){
1301 iKey = (iKey<<7) ^ (x = *++pIter);
1302 if( x>=0x80 ){
1303 iKey = (iKey<<7) ^ 0x10204000 ^ (x = *++pIter);
1304 if( x>=0x80 ){
1305 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter);
1306 if( x>=0x80 ){
1307 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter);
1308 if( x>=0x80 ){
1309 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter);
1310 if( x>=0x80 ){
1311 iKey = (iKey<<7) ^ 0x4000 ^ (x = *++pIter);
1312 if( x>=0x80 ){
1313 iKey = (iKey<<8) ^ 0x8000 ^ (*++pIter);
1319 }else{
1320 iKey ^= 0x204000;
1322 }else{
1323 iKey ^= 0x4000;
1326 pIter++;
1328 pInfo->nKey = *(i64*)&iKey;
1329 pInfo->nPayload = nPayload;
1330 pInfo->pPayload = pIter;
1331 testcase( nPayload==pPage->maxLocal );
1332 testcase( nPayload==(u32)pPage->maxLocal+1 );
1333 if( nPayload<=pPage->maxLocal ){
1334 /* This is the (easy) common case where the entire payload fits
1335 ** on the local page. No overflow is required.
1337 pInfo->nSize = nPayload + (u16)(pIter - pCell);
1338 if( pInfo->nSize<4 ) pInfo->nSize = 4;
1339 pInfo->nLocal = (u16)nPayload;
1340 }else{
1341 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1344 static void btreeParseCellPtrIndex(
1345 MemPage *pPage, /* Page containing the cell */
1346 u8 *pCell, /* Pointer to the cell text. */
1347 CellInfo *pInfo /* Fill in this structure */
1349 u8 *pIter; /* For scanning through pCell */
1350 u32 nPayload; /* Number of bytes of cell payload */
1352 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1353 assert( pPage->leaf==0 || pPage->leaf==1 );
1354 assert( pPage->intKeyLeaf==0 );
1355 pIter = pCell + pPage->childPtrSize;
1356 nPayload = *pIter;
1357 if( nPayload>=0x80 ){
1358 u8 *pEnd = &pIter[8];
1359 nPayload &= 0x7f;
1361 nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1362 }while( *(pIter)>=0x80 && pIter<pEnd );
1364 pIter++;
1365 pInfo->nKey = nPayload;
1366 pInfo->nPayload = nPayload;
1367 pInfo->pPayload = pIter;
1368 testcase( nPayload==pPage->maxLocal );
1369 testcase( nPayload==(u32)pPage->maxLocal+1 );
1370 if( nPayload<=pPage->maxLocal ){
1371 /* This is the (easy) common case where the entire payload fits
1372 ** on the local page. No overflow is required.
1374 pInfo->nSize = nPayload + (u16)(pIter - pCell);
1375 if( pInfo->nSize<4 ) pInfo->nSize = 4;
1376 pInfo->nLocal = (u16)nPayload;
1377 }else{
1378 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1381 static void btreeParseCell(
1382 MemPage *pPage, /* Page containing the cell */
1383 int iCell, /* The cell index. First cell is 0 */
1384 CellInfo *pInfo /* Fill in this structure */
1386 pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
1390 ** The following routines are implementations of the MemPage.xCellSize
1391 ** method.
1393 ** Compute the total number of bytes that a Cell needs in the cell
1394 ** data area of the btree-page. The return number includes the cell
1395 ** data header and the local payload, but not any overflow page or
1396 ** the space used by the cell pointer.
1398 ** cellSizePtrNoPayload() => table internal nodes
1399 ** cellSizePtrTableLeaf() => table leaf nodes
1400 ** cellSizePtr() => index internal nodes
1401 ** cellSizeIdxLeaf() => index leaf nodes
1403 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1404 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
1405 u8 *pEnd; /* End mark for a varint */
1406 u32 nSize; /* Size value to return */
1408 #ifdef SQLITE_DEBUG
1409 /* The value returned by this function should always be the same as
1410 ** the (CellInfo.nSize) value found by doing a full parse of the
1411 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1412 ** this function verifies that this invariant is not violated. */
1413 CellInfo debuginfo;
1414 pPage->xParseCell(pPage, pCell, &debuginfo);
1415 #endif
1417 assert( pPage->childPtrSize==4 );
1418 nSize = *pIter;
1419 if( nSize>=0x80 ){
1420 pEnd = &pIter[8];
1421 nSize &= 0x7f;
1423 nSize = (nSize<<7) | (*++pIter & 0x7f);
1424 }while( *(pIter)>=0x80 && pIter<pEnd );
1426 pIter++;
1427 testcase( nSize==pPage->maxLocal );
1428 testcase( nSize==(u32)pPage->maxLocal+1 );
1429 if( nSize<=pPage->maxLocal ){
1430 nSize += (u32)(pIter - pCell);
1431 assert( nSize>4 );
1432 }else{
1433 int minLocal = pPage->minLocal;
1434 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1435 testcase( nSize==pPage->maxLocal );
1436 testcase( nSize==(u32)pPage->maxLocal+1 );
1437 if( nSize>pPage->maxLocal ){
1438 nSize = minLocal;
1440 nSize += 4 + (u16)(pIter - pCell);
1442 assert( nSize==debuginfo.nSize || CORRUPT_DB );
1443 return (u16)nSize;
1445 static u16 cellSizePtrIdxLeaf(MemPage *pPage, u8 *pCell){
1446 u8 *pIter = pCell; /* For looping over bytes of pCell */
1447 u8 *pEnd; /* End mark for a varint */
1448 u32 nSize; /* Size value to return */
1450 #ifdef SQLITE_DEBUG
1451 /* The value returned by this function should always be the same as
1452 ** the (CellInfo.nSize) value found by doing a full parse of the
1453 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1454 ** this function verifies that this invariant is not violated. */
1455 CellInfo debuginfo;
1456 pPage->xParseCell(pPage, pCell, &debuginfo);
1457 #endif
1459 assert( pPage->childPtrSize==0 );
1460 nSize = *pIter;
1461 if( nSize>=0x80 ){
1462 pEnd = &pIter[8];
1463 nSize &= 0x7f;
1465 nSize = (nSize<<7) | (*++pIter & 0x7f);
1466 }while( *(pIter)>=0x80 && pIter<pEnd );
1468 pIter++;
1469 testcase( nSize==pPage->maxLocal );
1470 testcase( nSize==(u32)pPage->maxLocal+1 );
1471 if( nSize<=pPage->maxLocal ){
1472 nSize += (u32)(pIter - pCell);
1473 if( nSize<4 ) nSize = 4;
1474 }else{
1475 int minLocal = pPage->minLocal;
1476 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1477 testcase( nSize==pPage->maxLocal );
1478 testcase( nSize==(u32)pPage->maxLocal+1 );
1479 if( nSize>pPage->maxLocal ){
1480 nSize = minLocal;
1482 nSize += 4 + (u16)(pIter - pCell);
1484 assert( nSize==debuginfo.nSize || CORRUPT_DB );
1485 return (u16)nSize;
1487 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
1488 u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
1489 u8 *pEnd; /* End mark for a varint */
1491 #ifdef SQLITE_DEBUG
1492 /* The value returned by this function should always be the same as
1493 ** the (CellInfo.nSize) value found by doing a full parse of the
1494 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1495 ** this function verifies that this invariant is not violated. */
1496 CellInfo debuginfo;
1497 pPage->xParseCell(pPage, pCell, &debuginfo);
1498 #else
1499 UNUSED_PARAMETER(pPage);
1500 #endif
1502 assert( pPage->childPtrSize==4 );
1503 pEnd = pIter + 9;
1504 while( (*pIter++)&0x80 && pIter<pEnd );
1505 assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
1506 return (u16)(pIter - pCell);
1508 static u16 cellSizePtrTableLeaf(MemPage *pPage, u8 *pCell){
1509 u8 *pIter = pCell; /* For looping over bytes of pCell */
1510 u8 *pEnd; /* End mark for a varint */
1511 u32 nSize; /* Size value to return */
1513 #ifdef SQLITE_DEBUG
1514 /* The value returned by this function should always be the same as
1515 ** the (CellInfo.nSize) value found by doing a full parse of the
1516 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1517 ** this function verifies that this invariant is not violated. */
1518 CellInfo debuginfo;
1519 pPage->xParseCell(pPage, pCell, &debuginfo);
1520 #endif
1522 nSize = *pIter;
1523 if( nSize>=0x80 ){
1524 pEnd = &pIter[8];
1525 nSize &= 0x7f;
1527 nSize = (nSize<<7) | (*++pIter & 0x7f);
1528 }while( *(pIter)>=0x80 && pIter<pEnd );
1530 pIter++;
1531 /* pIter now points at the 64-bit integer key value, a variable length
1532 ** integer. The following block moves pIter to point at the first byte
1533 ** past the end of the key value. */
1534 if( (*pIter++)&0x80
1535 && (*pIter++)&0x80
1536 && (*pIter++)&0x80
1537 && (*pIter++)&0x80
1538 && (*pIter++)&0x80
1539 && (*pIter++)&0x80
1540 && (*pIter++)&0x80
1541 && (*pIter++)&0x80 ){ pIter++; }
1542 testcase( nSize==pPage->maxLocal );
1543 testcase( nSize==(u32)pPage->maxLocal+1 );
1544 if( nSize<=pPage->maxLocal ){
1545 nSize += (u32)(pIter - pCell);
1546 if( nSize<4 ) nSize = 4;
1547 }else{
1548 int minLocal = pPage->minLocal;
1549 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1550 testcase( nSize==pPage->maxLocal );
1551 testcase( nSize==(u32)pPage->maxLocal+1 );
1552 if( nSize>pPage->maxLocal ){
1553 nSize = minLocal;
1555 nSize += 4 + (u16)(pIter - pCell);
1557 assert( nSize==debuginfo.nSize || CORRUPT_DB );
1558 return (u16)nSize;
1562 #ifdef SQLITE_DEBUG
1563 /* This variation on cellSizePtr() is used inside of assert() statements
1564 ** only. */
1565 static u16 cellSize(MemPage *pPage, int iCell){
1566 return pPage->xCellSize(pPage, findCell(pPage, iCell));
1568 #endif
1570 #ifndef SQLITE_OMIT_AUTOVACUUM
1572 ** The cell pCell is currently part of page pSrc but will ultimately be part
1573 ** of pPage. (pSrc and pPage are often the same.) If pCell contains a
1574 ** pointer to an overflow page, insert an entry into the pointer-map for
1575 ** the overflow page that will be valid after pCell has been moved to pPage.
1577 static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){
1578 CellInfo info;
1579 if( *pRC ) return;
1580 assert( pCell!=0 );
1581 pPage->xParseCell(pPage, pCell, &info);
1582 if( info.nLocal<info.nPayload ){
1583 Pgno ovfl;
1584 if( SQLITE_OVERFLOW(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){
1585 testcase( pSrc!=pPage );
1586 *pRC = SQLITE_CORRUPT_BKPT;
1587 return;
1589 ovfl = get4byte(&pCell[info.nSize-4]);
1590 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1593 #endif
1597 ** Defragment the page given. This routine reorganizes cells within the
1598 ** page so that there are no free-blocks on the free-block list.
1600 ** Parameter nMaxFrag is the maximum amount of fragmented space that may be
1601 ** present in the page after this routine returns.
1603 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
1604 ** b-tree page so that there are no freeblocks or fragment bytes, all
1605 ** unused bytes are contained in the unallocated space region, and all
1606 ** cells are packed tightly at the end of the page.
1608 static int defragmentPage(MemPage *pPage, int nMaxFrag){
1609 int i; /* Loop counter */
1610 int pc; /* Address of the i-th cell */
1611 int hdr; /* Offset to the page header */
1612 int size; /* Size of a cell */
1613 int usableSize; /* Number of usable bytes on a page */
1614 int cellOffset; /* Offset to the cell pointer array */
1615 int cbrk; /* Offset to the cell content area */
1616 int nCell; /* Number of cells on the page */
1617 unsigned char *data; /* The page data */
1618 unsigned char *temp; /* Temp area for cell content */
1619 unsigned char *src; /* Source of content */
1620 int iCellFirst; /* First allowable cell index */
1621 int iCellLast; /* Last possible cell index */
1622 int iCellStart; /* First cell offset in input */
1624 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1625 assert( pPage->pBt!=0 );
1626 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1627 assert( pPage->nOverflow==0 );
1628 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1629 data = pPage->aData;
1630 hdr = pPage->hdrOffset;
1631 cellOffset = pPage->cellOffset;
1632 nCell = pPage->nCell;
1633 assert( nCell==get2byte(&data[hdr+3]) || CORRUPT_DB );
1634 iCellFirst = cellOffset + 2*nCell;
1635 usableSize = pPage->pBt->usableSize;
1637 /* This block handles pages with two or fewer free blocks and nMaxFrag
1638 ** or fewer fragmented bytes. In this case it is faster to move the
1639 ** two (or one) blocks of cells using memmove() and add the required
1640 ** offsets to each pointer in the cell-pointer array than it is to
1641 ** reconstruct the entire page. */
1642 if( (int)data[hdr+7]<=nMaxFrag ){
1643 int iFree = get2byte(&data[hdr+1]);
1644 if( iFree>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
1645 if( iFree ){
1646 int iFree2 = get2byte(&data[iFree]);
1647 if( iFree2>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
1648 if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){
1649 u8 *pEnd = &data[cellOffset + nCell*2];
1650 u8 *pAddr;
1651 int sz2 = 0;
1652 int sz = get2byte(&data[iFree+2]);
1653 int top = get2byte(&data[hdr+5]);
1654 if( top>=iFree ){
1655 return SQLITE_CORRUPT_PAGE(pPage);
1657 if( iFree2 ){
1658 if( iFree+sz>iFree2 ) return SQLITE_CORRUPT_PAGE(pPage);
1659 sz2 = get2byte(&data[iFree2+2]);
1660 if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage);
1661 memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz));
1662 sz += sz2;
1663 }else if( iFree+sz>usableSize ){
1664 return SQLITE_CORRUPT_PAGE(pPage);
1667 cbrk = top+sz;
1668 assert( cbrk+(iFree-top) <= usableSize );
1669 memmove(&data[cbrk], &data[top], iFree-top);
1670 for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){
1671 pc = get2byte(pAddr);
1672 if( pc<iFree ){ put2byte(pAddr, pc+sz); }
1673 else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); }
1675 goto defragment_out;
1680 cbrk = usableSize;
1681 iCellLast = usableSize - 4;
1682 iCellStart = get2byte(&data[hdr+5]);
1683 if( nCell>0 ){
1684 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1685 memcpy(temp, data, usableSize);
1686 src = temp;
1687 for(i=0; i<nCell; i++){
1688 u8 *pAddr; /* The i-th cell pointer */
1689 pAddr = &data[cellOffset + i*2];
1690 pc = get2byte(pAddr);
1691 testcase( pc==iCellFirst );
1692 testcase( pc==iCellLast );
1693 /* These conditions have already been verified in btreeInitPage()
1694 ** if PRAGMA cell_size_check=ON.
1696 if( pc>iCellLast ){
1697 return SQLITE_CORRUPT_PAGE(pPage);
1699 assert( pc>=0 && pc<=iCellLast );
1700 size = pPage->xCellSize(pPage, &src[pc]);
1701 cbrk -= size;
1702 if( cbrk<iCellStart || pc+size>usableSize ){
1703 return SQLITE_CORRUPT_PAGE(pPage);
1705 assert( cbrk+size<=usableSize && cbrk>=iCellStart );
1706 testcase( cbrk+size==usableSize );
1707 testcase( pc+size==usableSize );
1708 put2byte(pAddr, cbrk);
1709 memcpy(&data[cbrk], &src[pc], size);
1712 data[hdr+7] = 0;
1714 defragment_out:
1715 assert( pPage->nFree>=0 );
1716 if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
1717 return SQLITE_CORRUPT_PAGE(pPage);
1719 assert( cbrk>=iCellFirst );
1720 put2byte(&data[hdr+5], cbrk);
1721 data[hdr+1] = 0;
1722 data[hdr+2] = 0;
1723 memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1724 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1725 return SQLITE_OK;
1729 ** Search the free-list on page pPg for space to store a cell nByte bytes in
1730 ** size. If one can be found, return a pointer to the space and remove it
1731 ** from the free-list.
1733 ** If no suitable space can be found on the free-list, return NULL.
1735 ** This function may detect corruption within pPg. If corruption is
1736 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
1738 ** Slots on the free list that are between 1 and 3 bytes larger than nByte
1739 ** will be ignored if adding the extra space to the fragmentation count
1740 ** causes the fragmentation count to exceed 60.
1742 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
1743 const int hdr = pPg->hdrOffset; /* Offset to page header */
1744 u8 * const aData = pPg->aData; /* Page data */
1745 int iAddr = hdr + 1; /* Address of ptr to pc */
1746 u8 *pTmp = &aData[iAddr]; /* Temporary ptr into aData[] */
1747 int pc = get2byte(pTmp); /* Address of a free slot */
1748 int x; /* Excess size of the slot */
1749 int maxPC = pPg->pBt->usableSize - nByte; /* Max address for a usable slot */
1750 int size; /* Size of the free slot */
1752 assert( pc>0 );
1753 while( pc<=maxPC ){
1754 /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
1755 ** freeblock form a big-endian integer which is the size of the freeblock
1756 ** in bytes, including the 4-byte header. */
1757 pTmp = &aData[pc+2];
1758 size = get2byte(pTmp);
1759 if( (x = size - nByte)>=0 ){
1760 testcase( x==4 );
1761 testcase( x==3 );
1762 if( x<4 ){
1763 /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
1764 ** number of bytes in fragments may not exceed 60. */
1765 if( aData[hdr+7]>57 ) return 0;
1767 /* Remove the slot from the free-list. Update the number of
1768 ** fragmented bytes within the page. */
1769 memcpy(&aData[iAddr], &aData[pc], 2);
1770 aData[hdr+7] += (u8)x;
1771 return &aData[pc];
1772 }else if( x+pc > maxPC ){
1773 /* This slot extends off the end of the usable part of the page */
1774 *pRc = SQLITE_CORRUPT_PAGE(pPg);
1775 return 0;
1776 }else{
1777 /* The slot remains on the free-list. Reduce its size to account
1778 ** for the portion used by the new allocation. */
1779 put2byte(&aData[pc+2], x);
1781 return &aData[pc + x];
1783 iAddr = pc;
1784 pTmp = &aData[pc];
1785 pc = get2byte(pTmp);
1786 if( pc<=iAddr ){
1787 if( pc ){
1788 /* The next slot in the chain comes before the current slot */
1789 *pRc = SQLITE_CORRUPT_PAGE(pPg);
1791 return 0;
1794 if( pc>maxPC+nByte-4 ){
1795 /* The free slot chain extends off the end of the page */
1796 *pRc = SQLITE_CORRUPT_PAGE(pPg);
1798 return 0;
1802 ** Allocate nByte bytes of space from within the B-Tree page passed
1803 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1804 ** of the first byte of allocated space. Return either SQLITE_OK or
1805 ** an error code (usually SQLITE_CORRUPT).
1807 ** The caller guarantees that there is sufficient space to make the
1808 ** allocation. This routine might need to defragment in order to bring
1809 ** all the space together, however. This routine will avoid using
1810 ** the first two bytes past the cell pointer area since presumably this
1811 ** allocation is being made in order to insert a new cell, so we will
1812 ** also end up needing a new cell pointer.
1814 static SQLITE_INLINE int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1815 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */
1816 u8 * const data = pPage->aData; /* Local cache of pPage->aData */
1817 int top; /* First byte of cell content area */
1818 int rc = SQLITE_OK; /* Integer return code */
1819 u8 *pTmp; /* Temp ptr into data[] */
1820 int gap; /* First byte of gap between cell pointers and cell content */
1822 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1823 assert( pPage->pBt );
1824 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1825 assert( nByte>=0 ); /* Minimum cell size is 4 */
1826 assert( pPage->nFree>=nByte );
1827 assert( pPage->nOverflow==0 );
1828 assert( nByte < (int)(pPage->pBt->usableSize-8) );
1830 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1831 gap = pPage->cellOffset + 2*pPage->nCell;
1832 assert( gap<=65536 );
1833 /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
1834 ** and the reserved space is zero (the usual value for reserved space)
1835 ** then the cell content offset of an empty page wants to be 65536.
1836 ** However, that integer is too large to be stored in a 2-byte unsigned
1837 ** integer, so a value of 0 is used in its place. */
1838 pTmp = &data[hdr+5];
1839 top = get2byte(pTmp);
1840 if( gap>top ){
1841 if( top==0 && pPage->pBt->usableSize==65536 ){
1842 top = 65536;
1843 }else{
1844 return SQLITE_CORRUPT_PAGE(pPage);
1846 }else if( top>(int)pPage->pBt->usableSize ){
1847 return SQLITE_CORRUPT_PAGE(pPage);
1850 /* If there is enough space between gap and top for one more cell pointer,
1851 ** and if the freelist is not empty, then search the
1852 ** freelist looking for a slot big enough to satisfy the request.
1854 testcase( gap+2==top );
1855 testcase( gap+1==top );
1856 testcase( gap==top );
1857 if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
1858 u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
1859 if( pSpace ){
1860 int g2;
1861 assert( pSpace+nByte<=data+pPage->pBt->usableSize );
1862 *pIdx = g2 = (int)(pSpace-data);
1863 if( g2<=gap ){
1864 return SQLITE_CORRUPT_PAGE(pPage);
1865 }else{
1866 return SQLITE_OK;
1868 }else if( rc ){
1869 return rc;
1873 /* The request could not be fulfilled using a freelist slot. Check
1874 ** to see if defragmentation is necessary.
1876 testcase( gap+2+nByte==top );
1877 if( gap+2+nByte>top ){
1878 assert( pPage->nCell>0 || CORRUPT_DB );
1879 assert( pPage->nFree>=0 );
1880 rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte)));
1881 if( rc ) return rc;
1882 top = get2byteNotZero(&data[hdr+5]);
1883 assert( gap+2+nByte<=top );
1887 /* Allocate memory from the gap in between the cell pointer array
1888 ** and the cell content area. The btreeComputeFreeSpace() call has already
1889 ** validated the freelist. Given that the freelist is valid, there
1890 ** is no way that the allocation can extend off the end of the page.
1891 ** The assert() below verifies the previous sentence.
1893 top -= nByte;
1894 put2byte(&data[hdr+5], top);
1895 assert( top+nByte <= (int)pPage->pBt->usableSize );
1896 *pIdx = top;
1897 return SQLITE_OK;
1901 ** Return a section of the pPage->aData to the freelist.
1902 ** The first byte of the new free block is pPage->aData[iStart]
1903 ** and the size of the block is iSize bytes.
1905 ** Adjacent freeblocks are coalesced.
1907 ** Even though the freeblock list was checked by btreeComputeFreeSpace(),
1908 ** that routine will not detect overlap between cells or freeblocks. Nor
1909 ** does it detect cells or freeblocks that encroach into the reserved bytes
1910 ** at the end of the page. So do additional corruption checks inside this
1911 ** routine and return SQLITE_CORRUPT if any problems are found.
1913 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
1914 u16 iPtr; /* Address of ptr to next freeblock */
1915 u16 iFreeBlk; /* Address of the next freeblock */
1916 u8 hdr; /* Page header size. 0 or 100 */
1917 u8 nFrag = 0; /* Reduction in fragmentation */
1918 u16 iOrigSize = iSize; /* Original value of iSize */
1919 u16 x; /* Offset to cell content area */
1920 u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */
1921 unsigned char *data = pPage->aData; /* Page content */
1922 u8 *pTmp; /* Temporary ptr into data[] */
1924 assert( pPage->pBt!=0 );
1925 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1926 assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
1927 assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
1928 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1929 assert( iSize>=4 ); /* Minimum cell size is 4 */
1930 assert( CORRUPT_DB || iStart<=pPage->pBt->usableSize-4 );
1932 /* The list of freeblocks must be in ascending order. Find the
1933 ** spot on the list where iStart should be inserted.
1935 hdr = pPage->hdrOffset;
1936 iPtr = hdr + 1;
1937 if( data[iPtr+1]==0 && data[iPtr]==0 ){
1938 iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */
1939 }else{
1940 while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
1941 if( iFreeBlk<=iPtr ){
1942 if( iFreeBlk==0 ) break; /* TH3: corrupt082.100 */
1943 return SQLITE_CORRUPT_PAGE(pPage);
1945 iPtr = iFreeBlk;
1947 if( iFreeBlk>pPage->pBt->usableSize-4 ){ /* TH3: corrupt081.100 */
1948 return SQLITE_CORRUPT_PAGE(pPage);
1950 assert( iFreeBlk>iPtr || iFreeBlk==0 || CORRUPT_DB );
1952 /* At this point:
1953 ** iFreeBlk: First freeblock after iStart, or zero if none
1954 ** iPtr: The address of a pointer to iFreeBlk
1956 ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
1958 if( iFreeBlk && iEnd+3>=iFreeBlk ){
1959 nFrag = iFreeBlk - iEnd;
1960 if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage);
1961 iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
1962 if( iEnd > pPage->pBt->usableSize ){
1963 return SQLITE_CORRUPT_PAGE(pPage);
1965 iSize = iEnd - iStart;
1966 iFreeBlk = get2byte(&data[iFreeBlk]);
1969 /* If iPtr is another freeblock (that is, if iPtr is not the freelist
1970 ** pointer in the page header) then check to see if iStart should be
1971 ** coalesced onto the end of iPtr.
1973 if( iPtr>hdr+1 ){
1974 int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
1975 if( iPtrEnd+3>=iStart ){
1976 if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage);
1977 nFrag += iStart - iPtrEnd;
1978 iSize = iEnd - iPtr;
1979 iStart = iPtr;
1982 if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage);
1983 data[hdr+7] -= nFrag;
1985 pTmp = &data[hdr+5];
1986 x = get2byte(pTmp);
1987 if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){
1988 /* Overwrite deleted information with zeros when the secure_delete
1989 ** option is enabled */
1990 memset(&data[iStart], 0, iSize);
1992 if( iStart<=x ){
1993 /* The new freeblock is at the beginning of the cell content area,
1994 ** so just extend the cell content area rather than create another
1995 ** freelist entry */
1996 if( iStart<x ) return SQLITE_CORRUPT_PAGE(pPage);
1997 if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage);
1998 put2byte(&data[hdr+1], iFreeBlk);
1999 put2byte(&data[hdr+5], iEnd);
2000 }else{
2001 /* Insert the new freeblock into the freelist */
2002 put2byte(&data[iPtr], iStart);
2003 put2byte(&data[iStart], iFreeBlk);
2004 put2byte(&data[iStart+2], iSize);
2006 pPage->nFree += iOrigSize;
2007 return SQLITE_OK;
2011 ** Decode the flags byte (the first byte of the header) for a page
2012 ** and initialize fields of the MemPage structure accordingly.
2014 ** Only the following combinations are supported. Anything different
2015 ** indicates a corrupt database files:
2017 ** PTF_ZERODATA (0x02, 2)
2018 ** PTF_LEAFDATA | PTF_INTKEY (0x05, 5)
2019 ** PTF_ZERODATA | PTF_LEAF (0x0a, 10)
2020 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF (0x0d, 13)
2022 static int decodeFlags(MemPage *pPage, int flagByte){
2023 BtShared *pBt; /* A copy of pPage->pBt */
2025 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
2026 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2027 pBt = pPage->pBt;
2028 pPage->max1bytePayload = pBt->max1bytePayload;
2029 if( flagByte>=(PTF_ZERODATA | PTF_LEAF) ){
2030 pPage->childPtrSize = 0;
2031 pPage->leaf = 1;
2032 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF) ){
2033 pPage->intKeyLeaf = 1;
2034 pPage->xCellSize = cellSizePtrTableLeaf;
2035 pPage->xParseCell = btreeParseCellPtr;
2036 pPage->intKey = 1;
2037 pPage->maxLocal = pBt->maxLeaf;
2038 pPage->minLocal = pBt->minLeaf;
2039 }else if( flagByte==(PTF_ZERODATA | PTF_LEAF) ){
2040 pPage->intKey = 0;
2041 pPage->intKeyLeaf = 0;
2042 pPage->xCellSize = cellSizePtrIdxLeaf;
2043 pPage->xParseCell = btreeParseCellPtrIndex;
2044 pPage->maxLocal = pBt->maxLocal;
2045 pPage->minLocal = pBt->minLocal;
2046 }else{
2047 pPage->intKey = 0;
2048 pPage->intKeyLeaf = 0;
2049 pPage->xCellSize = cellSizePtrIdxLeaf;
2050 pPage->xParseCell = btreeParseCellPtrIndex;
2051 return SQLITE_CORRUPT_PAGE(pPage);
2053 }else{
2054 pPage->childPtrSize = 4;
2055 pPage->leaf = 0;
2056 if( flagByte==(PTF_ZERODATA) ){
2057 pPage->intKey = 0;
2058 pPage->intKeyLeaf = 0;
2059 pPage->xCellSize = cellSizePtr;
2060 pPage->xParseCell = btreeParseCellPtrIndex;
2061 pPage->maxLocal = pBt->maxLocal;
2062 pPage->minLocal = pBt->minLocal;
2063 }else if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
2064 pPage->intKeyLeaf = 0;
2065 pPage->xCellSize = cellSizePtrNoPayload;
2066 pPage->xParseCell = btreeParseCellPtrNoPayload;
2067 pPage->intKey = 1;
2068 pPage->maxLocal = pBt->maxLeaf;
2069 pPage->minLocal = pBt->minLeaf;
2070 }else{
2071 pPage->intKey = 0;
2072 pPage->intKeyLeaf = 0;
2073 pPage->xCellSize = cellSizePtr;
2074 pPage->xParseCell = btreeParseCellPtrIndex;
2075 return SQLITE_CORRUPT_PAGE(pPage);
2078 return SQLITE_OK;
2082 ** Compute the amount of freespace on the page. In other words, fill
2083 ** in the pPage->nFree field.
2085 static int btreeComputeFreeSpace(MemPage *pPage){
2086 int pc; /* Address of a freeblock within pPage->aData[] */
2087 u8 hdr; /* Offset to beginning of page header */
2088 u8 *data; /* Equal to pPage->aData */
2089 int usableSize; /* Amount of usable space on each page */
2090 int nFree; /* Number of unused bytes on the page */
2091 int top; /* First byte of the cell content area */
2092 int iCellFirst; /* First allowable cell or freeblock offset */
2093 int iCellLast; /* Last possible cell or freeblock offset */
2095 assert( pPage->pBt!=0 );
2096 assert( pPage->pBt->db!=0 );
2097 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2098 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
2099 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
2100 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
2101 assert( pPage->isInit==1 );
2102 assert( pPage->nFree<0 );
2104 usableSize = pPage->pBt->usableSize;
2105 hdr = pPage->hdrOffset;
2106 data = pPage->aData;
2107 /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
2108 ** the start of the cell content area. A zero value for this integer is
2109 ** interpreted as 65536. */
2110 top = get2byteNotZero(&data[hdr+5]);
2111 iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell;
2112 iCellLast = usableSize - 4;
2114 /* Compute the total free space on the page
2115 ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
2116 ** start of the first freeblock on the page, or is zero if there are no
2117 ** freeblocks. */
2118 pc = get2byte(&data[hdr+1]);
2119 nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */
2120 if( pc>0 ){
2121 u32 next, size;
2122 if( pc<top ){
2123 /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
2124 ** always be at least one cell before the first freeblock.
2126 return SQLITE_CORRUPT_PAGE(pPage);
2128 while( 1 ){
2129 if( pc>iCellLast ){
2130 /* Freeblock off the end of the page */
2131 return SQLITE_CORRUPT_PAGE(pPage);
2133 next = get2byte(&data[pc]);
2134 size = get2byte(&data[pc+2]);
2135 nFree = nFree + size;
2136 if( next<=pc+size+3 ) break;
2137 pc = next;
2139 if( next>0 ){
2140 /* Freeblock not in ascending order */
2141 return SQLITE_CORRUPT_PAGE(pPage);
2143 if( pc+size>(unsigned int)usableSize ){
2144 /* Last freeblock extends past page end */
2145 return SQLITE_CORRUPT_PAGE(pPage);
2149 /* At this point, nFree contains the sum of the offset to the start
2150 ** of the cell-content area plus the number of free bytes within
2151 ** the cell-content area. If this is greater than the usable-size
2152 ** of the page, then the page must be corrupted. This check also
2153 ** serves to verify that the offset to the start of the cell-content
2154 ** area, according to the page header, lies within the page.
2156 if( nFree>usableSize || nFree<iCellFirst ){
2157 return SQLITE_CORRUPT_PAGE(pPage);
2159 pPage->nFree = (u16)(nFree - iCellFirst);
2160 return SQLITE_OK;
2164 ** Do additional sanity check after btreeInitPage() if
2165 ** PRAGMA cell_size_check=ON
2167 static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){
2168 int iCellFirst; /* First allowable cell or freeblock offset */
2169 int iCellLast; /* Last possible cell or freeblock offset */
2170 int i; /* Index into the cell pointer array */
2171 int sz; /* Size of a cell */
2172 int pc; /* Address of a freeblock within pPage->aData[] */
2173 u8 *data; /* Equal to pPage->aData */
2174 int usableSize; /* Maximum usable space on the page */
2175 int cellOffset; /* Start of cell content area */
2177 iCellFirst = pPage->cellOffset + 2*pPage->nCell;
2178 usableSize = pPage->pBt->usableSize;
2179 iCellLast = usableSize - 4;
2180 data = pPage->aData;
2181 cellOffset = pPage->cellOffset;
2182 if( !pPage->leaf ) iCellLast--;
2183 for(i=0; i<pPage->nCell; i++){
2184 pc = get2byteAligned(&data[cellOffset+i*2]);
2185 testcase( pc==iCellFirst );
2186 testcase( pc==iCellLast );
2187 if( pc<iCellFirst || pc>iCellLast ){
2188 return SQLITE_CORRUPT_PAGE(pPage);
2190 sz = pPage->xCellSize(pPage, &data[pc]);
2191 testcase( pc+sz==usableSize );
2192 if( pc+sz>usableSize ){
2193 return SQLITE_CORRUPT_PAGE(pPage);
2196 return SQLITE_OK;
2200 ** Initialize the auxiliary information for a disk block.
2202 ** Return SQLITE_OK on success. If we see that the page does
2203 ** not contain a well-formed database page, then return
2204 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
2205 ** guarantee that the page is well-formed. It only shows that
2206 ** we failed to detect any corruption.
2208 static int btreeInitPage(MemPage *pPage){
2209 u8 *data; /* Equal to pPage->aData */
2210 BtShared *pBt; /* The main btree structure */
2212 assert( pPage->pBt!=0 );
2213 assert( pPage->pBt->db!=0 );
2214 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2215 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
2216 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
2217 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
2218 assert( pPage->isInit==0 );
2220 pBt = pPage->pBt;
2221 data = pPage->aData + pPage->hdrOffset;
2222 /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
2223 ** the b-tree page type. */
2224 if( decodeFlags(pPage, data[0]) ){
2225 return SQLITE_CORRUPT_PAGE(pPage);
2227 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
2228 pPage->maskPage = (u16)(pBt->pageSize - 1);
2229 pPage->nOverflow = 0;
2230 pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize;
2231 pPage->aCellIdx = data + pPage->childPtrSize + 8;
2232 pPage->aDataEnd = pPage->aData + pBt->pageSize;
2233 pPage->aDataOfst = pPage->aData + pPage->childPtrSize;
2234 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
2235 ** number of cells on the page. */
2236 pPage->nCell = get2byte(&data[3]);
2237 if( pPage->nCell>MX_CELL(pBt) ){
2238 /* To many cells for a single page. The page must be corrupt */
2239 return SQLITE_CORRUPT_PAGE(pPage);
2241 testcase( pPage->nCell==MX_CELL(pBt) );
2242 /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
2243 ** possible for a root page of a table that contains no rows) then the
2244 ** offset to the cell content area will equal the page size minus the
2245 ** bytes of reserved space. */
2246 assert( pPage->nCell>0
2247 || get2byteNotZero(&data[5])==(int)pBt->usableSize
2248 || CORRUPT_DB );
2249 pPage->nFree = -1; /* Indicate that this value is yet uncomputed */
2250 pPage->isInit = 1;
2251 if( pBt->db->flags & SQLITE_CellSizeCk ){
2252 return btreeCellSizeCheck(pPage);
2254 return SQLITE_OK;
2258 ** Set up a raw page so that it looks like a database page holding
2259 ** no entries.
2261 static void zeroPage(MemPage *pPage, int flags){
2262 unsigned char *data = pPage->aData;
2263 BtShared *pBt = pPage->pBt;
2264 u8 hdr = pPage->hdrOffset;
2265 u16 first;
2267 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno || CORRUPT_DB );
2268 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2269 assert( sqlite3PagerGetData(pPage->pDbPage) == data );
2270 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2271 assert( sqlite3_mutex_held(pBt->mutex) );
2272 if( pBt->btsFlags & BTS_FAST_SECURE ){
2273 memset(&data[hdr], 0, pBt->usableSize - hdr);
2275 data[hdr] = (char)flags;
2276 first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
2277 memset(&data[hdr+1], 0, 4);
2278 data[hdr+7] = 0;
2279 put2byte(&data[hdr+5], pBt->usableSize);
2280 pPage->nFree = (u16)(pBt->usableSize - first);
2281 decodeFlags(pPage, flags);
2282 pPage->cellOffset = first;
2283 pPage->aDataEnd = &data[pBt->pageSize];
2284 pPage->aCellIdx = &data[first];
2285 pPage->aDataOfst = &data[pPage->childPtrSize];
2286 pPage->nOverflow = 0;
2287 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
2288 pPage->maskPage = (u16)(pBt->pageSize - 1);
2289 pPage->nCell = 0;
2290 pPage->isInit = 1;
2295 ** Convert a DbPage obtained from the pager into a MemPage used by
2296 ** the btree layer.
2298 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
2299 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2300 if( pgno!=pPage->pgno ){
2301 pPage->aData = sqlite3PagerGetData(pDbPage);
2302 pPage->pDbPage = pDbPage;
2303 pPage->pBt = pBt;
2304 pPage->pgno = pgno;
2305 pPage->hdrOffset = pgno==1 ? 100 : 0;
2307 assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
2308 return pPage;
2312 ** Get a page from the pager. Initialize the MemPage.pBt and
2313 ** MemPage.aData elements if needed. See also: btreeGetUnusedPage().
2315 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
2316 ** about the content of the page at this time. So do not go to the disk
2317 ** to fetch the content. Just fill in the content with zeros for now.
2318 ** If in the future we call sqlite3PagerWrite() on this page, that
2319 ** means we have started to be concerned about content and the disk
2320 ** read should occur at that point.
2322 static int btreeGetPage(
2323 BtShared *pBt, /* The btree */
2324 Pgno pgno, /* Number of the page to fetch */
2325 MemPage **ppPage, /* Return the page in this parameter */
2326 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2328 int rc;
2329 DbPage *pDbPage;
2331 assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
2332 assert( sqlite3_mutex_held(pBt->mutex) );
2333 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
2334 if( rc ) return rc;
2335 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
2336 return SQLITE_OK;
2340 ** Retrieve a page from the pager cache. If the requested page is not
2341 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
2342 ** MemPage.aData elements if needed.
2344 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
2345 DbPage *pDbPage;
2346 assert( sqlite3_mutex_held(pBt->mutex) );
2347 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
2348 if( pDbPage ){
2349 return btreePageFromDbPage(pDbPage, pgno, pBt);
2351 return 0;
2355 ** Return the size of the database file in pages. If there is any kind of
2356 ** error, return ((unsigned int)-1).
2358 static Pgno btreePagecount(BtShared *pBt){
2359 return pBt->nPage;
2361 Pgno sqlite3BtreeLastPage(Btree *p){
2362 assert( sqlite3BtreeHoldsMutex(p) );
2363 return btreePagecount(p->pBt);
2367 ** Get a page from the pager and initialize it.
2369 static int getAndInitPage(
2370 BtShared *pBt, /* The database file */
2371 Pgno pgno, /* Number of the page to get */
2372 MemPage **ppPage, /* Write the page pointer here */
2373 int bReadOnly /* True for a read-only page */
2375 int rc;
2376 DbPage *pDbPage;
2377 MemPage *pPage;
2378 assert( sqlite3_mutex_held(pBt->mutex) );
2380 if( pgno>btreePagecount(pBt) ){
2381 *ppPage = 0;
2382 return SQLITE_CORRUPT_BKPT;
2384 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
2385 if( rc ){
2386 *ppPage = 0;
2387 return rc;
2389 pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2390 if( pPage->isInit==0 ){
2391 btreePageFromDbPage(pDbPage, pgno, pBt);
2392 rc = btreeInitPage(pPage);
2393 if( rc!=SQLITE_OK ){
2394 releasePage(pPage);
2395 *ppPage = 0;
2396 return rc;
2399 assert( pPage->pgno==pgno || CORRUPT_DB );
2400 assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
2401 *ppPage = pPage;
2402 return SQLITE_OK;
2406 ** Release a MemPage. This should be called once for each prior
2407 ** call to btreeGetPage.
2409 ** Page1 is a special case and must be released using releasePageOne().
2411 static void releasePageNotNull(MemPage *pPage){
2412 assert( pPage->aData );
2413 assert( pPage->pBt );
2414 assert( pPage->pDbPage!=0 );
2415 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2416 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2417 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2418 sqlite3PagerUnrefNotNull(pPage->pDbPage);
2420 static void releasePage(MemPage *pPage){
2421 if( pPage ) releasePageNotNull(pPage);
2423 static void releasePageOne(MemPage *pPage){
2424 assert( pPage!=0 );
2425 assert( pPage->aData );
2426 assert( pPage->pBt );
2427 assert( pPage->pDbPage!=0 );
2428 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2429 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2430 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2431 sqlite3PagerUnrefPageOne(pPage->pDbPage);
2435 ** Get an unused page.
2437 ** This works just like btreeGetPage() with the addition:
2439 ** * If the page is already in use for some other purpose, immediately
2440 ** release it and return an SQLITE_CURRUPT error.
2441 ** * Make sure the isInit flag is clear
2443 static int btreeGetUnusedPage(
2444 BtShared *pBt, /* The btree */
2445 Pgno pgno, /* Number of the page to fetch */
2446 MemPage **ppPage, /* Return the page in this parameter */
2447 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2449 int rc = btreeGetPage(pBt, pgno, ppPage, flags);
2450 if( rc==SQLITE_OK ){
2451 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
2452 releasePage(*ppPage);
2453 *ppPage = 0;
2454 return SQLITE_CORRUPT_BKPT;
2456 (*ppPage)->isInit = 0;
2457 }else{
2458 *ppPage = 0;
2460 return rc;
2465 ** During a rollback, when the pager reloads information into the cache
2466 ** so that the cache is restored to its original state at the start of
2467 ** the transaction, for each page restored this routine is called.
2469 ** This routine needs to reset the extra data section at the end of the
2470 ** page to agree with the restored data.
2472 static void pageReinit(DbPage *pData){
2473 MemPage *pPage;
2474 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
2475 assert( sqlite3PagerPageRefcount(pData)>0 );
2476 if( pPage->isInit ){
2477 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2478 pPage->isInit = 0;
2479 if( sqlite3PagerPageRefcount(pData)>1 ){
2480 /* pPage might not be a btree page; it might be an overflow page
2481 ** or ptrmap page or a free page. In those cases, the following
2482 ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
2483 ** But no harm is done by this. And it is very important that
2484 ** btreeInitPage() be called on every btree page so we make
2485 ** the call for every page that comes in for re-initializing. */
2486 btreeInitPage(pPage);
2492 ** Invoke the busy handler for a btree.
2494 static int btreeInvokeBusyHandler(void *pArg){
2495 BtShared *pBt = (BtShared*)pArg;
2496 assert( pBt->db );
2497 assert( sqlite3_mutex_held(pBt->db->mutex) );
2498 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
2502 ** Open a database file.
2504 ** zFilename is the name of the database file. If zFilename is NULL
2505 ** then an ephemeral database is created. The ephemeral database might
2506 ** be exclusively in memory, or it might use a disk-based memory cache.
2507 ** Either way, the ephemeral database will be automatically deleted
2508 ** when sqlite3BtreeClose() is called.
2510 ** If zFilename is ":memory:" then an in-memory database is created
2511 ** that is automatically destroyed when it is closed.
2513 ** The "flags" parameter is a bitmask that might contain bits like
2514 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
2516 ** If the database is already opened in the same database connection
2517 ** and we are in shared cache mode, then the open will fail with an
2518 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared
2519 ** objects in the same database connection since doing so will lead
2520 ** to problems with locking.
2522 int sqlite3BtreeOpen(
2523 sqlite3_vfs *pVfs, /* VFS to use for this b-tree */
2524 const char *zFilename, /* Name of the file containing the BTree database */
2525 sqlite3 *db, /* Associated database handle */
2526 Btree **ppBtree, /* Pointer to new Btree object written here */
2527 int flags, /* Options */
2528 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
2530 BtShared *pBt = 0; /* Shared part of btree structure */
2531 Btree *p; /* Handle to return */
2532 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */
2533 int rc = SQLITE_OK; /* Result code from this function */
2534 u8 nReserve; /* Byte of unused space on each page */
2535 unsigned char zDbHeader[100]; /* Database header content */
2537 /* True if opening an ephemeral, temporary database */
2538 const int isTempDb = zFilename==0 || zFilename[0]==0;
2540 /* Set the variable isMemdb to true for an in-memory database, or
2541 ** false for a file-based database.
2543 #ifdef SQLITE_OMIT_MEMORYDB
2544 const int isMemdb = 0;
2545 #else
2546 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
2547 || (isTempDb && sqlite3TempInMemory(db))
2548 || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
2549 #endif
2551 assert( db!=0 );
2552 assert( pVfs!=0 );
2553 assert( sqlite3_mutex_held(db->mutex) );
2554 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */
2556 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
2557 assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
2559 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
2560 assert( (flags & BTREE_SINGLE)==0 || isTempDb );
2562 if( isMemdb ){
2563 flags |= BTREE_MEMORY;
2565 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
2566 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
2568 p = sqlite3MallocZero(sizeof(Btree));
2569 if( !p ){
2570 return SQLITE_NOMEM_BKPT;
2572 p->inTrans = TRANS_NONE;
2573 p->db = db;
2574 #ifndef SQLITE_OMIT_SHARED_CACHE
2575 p->lock.pBtree = p;
2576 p->lock.iTable = 1;
2577 #endif
2579 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2581 ** If this Btree is a candidate for shared cache, try to find an
2582 ** existing BtShared object that we can share with
2584 if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
2585 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
2586 int nFilename = sqlite3Strlen30(zFilename)+1;
2587 int nFullPathname = pVfs->mxPathname+1;
2588 char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
2589 MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2591 p->sharable = 1;
2592 if( !zFullPathname ){
2593 sqlite3_free(p);
2594 return SQLITE_NOMEM_BKPT;
2596 if( isMemdb ){
2597 memcpy(zFullPathname, zFilename, nFilename);
2598 }else{
2599 rc = sqlite3OsFullPathname(pVfs, zFilename,
2600 nFullPathname, zFullPathname);
2601 if( rc ){
2602 if( rc==SQLITE_OK_SYMLINK ){
2603 rc = SQLITE_OK;
2604 }else{
2605 sqlite3_free(zFullPathname);
2606 sqlite3_free(p);
2607 return rc;
2611 #if SQLITE_THREADSAFE
2612 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
2613 sqlite3_mutex_enter(mutexOpen);
2614 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN);
2615 sqlite3_mutex_enter(mutexShared);
2616 #endif
2617 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
2618 assert( pBt->nRef>0 );
2619 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
2620 && sqlite3PagerVfs(pBt->pPager)==pVfs ){
2621 int iDb;
2622 for(iDb=db->nDb-1; iDb>=0; iDb--){
2623 Btree *pExisting = db->aDb[iDb].pBt;
2624 if( pExisting && pExisting->pBt==pBt ){
2625 sqlite3_mutex_leave(mutexShared);
2626 sqlite3_mutex_leave(mutexOpen);
2627 sqlite3_free(zFullPathname);
2628 sqlite3_free(p);
2629 return SQLITE_CONSTRAINT;
2632 p->pBt = pBt;
2633 pBt->nRef++;
2634 break;
2637 sqlite3_mutex_leave(mutexShared);
2638 sqlite3_free(zFullPathname);
2640 #ifdef SQLITE_DEBUG
2641 else{
2642 /* In debug mode, we mark all persistent databases as sharable
2643 ** even when they are not. This exercises the locking code and
2644 ** gives more opportunity for asserts(sqlite3_mutex_held())
2645 ** statements to find locking problems.
2647 p->sharable = 1;
2649 #endif
2651 #endif
2652 if( pBt==0 ){
2654 ** The following asserts make sure that structures used by the btree are
2655 ** the right size. This is to guard against size changes that result
2656 ** when compiling on a different architecture.
2658 assert( sizeof(i64)==8 );
2659 assert( sizeof(u64)==8 );
2660 assert( sizeof(u32)==4 );
2661 assert( sizeof(u16)==2 );
2662 assert( sizeof(Pgno)==4 );
2664 /* Suppress false-positive compiler warning from PVS-Studio */
2665 memset(&zDbHeader[16], 0, 8);
2667 pBt = sqlite3MallocZero( sizeof(*pBt) );
2668 if( pBt==0 ){
2669 rc = SQLITE_NOMEM_BKPT;
2670 goto btree_open_out;
2672 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
2673 sizeof(MemPage), flags, vfsFlags, pageReinit);
2674 if( rc==SQLITE_OK ){
2675 sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
2676 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
2678 if( rc!=SQLITE_OK ){
2679 goto btree_open_out;
2681 pBt->openFlags = (u8)flags;
2682 pBt->db = db;
2683 sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
2684 p->pBt = pBt;
2686 pBt->pCursor = 0;
2687 pBt->pPage1 = 0;
2688 if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
2689 #if defined(SQLITE_SECURE_DELETE)
2690 pBt->btsFlags |= BTS_SECURE_DELETE;
2691 #elif defined(SQLITE_FAST_SECURE_DELETE)
2692 pBt->btsFlags |= BTS_OVERWRITE;
2693 #endif
2694 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2695 ** determined by the 2-byte integer located at an offset of 16 bytes from
2696 ** the beginning of the database file. */
2697 pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
2698 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
2699 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
2700 pBt->pageSize = 0;
2701 #ifndef SQLITE_OMIT_AUTOVACUUM
2702 /* If the magic name ":memory:" will create an in-memory database, then
2703 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
2704 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
2705 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
2706 ** regular file-name. In this case the auto-vacuum applies as per normal.
2708 if( zFilename && !isMemdb ){
2709 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
2710 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
2712 #endif
2713 nReserve = 0;
2714 }else{
2715 /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2716 ** determined by the one-byte unsigned integer found at an offset of 20
2717 ** into the database file header. */
2718 nReserve = zDbHeader[20];
2719 pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2720 #ifndef SQLITE_OMIT_AUTOVACUUM
2721 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
2722 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
2723 #endif
2725 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2726 if( rc ) goto btree_open_out;
2727 pBt->usableSize = pBt->pageSize - nReserve;
2728 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
2730 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2731 /* Add the new BtShared object to the linked list sharable BtShareds.
2733 pBt->nRef = 1;
2734 if( p->sharable ){
2735 MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2736 MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN);)
2737 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
2738 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
2739 if( pBt->mutex==0 ){
2740 rc = SQLITE_NOMEM_BKPT;
2741 goto btree_open_out;
2744 sqlite3_mutex_enter(mutexShared);
2745 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
2746 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
2747 sqlite3_mutex_leave(mutexShared);
2749 #endif
2752 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2753 /* If the new Btree uses a sharable pBtShared, then link the new
2754 ** Btree into the list of all sharable Btrees for the same connection.
2755 ** The list is kept in ascending order by pBt address.
2757 if( p->sharable ){
2758 int i;
2759 Btree *pSib;
2760 for(i=0; i<db->nDb; i++){
2761 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
2762 while( pSib->pPrev ){ pSib = pSib->pPrev; }
2763 if( (uptr)p->pBt<(uptr)pSib->pBt ){
2764 p->pNext = pSib;
2765 p->pPrev = 0;
2766 pSib->pPrev = p;
2767 }else{
2768 while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
2769 pSib = pSib->pNext;
2771 p->pNext = pSib->pNext;
2772 p->pPrev = pSib;
2773 if( p->pNext ){
2774 p->pNext->pPrev = p;
2776 pSib->pNext = p;
2778 break;
2782 #endif
2783 *ppBtree = p;
2785 btree_open_out:
2786 if( rc!=SQLITE_OK ){
2787 if( pBt && pBt->pPager ){
2788 sqlite3PagerClose(pBt->pPager, 0);
2790 sqlite3_free(pBt);
2791 sqlite3_free(p);
2792 *ppBtree = 0;
2793 }else{
2794 sqlite3_file *pFile;
2796 /* If the B-Tree was successfully opened, set the pager-cache size to the
2797 ** default value. Except, when opening on an existing shared pager-cache,
2798 ** do not change the pager-cache size.
2800 if( sqlite3BtreeSchema(p, 0, 0)==0 ){
2801 sqlite3BtreeSetCacheSize(p, SQLITE_DEFAULT_CACHE_SIZE);
2804 pFile = sqlite3PagerFile(pBt->pPager);
2805 if( pFile->pMethods ){
2806 sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
2809 if( mutexOpen ){
2810 assert( sqlite3_mutex_held(mutexOpen) );
2811 sqlite3_mutex_leave(mutexOpen);
2813 assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
2814 return rc;
2818 ** Decrement the BtShared.nRef counter. When it reaches zero,
2819 ** remove the BtShared structure from the sharing list. Return
2820 ** true if the BtShared.nRef counter reaches zero and return
2821 ** false if it is still positive.
2823 static int removeFromSharingList(BtShared *pBt){
2824 #ifndef SQLITE_OMIT_SHARED_CACHE
2825 MUTEX_LOGIC( sqlite3_mutex *pMainMtx; )
2826 BtShared *pList;
2827 int removed = 0;
2829 assert( sqlite3_mutex_notheld(pBt->mutex) );
2830 MUTEX_LOGIC( pMainMtx = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN); )
2831 sqlite3_mutex_enter(pMainMtx);
2832 pBt->nRef--;
2833 if( pBt->nRef<=0 ){
2834 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
2835 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
2836 }else{
2837 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
2838 while( ALWAYS(pList) && pList->pNext!=pBt ){
2839 pList=pList->pNext;
2841 if( ALWAYS(pList) ){
2842 pList->pNext = pBt->pNext;
2845 if( SQLITE_THREADSAFE ){
2846 sqlite3_mutex_free(pBt->mutex);
2848 removed = 1;
2850 sqlite3_mutex_leave(pMainMtx);
2851 return removed;
2852 #else
2853 return 1;
2854 #endif
2858 ** Make sure pBt->pTmpSpace points to an allocation of
2859 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
2860 ** pointer.
2862 static SQLITE_NOINLINE int allocateTempSpace(BtShared *pBt){
2863 assert( pBt!=0 );
2864 assert( pBt->pTmpSpace==0 );
2865 /* This routine is called only by btreeCursor() when allocating the
2866 ** first write cursor for the BtShared object */
2867 assert( pBt->pCursor!=0 && (pBt->pCursor->curFlags & BTCF_WriteFlag)!=0 );
2868 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2869 if( pBt->pTmpSpace==0 ){
2870 BtCursor *pCur = pBt->pCursor;
2871 pBt->pCursor = pCur->pNext; /* Unlink the cursor */
2872 memset(pCur, 0, sizeof(*pCur));
2873 return SQLITE_NOMEM_BKPT;
2876 /* One of the uses of pBt->pTmpSpace is to format cells before
2877 ** inserting them into a leaf page (function fillInCell()). If
2878 ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
2879 ** by the various routines that manipulate binary cells. Which
2880 ** can mean that fillInCell() only initializes the first 2 or 3
2881 ** bytes of pTmpSpace, but that the first 4 bytes are copied from
2882 ** it into a database page. This is not actually a problem, but it
2883 ** does cause a valgrind error when the 1 or 2 bytes of uninitialized
2884 ** data is passed to system call write(). So to avoid this error,
2885 ** zero the first 4 bytes of temp space here.
2887 ** Also: Provide four bytes of initialized space before the
2888 ** beginning of pTmpSpace as an area available to prepend the
2889 ** left-child pointer to the beginning of a cell.
2891 memset(pBt->pTmpSpace, 0, 8);
2892 pBt->pTmpSpace += 4;
2893 return SQLITE_OK;
2897 ** Free the pBt->pTmpSpace allocation
2899 static void freeTempSpace(BtShared *pBt){
2900 if( pBt->pTmpSpace ){
2901 pBt->pTmpSpace -= 4;
2902 sqlite3PageFree(pBt->pTmpSpace);
2903 pBt->pTmpSpace = 0;
2908 ** Close an open database and invalidate all cursors.
2910 int sqlite3BtreeClose(Btree *p){
2911 BtShared *pBt = p->pBt;
2913 /* Close all cursors opened via this handle. */
2914 assert( sqlite3_mutex_held(p->db->mutex) );
2915 sqlite3BtreeEnter(p);
2917 /* Verify that no other cursors have this Btree open */
2918 #ifdef SQLITE_DEBUG
2920 BtCursor *pCur = pBt->pCursor;
2921 while( pCur ){
2922 BtCursor *pTmp = pCur;
2923 pCur = pCur->pNext;
2924 assert( pTmp->pBtree!=p );
2928 #endif
2930 /* Rollback any active transaction and free the handle structure.
2931 ** The call to sqlite3BtreeRollback() drops any table-locks held by
2932 ** this handle.
2934 sqlite3BtreeRollback(p, SQLITE_OK, 0);
2935 sqlite3BtreeLeave(p);
2937 /* If there are still other outstanding references to the shared-btree
2938 ** structure, return now. The remainder of this procedure cleans
2939 ** up the shared-btree.
2941 assert( p->wantToLock==0 && p->locked==0 );
2942 if( !p->sharable || removeFromSharingList(pBt) ){
2943 /* The pBt is no longer on the sharing list, so we can access
2944 ** it without having to hold the mutex.
2946 ** Clean out and delete the BtShared object.
2948 assert( !pBt->pCursor );
2949 sqlite3PagerClose(pBt->pPager, p->db);
2950 if( pBt->xFreeSchema && pBt->pSchema ){
2951 pBt->xFreeSchema(pBt->pSchema);
2953 sqlite3DbFree(0, pBt->pSchema);
2954 freeTempSpace(pBt);
2955 sqlite3_free(pBt);
2958 #ifndef SQLITE_OMIT_SHARED_CACHE
2959 assert( p->wantToLock==0 );
2960 assert( p->locked==0 );
2961 if( p->pPrev ) p->pPrev->pNext = p->pNext;
2962 if( p->pNext ) p->pNext->pPrev = p->pPrev;
2963 #endif
2965 sqlite3_free(p);
2966 return SQLITE_OK;
2970 ** Change the "soft" limit on the number of pages in the cache.
2971 ** Unused and unmodified pages will be recycled when the number of
2972 ** pages in the cache exceeds this soft limit. But the size of the
2973 ** cache is allowed to grow larger than this limit if it contains
2974 ** dirty pages or pages still in active use.
2976 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2977 BtShared *pBt = p->pBt;
2978 assert( sqlite3_mutex_held(p->db->mutex) );
2979 sqlite3BtreeEnter(p);
2980 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2981 sqlite3BtreeLeave(p);
2982 return SQLITE_OK;
2986 ** Change the "spill" limit on the number of pages in the cache.
2987 ** If the number of pages exceeds this limit during a write transaction,
2988 ** the pager might attempt to "spill" pages to the journal early in
2989 ** order to free up memory.
2991 ** The value returned is the current spill size. If zero is passed
2992 ** as an argument, no changes are made to the spill size setting, so
2993 ** using mxPage of 0 is a way to query the current spill size.
2995 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
2996 BtShared *pBt = p->pBt;
2997 int res;
2998 assert( sqlite3_mutex_held(p->db->mutex) );
2999 sqlite3BtreeEnter(p);
3000 res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
3001 sqlite3BtreeLeave(p);
3002 return res;
3005 #if SQLITE_MAX_MMAP_SIZE>0
3007 ** Change the limit on the amount of the database file that may be
3008 ** memory mapped.
3010 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
3011 BtShared *pBt = p->pBt;
3012 assert( sqlite3_mutex_held(p->db->mutex) );
3013 sqlite3BtreeEnter(p);
3014 sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
3015 sqlite3BtreeLeave(p);
3016 return SQLITE_OK;
3018 #endif /* SQLITE_MAX_MMAP_SIZE>0 */
3021 ** Change the way data is synced to disk in order to increase or decrease
3022 ** how well the database resists damage due to OS crashes and power
3023 ** failures. Level 1 is the same as asynchronous (no syncs() occur and
3024 ** there is a high probability of damage) Level 2 is the default. There
3025 ** is a very low but non-zero probability of damage. Level 3 reduces the
3026 ** probability of damage to near zero but with a write performance reduction.
3028 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
3029 int sqlite3BtreeSetPagerFlags(
3030 Btree *p, /* The btree to set the safety level on */
3031 unsigned pgFlags /* Various PAGER_* flags */
3033 BtShared *pBt = p->pBt;
3034 assert( sqlite3_mutex_held(p->db->mutex) );
3035 sqlite3BtreeEnter(p);
3036 sqlite3PagerSetFlags(pBt->pPager, pgFlags);
3037 sqlite3BtreeLeave(p);
3038 return SQLITE_OK;
3040 #endif
3043 ** Change the default pages size and the number of reserved bytes per page.
3044 ** Or, if the page size has already been fixed, return SQLITE_READONLY
3045 ** without changing anything.
3047 ** The page size must be a power of 2 between 512 and 65536. If the page
3048 ** size supplied does not meet this constraint then the page size is not
3049 ** changed.
3051 ** Page sizes are constrained to be a power of two so that the region
3052 ** of the database file used for locking (beginning at PENDING_BYTE,
3053 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
3054 ** at the beginning of a page.
3056 ** If parameter nReserve is less than zero, then the number of reserved
3057 ** bytes per page is left unchanged.
3059 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
3060 ** and autovacuum mode can no longer be changed.
3062 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
3063 int rc = SQLITE_OK;
3064 int x;
3065 BtShared *pBt = p->pBt;
3066 assert( nReserve>=0 && nReserve<=255 );
3067 sqlite3BtreeEnter(p);
3068 pBt->nReserveWanted = nReserve;
3069 x = pBt->pageSize - pBt->usableSize;
3070 if( nReserve<x ) nReserve = x;
3071 if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
3072 sqlite3BtreeLeave(p);
3073 return SQLITE_READONLY;
3075 assert( nReserve>=0 && nReserve<=255 );
3076 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
3077 ((pageSize-1)&pageSize)==0 ){
3078 assert( (pageSize & 7)==0 );
3079 assert( !pBt->pCursor );
3080 if( nReserve>32 && pageSize==512 ) pageSize = 1024;
3081 pBt->pageSize = (u32)pageSize;
3082 freeTempSpace(pBt);
3084 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
3085 pBt->usableSize = pBt->pageSize - (u16)nReserve;
3086 if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3087 sqlite3BtreeLeave(p);
3088 return rc;
3092 ** Return the currently defined page size
3094 int sqlite3BtreeGetPageSize(Btree *p){
3095 return p->pBt->pageSize;
3099 ** This function is similar to sqlite3BtreeGetReserve(), except that it
3100 ** may only be called if it is guaranteed that the b-tree mutex is already
3101 ** held.
3103 ** This is useful in one special case in the backup API code where it is
3104 ** known that the shared b-tree mutex is held, but the mutex on the
3105 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
3106 ** were to be called, it might collide with some other operation on the
3107 ** database handle that owns *p, causing undefined behavior.
3109 int sqlite3BtreeGetReserveNoMutex(Btree *p){
3110 int n;
3111 assert( sqlite3_mutex_held(p->pBt->mutex) );
3112 n = p->pBt->pageSize - p->pBt->usableSize;
3113 return n;
3117 ** Return the number of bytes of space at the end of every page that
3118 ** are intentionally left unused. This is the "reserved" space that is
3119 ** sometimes used by extensions.
3121 ** The value returned is the larger of the current reserve size and
3122 ** the latest reserve size requested by SQLITE_FILECTRL_RESERVE_BYTES.
3123 ** The amount of reserve can only grow - never shrink.
3125 int sqlite3BtreeGetRequestedReserve(Btree *p){
3126 int n1, n2;
3127 sqlite3BtreeEnter(p);
3128 n1 = (int)p->pBt->nReserveWanted;
3129 n2 = sqlite3BtreeGetReserveNoMutex(p);
3130 sqlite3BtreeLeave(p);
3131 return n1>n2 ? n1 : n2;
3136 ** Set the maximum page count for a database if mxPage is positive.
3137 ** No changes are made if mxPage is 0 or negative.
3138 ** Regardless of the value of mxPage, return the maximum page count.
3140 Pgno sqlite3BtreeMaxPageCount(Btree *p, Pgno mxPage){
3141 Pgno n;
3142 sqlite3BtreeEnter(p);
3143 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
3144 sqlite3BtreeLeave(p);
3145 return n;
3149 ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags:
3151 ** newFlag==0 Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared
3152 ** newFlag==1 BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared
3153 ** newFlag==2 BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set
3154 ** newFlag==(-1) No changes
3156 ** This routine acts as a query if newFlag is less than zero
3158 ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but
3159 ** freelist leaf pages are not written back to the database. Thus in-page
3160 ** deleted content is cleared, but freelist deleted content is not.
3162 ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition
3163 ** that freelist leaf pages are written back into the database, increasing
3164 ** the amount of disk I/O.
3166 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
3167 int b;
3168 if( p==0 ) return 0;
3169 sqlite3BtreeEnter(p);
3170 assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 );
3171 assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) );
3172 if( newFlag>=0 ){
3173 p->pBt->btsFlags &= ~BTS_FAST_SECURE;
3174 p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag;
3176 b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE;
3177 sqlite3BtreeLeave(p);
3178 return b;
3182 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
3183 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
3184 ** is disabled. The default value for the auto-vacuum property is
3185 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
3187 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
3188 #ifdef SQLITE_OMIT_AUTOVACUUM
3189 return SQLITE_READONLY;
3190 #else
3191 BtShared *pBt = p->pBt;
3192 int rc = SQLITE_OK;
3193 u8 av = (u8)autoVacuum;
3195 sqlite3BtreeEnter(p);
3196 if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
3197 rc = SQLITE_READONLY;
3198 }else{
3199 pBt->autoVacuum = av ?1:0;
3200 pBt->incrVacuum = av==2 ?1:0;
3202 sqlite3BtreeLeave(p);
3203 return rc;
3204 #endif
3208 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
3209 ** enabled 1 is returned. Otherwise 0.
3211 int sqlite3BtreeGetAutoVacuum(Btree *p){
3212 #ifdef SQLITE_OMIT_AUTOVACUUM
3213 return BTREE_AUTOVACUUM_NONE;
3214 #else
3215 int rc;
3216 sqlite3BtreeEnter(p);
3217 rc = (
3218 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
3219 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
3220 BTREE_AUTOVACUUM_INCR
3222 sqlite3BtreeLeave(p);
3223 return rc;
3224 #endif
3228 ** If the user has not set the safety-level for this database connection
3229 ** using "PRAGMA synchronous", and if the safety-level is not already
3230 ** set to the value passed to this function as the second parameter,
3231 ** set it so.
3233 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \
3234 && !defined(SQLITE_OMIT_WAL)
3235 static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){
3236 sqlite3 *db;
3237 Db *pDb;
3238 if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
3239 while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
3240 if( pDb->bSyncSet==0
3241 && pDb->safety_level!=safety_level
3242 && pDb!=&db->aDb[1]
3244 pDb->safety_level = safety_level;
3245 sqlite3PagerSetFlags(pBt->pPager,
3246 pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
3250 #else
3251 # define setDefaultSyncFlag(pBt,safety_level)
3252 #endif
3254 /* Forward declaration */
3255 static int newDatabase(BtShared*);
3259 ** Get a reference to pPage1 of the database file. This will
3260 ** also acquire a readlock on that file.
3262 ** SQLITE_OK is returned on success. If the file is not a
3263 ** well-formed database file, then SQLITE_CORRUPT is returned.
3264 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
3265 ** is returned if we run out of memory.
3267 static int lockBtree(BtShared *pBt){
3268 int rc; /* Result code from subfunctions */
3269 MemPage *pPage1; /* Page 1 of the database file */
3270 u32 nPage; /* Number of pages in the database */
3271 u32 nPageFile = 0; /* Number of pages in the database file */
3273 assert( sqlite3_mutex_held(pBt->mutex) );
3274 assert( pBt->pPage1==0 );
3275 rc = sqlite3PagerSharedLock(pBt->pPager);
3276 if( rc!=SQLITE_OK ) return rc;
3277 rc = btreeGetPage(pBt, 1, &pPage1, 0);
3278 if( rc!=SQLITE_OK ) return rc;
3280 /* Do some checking to help insure the file we opened really is
3281 ** a valid database file.
3283 nPage = get4byte(28+(u8*)pPage1->aData);
3284 sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile);
3285 if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
3286 nPage = nPageFile;
3288 if( (pBt->db->flags & SQLITE_ResetDatabase)!=0 ){
3289 nPage = 0;
3291 if( nPage>0 ){
3292 u32 pageSize;
3293 u32 usableSize;
3294 u8 *page1 = pPage1->aData;
3295 rc = SQLITE_NOTADB;
3296 /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
3297 ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
3298 ** 61 74 20 33 00. */
3299 if( memcmp(page1, zMagicHeader, 16)!=0 ){
3300 goto page1_init_failed;
3303 #ifdef SQLITE_OMIT_WAL
3304 if( page1[18]>1 ){
3305 pBt->btsFlags |= BTS_READ_ONLY;
3307 if( page1[19]>1 ){
3308 goto page1_init_failed;
3310 #else
3311 if( page1[18]>2 ){
3312 pBt->btsFlags |= BTS_READ_ONLY;
3314 if( page1[19]>2 ){
3315 goto page1_init_failed;
3318 /* If the read version is set to 2, this database should be accessed
3319 ** in WAL mode. If the log is not already open, open it now. Then
3320 ** return SQLITE_OK and return without populating BtShared.pPage1.
3321 ** The caller detects this and calls this function again. This is
3322 ** required as the version of page 1 currently in the page1 buffer
3323 ** may not be the latest version - there may be a newer one in the log
3324 ** file.
3326 if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
3327 int isOpen = 0;
3328 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
3329 if( rc!=SQLITE_OK ){
3330 goto page1_init_failed;
3331 }else{
3332 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1);
3333 if( isOpen==0 ){
3334 releasePageOne(pPage1);
3335 return SQLITE_OK;
3338 rc = SQLITE_NOTADB;
3339 }else{
3340 setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1);
3342 #endif
3344 /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
3345 ** fractions and the leaf payload fraction values must be 64, 32, and 32.
3347 ** The original design allowed these amounts to vary, but as of
3348 ** version 3.6.0, we require them to be fixed.
3350 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
3351 goto page1_init_failed;
3353 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
3354 ** determined by the 2-byte integer located at an offset of 16 bytes from
3355 ** the beginning of the database file. */
3356 pageSize = (page1[16]<<8) | (page1[17]<<16);
3357 /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
3358 ** between 512 and 65536 inclusive. */
3359 if( ((pageSize-1)&pageSize)!=0
3360 || pageSize>SQLITE_MAX_PAGE_SIZE
3361 || pageSize<=256
3363 goto page1_init_failed;
3365 assert( (pageSize & 7)==0 );
3366 /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
3367 ** integer at offset 20 is the number of bytes of space at the end of
3368 ** each page to reserve for extensions.
3370 ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
3371 ** determined by the one-byte unsigned integer found at an offset of 20
3372 ** into the database file header. */
3373 usableSize = pageSize - page1[20];
3374 if( (u32)pageSize!=pBt->pageSize ){
3375 /* After reading the first page of the database assuming a page size
3376 ** of BtShared.pageSize, we have discovered that the page-size is
3377 ** actually pageSize. Unlock the database, leave pBt->pPage1 at
3378 ** zero and return SQLITE_OK. The caller will call this function
3379 ** again with the correct page-size.
3381 releasePageOne(pPage1);
3382 pBt->usableSize = usableSize;
3383 pBt->pageSize = pageSize;
3384 pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3385 freeTempSpace(pBt);
3386 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
3387 pageSize-usableSize);
3388 return rc;
3390 if( nPage>nPageFile ){
3391 if( sqlite3WritableSchema(pBt->db)==0 ){
3392 rc = SQLITE_CORRUPT_BKPT;
3393 goto page1_init_failed;
3394 }else{
3395 nPage = nPageFile;
3398 /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
3399 ** be less than 480. In other words, if the page size is 512, then the
3400 ** reserved space size cannot exceed 32. */
3401 if( usableSize<480 ){
3402 goto page1_init_failed;
3404 pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3405 pBt->pageSize = pageSize;
3406 pBt->usableSize = usableSize;
3407 #ifndef SQLITE_OMIT_AUTOVACUUM
3408 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
3409 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
3410 #endif
3413 /* maxLocal is the maximum amount of payload to store locally for
3414 ** a cell. Make sure it is small enough so that at least minFanout
3415 ** cells can will fit on one page. We assume a 10-byte page header.
3416 ** Besides the payload, the cell must store:
3417 ** 2-byte pointer to the cell
3418 ** 4-byte child pointer
3419 ** 9-byte nKey value
3420 ** 4-byte nData value
3421 ** 4-byte overflow page pointer
3422 ** So a cell consists of a 2-byte pointer, a header which is as much as
3423 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
3424 ** page pointer.
3426 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
3427 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
3428 pBt->maxLeaf = (u16)(pBt->usableSize - 35);
3429 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
3430 if( pBt->maxLocal>127 ){
3431 pBt->max1bytePayload = 127;
3432 }else{
3433 pBt->max1bytePayload = (u8)pBt->maxLocal;
3435 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
3436 pBt->pPage1 = pPage1;
3437 pBt->nPage = nPage;
3438 return SQLITE_OK;
3440 page1_init_failed:
3441 releasePageOne(pPage1);
3442 pBt->pPage1 = 0;
3443 return rc;
3446 #ifndef NDEBUG
3448 ** Return the number of cursors open on pBt. This is for use
3449 ** in assert() expressions, so it is only compiled if NDEBUG is not
3450 ** defined.
3452 ** Only write cursors are counted if wrOnly is true. If wrOnly is
3453 ** false then all cursors are counted.
3455 ** For the purposes of this routine, a cursor is any cursor that
3456 ** is capable of reading or writing to the database. Cursors that
3457 ** have been tripped into the CURSOR_FAULT state are not counted.
3459 static int countValidCursors(BtShared *pBt, int wrOnly){
3460 BtCursor *pCur;
3461 int r = 0;
3462 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3463 if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
3464 && pCur->eState!=CURSOR_FAULT ) r++;
3466 return r;
3468 #endif
3471 ** If there are no outstanding cursors and we are not in the middle
3472 ** of a transaction but there is a read lock on the database, then
3473 ** this routine unrefs the first page of the database file which
3474 ** has the effect of releasing the read lock.
3476 ** If there is a transaction in progress, this routine is a no-op.
3478 static void unlockBtreeIfUnused(BtShared *pBt){
3479 assert( sqlite3_mutex_held(pBt->mutex) );
3480 assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
3481 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
3482 MemPage *pPage1 = pBt->pPage1;
3483 assert( pPage1->aData );
3484 assert( sqlite3PagerRefcount(pBt->pPager)==1 );
3485 pBt->pPage1 = 0;
3486 releasePageOne(pPage1);
3491 ** If pBt points to an empty file then convert that empty file
3492 ** into a new empty database by initializing the first page of
3493 ** the database.
3495 static int newDatabase(BtShared *pBt){
3496 MemPage *pP1;
3497 unsigned char *data;
3498 int rc;
3500 assert( sqlite3_mutex_held(pBt->mutex) );
3501 if( pBt->nPage>0 ){
3502 return SQLITE_OK;
3504 pP1 = pBt->pPage1;
3505 assert( pP1!=0 );
3506 data = pP1->aData;
3507 rc = sqlite3PagerWrite(pP1->pDbPage);
3508 if( rc ) return rc;
3509 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
3510 assert( sizeof(zMagicHeader)==16 );
3511 data[16] = (u8)((pBt->pageSize>>8)&0xff);
3512 data[17] = (u8)((pBt->pageSize>>16)&0xff);
3513 data[18] = 1;
3514 data[19] = 1;
3515 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
3516 data[20] = (u8)(pBt->pageSize - pBt->usableSize);
3517 data[21] = 64;
3518 data[22] = 32;
3519 data[23] = 32;
3520 memset(&data[24], 0, 100-24);
3521 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
3522 pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3523 #ifndef SQLITE_OMIT_AUTOVACUUM
3524 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
3525 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
3526 put4byte(&data[36 + 4*4], pBt->autoVacuum);
3527 put4byte(&data[36 + 7*4], pBt->incrVacuum);
3528 #endif
3529 pBt->nPage = 1;
3530 data[31] = 1;
3531 return SQLITE_OK;
3535 ** Initialize the first page of the database file (creating a database
3536 ** consisting of a single page and no schema objects). Return SQLITE_OK
3537 ** if successful, or an SQLite error code otherwise.
3539 int sqlite3BtreeNewDb(Btree *p){
3540 int rc;
3541 sqlite3BtreeEnter(p);
3542 p->pBt->nPage = 0;
3543 rc = newDatabase(p->pBt);
3544 sqlite3BtreeLeave(p);
3545 return rc;
3549 ** Attempt to start a new transaction. A write-transaction
3550 ** is started if the second argument is nonzero, otherwise a read-
3551 ** transaction. If the second argument is 2 or more and exclusive
3552 ** transaction is started, meaning that no other process is allowed
3553 ** to access the database. A preexisting transaction may not be
3554 ** upgraded to exclusive by calling this routine a second time - the
3555 ** exclusivity flag only works for a new transaction.
3557 ** A write-transaction must be started before attempting any
3558 ** changes to the database. None of the following routines
3559 ** will work unless a transaction is started first:
3561 ** sqlite3BtreeCreateTable()
3562 ** sqlite3BtreeCreateIndex()
3563 ** sqlite3BtreeClearTable()
3564 ** sqlite3BtreeDropTable()
3565 ** sqlite3BtreeInsert()
3566 ** sqlite3BtreeDelete()
3567 ** sqlite3BtreeUpdateMeta()
3569 ** If an initial attempt to acquire the lock fails because of lock contention
3570 ** and the database was previously unlocked, then invoke the busy handler
3571 ** if there is one. But if there was previously a read-lock, do not
3572 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
3573 ** returned when there is already a read-lock in order to avoid a deadlock.
3575 ** Suppose there are two processes A and B. A has a read lock and B has
3576 ** a reserved lock. B tries to promote to exclusive but is blocked because
3577 ** of A's read lock. A tries to promote to reserved but is blocked by B.
3578 ** One or the other of the two processes must give way or there can be
3579 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback
3580 ** when A already has a read lock, we encourage A to give up and let B
3581 ** proceed.
3583 static SQLITE_NOINLINE int btreeBeginTrans(
3584 Btree *p, /* The btree in which to start the transaction */
3585 int wrflag, /* True to start a write transaction */
3586 int *pSchemaVersion /* Put schema version number here, if not NULL */
3588 BtShared *pBt = p->pBt;
3589 Pager *pPager = pBt->pPager;
3590 int rc = SQLITE_OK;
3592 sqlite3BtreeEnter(p);
3593 btreeIntegrity(p);
3595 /* If the btree is already in a write-transaction, or it
3596 ** is already in a read-transaction and a read-transaction
3597 ** is requested, this is a no-op.
3599 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
3600 goto trans_begun;
3602 assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
3604 if( (p->db->flags & SQLITE_ResetDatabase)
3605 && sqlite3PagerIsreadonly(pPager)==0
3607 pBt->btsFlags &= ~BTS_READ_ONLY;
3610 /* Write transactions are not possible on a read-only database */
3611 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
3612 rc = SQLITE_READONLY;
3613 goto trans_begun;
3616 #ifndef SQLITE_OMIT_SHARED_CACHE
3618 sqlite3 *pBlock = 0;
3619 /* If another database handle has already opened a write transaction
3620 ** on this shared-btree structure and a second write transaction is
3621 ** requested, return SQLITE_LOCKED.
3623 if( (wrflag && pBt->inTransaction==TRANS_WRITE)
3624 || (pBt->btsFlags & BTS_PENDING)!=0
3626 pBlock = pBt->pWriter->db;
3627 }else if( wrflag>1 ){
3628 BtLock *pIter;
3629 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
3630 if( pIter->pBtree!=p ){
3631 pBlock = pIter->pBtree->db;
3632 break;
3636 if( pBlock ){
3637 sqlite3ConnectionBlocked(p->db, pBlock);
3638 rc = SQLITE_LOCKED_SHAREDCACHE;
3639 goto trans_begun;
3642 #endif
3644 /* Any read-only or read-write transaction implies a read-lock on
3645 ** page 1. So if some other shared-cache client already has a write-lock
3646 ** on page 1, the transaction cannot be opened. */
3647 rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK);
3648 if( SQLITE_OK!=rc ) goto trans_begun;
3650 pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
3651 if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
3652 do {
3653 sqlite3PagerWalDb(pPager, p->db);
3655 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT
3656 /* If transitioning from no transaction directly to a write transaction,
3657 ** block for the WRITER lock first if possible. */
3658 if( pBt->pPage1==0 && wrflag ){
3659 assert( pBt->inTransaction==TRANS_NONE );
3660 rc = sqlite3PagerWalWriteLock(pPager, 1);
3661 if( rc!=SQLITE_BUSY && rc!=SQLITE_OK ) break;
3663 #endif
3665 /* Call lockBtree() until either pBt->pPage1 is populated or
3666 ** lockBtree() returns something other than SQLITE_OK. lockBtree()
3667 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
3668 ** reading page 1 it discovers that the page-size of the database
3669 ** file is not pBt->pageSize. In this case lockBtree() will update
3670 ** pBt->pageSize to the page-size of the file on disk.
3672 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
3674 if( rc==SQLITE_OK && wrflag ){
3675 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
3676 rc = SQLITE_READONLY;
3677 }else{
3678 rc = sqlite3PagerBegin(pPager, wrflag>1, sqlite3TempInMemory(p->db));
3679 if( rc==SQLITE_OK ){
3680 rc = newDatabase(pBt);
3681 }else if( rc==SQLITE_BUSY_SNAPSHOT && pBt->inTransaction==TRANS_NONE ){
3682 /* if there was no transaction opened when this function was
3683 ** called and SQLITE_BUSY_SNAPSHOT is returned, change the error
3684 ** code to SQLITE_BUSY. */
3685 rc = SQLITE_BUSY;
3690 if( rc!=SQLITE_OK ){
3691 (void)sqlite3PagerWalWriteLock(pPager, 0);
3692 unlockBtreeIfUnused(pBt);
3694 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
3695 btreeInvokeBusyHandler(pBt) );
3696 sqlite3PagerWalDb(pPager, 0);
3697 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT
3698 if( rc==SQLITE_BUSY_TIMEOUT ) rc = SQLITE_BUSY;
3699 #endif
3701 if( rc==SQLITE_OK ){
3702 if( p->inTrans==TRANS_NONE ){
3703 pBt->nTransaction++;
3704 #ifndef SQLITE_OMIT_SHARED_CACHE
3705 if( p->sharable ){
3706 assert( p->lock.pBtree==p && p->lock.iTable==1 );
3707 p->lock.eLock = READ_LOCK;
3708 p->lock.pNext = pBt->pLock;
3709 pBt->pLock = &p->lock;
3711 #endif
3713 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
3714 if( p->inTrans>pBt->inTransaction ){
3715 pBt->inTransaction = p->inTrans;
3717 if( wrflag ){
3718 MemPage *pPage1 = pBt->pPage1;
3719 #ifndef SQLITE_OMIT_SHARED_CACHE
3720 assert( !pBt->pWriter );
3721 pBt->pWriter = p;
3722 pBt->btsFlags &= ~BTS_EXCLUSIVE;
3723 if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
3724 #endif
3726 /* If the db-size header field is incorrect (as it may be if an old
3727 ** client has been writing the database file), update it now. Doing
3728 ** this sooner rather than later means the database size can safely
3729 ** re-read the database size from page 1 if a savepoint or transaction
3730 ** rollback occurs within the transaction.
3732 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
3733 rc = sqlite3PagerWrite(pPage1->pDbPage);
3734 if( rc==SQLITE_OK ){
3735 put4byte(&pPage1->aData[28], pBt->nPage);
3741 trans_begun:
3742 if( rc==SQLITE_OK ){
3743 if( pSchemaVersion ){
3744 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]);
3746 if( wrflag ){
3747 /* This call makes sure that the pager has the correct number of
3748 ** open savepoints. If the second parameter is greater than 0 and
3749 ** the sub-journal is not already open, then it will be opened here.
3751 rc = sqlite3PagerOpenSavepoint(pPager, p->db->nSavepoint);
3755 btreeIntegrity(p);
3756 sqlite3BtreeLeave(p);
3757 return rc;
3759 int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){
3760 BtShared *pBt;
3761 if( p->sharable
3762 || p->inTrans==TRANS_NONE
3763 || (p->inTrans==TRANS_READ && wrflag!=0)
3765 return btreeBeginTrans(p,wrflag,pSchemaVersion);
3767 pBt = p->pBt;
3768 if( pSchemaVersion ){
3769 *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]);
3771 if( wrflag ){
3772 /* This call makes sure that the pager has the correct number of
3773 ** open savepoints. If the second parameter is greater than 0 and
3774 ** the sub-journal is not already open, then it will be opened here.
3776 return sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
3777 }else{
3778 return SQLITE_OK;
3782 #ifndef SQLITE_OMIT_AUTOVACUUM
3785 ** Set the pointer-map entries for all children of page pPage. Also, if
3786 ** pPage contains cells that point to overflow pages, set the pointer
3787 ** map entries for the overflow pages as well.
3789 static int setChildPtrmaps(MemPage *pPage){
3790 int i; /* Counter variable */
3791 int nCell; /* Number of cells in page pPage */
3792 int rc; /* Return code */
3793 BtShared *pBt = pPage->pBt;
3794 Pgno pgno = pPage->pgno;
3796 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3797 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
3798 if( rc!=SQLITE_OK ) return rc;
3799 nCell = pPage->nCell;
3801 for(i=0; i<nCell; i++){
3802 u8 *pCell = findCell(pPage, i);
3804 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc);
3806 if( !pPage->leaf ){
3807 Pgno childPgno = get4byte(pCell);
3808 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3812 if( !pPage->leaf ){
3813 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3814 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3817 return rc;
3821 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so
3822 ** that it points to iTo. Parameter eType describes the type of pointer to
3823 ** be modified, as follows:
3825 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
3826 ** page of pPage.
3828 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
3829 ** page pointed to by one of the cells on pPage.
3831 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
3832 ** overflow page in the list.
3834 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
3835 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3836 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
3837 if( eType==PTRMAP_OVERFLOW2 ){
3838 /* The pointer is always the first 4 bytes of the page in this case. */
3839 if( get4byte(pPage->aData)!=iFrom ){
3840 return SQLITE_CORRUPT_PAGE(pPage);
3842 put4byte(pPage->aData, iTo);
3843 }else{
3844 int i;
3845 int nCell;
3846 int rc;
3848 rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
3849 if( rc ) return rc;
3850 nCell = pPage->nCell;
3852 for(i=0; i<nCell; i++){
3853 u8 *pCell = findCell(pPage, i);
3854 if( eType==PTRMAP_OVERFLOW1 ){
3855 CellInfo info;
3856 pPage->xParseCell(pPage, pCell, &info);
3857 if( info.nLocal<info.nPayload ){
3858 if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){
3859 return SQLITE_CORRUPT_PAGE(pPage);
3861 if( iFrom==get4byte(pCell+info.nSize-4) ){
3862 put4byte(pCell+info.nSize-4, iTo);
3863 break;
3866 }else{
3867 if( pCell+4 > pPage->aData+pPage->pBt->usableSize ){
3868 return SQLITE_CORRUPT_PAGE(pPage);
3870 if( get4byte(pCell)==iFrom ){
3871 put4byte(pCell, iTo);
3872 break;
3877 if( i==nCell ){
3878 if( eType!=PTRMAP_BTREE ||
3879 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
3880 return SQLITE_CORRUPT_PAGE(pPage);
3882 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
3885 return SQLITE_OK;
3890 ** Move the open database page pDbPage to location iFreePage in the
3891 ** database. The pDbPage reference remains valid.
3893 ** The isCommit flag indicates that there is no need to remember that
3894 ** the journal needs to be sync()ed before database page pDbPage->pgno
3895 ** can be written to. The caller has already promised not to write to that
3896 ** page.
3898 static int relocatePage(
3899 BtShared *pBt, /* Btree */
3900 MemPage *pDbPage, /* Open page to move */
3901 u8 eType, /* Pointer map 'type' entry for pDbPage */
3902 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
3903 Pgno iFreePage, /* The location to move pDbPage to */
3904 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */
3906 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
3907 Pgno iDbPage = pDbPage->pgno;
3908 Pager *pPager = pBt->pPager;
3909 int rc;
3911 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
3912 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
3913 assert( sqlite3_mutex_held(pBt->mutex) );
3914 assert( pDbPage->pBt==pBt );
3915 if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT;
3917 /* Move page iDbPage from its current location to page number iFreePage */
3918 TRACE(("AUTOVACUUM: Moving %u to free page %u (ptr page %u type %u)\n",
3919 iDbPage, iFreePage, iPtrPage, eType));
3920 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
3921 if( rc!=SQLITE_OK ){
3922 return rc;
3924 pDbPage->pgno = iFreePage;
3926 /* If pDbPage was a btree-page, then it may have child pages and/or cells
3927 ** that point to overflow pages. The pointer map entries for all these
3928 ** pages need to be changed.
3930 ** If pDbPage is an overflow page, then the first 4 bytes may store a
3931 ** pointer to a subsequent overflow page. If this is the case, then
3932 ** the pointer map needs to be updated for the subsequent overflow page.
3934 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
3935 rc = setChildPtrmaps(pDbPage);
3936 if( rc!=SQLITE_OK ){
3937 return rc;
3939 }else{
3940 Pgno nextOvfl = get4byte(pDbPage->aData);
3941 if( nextOvfl!=0 ){
3942 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
3943 if( rc!=SQLITE_OK ){
3944 return rc;
3949 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
3950 ** that it points at iFreePage. Also fix the pointer map entry for
3951 ** iPtrPage.
3953 if( eType!=PTRMAP_ROOTPAGE ){
3954 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
3955 if( rc!=SQLITE_OK ){
3956 return rc;
3958 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
3959 if( rc!=SQLITE_OK ){
3960 releasePage(pPtrPage);
3961 return rc;
3963 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
3964 releasePage(pPtrPage);
3965 if( rc==SQLITE_OK ){
3966 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
3969 return rc;
3972 /* Forward declaration required by incrVacuumStep(). */
3973 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
3976 ** Perform a single step of an incremental-vacuum. If successful, return
3977 ** SQLITE_OK. If there is no work to do (and therefore no point in
3978 ** calling this function again), return SQLITE_DONE. Or, if an error
3979 ** occurs, return some other error code.
3981 ** More specifically, this function attempts to re-organize the database so
3982 ** that the last page of the file currently in use is no longer in use.
3984 ** Parameter nFin is the number of pages that this database would contain
3985 ** were this function called until it returns SQLITE_DONE.
3987 ** If the bCommit parameter is non-zero, this function assumes that the
3988 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
3989 ** or an error. bCommit is passed true for an auto-vacuum-on-commit
3990 ** operation, or false for an incremental vacuum.
3992 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
3993 Pgno nFreeList; /* Number of pages still on the free-list */
3994 int rc;
3996 assert( sqlite3_mutex_held(pBt->mutex) );
3997 assert( iLastPg>nFin );
3999 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
4000 u8 eType;
4001 Pgno iPtrPage;
4003 nFreeList = get4byte(&pBt->pPage1->aData[36]);
4004 if( nFreeList==0 ){
4005 return SQLITE_DONE;
4008 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
4009 if( rc!=SQLITE_OK ){
4010 return rc;
4012 if( eType==PTRMAP_ROOTPAGE ){
4013 return SQLITE_CORRUPT_BKPT;
4016 if( eType==PTRMAP_FREEPAGE ){
4017 if( bCommit==0 ){
4018 /* Remove the page from the files free-list. This is not required
4019 ** if bCommit is non-zero. In that case, the free-list will be
4020 ** truncated to zero after this function returns, so it doesn't
4021 ** matter if it still contains some garbage entries.
4023 Pgno iFreePg;
4024 MemPage *pFreePg;
4025 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
4026 if( rc!=SQLITE_OK ){
4027 return rc;
4029 assert( iFreePg==iLastPg );
4030 releasePage(pFreePg);
4032 } else {
4033 Pgno iFreePg; /* Index of free page to move pLastPg to */
4034 MemPage *pLastPg;
4035 u8 eMode = BTALLOC_ANY; /* Mode parameter for allocateBtreePage() */
4036 Pgno iNear = 0; /* nearby parameter for allocateBtreePage() */
4038 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
4039 if( rc!=SQLITE_OK ){
4040 return rc;
4043 /* If bCommit is zero, this loop runs exactly once and page pLastPg
4044 ** is swapped with the first free page pulled off the free list.
4046 ** On the other hand, if bCommit is greater than zero, then keep
4047 ** looping until a free-page located within the first nFin pages
4048 ** of the file is found.
4050 if( bCommit==0 ){
4051 eMode = BTALLOC_LE;
4052 iNear = nFin;
4054 do {
4055 MemPage *pFreePg;
4056 Pgno dbSize = btreePagecount(pBt);
4057 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
4058 if( rc!=SQLITE_OK ){
4059 releasePage(pLastPg);
4060 return rc;
4062 releasePage(pFreePg);
4063 if( iFreePg>dbSize ){
4064 releasePage(pLastPg);
4065 return SQLITE_CORRUPT_BKPT;
4067 }while( bCommit && iFreePg>nFin );
4068 assert( iFreePg<iLastPg );
4070 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
4071 releasePage(pLastPg);
4072 if( rc!=SQLITE_OK ){
4073 return rc;
4078 if( bCommit==0 ){
4079 do {
4080 iLastPg--;
4081 }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
4082 pBt->bDoTruncate = 1;
4083 pBt->nPage = iLastPg;
4085 return SQLITE_OK;
4089 ** The database opened by the first argument is an auto-vacuum database
4090 ** nOrig pages in size containing nFree free pages. Return the expected
4091 ** size of the database in pages following an auto-vacuum operation.
4093 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
4094 int nEntry; /* Number of entries on one ptrmap page */
4095 Pgno nPtrmap; /* Number of PtrMap pages to be freed */
4096 Pgno nFin; /* Return value */
4098 nEntry = pBt->usableSize/5;
4099 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
4100 nFin = nOrig - nFree - nPtrmap;
4101 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
4102 nFin--;
4104 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
4105 nFin--;
4108 return nFin;
4112 ** A write-transaction must be opened before calling this function.
4113 ** It performs a single unit of work towards an incremental vacuum.
4115 ** If the incremental vacuum is finished after this function has run,
4116 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
4117 ** SQLITE_OK is returned. Otherwise an SQLite error code.
4119 int sqlite3BtreeIncrVacuum(Btree *p){
4120 int rc;
4121 BtShared *pBt = p->pBt;
4123 sqlite3BtreeEnter(p);
4124 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
4125 if( !pBt->autoVacuum ){
4126 rc = SQLITE_DONE;
4127 }else{
4128 Pgno nOrig = btreePagecount(pBt);
4129 Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
4130 Pgno nFin = finalDbSize(pBt, nOrig, nFree);
4132 if( nOrig<nFin || nFree>=nOrig ){
4133 rc = SQLITE_CORRUPT_BKPT;
4134 }else if( nFree>0 ){
4135 rc = saveAllCursors(pBt, 0, 0);
4136 if( rc==SQLITE_OK ){
4137 invalidateAllOverflowCache(pBt);
4138 rc = incrVacuumStep(pBt, nFin, nOrig, 0);
4140 if( rc==SQLITE_OK ){
4141 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
4142 put4byte(&pBt->pPage1->aData[28], pBt->nPage);
4144 }else{
4145 rc = SQLITE_DONE;
4148 sqlite3BtreeLeave(p);
4149 return rc;
4153 ** This routine is called prior to sqlite3PagerCommit when a transaction
4154 ** is committed for an auto-vacuum database.
4156 static int autoVacuumCommit(Btree *p){
4157 int rc = SQLITE_OK;
4158 Pager *pPager;
4159 BtShared *pBt;
4160 sqlite3 *db;
4161 VVA_ONLY( int nRef );
4163 assert( p!=0 );
4164 pBt = p->pBt;
4165 pPager = pBt->pPager;
4166 VVA_ONLY( nRef = sqlite3PagerRefcount(pPager); )
4168 assert( sqlite3_mutex_held(pBt->mutex) );
4169 invalidateAllOverflowCache(pBt);
4170 assert(pBt->autoVacuum);
4171 if( !pBt->incrVacuum ){
4172 Pgno nFin; /* Number of pages in database after autovacuuming */
4173 Pgno nFree; /* Number of pages on the freelist initially */
4174 Pgno nVac; /* Number of pages to vacuum */
4175 Pgno iFree; /* The next page to be freed */
4176 Pgno nOrig; /* Database size before freeing */
4178 nOrig = btreePagecount(pBt);
4179 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
4180 /* It is not possible to create a database for which the final page
4181 ** is either a pointer-map page or the pending-byte page. If one
4182 ** is encountered, this indicates corruption.
4184 return SQLITE_CORRUPT_BKPT;
4187 nFree = get4byte(&pBt->pPage1->aData[36]);
4188 db = p->db;
4189 if( db->xAutovacPages ){
4190 int iDb;
4191 for(iDb=0; ALWAYS(iDb<db->nDb); iDb++){
4192 if( db->aDb[iDb].pBt==p ) break;
4194 nVac = db->xAutovacPages(
4195 db->pAutovacPagesArg,
4196 db->aDb[iDb].zDbSName,
4197 nOrig,
4198 nFree,
4199 pBt->pageSize
4201 if( nVac>nFree ){
4202 nVac = nFree;
4204 if( nVac==0 ){
4205 return SQLITE_OK;
4207 }else{
4208 nVac = nFree;
4210 nFin = finalDbSize(pBt, nOrig, nVac);
4211 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
4212 if( nFin<nOrig ){
4213 rc = saveAllCursors(pBt, 0, 0);
4215 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
4216 rc = incrVacuumStep(pBt, nFin, iFree, nVac==nFree);
4218 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
4219 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
4220 if( nVac==nFree ){
4221 put4byte(&pBt->pPage1->aData[32], 0);
4222 put4byte(&pBt->pPage1->aData[36], 0);
4224 put4byte(&pBt->pPage1->aData[28], nFin);
4225 pBt->bDoTruncate = 1;
4226 pBt->nPage = nFin;
4228 if( rc!=SQLITE_OK ){
4229 sqlite3PagerRollback(pPager);
4233 assert( nRef>=sqlite3PagerRefcount(pPager) );
4234 return rc;
4237 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
4238 # define setChildPtrmaps(x) SQLITE_OK
4239 #endif
4242 ** This routine does the first phase of a two-phase commit. This routine
4243 ** causes a rollback journal to be created (if it does not already exist)
4244 ** and populated with enough information so that if a power loss occurs
4245 ** the database can be restored to its original state by playing back
4246 ** the journal. Then the contents of the journal are flushed out to
4247 ** the disk. After the journal is safely on oxide, the changes to the
4248 ** database are written into the database file and flushed to oxide.
4249 ** At the end of this call, the rollback journal still exists on the
4250 ** disk and we are still holding all locks, so the transaction has not
4251 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the
4252 ** commit process.
4254 ** This call is a no-op if no write-transaction is currently active on pBt.
4256 ** Otherwise, sync the database file for the btree pBt. zSuperJrnl points to
4257 ** the name of a super-journal file that should be written into the
4258 ** individual journal file, or is NULL, indicating no super-journal file
4259 ** (single database transaction).
4261 ** When this is called, the super-journal should already have been
4262 ** created, populated with this journal pointer and synced to disk.
4264 ** Once this is routine has returned, the only thing required to commit
4265 ** the write-transaction for this database file is to delete the journal.
4267 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zSuperJrnl){
4268 int rc = SQLITE_OK;
4269 if( p->inTrans==TRANS_WRITE ){
4270 BtShared *pBt = p->pBt;
4271 sqlite3BtreeEnter(p);
4272 #ifndef SQLITE_OMIT_AUTOVACUUM
4273 if( pBt->autoVacuum ){
4274 rc = autoVacuumCommit(p);
4275 if( rc!=SQLITE_OK ){
4276 sqlite3BtreeLeave(p);
4277 return rc;
4280 if( pBt->bDoTruncate ){
4281 sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
4283 #endif
4284 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zSuperJrnl, 0);
4285 sqlite3BtreeLeave(p);
4287 return rc;
4291 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
4292 ** at the conclusion of a transaction.
4294 static void btreeEndTransaction(Btree *p){
4295 BtShared *pBt = p->pBt;
4296 sqlite3 *db = p->db;
4297 assert( sqlite3BtreeHoldsMutex(p) );
4299 #ifndef SQLITE_OMIT_AUTOVACUUM
4300 pBt->bDoTruncate = 0;
4301 #endif
4302 if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
4303 /* If there are other active statements that belong to this database
4304 ** handle, downgrade to a read-only transaction. The other statements
4305 ** may still be reading from the database. */
4306 downgradeAllSharedCacheTableLocks(p);
4307 p->inTrans = TRANS_READ;
4308 }else{
4309 /* If the handle had any kind of transaction open, decrement the
4310 ** transaction count of the shared btree. If the transaction count
4311 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
4312 ** call below will unlock the pager. */
4313 if( p->inTrans!=TRANS_NONE ){
4314 clearAllSharedCacheTableLocks(p);
4315 pBt->nTransaction--;
4316 if( 0==pBt->nTransaction ){
4317 pBt->inTransaction = TRANS_NONE;
4321 /* Set the current transaction state to TRANS_NONE and unlock the
4322 ** pager if this call closed the only read or write transaction. */
4323 p->inTrans = TRANS_NONE;
4324 unlockBtreeIfUnused(pBt);
4327 btreeIntegrity(p);
4331 ** Commit the transaction currently in progress.
4333 ** This routine implements the second phase of a 2-phase commit. The
4334 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
4335 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne()
4336 ** routine did all the work of writing information out to disk and flushing the
4337 ** contents so that they are written onto the disk platter. All this
4338 ** routine has to do is delete or truncate or zero the header in the
4339 ** the rollback journal (which causes the transaction to commit) and
4340 ** drop locks.
4342 ** Normally, if an error occurs while the pager layer is attempting to
4343 ** finalize the underlying journal file, this function returns an error and
4344 ** the upper layer will attempt a rollback. However, if the second argument
4345 ** is non-zero then this b-tree transaction is part of a multi-file
4346 ** transaction. In this case, the transaction has already been committed
4347 ** (by deleting a super-journal file) and the caller will ignore this
4348 ** functions return code. So, even if an error occurs in the pager layer,
4349 ** reset the b-tree objects internal state to indicate that the write
4350 ** transaction has been closed. This is quite safe, as the pager will have
4351 ** transitioned to the error state.
4353 ** This will release the write lock on the database file. If there
4354 ** are no active cursors, it also releases the read lock.
4356 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
4358 if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
4359 sqlite3BtreeEnter(p);
4360 btreeIntegrity(p);
4362 /* If the handle has a write-transaction open, commit the shared-btrees
4363 ** transaction and set the shared state to TRANS_READ.
4365 if( p->inTrans==TRANS_WRITE ){
4366 int rc;
4367 BtShared *pBt = p->pBt;
4368 assert( pBt->inTransaction==TRANS_WRITE );
4369 assert( pBt->nTransaction>0 );
4370 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
4371 if( rc!=SQLITE_OK && bCleanup==0 ){
4372 sqlite3BtreeLeave(p);
4373 return rc;
4375 p->iBDataVersion--; /* Compensate for pPager->iDataVersion++; */
4376 pBt->inTransaction = TRANS_READ;
4377 btreeClearHasContent(pBt);
4380 btreeEndTransaction(p);
4381 sqlite3BtreeLeave(p);
4382 return SQLITE_OK;
4386 ** Do both phases of a commit.
4388 int sqlite3BtreeCommit(Btree *p){
4389 int rc;
4390 sqlite3BtreeEnter(p);
4391 rc = sqlite3BtreeCommitPhaseOne(p, 0);
4392 if( rc==SQLITE_OK ){
4393 rc = sqlite3BtreeCommitPhaseTwo(p, 0);
4395 sqlite3BtreeLeave(p);
4396 return rc;
4400 ** This routine sets the state to CURSOR_FAULT and the error
4401 ** code to errCode for every cursor on any BtShared that pBtree
4402 ** references. Or if the writeOnly flag is set to 1, then only
4403 ** trip write cursors and leave read cursors unchanged.
4405 ** Every cursor is a candidate to be tripped, including cursors
4406 ** that belong to other database connections that happen to be
4407 ** sharing the cache with pBtree.
4409 ** This routine gets called when a rollback occurs. If the writeOnly
4410 ** flag is true, then only write-cursors need be tripped - read-only
4411 ** cursors save their current positions so that they may continue
4412 ** following the rollback. Or, if writeOnly is false, all cursors are
4413 ** tripped. In general, writeOnly is false if the transaction being
4414 ** rolled back modified the database schema. In this case b-tree root
4415 ** pages may be moved or deleted from the database altogether, making
4416 ** it unsafe for read cursors to continue.
4418 ** If the writeOnly flag is true and an error is encountered while
4419 ** saving the current position of a read-only cursor, all cursors,
4420 ** including all read-cursors are tripped.
4422 ** SQLITE_OK is returned if successful, or if an error occurs while
4423 ** saving a cursor position, an SQLite error code.
4425 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
4426 BtCursor *p;
4427 int rc = SQLITE_OK;
4429 assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
4430 if( pBtree ){
4431 sqlite3BtreeEnter(pBtree);
4432 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
4433 if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
4434 if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
4435 rc = saveCursorPosition(p);
4436 if( rc!=SQLITE_OK ){
4437 (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
4438 break;
4441 }else{
4442 sqlite3BtreeClearCursor(p);
4443 p->eState = CURSOR_FAULT;
4444 p->skipNext = errCode;
4446 btreeReleaseAllCursorPages(p);
4448 sqlite3BtreeLeave(pBtree);
4450 return rc;
4454 ** Set the pBt->nPage field correctly, according to the current
4455 ** state of the database. Assume pBt->pPage1 is valid.
4457 static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){
4458 int nPage = get4byte(&pPage1->aData[28]);
4459 testcase( nPage==0 );
4460 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
4461 testcase( pBt->nPage!=(u32)nPage );
4462 pBt->nPage = nPage;
4466 ** Rollback the transaction in progress.
4468 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
4469 ** Only write cursors are tripped if writeOnly is true but all cursors are
4470 ** tripped if writeOnly is false. Any attempt to use
4471 ** a tripped cursor will result in an error.
4473 ** This will release the write lock on the database file. If there
4474 ** are no active cursors, it also releases the read lock.
4476 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
4477 int rc;
4478 BtShared *pBt = p->pBt;
4479 MemPage *pPage1;
4481 assert( writeOnly==1 || writeOnly==0 );
4482 assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
4483 sqlite3BtreeEnter(p);
4484 if( tripCode==SQLITE_OK ){
4485 rc = tripCode = saveAllCursors(pBt, 0, 0);
4486 if( rc ) writeOnly = 0;
4487 }else{
4488 rc = SQLITE_OK;
4490 if( tripCode ){
4491 int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
4492 assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
4493 if( rc2!=SQLITE_OK ) rc = rc2;
4495 btreeIntegrity(p);
4497 if( p->inTrans==TRANS_WRITE ){
4498 int rc2;
4500 assert( TRANS_WRITE==pBt->inTransaction );
4501 rc2 = sqlite3PagerRollback(pBt->pPager);
4502 if( rc2!=SQLITE_OK ){
4503 rc = rc2;
4506 /* The rollback may have destroyed the pPage1->aData value. So
4507 ** call btreeGetPage() on page 1 again to make
4508 ** sure pPage1->aData is set correctly. */
4509 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
4510 btreeSetNPage(pBt, pPage1);
4511 releasePageOne(pPage1);
4513 assert( countValidCursors(pBt, 1)==0 );
4514 pBt->inTransaction = TRANS_READ;
4515 btreeClearHasContent(pBt);
4518 btreeEndTransaction(p);
4519 sqlite3BtreeLeave(p);
4520 return rc;
4524 ** Start a statement subtransaction. The subtransaction can be rolled
4525 ** back independently of the main transaction. You must start a transaction
4526 ** before starting a subtransaction. The subtransaction is ended automatically
4527 ** if the main transaction commits or rolls back.
4529 ** Statement subtransactions are used around individual SQL statements
4530 ** that are contained within a BEGIN...COMMIT block. If a constraint
4531 ** error occurs within the statement, the effect of that one statement
4532 ** can be rolled back without having to rollback the entire transaction.
4534 ** A statement sub-transaction is implemented as an anonymous savepoint. The
4535 ** value passed as the second parameter is the total number of savepoints,
4536 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
4537 ** are no active savepoints and no other statement-transactions open,
4538 ** iStatement is 1. This anonymous savepoint can be released or rolled back
4539 ** using the sqlite3BtreeSavepoint() function.
4541 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
4542 int rc;
4543 BtShared *pBt = p->pBt;
4544 sqlite3BtreeEnter(p);
4545 assert( p->inTrans==TRANS_WRITE );
4546 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
4547 assert( iStatement>0 );
4548 assert( iStatement>p->db->nSavepoint );
4549 assert( pBt->inTransaction==TRANS_WRITE );
4550 /* At the pager level, a statement transaction is a savepoint with
4551 ** an index greater than all savepoints created explicitly using
4552 ** SQL statements. It is illegal to open, release or rollback any
4553 ** such savepoints while the statement transaction savepoint is active.
4555 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
4556 sqlite3BtreeLeave(p);
4557 return rc;
4561 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
4562 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
4563 ** savepoint identified by parameter iSavepoint, depending on the value
4564 ** of op.
4566 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
4567 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
4568 ** contents of the entire transaction are rolled back. This is different
4569 ** from a normal transaction rollback, as no locks are released and the
4570 ** transaction remains open.
4572 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
4573 int rc = SQLITE_OK;
4574 if( p && p->inTrans==TRANS_WRITE ){
4575 BtShared *pBt = p->pBt;
4576 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
4577 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
4578 sqlite3BtreeEnter(p);
4579 if( op==SAVEPOINT_ROLLBACK ){
4580 rc = saveAllCursors(pBt, 0, 0);
4582 if( rc==SQLITE_OK ){
4583 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
4585 if( rc==SQLITE_OK ){
4586 if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
4587 pBt->nPage = 0;
4589 rc = newDatabase(pBt);
4590 btreeSetNPage(pBt, pBt->pPage1);
4592 /* pBt->nPage might be zero if the database was corrupt when
4593 ** the transaction was started. Otherwise, it must be at least 1. */
4594 assert( CORRUPT_DB || pBt->nPage>0 );
4596 sqlite3BtreeLeave(p);
4598 return rc;
4602 ** Create a new cursor for the BTree whose root is on the page
4603 ** iTable. If a read-only cursor is requested, it is assumed that
4604 ** the caller already has at least a read-only transaction open
4605 ** on the database already. If a write-cursor is requested, then
4606 ** the caller is assumed to have an open write transaction.
4608 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
4609 ** be used for reading. If the BTREE_WRCSR bit is set, then the cursor
4610 ** can be used for reading or for writing if other conditions for writing
4611 ** are also met. These are the conditions that must be met in order
4612 ** for writing to be allowed:
4614 ** 1: The cursor must have been opened with wrFlag containing BTREE_WRCSR
4616 ** 2: Other database connections that share the same pager cache
4617 ** but which are not in the READ_UNCOMMITTED state may not have
4618 ** cursors open with wrFlag==0 on the same table. Otherwise
4619 ** the changes made by this write cursor would be visible to
4620 ** the read cursors in the other database connection.
4622 ** 3: The database must be writable (not on read-only media)
4624 ** 4: There must be an active transaction.
4626 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
4627 ** is set. If FORDELETE is set, that is a hint to the implementation that
4628 ** this cursor will only be used to seek to and delete entries of an index
4629 ** as part of a larger DELETE statement. The FORDELETE hint is not used by
4630 ** this implementation. But in a hypothetical alternative storage engine
4631 ** in which index entries are automatically deleted when corresponding table
4632 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
4633 ** operations on this cursor can be no-ops and all READ operations can
4634 ** return a null row (2-bytes: 0x01 0x00).
4636 ** No checking is done to make sure that page iTable really is the
4637 ** root page of a b-tree. If it is not, then the cursor acquired
4638 ** will not work correctly.
4640 ** It is assumed that the sqlite3BtreeCursorZero() has been called
4641 ** on pCur to initialize the memory space prior to invoking this routine.
4643 static int btreeCursor(
4644 Btree *p, /* The btree */
4645 Pgno iTable, /* Root page of table to open */
4646 int wrFlag, /* 1 to write. 0 read-only */
4647 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
4648 BtCursor *pCur /* Space for new cursor */
4650 BtShared *pBt = p->pBt; /* Shared b-tree handle */
4651 BtCursor *pX; /* Looping over other all cursors */
4653 assert( sqlite3BtreeHoldsMutex(p) );
4654 assert( wrFlag==0
4655 || wrFlag==BTREE_WRCSR
4656 || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE)
4659 /* The following assert statements verify that if this is a sharable
4660 ** b-tree database, the connection is holding the required table locks,
4661 ** and that no other connection has any open cursor that conflicts with
4662 ** this lock. The iTable<1 term disables the check for corrupt schemas. */
4663 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1))
4664 || iTable<1 );
4665 assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
4667 /* Assert that the caller has opened the required transaction. */
4668 assert( p->inTrans>TRANS_NONE );
4669 assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
4670 assert( pBt->pPage1 && pBt->pPage1->aData );
4671 assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
4673 if( iTable<=1 ){
4674 if( iTable<1 ){
4675 return SQLITE_CORRUPT_BKPT;
4676 }else if( btreePagecount(pBt)==0 ){
4677 assert( wrFlag==0 );
4678 iTable = 0;
4682 /* Now that no other errors can occur, finish filling in the BtCursor
4683 ** variables and link the cursor into the BtShared list. */
4684 pCur->pgnoRoot = iTable;
4685 pCur->iPage = -1;
4686 pCur->pKeyInfo = pKeyInfo;
4687 pCur->pBtree = p;
4688 pCur->pBt = pBt;
4689 pCur->curFlags = 0;
4690 /* If there are two or more cursors on the same btree, then all such
4691 ** cursors *must* have the BTCF_Multiple flag set. */
4692 for(pX=pBt->pCursor; pX; pX=pX->pNext){
4693 if( pX->pgnoRoot==iTable ){
4694 pX->curFlags |= BTCF_Multiple;
4695 pCur->curFlags = BTCF_Multiple;
4698 pCur->eState = CURSOR_INVALID;
4699 pCur->pNext = pBt->pCursor;
4700 pBt->pCursor = pCur;
4701 if( wrFlag ){
4702 pCur->curFlags |= BTCF_WriteFlag;
4703 pCur->curPagerFlags = 0;
4704 if( pBt->pTmpSpace==0 ) return allocateTempSpace(pBt);
4705 }else{
4706 pCur->curPagerFlags = PAGER_GET_READONLY;
4708 return SQLITE_OK;
4710 static int btreeCursorWithLock(
4711 Btree *p, /* The btree */
4712 Pgno iTable, /* Root page of table to open */
4713 int wrFlag, /* 1 to write. 0 read-only */
4714 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
4715 BtCursor *pCur /* Space for new cursor */
4717 int rc;
4718 sqlite3BtreeEnter(p);
4719 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4720 sqlite3BtreeLeave(p);
4721 return rc;
4723 int sqlite3BtreeCursor(
4724 Btree *p, /* The btree */
4725 Pgno iTable, /* Root page of table to open */
4726 int wrFlag, /* 1 to write. 0 read-only */
4727 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
4728 BtCursor *pCur /* Write new cursor here */
4730 if( p->sharable ){
4731 return btreeCursorWithLock(p, iTable, wrFlag, pKeyInfo, pCur);
4732 }else{
4733 return btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4738 ** Return the size of a BtCursor object in bytes.
4740 ** This interfaces is needed so that users of cursors can preallocate
4741 ** sufficient storage to hold a cursor. The BtCursor object is opaque
4742 ** to users so they cannot do the sizeof() themselves - they must call
4743 ** this routine.
4745 int sqlite3BtreeCursorSize(void){
4746 return ROUND8(sizeof(BtCursor));
4750 ** Initialize memory that will be converted into a BtCursor object.
4752 ** The simple approach here would be to memset() the entire object
4753 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays
4754 ** do not need to be zeroed and they are large, so we can save a lot
4755 ** of run-time by skipping the initialization of those elements.
4757 void sqlite3BtreeCursorZero(BtCursor *p){
4758 memset(p, 0, offsetof(BtCursor, BTCURSOR_FIRST_UNINIT));
4762 ** Close a cursor. The read lock on the database file is released
4763 ** when the last cursor is closed.
4765 int sqlite3BtreeCloseCursor(BtCursor *pCur){
4766 Btree *pBtree = pCur->pBtree;
4767 if( pBtree ){
4768 BtShared *pBt = pCur->pBt;
4769 sqlite3BtreeEnter(pBtree);
4770 assert( pBt->pCursor!=0 );
4771 if( pBt->pCursor==pCur ){
4772 pBt->pCursor = pCur->pNext;
4773 }else{
4774 BtCursor *pPrev = pBt->pCursor;
4776 if( pPrev->pNext==pCur ){
4777 pPrev->pNext = pCur->pNext;
4778 break;
4780 pPrev = pPrev->pNext;
4781 }while( ALWAYS(pPrev) );
4783 btreeReleaseAllCursorPages(pCur);
4784 unlockBtreeIfUnused(pBt);
4785 sqlite3_free(pCur->aOverflow);
4786 sqlite3_free(pCur->pKey);
4787 if( (pBt->openFlags & BTREE_SINGLE) && pBt->pCursor==0 ){
4788 /* Since the BtShared is not sharable, there is no need to
4789 ** worry about the missing sqlite3BtreeLeave() call here. */
4790 assert( pBtree->sharable==0 );
4791 sqlite3BtreeClose(pBtree);
4792 }else{
4793 sqlite3BtreeLeave(pBtree);
4795 pCur->pBtree = 0;
4797 return SQLITE_OK;
4801 ** Make sure the BtCursor* given in the argument has a valid
4802 ** BtCursor.info structure. If it is not already valid, call
4803 ** btreeParseCell() to fill it in.
4805 ** BtCursor.info is a cache of the information in the current cell.
4806 ** Using this cache reduces the number of calls to btreeParseCell().
4808 #ifndef NDEBUG
4809 static int cellInfoEqual(CellInfo *a, CellInfo *b){
4810 if( a->nKey!=b->nKey ) return 0;
4811 if( a->pPayload!=b->pPayload ) return 0;
4812 if( a->nPayload!=b->nPayload ) return 0;
4813 if( a->nLocal!=b->nLocal ) return 0;
4814 if( a->nSize!=b->nSize ) return 0;
4815 return 1;
4817 static void assertCellInfo(BtCursor *pCur){
4818 CellInfo info;
4819 memset(&info, 0, sizeof(info));
4820 btreeParseCell(pCur->pPage, pCur->ix, &info);
4821 assert( CORRUPT_DB || cellInfoEqual(&info, &pCur->info) );
4823 #else
4824 #define assertCellInfo(x)
4825 #endif
4826 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
4827 if( pCur->info.nSize==0 ){
4828 pCur->curFlags |= BTCF_ValidNKey;
4829 btreeParseCell(pCur->pPage,pCur->ix,&pCur->info);
4830 }else{
4831 assertCellInfo(pCur);
4835 #ifndef NDEBUG /* The next routine used only within assert() statements */
4837 ** Return true if the given BtCursor is valid. A valid cursor is one
4838 ** that is currently pointing to a row in a (non-empty) table.
4839 ** This is a verification routine is used only within assert() statements.
4841 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
4842 return pCur && pCur->eState==CURSOR_VALID;
4844 #endif /* NDEBUG */
4845 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
4846 assert( pCur!=0 );
4847 return pCur->eState==CURSOR_VALID;
4851 ** Return the value of the integer key or "rowid" for a table btree.
4852 ** This routine is only valid for a cursor that is pointing into a
4853 ** ordinary table btree. If the cursor points to an index btree or
4854 ** is invalid, the result of this routine is undefined.
4856 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
4857 assert( cursorHoldsMutex(pCur) );
4858 assert( pCur->eState==CURSOR_VALID );
4859 assert( pCur->curIntKey );
4860 getCellInfo(pCur);
4861 return pCur->info.nKey;
4865 ** Pin or unpin a cursor.
4867 void sqlite3BtreeCursorPin(BtCursor *pCur){
4868 assert( (pCur->curFlags & BTCF_Pinned)==0 );
4869 pCur->curFlags |= BTCF_Pinned;
4871 void sqlite3BtreeCursorUnpin(BtCursor *pCur){
4872 assert( (pCur->curFlags & BTCF_Pinned)!=0 );
4873 pCur->curFlags &= ~BTCF_Pinned;
4877 ** Return the offset into the database file for the start of the
4878 ** payload to which the cursor is pointing.
4880 i64 sqlite3BtreeOffset(BtCursor *pCur){
4881 assert( cursorHoldsMutex(pCur) );
4882 assert( pCur->eState==CURSOR_VALID );
4883 getCellInfo(pCur);
4884 return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) +
4885 (i64)(pCur->info.pPayload - pCur->pPage->aData);
4889 ** Return the number of bytes of payload for the entry that pCur is
4890 ** currently pointing to. For table btrees, this will be the amount
4891 ** of data. For index btrees, this will be the size of the key.
4893 ** The caller must guarantee that the cursor is pointing to a non-NULL
4894 ** valid entry. In other words, the calling procedure must guarantee
4895 ** that the cursor has Cursor.eState==CURSOR_VALID.
4897 u32 sqlite3BtreePayloadSize(BtCursor *pCur){
4898 assert( cursorHoldsMutex(pCur) );
4899 assert( pCur->eState==CURSOR_VALID );
4900 getCellInfo(pCur);
4901 return pCur->info.nPayload;
4905 ** Return an upper bound on the size of any record for the table
4906 ** that the cursor is pointing into.
4908 ** This is an optimization. Everything will still work if this
4909 ** routine always returns 2147483647 (which is the largest record
4910 ** that SQLite can handle) or more. But returning a smaller value might
4911 ** prevent large memory allocations when trying to interpret a
4912 ** corrupt database.
4914 ** The current implementation merely returns the size of the underlying
4915 ** database file.
4917 sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){
4918 assert( cursorHoldsMutex(pCur) );
4919 assert( pCur->eState==CURSOR_VALID );
4920 return pCur->pBt->pageSize * (sqlite3_int64)pCur->pBt->nPage;
4924 ** Given the page number of an overflow page in the database (parameter
4925 ** ovfl), this function finds the page number of the next page in the
4926 ** linked list of overflow pages. If possible, it uses the auto-vacuum
4927 ** pointer-map data instead of reading the content of page ovfl to do so.
4929 ** If an error occurs an SQLite error code is returned. Otherwise:
4931 ** The page number of the next overflow page in the linked list is
4932 ** written to *pPgnoNext. If page ovfl is the last page in its linked
4933 ** list, *pPgnoNext is set to zero.
4935 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
4936 ** to page number pOvfl was obtained, then *ppPage is set to point to that
4937 ** reference. It is the responsibility of the caller to call releasePage()
4938 ** on *ppPage to free the reference. In no reference was obtained (because
4939 ** the pointer-map was used to obtain the value for *pPgnoNext), then
4940 ** *ppPage is set to zero.
4942 static int getOverflowPage(
4943 BtShared *pBt, /* The database file */
4944 Pgno ovfl, /* Current overflow page number */
4945 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */
4946 Pgno *pPgnoNext /* OUT: Next overflow page number */
4948 Pgno next = 0;
4949 MemPage *pPage = 0;
4950 int rc = SQLITE_OK;
4952 assert( sqlite3_mutex_held(pBt->mutex) );
4953 assert(pPgnoNext);
4955 #ifndef SQLITE_OMIT_AUTOVACUUM
4956 /* Try to find the next page in the overflow list using the
4957 ** autovacuum pointer-map pages. Guess that the next page in
4958 ** the overflow list is page number (ovfl+1). If that guess turns
4959 ** out to be wrong, fall back to loading the data of page
4960 ** number ovfl to determine the next page number.
4962 if( pBt->autoVacuum ){
4963 Pgno pgno;
4964 Pgno iGuess = ovfl+1;
4965 u8 eType;
4967 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
4968 iGuess++;
4971 if( iGuess<=btreePagecount(pBt) ){
4972 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
4973 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
4974 next = iGuess;
4975 rc = SQLITE_DONE;
4979 #endif
4981 assert( next==0 || rc==SQLITE_DONE );
4982 if( rc==SQLITE_OK ){
4983 rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
4984 assert( rc==SQLITE_OK || pPage==0 );
4985 if( rc==SQLITE_OK ){
4986 next = get4byte(pPage->aData);
4990 *pPgnoNext = next;
4991 if( ppPage ){
4992 *ppPage = pPage;
4993 }else{
4994 releasePage(pPage);
4996 return (rc==SQLITE_DONE ? SQLITE_OK : rc);
5000 ** Copy data from a buffer to a page, or from a page to a buffer.
5002 ** pPayload is a pointer to data stored on database page pDbPage.
5003 ** If argument eOp is false, then nByte bytes of data are copied
5004 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
5005 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
5006 ** of data are copied from the buffer pBuf to pPayload.
5008 ** SQLITE_OK is returned on success, otherwise an error code.
5010 static int copyPayload(
5011 void *pPayload, /* Pointer to page data */
5012 void *pBuf, /* Pointer to buffer */
5013 int nByte, /* Number of bytes to copy */
5014 int eOp, /* 0 -> copy from page, 1 -> copy to page */
5015 DbPage *pDbPage /* Page containing pPayload */
5017 if( eOp ){
5018 /* Copy data from buffer to page (a write operation) */
5019 int rc = sqlite3PagerWrite(pDbPage);
5020 if( rc!=SQLITE_OK ){
5021 return rc;
5023 memcpy(pPayload, pBuf, nByte);
5024 }else{
5025 /* Copy data from page to buffer (a read operation) */
5026 memcpy(pBuf, pPayload, nByte);
5028 return SQLITE_OK;
5032 ** This function is used to read or overwrite payload information
5033 ** for the entry that the pCur cursor is pointing to. The eOp
5034 ** argument is interpreted as follows:
5036 ** 0: The operation is a read. Populate the overflow cache.
5037 ** 1: The operation is a write. Populate the overflow cache.
5039 ** A total of "amt" bytes are read or written beginning at "offset".
5040 ** Data is read to or from the buffer pBuf.
5042 ** The content being read or written might appear on the main page
5043 ** or be scattered out on multiple overflow pages.
5045 ** If the current cursor entry uses one or more overflow pages
5046 ** this function may allocate space for and lazily populate
5047 ** the overflow page-list cache array (BtCursor.aOverflow).
5048 ** Subsequent calls use this cache to make seeking to the supplied offset
5049 ** more efficient.
5051 ** Once an overflow page-list cache has been allocated, it must be
5052 ** invalidated if some other cursor writes to the same table, or if
5053 ** the cursor is moved to a different row. Additionally, in auto-vacuum
5054 ** mode, the following events may invalidate an overflow page-list cache.
5056 ** * An incremental vacuum,
5057 ** * A commit in auto_vacuum="full" mode,
5058 ** * Creating a table (may require moving an overflow page).
5060 static int accessPayload(
5061 BtCursor *pCur, /* Cursor pointing to entry to read from */
5062 u32 offset, /* Begin reading this far into payload */
5063 u32 amt, /* Read this many bytes */
5064 unsigned char *pBuf, /* Write the bytes into this buffer */
5065 int eOp /* zero to read. non-zero to write. */
5067 unsigned char *aPayload;
5068 int rc = SQLITE_OK;
5069 int iIdx = 0;
5070 MemPage *pPage = pCur->pPage; /* Btree page of current entry */
5071 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */
5072 #ifdef SQLITE_DIRECT_OVERFLOW_READ
5073 unsigned char * const pBufStart = pBuf; /* Start of original out buffer */
5074 #endif
5076 assert( pPage );
5077 assert( eOp==0 || eOp==1 );
5078 assert( pCur->eState==CURSOR_VALID );
5079 if( pCur->ix>=pPage->nCell ){
5080 return SQLITE_CORRUPT_PAGE(pPage);
5082 assert( cursorHoldsMutex(pCur) );
5084 getCellInfo(pCur);
5085 aPayload = pCur->info.pPayload;
5086 assert( offset+amt <= pCur->info.nPayload );
5088 assert( aPayload > pPage->aData );
5089 if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
5090 /* Trying to read or write past the end of the data is an error. The
5091 ** conditional above is really:
5092 ** &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
5093 ** but is recast into its current form to avoid integer overflow problems
5095 return SQLITE_CORRUPT_PAGE(pPage);
5098 /* Check if data must be read/written to/from the btree page itself. */
5099 if( offset<pCur->info.nLocal ){
5100 int a = amt;
5101 if( a+offset>pCur->info.nLocal ){
5102 a = pCur->info.nLocal - offset;
5104 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
5105 offset = 0;
5106 pBuf += a;
5107 amt -= a;
5108 }else{
5109 offset -= pCur->info.nLocal;
5113 if( rc==SQLITE_OK && amt>0 ){
5114 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
5115 Pgno nextPage;
5117 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
5119 /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
5121 ** The aOverflow[] array is sized at one entry for each overflow page
5122 ** in the overflow chain. The page number of the first overflow page is
5123 ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
5124 ** means "not yet known" (the cache is lazily populated).
5126 if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){
5127 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
5128 if( pCur->aOverflow==0
5129 || nOvfl*(int)sizeof(Pgno) > sqlite3MallocSize(pCur->aOverflow)
5131 Pgno *aNew;
5132 if( sqlite3FaultSim(413) ){
5133 aNew = 0;
5134 }else{
5135 aNew = (Pgno*)sqlite3Realloc(pCur->aOverflow, nOvfl*2*sizeof(Pgno));
5137 if( aNew==0 ){
5138 return SQLITE_NOMEM_BKPT;
5139 }else{
5140 pCur->aOverflow = aNew;
5143 memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
5144 pCur->curFlags |= BTCF_ValidOvfl;
5145 }else{
5146 /* Sanity check the validity of the overflow page cache */
5147 assert( pCur->aOverflow[0]==nextPage
5148 || pCur->aOverflow[0]==0
5149 || CORRUPT_DB );
5150 assert( pCur->aOverflow[0]!=0 || pCur->aOverflow[offset/ovflSize]==0 );
5152 /* If the overflow page-list cache has been allocated and the
5153 ** entry for the first required overflow page is valid, skip
5154 ** directly to it.
5156 if( pCur->aOverflow[offset/ovflSize] ){
5157 iIdx = (offset/ovflSize);
5158 nextPage = pCur->aOverflow[iIdx];
5159 offset = (offset%ovflSize);
5163 assert( rc==SQLITE_OK && amt>0 );
5164 while( nextPage ){
5165 /* If required, populate the overflow page-list cache. */
5166 if( nextPage > pBt->nPage ) return SQLITE_CORRUPT_BKPT;
5167 assert( pCur->aOverflow[iIdx]==0
5168 || pCur->aOverflow[iIdx]==nextPage
5169 || CORRUPT_DB );
5170 pCur->aOverflow[iIdx] = nextPage;
5172 if( offset>=ovflSize ){
5173 /* The only reason to read this page is to obtain the page
5174 ** number for the next page in the overflow chain. The page
5175 ** data is not required. So first try to lookup the overflow
5176 ** page-list cache, if any, then fall back to the getOverflowPage()
5177 ** function.
5179 assert( pCur->curFlags & BTCF_ValidOvfl );
5180 assert( pCur->pBtree->db==pBt->db );
5181 if( pCur->aOverflow[iIdx+1] ){
5182 nextPage = pCur->aOverflow[iIdx+1];
5183 }else{
5184 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
5186 offset -= ovflSize;
5187 }else{
5188 /* Need to read this page properly. It contains some of the
5189 ** range of data that is being read (eOp==0) or written (eOp!=0).
5191 int a = amt;
5192 if( a + offset > ovflSize ){
5193 a = ovflSize - offset;
5196 #ifdef SQLITE_DIRECT_OVERFLOW_READ
5197 /* If all the following are true:
5199 ** 1) this is a read operation, and
5200 ** 2) data is required from the start of this overflow page, and
5201 ** 3) there are no dirty pages in the page-cache
5202 ** 4) the database is file-backed, and
5203 ** 5) the page is not in the WAL file
5204 ** 6) at least 4 bytes have already been read into the output buffer
5206 ** then data can be read directly from the database file into the
5207 ** output buffer, bypassing the page-cache altogether. This speeds
5208 ** up loading large records that span many overflow pages.
5210 if( eOp==0 /* (1) */
5211 && offset==0 /* (2) */
5212 && sqlite3PagerDirectReadOk(pBt->pPager, nextPage) /* (3,4,5) */
5213 && &pBuf[-4]>=pBufStart /* (6) */
5215 sqlite3_file *fd = sqlite3PagerFile(pBt->pPager);
5216 u8 aSave[4];
5217 u8 *aWrite = &pBuf[-4];
5218 assert( aWrite>=pBufStart ); /* due to (6) */
5219 memcpy(aSave, aWrite, 4);
5220 rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
5221 nextPage = get4byte(aWrite);
5222 memcpy(aWrite, aSave, 4);
5223 }else
5224 #endif
5227 DbPage *pDbPage;
5228 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
5229 (eOp==0 ? PAGER_GET_READONLY : 0)
5231 if( rc==SQLITE_OK ){
5232 aPayload = sqlite3PagerGetData(pDbPage);
5233 nextPage = get4byte(aPayload);
5234 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
5235 sqlite3PagerUnref(pDbPage);
5236 offset = 0;
5239 amt -= a;
5240 if( amt==0 ) return rc;
5241 pBuf += a;
5243 if( rc ) break;
5244 iIdx++;
5248 if( rc==SQLITE_OK && amt>0 ){
5249 /* Overflow chain ends prematurely */
5250 return SQLITE_CORRUPT_PAGE(pPage);
5252 return rc;
5256 ** Read part of the payload for the row at which that cursor pCur is currently
5257 ** pointing. "amt" bytes will be transferred into pBuf[]. The transfer
5258 ** begins at "offset".
5260 ** pCur can be pointing to either a table or an index b-tree.
5261 ** If pointing to a table btree, then the content section is read. If
5262 ** pCur is pointing to an index b-tree then the key section is read.
5264 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
5265 ** to a valid row in the table. For sqlite3BtreePayloadChecked(), the
5266 ** cursor might be invalid or might need to be restored before being read.
5268 ** Return SQLITE_OK on success or an error code if anything goes
5269 ** wrong. An error is returned if "offset+amt" is larger than
5270 ** the available payload.
5272 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
5273 assert( cursorHoldsMutex(pCur) );
5274 assert( pCur->eState==CURSOR_VALID );
5275 assert( pCur->iPage>=0 && pCur->pPage );
5276 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
5280 ** This variant of sqlite3BtreePayload() works even if the cursor has not
5281 ** in the CURSOR_VALID state. It is only used by the sqlite3_blob_read()
5282 ** interface.
5284 #ifndef SQLITE_OMIT_INCRBLOB
5285 static SQLITE_NOINLINE int accessPayloadChecked(
5286 BtCursor *pCur,
5287 u32 offset,
5288 u32 amt,
5289 void *pBuf
5291 int rc;
5292 if ( pCur->eState==CURSOR_INVALID ){
5293 return SQLITE_ABORT;
5295 assert( cursorOwnsBtShared(pCur) );
5296 rc = btreeRestoreCursorPosition(pCur);
5297 return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0);
5299 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
5300 if( pCur->eState==CURSOR_VALID ){
5301 assert( cursorOwnsBtShared(pCur) );
5302 return accessPayload(pCur, offset, amt, pBuf, 0);
5303 }else{
5304 return accessPayloadChecked(pCur, offset, amt, pBuf);
5307 #endif /* SQLITE_OMIT_INCRBLOB */
5310 ** Return a pointer to payload information from the entry that the
5311 ** pCur cursor is pointing to. The pointer is to the beginning of
5312 ** the key if index btrees (pPage->intKey==0) and is the data for
5313 ** table btrees (pPage->intKey==1). The number of bytes of available
5314 ** key/data is written into *pAmt. If *pAmt==0, then the value
5315 ** returned will not be a valid pointer.
5317 ** This routine is an optimization. It is common for the entire key
5318 ** and data to fit on the local page and for there to be no overflow
5319 ** pages. When that is so, this routine can be used to access the
5320 ** key and data without making a copy. If the key and/or data spills
5321 ** onto overflow pages, then accessPayload() must be used to reassemble
5322 ** the key/data and copy it into a preallocated buffer.
5324 ** The pointer returned by this routine looks directly into the cached
5325 ** page of the database. The data might change or move the next time
5326 ** any btree routine is called.
5328 static const void *fetchPayload(
5329 BtCursor *pCur, /* Cursor pointing to entry to read from */
5330 u32 *pAmt /* Write the number of available bytes here */
5332 int amt;
5333 assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage);
5334 assert( pCur->eState==CURSOR_VALID );
5335 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5336 assert( cursorOwnsBtShared(pCur) );
5337 assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB );
5338 assert( pCur->info.nSize>0 );
5339 assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB );
5340 assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB);
5341 amt = pCur->info.nLocal;
5342 if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){
5343 /* There is too little space on the page for the expected amount
5344 ** of local content. Database must be corrupt. */
5345 assert( CORRUPT_DB );
5346 amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload));
5348 *pAmt = (u32)amt;
5349 return (void*)pCur->info.pPayload;
5354 ** For the entry that cursor pCur is point to, return as
5355 ** many bytes of the key or data as are available on the local
5356 ** b-tree page. Write the number of available bytes into *pAmt.
5358 ** The pointer returned is ephemeral. The key/data may move
5359 ** or be destroyed on the next call to any Btree routine,
5360 ** including calls from other threads against the same cache.
5361 ** Hence, a mutex on the BtShared should be held prior to calling
5362 ** this routine.
5364 ** These routines is used to get quick access to key and data
5365 ** in the common case where no overflow pages are used.
5367 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
5368 return fetchPayload(pCur, pAmt);
5373 ** Move the cursor down to a new child page. The newPgno argument is the
5374 ** page number of the child page to move to.
5376 ** This function returns SQLITE_CORRUPT if the page-header flags field of
5377 ** the new child page does not match the flags field of the parent (i.e.
5378 ** if an intkey page appears to be the parent of a non-intkey page, or
5379 ** vice-versa).
5381 static int moveToChild(BtCursor *pCur, u32 newPgno){
5382 int rc;
5383 assert( cursorOwnsBtShared(pCur) );
5384 assert( pCur->eState==CURSOR_VALID );
5385 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
5386 assert( pCur->iPage>=0 );
5387 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
5388 return SQLITE_CORRUPT_BKPT;
5390 pCur->info.nSize = 0;
5391 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5392 pCur->aiIdx[pCur->iPage] = pCur->ix;
5393 pCur->apPage[pCur->iPage] = pCur->pPage;
5394 pCur->ix = 0;
5395 pCur->iPage++;
5396 rc = getAndInitPage(pCur->pBt, newPgno, &pCur->pPage, pCur->curPagerFlags);
5397 assert( pCur->pPage!=0 || rc!=SQLITE_OK );
5398 if( rc==SQLITE_OK
5399 && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey)
5401 releasePage(pCur->pPage);
5402 rc = SQLITE_CORRUPT_PGNO(newPgno);
5404 if( rc ){
5405 pCur->pPage = pCur->apPage[--pCur->iPage];
5407 return rc;
5410 #ifdef SQLITE_DEBUG
5412 ** Page pParent is an internal (non-leaf) tree page. This function
5413 ** asserts that page number iChild is the left-child if the iIdx'th
5414 ** cell in page pParent. Or, if iIdx is equal to the total number of
5415 ** cells in pParent, that page number iChild is the right-child of
5416 ** the page.
5418 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
5419 if( CORRUPT_DB ) return; /* The conditions tested below might not be true
5420 ** in a corrupt database */
5421 assert( iIdx<=pParent->nCell );
5422 if( iIdx==pParent->nCell ){
5423 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
5424 }else{
5425 assert( get4byte(findCell(pParent, iIdx))==iChild );
5428 #else
5429 # define assertParentIndex(x,y,z)
5430 #endif
5433 ** Move the cursor up to the parent page.
5435 ** pCur->idx is set to the cell index that contains the pointer
5436 ** to the page we are coming from. If we are coming from the
5437 ** right-most child page then pCur->idx is set to one more than
5438 ** the largest cell index.
5440 static void moveToParent(BtCursor *pCur){
5441 MemPage *pLeaf;
5442 assert( cursorOwnsBtShared(pCur) );
5443 assert( pCur->eState==CURSOR_VALID );
5444 assert( pCur->iPage>0 );
5445 assert( pCur->pPage );
5446 assertParentIndex(
5447 pCur->apPage[pCur->iPage-1],
5448 pCur->aiIdx[pCur->iPage-1],
5449 pCur->pPage->pgno
5451 testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
5452 pCur->info.nSize = 0;
5453 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5454 pCur->ix = pCur->aiIdx[pCur->iPage-1];
5455 pLeaf = pCur->pPage;
5456 pCur->pPage = pCur->apPage[--pCur->iPage];
5457 releasePageNotNull(pLeaf);
5461 ** Move the cursor to point to the root page of its b-tree structure.
5463 ** If the table has a virtual root page, then the cursor is moved to point
5464 ** to the virtual root page instead of the actual root page. A table has a
5465 ** virtual root page when the actual root page contains no cells and a
5466 ** single child page. This can only happen with the table rooted at page 1.
5468 ** If the b-tree structure is empty, the cursor state is set to
5469 ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise,
5470 ** the cursor is set to point to the first cell located on the root
5471 ** (or virtual root) page and the cursor state is set to CURSOR_VALID.
5473 ** If this function returns successfully, it may be assumed that the
5474 ** page-header flags indicate that the [virtual] root-page is the expected
5475 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
5476 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
5477 ** indicating a table b-tree, or if the caller did specify a KeyInfo
5478 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
5479 ** b-tree).
5481 static int moveToRoot(BtCursor *pCur){
5482 MemPage *pRoot;
5483 int rc = SQLITE_OK;
5485 assert( cursorOwnsBtShared(pCur) );
5486 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
5487 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
5488 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
5489 assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 );
5490 assert( pCur->pgnoRoot>0 || pCur->iPage<0 );
5492 if( pCur->iPage>=0 ){
5493 if( pCur->iPage ){
5494 releasePageNotNull(pCur->pPage);
5495 while( --pCur->iPage ){
5496 releasePageNotNull(pCur->apPage[pCur->iPage]);
5498 pRoot = pCur->pPage = pCur->apPage[0];
5499 goto skip_init;
5501 }else if( pCur->pgnoRoot==0 ){
5502 pCur->eState = CURSOR_INVALID;
5503 return SQLITE_EMPTY;
5504 }else{
5505 assert( pCur->iPage==(-1) );
5506 if( pCur->eState>=CURSOR_REQUIRESEEK ){
5507 if( pCur->eState==CURSOR_FAULT ){
5508 assert( pCur->skipNext!=SQLITE_OK );
5509 return pCur->skipNext;
5511 sqlite3BtreeClearCursor(pCur);
5513 rc = getAndInitPage(pCur->pBt, pCur->pgnoRoot, &pCur->pPage,
5514 pCur->curPagerFlags);
5515 if( rc!=SQLITE_OK ){
5516 pCur->eState = CURSOR_INVALID;
5517 return rc;
5519 pCur->iPage = 0;
5520 pCur->curIntKey = pCur->pPage->intKey;
5522 pRoot = pCur->pPage;
5523 assert( pRoot->pgno==pCur->pgnoRoot || CORRUPT_DB );
5525 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
5526 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
5527 ** NULL, the caller expects a table b-tree. If this is not the case,
5528 ** return an SQLITE_CORRUPT error.
5530 ** Earlier versions of SQLite assumed that this test could not fail
5531 ** if the root page was already loaded when this function was called (i.e.
5532 ** if pCur->iPage>=0). But this is not so if the database is corrupted
5533 ** in such a way that page pRoot is linked into a second b-tree table
5534 ** (or the freelist). */
5535 assert( pRoot->intKey==1 || pRoot->intKey==0 );
5536 if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
5537 return SQLITE_CORRUPT_PAGE(pCur->pPage);
5540 skip_init:
5541 pCur->ix = 0;
5542 pCur->info.nSize = 0;
5543 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
5545 if( pRoot->nCell>0 ){
5546 pCur->eState = CURSOR_VALID;
5547 }else if( !pRoot->leaf ){
5548 Pgno subpage;
5549 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
5550 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
5551 pCur->eState = CURSOR_VALID;
5552 rc = moveToChild(pCur, subpage);
5553 }else{
5554 pCur->eState = CURSOR_INVALID;
5555 rc = SQLITE_EMPTY;
5557 return rc;
5561 ** Move the cursor down to the left-most leaf entry beneath the
5562 ** entry to which it is currently pointing.
5564 ** The left-most leaf is the one with the smallest key - the first
5565 ** in ascending order.
5567 static int moveToLeftmost(BtCursor *pCur){
5568 Pgno pgno;
5569 int rc = SQLITE_OK;
5570 MemPage *pPage;
5572 assert( cursorOwnsBtShared(pCur) );
5573 assert( pCur->eState==CURSOR_VALID );
5574 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
5575 assert( pCur->ix<pPage->nCell );
5576 pgno = get4byte(findCell(pPage, pCur->ix));
5577 rc = moveToChild(pCur, pgno);
5579 return rc;
5583 ** Move the cursor down to the right-most leaf entry beneath the
5584 ** page to which it is currently pointing. Notice the difference
5585 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
5586 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
5587 ** finds the right-most entry beneath the *page*.
5589 ** The right-most entry is the one with the largest key - the last
5590 ** key in ascending order.
5592 static int moveToRightmost(BtCursor *pCur){
5593 Pgno pgno;
5594 int rc = SQLITE_OK;
5595 MemPage *pPage = 0;
5597 assert( cursorOwnsBtShared(pCur) );
5598 assert( pCur->eState==CURSOR_VALID );
5599 while( !(pPage = pCur->pPage)->leaf ){
5600 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5601 pCur->ix = pPage->nCell;
5602 rc = moveToChild(pCur, pgno);
5603 if( rc ) return rc;
5605 pCur->ix = pPage->nCell-1;
5606 assert( pCur->info.nSize==0 );
5607 assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
5608 return SQLITE_OK;
5611 /* Move the cursor to the first entry in the table. Return SQLITE_OK
5612 ** on success. Set *pRes to 0 if the cursor actually points to something
5613 ** or set *pRes to 1 if the table is empty.
5615 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
5616 int rc;
5618 assert( cursorOwnsBtShared(pCur) );
5619 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5620 rc = moveToRoot(pCur);
5621 if( rc==SQLITE_OK ){
5622 assert( pCur->pPage->nCell>0 );
5623 *pRes = 0;
5624 rc = moveToLeftmost(pCur);
5625 }else if( rc==SQLITE_EMPTY ){
5626 assert( pCur->pgnoRoot==0 || (pCur->pPage!=0 && pCur->pPage->nCell==0) );
5627 *pRes = 1;
5628 rc = SQLITE_OK;
5630 return rc;
5633 #ifdef SQLITE_DEBUG
5634 /* The cursors is CURSOR_VALID and has BTCF_AtLast set. Verify that
5635 ** this flags are true for a consistent database.
5637 ** This routine is is called from within assert() statements only.
5638 ** It is an internal verification routine and does not appear in production
5639 ** builds.
5641 static int cursorIsAtLastEntry(BtCursor *pCur){
5642 int ii;
5643 for(ii=0; ii<pCur->iPage; ii++){
5644 if( pCur->aiIdx[ii]!=pCur->apPage[ii]->nCell ) return 0;
5646 return pCur->ix==pCur->pPage->nCell-1 && pCur->pPage->leaf!=0;
5648 #endif
5650 /* Move the cursor to the last entry in the table. Return SQLITE_OK
5651 ** on success. Set *pRes to 0 if the cursor actually points to something
5652 ** or set *pRes to 1 if the table is empty.
5654 static SQLITE_NOINLINE int btreeLast(BtCursor *pCur, int *pRes){
5655 int rc = moveToRoot(pCur);
5656 if( rc==SQLITE_OK ){
5657 assert( pCur->eState==CURSOR_VALID );
5658 *pRes = 0;
5659 rc = moveToRightmost(pCur);
5660 if( rc==SQLITE_OK ){
5661 pCur->curFlags |= BTCF_AtLast;
5662 }else{
5663 pCur->curFlags &= ~BTCF_AtLast;
5665 }else if( rc==SQLITE_EMPTY ){
5666 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5667 *pRes = 1;
5668 rc = SQLITE_OK;
5670 return rc;
5672 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
5673 assert( cursorOwnsBtShared(pCur) );
5674 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5676 /* If the cursor already points to the last entry, this is a no-op. */
5677 if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
5678 assert( cursorIsAtLastEntry(pCur) || CORRUPT_DB );
5679 *pRes = 0;
5680 return SQLITE_OK;
5682 return btreeLast(pCur, pRes);
5685 /* Move the cursor so that it points to an entry in a table (a.k.a INTKEY)
5686 ** table near the key intKey. Return a success code.
5688 ** If an exact match is not found, then the cursor is always
5689 ** left pointing at a leaf page which would hold the entry if it
5690 ** were present. The cursor might point to an entry that comes
5691 ** before or after the key.
5693 ** An integer is written into *pRes which is the result of
5694 ** comparing the key with the entry to which the cursor is
5695 ** pointing. The meaning of the integer written into
5696 ** *pRes is as follows:
5698 ** *pRes<0 The cursor is left pointing at an entry that
5699 ** is smaller than intKey or if the table is empty
5700 ** and the cursor is therefore left point to nothing.
5702 ** *pRes==0 The cursor is left pointing at an entry that
5703 ** exactly matches intKey.
5705 ** *pRes>0 The cursor is left pointing at an entry that
5706 ** is larger than intKey.
5708 int sqlite3BtreeTableMoveto(
5709 BtCursor *pCur, /* The cursor to be moved */
5710 i64 intKey, /* The table key */
5711 int biasRight, /* If true, bias the search to the high end */
5712 int *pRes /* Write search results here */
5714 int rc;
5716 assert( cursorOwnsBtShared(pCur) );
5717 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5718 assert( pRes );
5719 assert( pCur->pKeyInfo==0 );
5720 assert( pCur->eState!=CURSOR_VALID || pCur->curIntKey!=0 );
5722 /* If the cursor is already positioned at the point we are trying
5723 ** to move to, then just return without doing any work */
5724 if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0 ){
5725 if( pCur->info.nKey==intKey ){
5726 *pRes = 0;
5727 return SQLITE_OK;
5729 if( pCur->info.nKey<intKey ){
5730 if( (pCur->curFlags & BTCF_AtLast)!=0 ){
5731 assert( cursorIsAtLastEntry(pCur) || CORRUPT_DB );
5732 *pRes = -1;
5733 return SQLITE_OK;
5735 /* If the requested key is one more than the previous key, then
5736 ** try to get there using sqlite3BtreeNext() rather than a full
5737 ** binary search. This is an optimization only. The correct answer
5738 ** is still obtained without this case, only a little more slowly. */
5739 if( pCur->info.nKey+1==intKey ){
5740 *pRes = 0;
5741 rc = sqlite3BtreeNext(pCur, 0);
5742 if( rc==SQLITE_OK ){
5743 getCellInfo(pCur);
5744 if( pCur->info.nKey==intKey ){
5745 return SQLITE_OK;
5747 }else if( rc!=SQLITE_DONE ){
5748 return rc;
5754 #ifdef SQLITE_DEBUG
5755 pCur->pBtree->nSeek++; /* Performance measurement during testing */
5756 #endif
5758 rc = moveToRoot(pCur);
5759 if( rc ){
5760 if( rc==SQLITE_EMPTY ){
5761 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5762 *pRes = -1;
5763 return SQLITE_OK;
5765 return rc;
5767 assert( pCur->pPage );
5768 assert( pCur->pPage->isInit );
5769 assert( pCur->eState==CURSOR_VALID );
5770 assert( pCur->pPage->nCell > 0 );
5771 assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey );
5772 assert( pCur->curIntKey );
5774 for(;;){
5775 int lwr, upr, idx, c;
5776 Pgno chldPg;
5777 MemPage *pPage = pCur->pPage;
5778 u8 *pCell; /* Pointer to current cell in pPage */
5780 /* pPage->nCell must be greater than zero. If this is the root-page
5781 ** the cursor would have been INVALID above and this for(;;) loop
5782 ** not run. If this is not the root-page, then the moveToChild() routine
5783 ** would have already detected db corruption. Similarly, pPage must
5784 ** be the right kind (index or table) of b-tree page. Otherwise
5785 ** a moveToChild() or moveToRoot() call would have detected corruption. */
5786 assert( pPage->nCell>0 );
5787 assert( pPage->intKey );
5788 lwr = 0;
5789 upr = pPage->nCell-1;
5790 assert( biasRight==0 || biasRight==1 );
5791 idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
5792 for(;;){
5793 i64 nCellKey;
5794 pCell = findCellPastPtr(pPage, idx);
5795 if( pPage->intKeyLeaf ){
5796 while( 0x80 <= *(pCell++) ){
5797 if( pCell>=pPage->aDataEnd ){
5798 return SQLITE_CORRUPT_PAGE(pPage);
5802 getVarint(pCell, (u64*)&nCellKey);
5803 if( nCellKey<intKey ){
5804 lwr = idx+1;
5805 if( lwr>upr ){ c = -1; break; }
5806 }else if( nCellKey>intKey ){
5807 upr = idx-1;
5808 if( lwr>upr ){ c = +1; break; }
5809 }else{
5810 assert( nCellKey==intKey );
5811 pCur->ix = (u16)idx;
5812 if( !pPage->leaf ){
5813 lwr = idx;
5814 goto moveto_table_next_layer;
5815 }else{
5816 pCur->curFlags |= BTCF_ValidNKey;
5817 pCur->info.nKey = nCellKey;
5818 pCur->info.nSize = 0;
5819 *pRes = 0;
5820 return SQLITE_OK;
5823 assert( lwr+upr>=0 );
5824 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */
5826 assert( lwr==upr+1 || !pPage->leaf );
5827 assert( pPage->isInit );
5828 if( pPage->leaf ){
5829 assert( pCur->ix<pCur->pPage->nCell );
5830 pCur->ix = (u16)idx;
5831 *pRes = c;
5832 rc = SQLITE_OK;
5833 goto moveto_table_finish;
5835 moveto_table_next_layer:
5836 if( lwr>=pPage->nCell ){
5837 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5838 }else{
5839 chldPg = get4byte(findCell(pPage, lwr));
5841 pCur->ix = (u16)lwr;
5842 rc = moveToChild(pCur, chldPg);
5843 if( rc ) break;
5845 moveto_table_finish:
5846 pCur->info.nSize = 0;
5847 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5848 return rc;
5852 ** Compare the "idx"-th cell on the page the cursor pCur is currently
5853 ** pointing to to pIdxKey using xRecordCompare. Return negative or
5854 ** zero if the cell is less than or equal pIdxKey. Return positive
5855 ** if unknown.
5857 ** Return value negative: Cell at pCur[idx] less than pIdxKey
5859 ** Return value is zero: Cell at pCur[idx] equals pIdxKey
5861 ** Return value positive: Nothing is known about the relationship
5862 ** of the cell at pCur[idx] and pIdxKey.
5864 ** This routine is part of an optimization. It is always safe to return
5865 ** a positive value as that will cause the optimization to be skipped.
5867 static int indexCellCompare(
5868 BtCursor *pCur,
5869 int idx,
5870 UnpackedRecord *pIdxKey,
5871 RecordCompare xRecordCompare
5873 MemPage *pPage = pCur->pPage;
5874 int c;
5875 int nCell; /* Size of the pCell cell in bytes */
5876 u8 *pCell = findCellPastPtr(pPage, idx);
5878 nCell = pCell[0];
5879 if( nCell<=pPage->max1bytePayload ){
5880 /* This branch runs if the record-size field of the cell is a
5881 ** single byte varint and the record fits entirely on the main
5882 ** b-tree page. */
5883 testcase( pCell+nCell+1==pPage->aDataEnd );
5884 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
5885 }else if( !(pCell[1] & 0x80)
5886 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
5888 /* The record-size field is a 2 byte varint and the record
5889 ** fits entirely on the main b-tree page. */
5890 testcase( pCell+nCell+2==pPage->aDataEnd );
5891 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
5892 }else{
5893 /* If the record extends into overflow pages, do not attempt
5894 ** the optimization. */
5895 c = 99;
5897 return c;
5901 ** Return true (non-zero) if pCur is current pointing to the last
5902 ** page of a table.
5904 static int cursorOnLastPage(BtCursor *pCur){
5905 int i;
5906 assert( pCur->eState==CURSOR_VALID );
5907 for(i=0; i<pCur->iPage; i++){
5908 MemPage *pPage = pCur->apPage[i];
5909 if( pCur->aiIdx[i]<pPage->nCell ) return 0;
5911 return 1;
5914 /* Move the cursor so that it points to an entry in an index table
5915 ** near the key pIdxKey. Return a success code.
5917 ** If an exact match is not found, then the cursor is always
5918 ** left pointing at a leaf page which would hold the entry if it
5919 ** were present. The cursor might point to an entry that comes
5920 ** before or after the key.
5922 ** An integer is written into *pRes which is the result of
5923 ** comparing the key with the entry to which the cursor is
5924 ** pointing. The meaning of the integer written into
5925 ** *pRes is as follows:
5927 ** *pRes<0 The cursor is left pointing at an entry that
5928 ** is smaller than pIdxKey or if the table is empty
5929 ** and the cursor is therefore left point to nothing.
5931 ** *pRes==0 The cursor is left pointing at an entry that
5932 ** exactly matches pIdxKey.
5934 ** *pRes>0 The cursor is left pointing at an entry that
5935 ** is larger than pIdxKey.
5937 ** The pIdxKey->eqSeen field is set to 1 if there
5938 ** exists an entry in the table that exactly matches pIdxKey.
5940 int sqlite3BtreeIndexMoveto(
5941 BtCursor *pCur, /* The cursor to be moved */
5942 UnpackedRecord *pIdxKey, /* Unpacked index key */
5943 int *pRes /* Write search results here */
5945 int rc;
5946 RecordCompare xRecordCompare;
5948 assert( cursorOwnsBtShared(pCur) );
5949 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5950 assert( pRes );
5951 assert( pCur->pKeyInfo!=0 );
5953 #ifdef SQLITE_DEBUG
5954 pCur->pBtree->nSeek++; /* Performance measurement during testing */
5955 #endif
5957 xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
5958 pIdxKey->errCode = 0;
5959 assert( pIdxKey->default_rc==1
5960 || pIdxKey->default_rc==0
5961 || pIdxKey->default_rc==-1
5965 /* Check to see if we can skip a lot of work. Two cases:
5967 ** (1) If the cursor is already pointing to the very last cell
5968 ** in the table and the pIdxKey search key is greater than or
5969 ** equal to that last cell, then no movement is required.
5971 ** (2) If the cursor is on the last page of the table and the first
5972 ** cell on that last page is less than or equal to the pIdxKey
5973 ** search key, then we can start the search on the current page
5974 ** without needing to go back to root.
5976 if( pCur->eState==CURSOR_VALID
5977 && pCur->pPage->leaf
5978 && cursorOnLastPage(pCur)
5980 int c;
5981 if( pCur->ix==pCur->pPage->nCell-1
5982 && (c = indexCellCompare(pCur, pCur->ix, pIdxKey, xRecordCompare))<=0
5983 && pIdxKey->errCode==SQLITE_OK
5985 *pRes = c;
5986 return SQLITE_OK; /* Cursor already pointing at the correct spot */
5988 if( pCur->iPage>0
5989 && indexCellCompare(pCur, 0, pIdxKey, xRecordCompare)<=0
5990 && pIdxKey->errCode==SQLITE_OK
5992 pCur->curFlags &= ~BTCF_ValidOvfl;
5993 if( !pCur->pPage->isInit ){
5994 return SQLITE_CORRUPT_BKPT;
5996 goto bypass_moveto_root; /* Start search on the current page */
5998 pIdxKey->errCode = SQLITE_OK;
6001 rc = moveToRoot(pCur);
6002 if( rc ){
6003 if( rc==SQLITE_EMPTY ){
6004 assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
6005 *pRes = -1;
6006 return SQLITE_OK;
6008 return rc;
6011 bypass_moveto_root:
6012 assert( pCur->pPage );
6013 assert( pCur->pPage->isInit );
6014 assert( pCur->eState==CURSOR_VALID );
6015 assert( pCur->pPage->nCell > 0 );
6016 assert( pCur->curIntKey==0 );
6017 assert( pIdxKey!=0 );
6018 for(;;){
6019 int lwr, upr, idx, c;
6020 Pgno chldPg;
6021 MemPage *pPage = pCur->pPage;
6022 u8 *pCell; /* Pointer to current cell in pPage */
6024 /* pPage->nCell must be greater than zero. If this is the root-page
6025 ** the cursor would have been INVALID above and this for(;;) loop
6026 ** not run. If this is not the root-page, then the moveToChild() routine
6027 ** would have already detected db corruption. Similarly, pPage must
6028 ** be the right kind (index or table) of b-tree page. Otherwise
6029 ** a moveToChild() or moveToRoot() call would have detected corruption. */
6030 assert( pPage->nCell>0 );
6031 assert( pPage->intKey==0 );
6032 lwr = 0;
6033 upr = pPage->nCell-1;
6034 idx = upr>>1; /* idx = (lwr+upr)/2; */
6035 for(;;){
6036 int nCell; /* Size of the pCell cell in bytes */
6037 pCell = findCellPastPtr(pPage, idx);
6039 /* The maximum supported page-size is 65536 bytes. This means that
6040 ** the maximum number of record bytes stored on an index B-Tree
6041 ** page is less than 16384 bytes and may be stored as a 2-byte
6042 ** varint. This information is used to attempt to avoid parsing
6043 ** the entire cell by checking for the cases where the record is
6044 ** stored entirely within the b-tree page by inspecting the first
6045 ** 2 bytes of the cell.
6047 nCell = pCell[0];
6048 if( nCell<=pPage->max1bytePayload ){
6049 /* This branch runs if the record-size field of the cell is a
6050 ** single byte varint and the record fits entirely on the main
6051 ** b-tree page. */
6052 testcase( pCell+nCell+1==pPage->aDataEnd );
6053 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
6054 }else if( !(pCell[1] & 0x80)
6055 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
6057 /* The record-size field is a 2 byte varint and the record
6058 ** fits entirely on the main b-tree page. */
6059 testcase( pCell+nCell+2==pPage->aDataEnd );
6060 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
6061 }else{
6062 /* The record flows over onto one or more overflow pages. In
6063 ** this case the whole cell needs to be parsed, a buffer allocated
6064 ** and accessPayload() used to retrieve the record into the
6065 ** buffer before VdbeRecordCompare() can be called.
6067 ** If the record is corrupt, the xRecordCompare routine may read
6068 ** up to two varints past the end of the buffer. An extra 18
6069 ** bytes of padding is allocated at the end of the buffer in
6070 ** case this happens. */
6071 void *pCellKey;
6072 u8 * const pCellBody = pCell - pPage->childPtrSize;
6073 const int nOverrun = 18; /* Size of the overrun padding */
6074 pPage->xParseCell(pPage, pCellBody, &pCur->info);
6075 nCell = (int)pCur->info.nKey;
6076 testcase( nCell<0 ); /* True if key size is 2^32 or more */
6077 testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */
6078 testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */
6079 testcase( nCell==2 ); /* Minimum legal index key size */
6080 if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){
6081 rc = SQLITE_CORRUPT_PAGE(pPage);
6082 goto moveto_index_finish;
6084 pCellKey = sqlite3Malloc( nCell+nOverrun );
6085 if( pCellKey==0 ){
6086 rc = SQLITE_NOMEM_BKPT;
6087 goto moveto_index_finish;
6089 pCur->ix = (u16)idx;
6090 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
6091 memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */
6092 pCur->curFlags &= ~BTCF_ValidOvfl;
6093 if( rc ){
6094 sqlite3_free(pCellKey);
6095 goto moveto_index_finish;
6097 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
6098 sqlite3_free(pCellKey);
6100 assert(
6101 (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
6102 && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
6104 if( c<0 ){
6105 lwr = idx+1;
6106 }else if( c>0 ){
6107 upr = idx-1;
6108 }else{
6109 assert( c==0 );
6110 *pRes = 0;
6111 rc = SQLITE_OK;
6112 pCur->ix = (u16)idx;
6113 if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT;
6114 goto moveto_index_finish;
6116 if( lwr>upr ) break;
6117 assert( lwr+upr>=0 );
6118 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */
6120 assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
6121 assert( pPage->isInit );
6122 if( pPage->leaf ){
6123 assert( pCur->ix<pCur->pPage->nCell || CORRUPT_DB );
6124 pCur->ix = (u16)idx;
6125 *pRes = c;
6126 rc = SQLITE_OK;
6127 goto moveto_index_finish;
6129 if( lwr>=pPage->nCell ){
6130 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
6131 }else{
6132 chldPg = get4byte(findCell(pPage, lwr));
6135 /* This block is similar to an in-lined version of:
6137 ** pCur->ix = (u16)lwr;
6138 ** rc = moveToChild(pCur, chldPg);
6139 ** if( rc ) break;
6141 pCur->info.nSize = 0;
6142 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
6143 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
6144 return SQLITE_CORRUPT_BKPT;
6146 pCur->aiIdx[pCur->iPage] = (u16)lwr;
6147 pCur->apPage[pCur->iPage] = pCur->pPage;
6148 pCur->ix = 0;
6149 pCur->iPage++;
6150 rc = getAndInitPage(pCur->pBt, chldPg, &pCur->pPage, pCur->curPagerFlags);
6151 if( rc==SQLITE_OK
6152 && (pCur->pPage->nCell<1 || pCur->pPage->intKey!=pCur->curIntKey)
6154 releasePage(pCur->pPage);
6155 rc = SQLITE_CORRUPT_PGNO(chldPg);
6157 if( rc ){
6158 pCur->pPage = pCur->apPage[--pCur->iPage];
6159 break;
6162 ***** End of in-lined moveToChild() call */
6164 moveto_index_finish:
6165 pCur->info.nSize = 0;
6166 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
6167 return rc;
6172 ** Return TRUE if the cursor is not pointing at an entry of the table.
6174 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
6175 ** past the last entry in the table or sqlite3BtreePrev() moves past
6176 ** the first entry. TRUE is also returned if the table is empty.
6178 int sqlite3BtreeEof(BtCursor *pCur){
6179 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
6180 ** have been deleted? This API will need to change to return an error code
6181 ** as well as the boolean result value.
6183 return (CURSOR_VALID!=pCur->eState);
6187 ** Return an estimate for the number of rows in the table that pCur is
6188 ** pointing to. Return a negative number if no estimate is currently
6189 ** available.
6191 i64 sqlite3BtreeRowCountEst(BtCursor *pCur){
6192 i64 n;
6193 u8 i;
6195 assert( cursorOwnsBtShared(pCur) );
6196 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
6198 /* Currently this interface is only called by the OP_IfSizeBetween
6199 ** opcode and the OP_Count opcode with P3=1. In either case,
6200 ** the cursor will always be valid unless the btree is empty. */
6201 if( pCur->eState!=CURSOR_VALID ) return 0;
6202 if( NEVER(pCur->pPage->leaf==0) ) return -1;
6204 n = pCur->pPage->nCell;
6205 for(i=0; i<pCur->iPage; i++){
6206 n *= pCur->apPage[i]->nCell;
6208 return n;
6212 ** Advance the cursor to the next entry in the database.
6213 ** Return value:
6215 ** SQLITE_OK success
6216 ** SQLITE_DONE cursor is already pointing at the last element
6217 ** otherwise some kind of error occurred
6219 ** The main entry point is sqlite3BtreeNext(). That routine is optimized
6220 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
6221 ** to the next cell on the current page. The (slower) btreeNext() helper
6222 ** routine is called when it is necessary to move to a different page or
6223 ** to restore the cursor.
6225 ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the
6226 ** cursor corresponds to an SQL index and this routine could have been
6227 ** skipped if the SQL index had been a unique index. The F argument
6228 ** is a hint to the implement. SQLite btree implementation does not use
6229 ** this hint, but COMDB2 does.
6231 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){
6232 int rc;
6233 int idx;
6234 MemPage *pPage;
6236 assert( cursorOwnsBtShared(pCur) );
6237 if( pCur->eState!=CURSOR_VALID ){
6238 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
6239 rc = restoreCursorPosition(pCur);
6240 if( rc!=SQLITE_OK ){
6241 return rc;
6243 if( CURSOR_INVALID==pCur->eState ){
6244 return SQLITE_DONE;
6246 if( pCur->eState==CURSOR_SKIPNEXT ){
6247 pCur->eState = CURSOR_VALID;
6248 if( pCur->skipNext>0 ) return SQLITE_OK;
6252 pPage = pCur->pPage;
6253 idx = ++pCur->ix;
6254 if( sqlite3FaultSim(412) ) pPage->isInit = 0;
6255 if( !pPage->isInit ){
6256 return SQLITE_CORRUPT_BKPT;
6259 if( idx>=pPage->nCell ){
6260 if( !pPage->leaf ){
6261 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
6262 if( rc ) return rc;
6263 return moveToLeftmost(pCur);
6266 if( pCur->iPage==0 ){
6267 pCur->eState = CURSOR_INVALID;
6268 return SQLITE_DONE;
6270 moveToParent(pCur);
6271 pPage = pCur->pPage;
6272 }while( pCur->ix>=pPage->nCell );
6273 if( pPage->intKey ){
6274 return sqlite3BtreeNext(pCur, 0);
6275 }else{
6276 return SQLITE_OK;
6279 if( pPage->leaf ){
6280 return SQLITE_OK;
6281 }else{
6282 return moveToLeftmost(pCur);
6285 int sqlite3BtreeNext(BtCursor *pCur, int flags){
6286 MemPage *pPage;
6287 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */
6288 assert( cursorOwnsBtShared(pCur) );
6289 assert( flags==0 || flags==1 );
6290 pCur->info.nSize = 0;
6291 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
6292 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur);
6293 pPage = pCur->pPage;
6294 if( (++pCur->ix)>=pPage->nCell ){
6295 pCur->ix--;
6296 return btreeNext(pCur);
6298 if( pPage->leaf ){
6299 return SQLITE_OK;
6300 }else{
6301 return moveToLeftmost(pCur);
6306 ** Step the cursor to the back to the previous entry in the database.
6307 ** Return values:
6309 ** SQLITE_OK success
6310 ** SQLITE_DONE the cursor is already on the first element of the table
6311 ** otherwise some kind of error occurred
6313 ** The main entry point is sqlite3BtreePrevious(). That routine is optimized
6314 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
6315 ** to the previous cell on the current page. The (slower) btreePrevious()
6316 ** helper routine is called when it is necessary to move to a different page
6317 ** or to restore the cursor.
6319 ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then
6320 ** the cursor corresponds to an SQL index and this routine could have been
6321 ** skipped if the SQL index had been a unique index. The F argument is a
6322 ** hint to the implement. The native SQLite btree implementation does not
6323 ** use this hint, but COMDB2 does.
6325 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){
6326 int rc;
6327 MemPage *pPage;
6329 assert( cursorOwnsBtShared(pCur) );
6330 assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
6331 assert( pCur->info.nSize==0 );
6332 if( pCur->eState!=CURSOR_VALID ){
6333 rc = restoreCursorPosition(pCur);
6334 if( rc!=SQLITE_OK ){
6335 return rc;
6337 if( CURSOR_INVALID==pCur->eState ){
6338 return SQLITE_DONE;
6340 if( CURSOR_SKIPNEXT==pCur->eState ){
6341 pCur->eState = CURSOR_VALID;
6342 if( pCur->skipNext<0 ) return SQLITE_OK;
6346 pPage = pCur->pPage;
6347 if( sqlite3FaultSim(412) ) pPage->isInit = 0;
6348 if( !pPage->isInit ){
6349 return SQLITE_CORRUPT_BKPT;
6351 if( !pPage->leaf ){
6352 int idx = pCur->ix;
6353 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
6354 if( rc ) return rc;
6355 rc = moveToRightmost(pCur);
6356 }else{
6357 while( pCur->ix==0 ){
6358 if( pCur->iPage==0 ){
6359 pCur->eState = CURSOR_INVALID;
6360 return SQLITE_DONE;
6362 moveToParent(pCur);
6364 assert( pCur->info.nSize==0 );
6365 assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
6367 pCur->ix--;
6368 pPage = pCur->pPage;
6369 if( pPage->intKey && !pPage->leaf ){
6370 rc = sqlite3BtreePrevious(pCur, 0);
6371 }else{
6372 rc = SQLITE_OK;
6375 return rc;
6377 int sqlite3BtreePrevious(BtCursor *pCur, int flags){
6378 assert( cursorOwnsBtShared(pCur) );
6379 assert( flags==0 || flags==1 );
6380 UNUSED_PARAMETER( flags ); /* Used in COMDB2 but not native SQLite */
6381 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
6382 pCur->info.nSize = 0;
6383 if( pCur->eState!=CURSOR_VALID
6384 || pCur->ix==0
6385 || pCur->pPage->leaf==0
6387 return btreePrevious(pCur);
6389 pCur->ix--;
6390 return SQLITE_OK;
6394 ** Allocate a new page from the database file.
6396 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
6397 ** has already been called on the new page.) The new page has also
6398 ** been referenced and the calling routine is responsible for calling
6399 ** sqlite3PagerUnref() on the new page when it is done.
6401 ** SQLITE_OK is returned on success. Any other return value indicates
6402 ** an error. *ppPage is set to NULL in the event of an error.
6404 ** If the "nearby" parameter is not 0, then an effort is made to
6405 ** locate a page close to the page number "nearby". This can be used in an
6406 ** attempt to keep related pages close to each other in the database file,
6407 ** which in turn can make database access faster.
6409 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
6410 ** anywhere on the free-list, then it is guaranteed to be returned. If
6411 ** eMode is BTALLOC_LT then the page returned will be less than or equal
6412 ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there
6413 ** are no restrictions on which page is returned.
6415 static int allocateBtreePage(
6416 BtShared *pBt, /* The btree */
6417 MemPage **ppPage, /* Store pointer to the allocated page here */
6418 Pgno *pPgno, /* Store the page number here */
6419 Pgno nearby, /* Search for a page near this one */
6420 u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
6422 MemPage *pPage1;
6423 int rc;
6424 u32 n; /* Number of pages on the freelist */
6425 u32 k; /* Number of leaves on the trunk of the freelist */
6426 MemPage *pTrunk = 0;
6427 MemPage *pPrevTrunk = 0;
6428 Pgno mxPage; /* Total size of the database file */
6430 assert( sqlite3_mutex_held(pBt->mutex) );
6431 assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
6432 pPage1 = pBt->pPage1;
6433 mxPage = btreePagecount(pBt);
6434 /* EVIDENCE-OF: R-21003-45125 The 4-byte big-endian integer at offset 36
6435 ** stores the total number of pages on the freelist. */
6436 n = get4byte(&pPage1->aData[36]);
6437 testcase( n==mxPage-1 );
6438 if( n>=mxPage ){
6439 return SQLITE_CORRUPT_BKPT;
6441 if( n>0 ){
6442 /* There are pages on the freelist. Reuse one of those pages. */
6443 Pgno iTrunk;
6444 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
6445 u32 nSearch = 0; /* Count of the number of search attempts */
6447 /* If eMode==BTALLOC_EXACT and a query of the pointer-map
6448 ** shows that the page 'nearby' is somewhere on the free-list, then
6449 ** the entire-list will be searched for that page.
6451 #ifndef SQLITE_OMIT_AUTOVACUUM
6452 if( eMode==BTALLOC_EXACT ){
6453 if( nearby<=mxPage ){
6454 u8 eType;
6455 assert( nearby>0 );
6456 assert( pBt->autoVacuum );
6457 rc = ptrmapGet(pBt, nearby, &eType, 0);
6458 if( rc ) return rc;
6459 if( eType==PTRMAP_FREEPAGE ){
6460 searchList = 1;
6463 }else if( eMode==BTALLOC_LE ){
6464 searchList = 1;
6466 #endif
6468 /* Decrement the free-list count by 1. Set iTrunk to the index of the
6469 ** first free-list trunk page. iPrevTrunk is initially 1.
6471 rc = sqlite3PagerWrite(pPage1->pDbPage);
6472 if( rc ) return rc;
6473 put4byte(&pPage1->aData[36], n-1);
6475 /* The code within this loop is run only once if the 'searchList' variable
6476 ** is not true. Otherwise, it runs once for each trunk-page on the
6477 ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
6478 ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
6480 do {
6481 pPrevTrunk = pTrunk;
6482 if( pPrevTrunk ){
6483 /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
6484 ** is the page number of the next freelist trunk page in the list or
6485 ** zero if this is the last freelist trunk page. */
6486 iTrunk = get4byte(&pPrevTrunk->aData[0]);
6487 }else{
6488 /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
6489 ** stores the page number of the first page of the freelist, or zero if
6490 ** the freelist is empty. */
6491 iTrunk = get4byte(&pPage1->aData[32]);
6493 testcase( iTrunk==mxPage );
6494 if( iTrunk>mxPage || nSearch++ > n ){
6495 rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1);
6496 }else{
6497 rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
6499 if( rc ){
6500 pTrunk = 0;
6501 goto end_allocate_page;
6503 assert( pTrunk!=0 );
6504 assert( pTrunk->aData!=0 );
6505 /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
6506 ** is the number of leaf page pointers to follow. */
6507 k = get4byte(&pTrunk->aData[4]);
6508 if( k==0 && !searchList ){
6509 /* The trunk has no leaves and the list is not being searched.
6510 ** So extract the trunk page itself and use it as the newly
6511 ** allocated page */
6512 assert( pPrevTrunk==0 );
6513 rc = sqlite3PagerWrite(pTrunk->pDbPage);
6514 if( rc ){
6515 goto end_allocate_page;
6517 *pPgno = iTrunk;
6518 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
6519 *ppPage = pTrunk;
6520 pTrunk = 0;
6521 TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1));
6522 }else if( k>(u32)(pBt->usableSize/4 - 2) ){
6523 /* Value of k is out of range. Database corruption */
6524 rc = SQLITE_CORRUPT_PGNO(iTrunk);
6525 goto end_allocate_page;
6526 #ifndef SQLITE_OMIT_AUTOVACUUM
6527 }else if( searchList
6528 && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
6530 /* The list is being searched and this trunk page is the page
6531 ** to allocate, regardless of whether it has leaves.
6533 *pPgno = iTrunk;
6534 *ppPage = pTrunk;
6535 searchList = 0;
6536 rc = sqlite3PagerWrite(pTrunk->pDbPage);
6537 if( rc ){
6538 goto end_allocate_page;
6540 if( k==0 ){
6541 if( !pPrevTrunk ){
6542 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
6543 }else{
6544 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
6545 if( rc!=SQLITE_OK ){
6546 goto end_allocate_page;
6548 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
6550 }else{
6551 /* The trunk page is required by the caller but it contains
6552 ** pointers to free-list leaves. The first leaf becomes a trunk
6553 ** page in this case.
6555 MemPage *pNewTrunk;
6556 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
6557 if( iNewTrunk>mxPage ){
6558 rc = SQLITE_CORRUPT_PGNO(iTrunk);
6559 goto end_allocate_page;
6561 testcase( iNewTrunk==mxPage );
6562 rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
6563 if( rc!=SQLITE_OK ){
6564 goto end_allocate_page;
6566 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
6567 if( rc!=SQLITE_OK ){
6568 releasePage(pNewTrunk);
6569 goto end_allocate_page;
6571 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
6572 put4byte(&pNewTrunk->aData[4], k-1);
6573 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
6574 releasePage(pNewTrunk);
6575 if( !pPrevTrunk ){
6576 assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
6577 put4byte(&pPage1->aData[32], iNewTrunk);
6578 }else{
6579 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
6580 if( rc ){
6581 goto end_allocate_page;
6583 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
6586 pTrunk = 0;
6587 TRACE(("ALLOCATE: %u trunk - %u free pages left\n", *pPgno, n-1));
6588 #endif
6589 }else if( k>0 ){
6590 /* Extract a leaf from the trunk */
6591 u32 closest;
6592 Pgno iPage;
6593 unsigned char *aData = pTrunk->aData;
6594 if( nearby>0 ){
6595 u32 i;
6596 closest = 0;
6597 if( eMode==BTALLOC_LE ){
6598 for(i=0; i<k; i++){
6599 iPage = get4byte(&aData[8+i*4]);
6600 if( iPage<=nearby ){
6601 closest = i;
6602 break;
6605 }else{
6606 int dist;
6607 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
6608 for(i=1; i<k; i++){
6609 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
6610 if( d2<dist ){
6611 closest = i;
6612 dist = d2;
6616 }else{
6617 closest = 0;
6620 iPage = get4byte(&aData[8+closest*4]);
6621 testcase( iPage==mxPage );
6622 if( iPage>mxPage || iPage<2 ){
6623 rc = SQLITE_CORRUPT_PGNO(iTrunk);
6624 goto end_allocate_page;
6626 testcase( iPage==mxPage );
6627 if( !searchList
6628 || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
6630 int noContent;
6631 *pPgno = iPage;
6632 TRACE(("ALLOCATE: %u was leaf %u of %u on trunk %u"
6633 ": %u more free pages\n",
6634 *pPgno, closest+1, k, pTrunk->pgno, n-1));
6635 rc = sqlite3PagerWrite(pTrunk->pDbPage);
6636 if( rc ) goto end_allocate_page;
6637 if( closest<k-1 ){
6638 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
6640 put4byte(&aData[4], k-1);
6641 noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
6642 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
6643 if( rc==SQLITE_OK ){
6644 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
6645 if( rc!=SQLITE_OK ){
6646 releasePage(*ppPage);
6647 *ppPage = 0;
6650 searchList = 0;
6653 releasePage(pPrevTrunk);
6654 pPrevTrunk = 0;
6655 }while( searchList );
6656 }else{
6657 /* There are no pages on the freelist, so append a new page to the
6658 ** database image.
6660 ** Normally, new pages allocated by this block can be requested from the
6661 ** pager layer with the 'no-content' flag set. This prevents the pager
6662 ** from trying to read the pages content from disk. However, if the
6663 ** current transaction has already run one or more incremental-vacuum
6664 ** steps, then the page we are about to allocate may contain content
6665 ** that is required in the event of a rollback. In this case, do
6666 ** not set the no-content flag. This causes the pager to load and journal
6667 ** the current page content before overwriting it.
6669 ** Note that the pager will not actually attempt to load or journal
6670 ** content for any page that really does lie past the end of the database
6671 ** file on disk. So the effects of disabling the no-content optimization
6672 ** here are confined to those pages that lie between the end of the
6673 ** database image and the end of the database file.
6675 int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
6677 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6678 if( rc ) return rc;
6679 pBt->nPage++;
6680 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
6682 #ifndef SQLITE_OMIT_AUTOVACUUM
6683 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
6684 /* If *pPgno refers to a pointer-map page, allocate two new pages
6685 ** at the end of the file instead of one. The first allocated page
6686 ** becomes a new pointer-map page, the second is used by the caller.
6688 MemPage *pPg = 0;
6689 TRACE(("ALLOCATE: %u from end of file (pointer-map page)\n", pBt->nPage));
6690 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
6691 rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
6692 if( rc==SQLITE_OK ){
6693 rc = sqlite3PagerWrite(pPg->pDbPage);
6694 releasePage(pPg);
6696 if( rc ) return rc;
6697 pBt->nPage++;
6698 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
6700 #endif
6701 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
6702 *pPgno = pBt->nPage;
6704 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
6705 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
6706 if( rc ) return rc;
6707 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
6708 if( rc!=SQLITE_OK ){
6709 releasePage(*ppPage);
6710 *ppPage = 0;
6712 TRACE(("ALLOCATE: %u from end of file\n", *pPgno));
6715 assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) );
6717 end_allocate_page:
6718 releasePage(pTrunk);
6719 releasePage(pPrevTrunk);
6720 assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
6721 assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
6722 return rc;
6726 ** This function is used to add page iPage to the database file free-list.
6727 ** It is assumed that the page is not already a part of the free-list.
6729 ** The value passed as the second argument to this function is optional.
6730 ** If the caller happens to have a pointer to the MemPage object
6731 ** corresponding to page iPage handy, it may pass it as the second value.
6732 ** Otherwise, it may pass NULL.
6734 ** If a pointer to a MemPage object is passed as the second argument,
6735 ** its reference count is not altered by this function.
6737 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
6738 MemPage *pTrunk = 0; /* Free-list trunk page */
6739 Pgno iTrunk = 0; /* Page number of free-list trunk page */
6740 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */
6741 MemPage *pPage; /* Page being freed. May be NULL. */
6742 int rc; /* Return Code */
6743 u32 nFree; /* Initial number of pages on free-list */
6745 assert( sqlite3_mutex_held(pBt->mutex) );
6746 assert( CORRUPT_DB || iPage>1 );
6747 assert( !pMemPage || pMemPage->pgno==iPage );
6749 if( iPage<2 || iPage>pBt->nPage ){
6750 return SQLITE_CORRUPT_BKPT;
6752 if( pMemPage ){
6753 pPage = pMemPage;
6754 sqlite3PagerRef(pPage->pDbPage);
6755 }else{
6756 pPage = btreePageLookup(pBt, iPage);
6759 /* Increment the free page count on pPage1 */
6760 rc = sqlite3PagerWrite(pPage1->pDbPage);
6761 if( rc ) goto freepage_out;
6762 nFree = get4byte(&pPage1->aData[36]);
6763 put4byte(&pPage1->aData[36], nFree+1);
6765 if( pBt->btsFlags & BTS_SECURE_DELETE ){
6766 /* If the secure_delete option is enabled, then
6767 ** always fully overwrite deleted information with zeros.
6769 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
6770 || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
6772 goto freepage_out;
6774 memset(pPage->aData, 0, pPage->pBt->pageSize);
6777 /* If the database supports auto-vacuum, write an entry in the pointer-map
6778 ** to indicate that the page is free.
6780 if( ISAUTOVACUUM(pBt) ){
6781 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
6782 if( rc ) goto freepage_out;
6785 /* Now manipulate the actual database free-list structure. There are two
6786 ** possibilities. If the free-list is currently empty, or if the first
6787 ** trunk page in the free-list is full, then this page will become a
6788 ** new free-list trunk page. Otherwise, it will become a leaf of the
6789 ** first trunk page in the current free-list. This block tests if it
6790 ** is possible to add the page as a new free-list leaf.
6792 if( nFree!=0 ){
6793 u32 nLeaf; /* Initial number of leaf cells on trunk page */
6795 iTrunk = get4byte(&pPage1->aData[32]);
6796 if( iTrunk>btreePagecount(pBt) ){
6797 rc = SQLITE_CORRUPT_BKPT;
6798 goto freepage_out;
6800 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
6801 if( rc!=SQLITE_OK ){
6802 goto freepage_out;
6805 nLeaf = get4byte(&pTrunk->aData[4]);
6806 assert( pBt->usableSize>32 );
6807 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
6808 rc = SQLITE_CORRUPT_BKPT;
6809 goto freepage_out;
6811 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
6812 /* In this case there is room on the trunk page to insert the page
6813 ** being freed as a new leaf.
6815 ** Note that the trunk page is not really full until it contains
6816 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
6817 ** coded. But due to a coding error in versions of SQLite prior to
6818 ** 3.6.0, databases with freelist trunk pages holding more than
6819 ** usableSize/4 - 8 entries will be reported as corrupt. In order
6820 ** to maintain backwards compatibility with older versions of SQLite,
6821 ** we will continue to restrict the number of entries to usableSize/4 - 8
6822 ** for now. At some point in the future (once everyone has upgraded
6823 ** to 3.6.0 or later) we should consider fixing the conditional above
6824 ** to read "usableSize/4-2" instead of "usableSize/4-8".
6826 ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
6827 ** avoid using the last six entries in the freelist trunk page array in
6828 ** order that database files created by newer versions of SQLite can be
6829 ** read by older versions of SQLite.
6831 rc = sqlite3PagerWrite(pTrunk->pDbPage);
6832 if( rc==SQLITE_OK ){
6833 put4byte(&pTrunk->aData[4], nLeaf+1);
6834 put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
6835 if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
6836 sqlite3PagerDontWrite(pPage->pDbPage);
6838 rc = btreeSetHasContent(pBt, iPage);
6840 TRACE(("FREE-PAGE: %u leaf on trunk page %u\n",pPage->pgno,pTrunk->pgno));
6841 goto freepage_out;
6845 /* If control flows to this point, then it was not possible to add the
6846 ** the page being freed as a leaf page of the first trunk in the free-list.
6847 ** Possibly because the free-list is empty, or possibly because the
6848 ** first trunk in the free-list is full. Either way, the page being freed
6849 ** will become the new first trunk page in the free-list.
6851 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
6852 goto freepage_out;
6854 rc = sqlite3PagerWrite(pPage->pDbPage);
6855 if( rc!=SQLITE_OK ){
6856 goto freepage_out;
6858 put4byte(pPage->aData, iTrunk);
6859 put4byte(&pPage->aData[4], 0);
6860 put4byte(&pPage1->aData[32], iPage);
6861 TRACE(("FREE-PAGE: %u new trunk page replacing %u\n", pPage->pgno, iTrunk));
6863 freepage_out:
6864 if( pPage ){
6865 pPage->isInit = 0;
6867 releasePage(pPage);
6868 releasePage(pTrunk);
6869 return rc;
6871 static void freePage(MemPage *pPage, int *pRC){
6872 if( (*pRC)==SQLITE_OK ){
6873 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
6878 ** Free the overflow pages associated with the given Cell.
6880 static SQLITE_NOINLINE int clearCellOverflow(
6881 MemPage *pPage, /* The page that contains the Cell */
6882 unsigned char *pCell, /* First byte of the Cell */
6883 CellInfo *pInfo /* Size information about the cell */
6885 BtShared *pBt;
6886 Pgno ovflPgno;
6887 int rc;
6888 int nOvfl;
6889 u32 ovflPageSize;
6891 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6892 assert( pInfo->nLocal!=pInfo->nPayload );
6893 testcase( pCell + pInfo->nSize == pPage->aDataEnd );
6894 testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd );
6895 if( pCell + pInfo->nSize > pPage->aDataEnd ){
6896 /* Cell extends past end of page */
6897 return SQLITE_CORRUPT_PAGE(pPage);
6899 ovflPgno = get4byte(pCell + pInfo->nSize - 4);
6900 pBt = pPage->pBt;
6901 assert( pBt->usableSize > 4 );
6902 ovflPageSize = pBt->usableSize - 4;
6903 nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
6904 assert( nOvfl>0 ||
6905 (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
6907 while( nOvfl-- ){
6908 Pgno iNext = 0;
6909 MemPage *pOvfl = 0;
6910 if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
6911 /* 0 is not a legal page number and page 1 cannot be an
6912 ** overflow page. Therefore if ovflPgno<2 or past the end of the
6913 ** file the database must be corrupt. */
6914 return SQLITE_CORRUPT_BKPT;
6916 if( nOvfl ){
6917 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
6918 if( rc ) return rc;
6921 if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
6922 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
6924 /* There is no reason any cursor should have an outstanding reference
6925 ** to an overflow page belonging to a cell that is being deleted/updated.
6926 ** So if there exists more than one reference to this page, then it
6927 ** must not really be an overflow page and the database must be corrupt.
6928 ** It is helpful to detect this before calling freePage2(), as
6929 ** freePage2() may zero the page contents if secure-delete mode is
6930 ** enabled. If this 'overflow' page happens to be a page that the
6931 ** caller is iterating through or using in some other way, this
6932 ** can be problematic.
6934 rc = SQLITE_CORRUPT_BKPT;
6935 }else{
6936 rc = freePage2(pBt, pOvfl, ovflPgno);
6939 if( pOvfl ){
6940 sqlite3PagerUnref(pOvfl->pDbPage);
6942 if( rc ) return rc;
6943 ovflPgno = iNext;
6945 return SQLITE_OK;
6948 /* Call xParseCell to compute the size of a cell. If the cell contains
6949 ** overflow, then invoke cellClearOverflow to clear out that overflow.
6950 ** Store the result code (SQLITE_OK or some error code) in rc.
6952 ** Implemented as macro to force inlining for performance.
6954 #define BTREE_CLEAR_CELL(rc, pPage, pCell, sInfo) \
6955 pPage->xParseCell(pPage, pCell, &sInfo); \
6956 if( sInfo.nLocal!=sInfo.nPayload ){ \
6957 rc = clearCellOverflow(pPage, pCell, &sInfo); \
6958 }else{ \
6959 rc = SQLITE_OK; \
6964 ** Create the byte sequence used to represent a cell on page pPage
6965 ** and write that byte sequence into pCell[]. Overflow pages are
6966 ** allocated and filled in as necessary. The calling procedure
6967 ** is responsible for making sure sufficient space has been allocated
6968 ** for pCell[].
6970 ** Note that pCell does not necessary need to point to the pPage->aData
6971 ** area. pCell might point to some temporary storage. The cell will
6972 ** be constructed in this temporary area then copied into pPage->aData
6973 ** later.
6975 static int fillInCell(
6976 MemPage *pPage, /* The page that contains the cell */
6977 unsigned char *pCell, /* Complete text of the cell */
6978 const BtreePayload *pX, /* Payload with which to construct the cell */
6979 int *pnSize /* Write cell size here */
6981 int nPayload;
6982 const u8 *pSrc;
6983 int nSrc, n, rc, mn;
6984 int spaceLeft;
6985 MemPage *pToRelease;
6986 unsigned char *pPrior;
6987 unsigned char *pPayload;
6988 BtShared *pBt;
6989 Pgno pgnoOvfl;
6990 int nHeader;
6992 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6994 /* pPage is not necessarily writeable since pCell might be auxiliary
6995 ** buffer space that is separate from the pPage buffer area */
6996 assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize]
6997 || sqlite3PagerIswriteable(pPage->pDbPage) );
6999 /* Fill in the header. */
7000 nHeader = pPage->childPtrSize;
7001 if( pPage->intKey ){
7002 nPayload = pX->nData + pX->nZero;
7003 pSrc = pX->pData;
7004 nSrc = pX->nData;
7005 assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
7006 nHeader += putVarint32(&pCell[nHeader], nPayload);
7007 nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
7008 }else{
7009 assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
7010 nSrc = nPayload = (int)pX->nKey;
7011 pSrc = pX->pKey;
7012 nHeader += putVarint32(&pCell[nHeader], nPayload);
7015 /* Fill in the payload */
7016 pPayload = &pCell[nHeader];
7017 if( nPayload<=pPage->maxLocal ){
7018 /* This is the common case where everything fits on the btree page
7019 ** and no overflow pages are required. */
7020 n = nHeader + nPayload;
7021 testcase( n==3 );
7022 testcase( n==4 );
7023 if( n<4 ){
7024 n = 4;
7025 pPayload[nPayload] = 0;
7027 *pnSize = n;
7028 assert( nSrc<=nPayload );
7029 testcase( nSrc<nPayload );
7030 memcpy(pPayload, pSrc, nSrc);
7031 memset(pPayload+nSrc, 0, nPayload-nSrc);
7032 return SQLITE_OK;
7035 /* If we reach this point, it means that some of the content will need
7036 ** to spill onto overflow pages.
7038 mn = pPage->minLocal;
7039 n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
7040 testcase( n==pPage->maxLocal );
7041 testcase( n==pPage->maxLocal+1 );
7042 if( n > pPage->maxLocal ) n = mn;
7043 spaceLeft = n;
7044 *pnSize = n + nHeader + 4;
7045 pPrior = &pCell[nHeader+n];
7046 pToRelease = 0;
7047 pgnoOvfl = 0;
7048 pBt = pPage->pBt;
7050 /* At this point variables should be set as follows:
7052 ** nPayload Total payload size in bytes
7053 ** pPayload Begin writing payload here
7054 ** spaceLeft Space available at pPayload. If nPayload>spaceLeft,
7055 ** that means content must spill into overflow pages.
7056 ** *pnSize Size of the local cell (not counting overflow pages)
7057 ** pPrior Where to write the pgno of the first overflow page
7059 ** Use a call to btreeParseCellPtr() to verify that the values above
7060 ** were computed correctly.
7062 #ifdef SQLITE_DEBUG
7064 CellInfo info;
7065 pPage->xParseCell(pPage, pCell, &info);
7066 assert( nHeader==(int)(info.pPayload - pCell) );
7067 assert( info.nKey==pX->nKey );
7068 assert( *pnSize == info.nSize );
7069 assert( spaceLeft == info.nLocal );
7071 #endif
7073 /* Write the payload into the local Cell and any extra into overflow pages */
7074 while( 1 ){
7075 n = nPayload;
7076 if( n>spaceLeft ) n = spaceLeft;
7078 /* If pToRelease is not zero than pPayload points into the data area
7079 ** of pToRelease. Make sure pToRelease is still writeable. */
7080 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
7082 /* If pPayload is part of the data area of pPage, then make sure pPage
7083 ** is still writeable */
7084 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
7085 || sqlite3PagerIswriteable(pPage->pDbPage) );
7087 if( nSrc>=n ){
7088 memcpy(pPayload, pSrc, n);
7089 }else if( nSrc>0 ){
7090 n = nSrc;
7091 memcpy(pPayload, pSrc, n);
7092 }else{
7093 memset(pPayload, 0, n);
7095 nPayload -= n;
7096 if( nPayload<=0 ) break;
7097 pPayload += n;
7098 pSrc += n;
7099 nSrc -= n;
7100 spaceLeft -= n;
7101 if( spaceLeft==0 ){
7102 MemPage *pOvfl = 0;
7103 #ifndef SQLITE_OMIT_AUTOVACUUM
7104 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
7105 if( pBt->autoVacuum ){
7107 pgnoOvfl++;
7108 } while(
7109 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
7112 #endif
7113 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
7114 #ifndef SQLITE_OMIT_AUTOVACUUM
7115 /* If the database supports auto-vacuum, and the second or subsequent
7116 ** overflow page is being allocated, add an entry to the pointer-map
7117 ** for that page now.
7119 ** If this is the first overflow page, then write a partial entry
7120 ** to the pointer-map. If we write nothing to this pointer-map slot,
7121 ** then the optimistic overflow chain processing in clearCell()
7122 ** may misinterpret the uninitialized values and delete the
7123 ** wrong pages from the database.
7125 if( pBt->autoVacuum && rc==SQLITE_OK ){
7126 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
7127 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
7128 if( rc ){
7129 releasePage(pOvfl);
7132 #endif
7133 if( rc ){
7134 releasePage(pToRelease);
7135 return rc;
7138 /* If pToRelease is not zero than pPrior points into the data area
7139 ** of pToRelease. Make sure pToRelease is still writeable. */
7140 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
7142 /* If pPrior is part of the data area of pPage, then make sure pPage
7143 ** is still writeable */
7144 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
7145 || sqlite3PagerIswriteable(pPage->pDbPage) );
7147 put4byte(pPrior, pgnoOvfl);
7148 releasePage(pToRelease);
7149 pToRelease = pOvfl;
7150 pPrior = pOvfl->aData;
7151 put4byte(pPrior, 0);
7152 pPayload = &pOvfl->aData[4];
7153 spaceLeft = pBt->usableSize - 4;
7156 releasePage(pToRelease);
7157 return SQLITE_OK;
7161 ** Remove the i-th cell from pPage. This routine effects pPage only.
7162 ** The cell content is not freed or deallocated. It is assumed that
7163 ** the cell content has been copied someplace else. This routine just
7164 ** removes the reference to the cell from pPage.
7166 ** "sz" must be the number of bytes in the cell.
7168 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
7169 u32 pc; /* Offset to cell content of cell being deleted */
7170 u8 *data; /* pPage->aData */
7171 u8 *ptr; /* Used to move bytes around within data[] */
7172 int rc; /* The return code */
7173 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */
7175 if( *pRC ) return;
7176 assert( idx>=0 );
7177 assert( idx<pPage->nCell );
7178 assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
7179 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
7180 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
7181 assert( pPage->nFree>=0 );
7182 data = pPage->aData;
7183 ptr = &pPage->aCellIdx[2*idx];
7184 assert( pPage->pBt->usableSize > (u32)(ptr-data) );
7185 pc = get2byte(ptr);
7186 hdr = pPage->hdrOffset;
7187 testcase( pc==(u32)get2byte(&data[hdr+5]) );
7188 testcase( pc+sz==pPage->pBt->usableSize );
7189 if( pc+sz > pPage->pBt->usableSize ){
7190 *pRC = SQLITE_CORRUPT_BKPT;
7191 return;
7193 rc = freeSpace(pPage, pc, sz);
7194 if( rc ){
7195 *pRC = rc;
7196 return;
7198 pPage->nCell--;
7199 if( pPage->nCell==0 ){
7200 memset(&data[hdr+1], 0, 4);
7201 data[hdr+7] = 0;
7202 put2byte(&data[hdr+5], pPage->pBt->usableSize);
7203 pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
7204 - pPage->childPtrSize - 8;
7205 }else{
7206 memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
7207 put2byte(&data[hdr+3], pPage->nCell);
7208 pPage->nFree += 2;
7213 ** Insert a new cell on pPage at cell index "i". pCell points to the
7214 ** content of the cell.
7216 ** If the cell content will fit on the page, then put it there. If it
7217 ** will not fit, then make a copy of the cell content into pTemp if
7218 ** pTemp is not null. Regardless of pTemp, allocate a new entry
7219 ** in pPage->apOvfl[] and make it point to the cell content (either
7220 ** in pTemp or the original pCell) and also record its index.
7221 ** Allocating a new entry in pPage->aCell[] implies that
7222 ** pPage->nOverflow is incremented.
7224 ** The insertCellFast() routine below works exactly the same as
7225 ** insertCell() except that it lacks the pTemp and iChild parameters
7226 ** which are assumed zero. Other than that, the two routines are the
7227 ** same.
7229 ** Fixes or enhancements to this routine should be reflected in
7230 ** insertCellFast()!
7232 static int insertCell(
7233 MemPage *pPage, /* Page into which we are copying */
7234 int i, /* New cell becomes the i-th cell of the page */
7235 u8 *pCell, /* Content of the new cell */
7236 int sz, /* Bytes of content in pCell */
7237 u8 *pTemp, /* Temp storage space for pCell, if needed */
7238 Pgno iChild /* If non-zero, replace first 4 bytes with this value */
7240 int idx = 0; /* Where to write new cell content in data[] */
7241 int j; /* Loop counter */
7242 u8 *data; /* The content of the whole page */
7243 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */
7245 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
7246 assert( MX_CELL(pPage->pBt)<=10921 );
7247 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
7248 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
7249 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
7250 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
7251 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB );
7252 assert( pPage->nFree>=0 );
7253 assert( iChild>0 );
7254 if( pPage->nOverflow || sz+2>pPage->nFree ){
7255 if( pTemp ){
7256 memcpy(pTemp, pCell, sz);
7257 pCell = pTemp;
7259 put4byte(pCell, iChild);
7260 j = pPage->nOverflow++;
7261 /* Comparison against ArraySize-1 since we hold back one extra slot
7262 ** as a contingency. In other words, never need more than 3 overflow
7263 ** slots but 4 are allocated, just to be safe. */
7264 assert( j < ArraySize(pPage->apOvfl)-1 );
7265 pPage->apOvfl[j] = pCell;
7266 pPage->aiOvfl[j] = (u16)i;
7268 /* When multiple overflows occur, they are always sequential and in
7269 ** sorted order. This invariants arise because multiple overflows can
7270 ** only occur when inserting divider cells into the parent page during
7271 ** balancing, and the dividers are adjacent and sorted.
7273 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
7274 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */
7275 }else{
7276 int rc = sqlite3PagerWrite(pPage->pDbPage);
7277 if( NEVER(rc!=SQLITE_OK) ){
7278 return rc;
7280 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
7281 data = pPage->aData;
7282 assert( &data[pPage->cellOffset]==pPage->aCellIdx );
7283 rc = allocateSpace(pPage, sz, &idx);
7284 if( rc ){ return rc; }
7285 /* The allocateSpace() routine guarantees the following properties
7286 ** if it returns successfully */
7287 assert( idx >= 0 );
7288 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
7289 assert( idx+sz <= (int)pPage->pBt->usableSize );
7290 pPage->nFree -= (u16)(2 + sz);
7291 /* In a corrupt database where an entry in the cell index section of
7292 ** a btree page has a value of 3 or less, the pCell value might point
7293 ** as many as 4 bytes in front of the start of the aData buffer for
7294 ** the source page. Make sure this does not cause problems by not
7295 ** reading the first 4 bytes */
7296 memcpy(&data[idx+4], pCell+4, sz-4);
7297 put4byte(&data[idx], iChild);
7298 pIns = pPage->aCellIdx + i*2;
7299 memmove(pIns+2, pIns, 2*(pPage->nCell - i));
7300 put2byte(pIns, idx);
7301 pPage->nCell++;
7302 /* increment the cell count */
7303 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
7304 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB );
7305 #ifndef SQLITE_OMIT_AUTOVACUUM
7306 if( pPage->pBt->autoVacuum ){
7307 int rc2 = SQLITE_OK;
7308 /* The cell may contain a pointer to an overflow page. If so, write
7309 ** the entry for the overflow page into the pointer map.
7311 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2);
7312 if( rc2 ) return rc2;
7314 #endif
7316 return SQLITE_OK;
7320 ** This variant of insertCell() assumes that the pTemp and iChild
7321 ** parameters are both zero. Use this variant in sqlite3BtreeInsert()
7322 ** for performance improvement, and also so that this variant is only
7323 ** called from that one place, and is thus inlined, and thus runs must
7324 ** faster.
7326 ** Fixes or enhancements to this routine should be reflected into
7327 ** the insertCell() routine.
7329 static int insertCellFast(
7330 MemPage *pPage, /* Page into which we are copying */
7331 int i, /* New cell becomes the i-th cell of the page */
7332 u8 *pCell, /* Content of the new cell */
7333 int sz /* Bytes of content in pCell */
7335 int idx = 0; /* Where to write new cell content in data[] */
7336 int j; /* Loop counter */
7337 u8 *data; /* The content of the whole page */
7338 u8 *pIns; /* The point in pPage->aCellIdx[] where no cell inserted */
7340 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
7341 assert( MX_CELL(pPage->pBt)<=10921 );
7342 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
7343 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
7344 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
7345 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
7346 assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB );
7347 assert( pPage->nFree>=0 );
7348 assert( pPage->nOverflow==0 );
7349 if( sz+2>pPage->nFree ){
7350 j = pPage->nOverflow++;
7351 /* Comparison against ArraySize-1 since we hold back one extra slot
7352 ** as a contingency. In other words, never need more than 3 overflow
7353 ** slots but 4 are allocated, just to be safe. */
7354 assert( j < ArraySize(pPage->apOvfl)-1 );
7355 pPage->apOvfl[j] = pCell;
7356 pPage->aiOvfl[j] = (u16)i;
7358 /* When multiple overflows occur, they are always sequential and in
7359 ** sorted order. This invariants arise because multiple overflows can
7360 ** only occur when inserting divider cells into the parent page during
7361 ** balancing, and the dividers are adjacent and sorted.
7363 assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
7364 assert( j==0 || i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */
7365 }else{
7366 int rc = sqlite3PagerWrite(pPage->pDbPage);
7367 if( rc!=SQLITE_OK ){
7368 return rc;
7370 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
7371 data = pPage->aData;
7372 assert( &data[pPage->cellOffset]==pPage->aCellIdx );
7373 rc = allocateSpace(pPage, sz, &idx);
7374 if( rc ){ return rc; }
7375 /* The allocateSpace() routine guarantees the following properties
7376 ** if it returns successfully */
7377 assert( idx >= 0 );
7378 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
7379 assert( idx+sz <= (int)pPage->pBt->usableSize );
7380 pPage->nFree -= (u16)(2 + sz);
7381 memcpy(&data[idx], pCell, sz);
7382 pIns = pPage->aCellIdx + i*2;
7383 memmove(pIns+2, pIns, 2*(pPage->nCell - i));
7384 put2byte(pIns, idx);
7385 pPage->nCell++;
7386 /* increment the cell count */
7387 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
7388 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB );
7389 #ifndef SQLITE_OMIT_AUTOVACUUM
7390 if( pPage->pBt->autoVacuum ){
7391 int rc2 = SQLITE_OK;
7392 /* The cell may contain a pointer to an overflow page. If so, write
7393 ** the entry for the overflow page into the pointer map.
7395 ptrmapPutOvflPtr(pPage, pPage, pCell, &rc2);
7396 if( rc2 ) return rc2;
7398 #endif
7400 return SQLITE_OK;
7404 ** The following parameters determine how many adjacent pages get involved
7405 ** in a balancing operation. NN is the number of neighbors on either side
7406 ** of the page that participate in the balancing operation. NB is the
7407 ** total number of pages that participate, including the target page and
7408 ** NN neighbors on either side.
7410 ** The minimum value of NN is 1 (of course). Increasing NN above 1
7411 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
7412 ** in exchange for a larger degradation in INSERT and UPDATE performance.
7413 ** The value of NN appears to give the best results overall.
7415 ** (Later:) The description above makes it seem as if these values are
7416 ** tunable - as if you could change them and recompile and it would all work.
7417 ** But that is unlikely. NB has been 3 since the inception of SQLite and
7418 ** we have never tested any other value.
7420 #define NN 1 /* Number of neighbors on either side of pPage */
7421 #define NB 3 /* (NN*2+1): Total pages involved in the balance */
7424 ** A CellArray object contains a cache of pointers and sizes for a
7425 ** consecutive sequence of cells that might be held on multiple pages.
7427 ** The cells in this array are the divider cell or cells from the pParent
7428 ** page plus up to three child pages. There are a total of nCell cells.
7430 ** pRef is a pointer to one of the pages that contributes cells. This is
7431 ** used to access information such as MemPage.intKey and MemPage.pBt->pageSize
7432 ** which should be common to all pages that contribute cells to this array.
7434 ** apCell[] and szCell[] hold, respectively, pointers to the start of each
7435 ** cell and the size of each cell. Some of the apCell[] pointers might refer
7436 ** to overflow cells. In other words, some apCel[] pointers might not point
7437 ** to content area of the pages.
7439 ** A szCell[] of zero means the size of that cell has not yet been computed.
7441 ** The cells come from as many as four different pages:
7443 ** -----------
7444 ** | Parent |
7445 ** -----------
7446 ** / | \
7447 ** / | \
7448 ** --------- --------- ---------
7449 ** |Child-1| |Child-2| |Child-3|
7450 ** --------- --------- ---------
7452 ** The order of cells is in the array is for an index btree is:
7454 ** 1. All cells from Child-1 in order
7455 ** 2. The first divider cell from Parent
7456 ** 3. All cells from Child-2 in order
7457 ** 4. The second divider cell from Parent
7458 ** 5. All cells from Child-3 in order
7460 ** For a table-btree (with rowids) the items 2 and 4 are empty because
7461 ** content exists only in leaves and there are no divider cells.
7463 ** For an index btree, the apEnd[] array holds pointer to the end of page
7464 ** for Child-1, the Parent, Child-2, the Parent (again), and Child-3,
7465 ** respectively. The ixNx[] array holds the number of cells contained in
7466 ** each of these 5 stages, and all stages to the left. Hence:
7468 ** ixNx[0] = Number of cells in Child-1.
7469 ** ixNx[1] = Number of cells in Child-1 plus 1 for first divider.
7470 ** ixNx[2] = Number of cells in Child-1 and Child-2 + 1 for 1st divider.
7471 ** ixNx[3] = Number of cells in Child-1 and Child-2 + both divider cells
7472 ** ixNx[4] = Total number of cells.
7474 ** For a table-btree, the concept is similar, except only apEnd[0]..apEnd[2]
7475 ** are used and they point to the leaf pages only, and the ixNx value are:
7477 ** ixNx[0] = Number of cells in Child-1.
7478 ** ixNx[1] = Number of cells in Child-1 and Child-2.
7479 ** ixNx[2] = Total number of cells.
7481 ** Sometimes when deleting, a child page can have zero cells. In those
7482 ** cases, ixNx[] entries with higher indexes, and the corresponding apEnd[]
7483 ** entries, shift down. The end result is that each ixNx[] entry should
7484 ** be larger than the previous
7486 typedef struct CellArray CellArray;
7487 struct CellArray {
7488 int nCell; /* Number of cells in apCell[] */
7489 MemPage *pRef; /* Reference page */
7490 u8 **apCell; /* All cells begin balanced */
7491 u16 *szCell; /* Local size of all cells in apCell[] */
7492 u8 *apEnd[NB*2]; /* MemPage.aDataEnd values */
7493 int ixNx[NB*2]; /* Index of at which we move to the next apEnd[] */
7497 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
7498 ** computed.
7500 static void populateCellCache(CellArray *p, int idx, int N){
7501 MemPage *pRef = p->pRef;
7502 u16 *szCell = p->szCell;
7503 assert( idx>=0 && idx+N<=p->nCell );
7504 while( N>0 ){
7505 assert( p->apCell[idx]!=0 );
7506 if( szCell[idx]==0 ){
7507 szCell[idx] = pRef->xCellSize(pRef, p->apCell[idx]);
7508 }else{
7509 assert( CORRUPT_DB ||
7510 szCell[idx]==pRef->xCellSize(pRef, p->apCell[idx]) );
7512 idx++;
7513 N--;
7518 ** Return the size of the Nth element of the cell array
7520 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
7521 assert( N>=0 && N<p->nCell );
7522 assert( p->szCell[N]==0 );
7523 p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
7524 return p->szCell[N];
7526 static u16 cachedCellSize(CellArray *p, int N){
7527 assert( N>=0 && N<p->nCell );
7528 if( p->szCell[N] ) return p->szCell[N];
7529 return computeCellSize(p, N);
7533 ** Array apCell[] contains pointers to nCell b-tree page cells. The
7534 ** szCell[] array contains the size in bytes of each cell. This function
7535 ** replaces the current contents of page pPg with the contents of the cell
7536 ** array.
7538 ** Some of the cells in apCell[] may currently be stored in pPg. This
7539 ** function works around problems caused by this by making a copy of any
7540 ** such cells before overwriting the page data.
7542 ** The MemPage.nFree field is invalidated by this function. It is the
7543 ** responsibility of the caller to set it correctly.
7545 static int rebuildPage(
7546 CellArray *pCArray, /* Content to be added to page pPg */
7547 int iFirst, /* First cell in pCArray to use */
7548 int nCell, /* Final number of cells on page */
7549 MemPage *pPg /* The page to be reconstructed */
7551 const int hdr = pPg->hdrOffset; /* Offset of header on pPg */
7552 u8 * const aData = pPg->aData; /* Pointer to data for pPg */
7553 const int usableSize = pPg->pBt->usableSize;
7554 u8 * const pEnd = &aData[usableSize];
7555 int i = iFirst; /* Which cell to copy from pCArray*/
7556 u32 j; /* Start of cell content area */
7557 int iEnd = i+nCell; /* Loop terminator */
7558 u8 *pCellptr = pPg->aCellIdx;
7559 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
7560 u8 *pData;
7561 int k; /* Current slot in pCArray->apEnd[] */
7562 u8 *pSrcEnd; /* Current pCArray->apEnd[k] value */
7564 assert( nCell>0 );
7565 assert( i<iEnd );
7566 j = get2byte(&aData[hdr+5]);
7567 if( j>(u32)usableSize ){ j = 0; }
7568 memcpy(&pTmp[j], &aData[j], usableSize - j);
7570 for(k=0; ALWAYS(k<NB*2) && pCArray->ixNx[k]<=i; k++){}
7571 pSrcEnd = pCArray->apEnd[k];
7573 pData = pEnd;
7574 while( 1/*exit by break*/ ){
7575 u8 *pCell = pCArray->apCell[i];
7576 u16 sz = pCArray->szCell[i];
7577 assert( sz>0 );
7578 if( SQLITE_WITHIN(pCell,aData+j,pEnd) ){
7579 if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT;
7580 pCell = &pTmp[pCell - aData];
7581 }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd
7582 && (uptr)(pCell)<(uptr)pSrcEnd
7584 return SQLITE_CORRUPT_BKPT;
7587 pData -= sz;
7588 put2byte(pCellptr, (pData - aData));
7589 pCellptr += 2;
7590 if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
7591 memmove(pData, pCell, sz);
7592 assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
7593 i++;
7594 if( i>=iEnd ) break;
7595 if( pCArray->ixNx[k]<=i ){
7596 k++;
7597 pSrcEnd = pCArray->apEnd[k];
7601 /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
7602 pPg->nCell = nCell;
7603 pPg->nOverflow = 0;
7605 put2byte(&aData[hdr+1], 0);
7606 put2byte(&aData[hdr+3], pPg->nCell);
7607 put2byte(&aData[hdr+5], pData - aData);
7608 aData[hdr+7] = 0x00;
7609 return SQLITE_OK;
7613 ** The pCArray objects contains pointers to b-tree cells and the cell sizes.
7614 ** This function attempts to add the cells stored in the array to page pPg.
7615 ** If it cannot (because the page needs to be defragmented before the cells
7616 ** will fit), non-zero is returned. Otherwise, if the cells are added
7617 ** successfully, zero is returned.
7619 ** Argument pCellptr points to the first entry in the cell-pointer array
7620 ** (part of page pPg) to populate. After cell apCell[0] is written to the
7621 ** page body, a 16-bit offset is written to pCellptr. And so on, for each
7622 ** cell in the array. It is the responsibility of the caller to ensure
7623 ** that it is safe to overwrite this part of the cell-pointer array.
7625 ** When this function is called, *ppData points to the start of the
7626 ** content area on page pPg. If the size of the content area is extended,
7627 ** *ppData is updated to point to the new start of the content area
7628 ** before returning.
7630 ** Finally, argument pBegin points to the byte immediately following the
7631 ** end of the space required by this page for the cell-pointer area (for
7632 ** all cells - not just those inserted by the current call). If the content
7633 ** area must be extended to before this point in order to accommodate all
7634 ** cells in apCell[], then the cells do not fit and non-zero is returned.
7636 static int pageInsertArray(
7637 MemPage *pPg, /* Page to add cells to */
7638 u8 *pBegin, /* End of cell-pointer array */
7639 u8 **ppData, /* IN/OUT: Page content-area pointer */
7640 u8 *pCellptr, /* Pointer to cell-pointer area */
7641 int iFirst, /* Index of first cell to add */
7642 int nCell, /* Number of cells to add to pPg */
7643 CellArray *pCArray /* Array of cells */
7645 int i = iFirst; /* Loop counter - cell index to insert */
7646 u8 *aData = pPg->aData; /* Complete page */
7647 u8 *pData = *ppData; /* Content area. A subset of aData[] */
7648 int iEnd = iFirst + nCell; /* End of loop. One past last cell to ins */
7649 int k; /* Current slot in pCArray->apEnd[] */
7650 u8 *pEnd; /* Maximum extent of cell data */
7651 assert( CORRUPT_DB || pPg->hdrOffset==0 ); /* Never called on page 1 */
7652 if( iEnd<=iFirst ) return 0;
7653 for(k=0; ALWAYS(k<NB*2) && pCArray->ixNx[k]<=i ; k++){}
7654 pEnd = pCArray->apEnd[k];
7655 while( 1 /*Exit by break*/ ){
7656 int sz, rc;
7657 u8 *pSlot;
7658 assert( pCArray->szCell[i]!=0 );
7659 sz = pCArray->szCell[i];
7660 if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
7661 if( (pData - pBegin)<sz ) return 1;
7662 pData -= sz;
7663 pSlot = pData;
7665 /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
7666 ** database. But they might for a corrupt database. Hence use memmove()
7667 ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
7668 assert( (pSlot+sz)<=pCArray->apCell[i]
7669 || pSlot>=(pCArray->apCell[i]+sz)
7670 || CORRUPT_DB );
7671 if( (uptr)(pCArray->apCell[i]+sz)>(uptr)pEnd
7672 && (uptr)(pCArray->apCell[i])<(uptr)pEnd
7674 assert( CORRUPT_DB );
7675 (void)SQLITE_CORRUPT_BKPT;
7676 return 1;
7678 memmove(pSlot, pCArray->apCell[i], sz);
7679 put2byte(pCellptr, (pSlot - aData));
7680 pCellptr += 2;
7681 i++;
7682 if( i>=iEnd ) break;
7683 if( pCArray->ixNx[k]<=i ){
7684 k++;
7685 pEnd = pCArray->apEnd[k];
7688 *ppData = pData;
7689 return 0;
7693 ** The pCArray object contains pointers to b-tree cells and their sizes.
7695 ** This function adds the space associated with each cell in the array
7696 ** that is currently stored within the body of pPg to the pPg free-list.
7697 ** The cell-pointers and other fields of the page are not updated.
7699 ** This function returns the total number of cells added to the free-list.
7701 static int pageFreeArray(
7702 MemPage *pPg, /* Page to edit */
7703 int iFirst, /* First cell to delete */
7704 int nCell, /* Cells to delete */
7705 CellArray *pCArray /* Array of cells */
7707 u8 * const aData = pPg->aData;
7708 u8 * const pEnd = &aData[pPg->pBt->usableSize];
7709 u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
7710 int nRet = 0;
7711 int i, j;
7712 int iEnd = iFirst + nCell;
7713 int nFree = 0;
7714 int aOfst[10];
7715 int aAfter[10];
7717 for(i=iFirst; i<iEnd; i++){
7718 u8 *pCell = pCArray->apCell[i];
7719 if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
7720 int sz;
7721 int iAfter;
7722 int iOfst;
7723 /* No need to use cachedCellSize() here. The sizes of all cells that
7724 ** are to be freed have already been computing while deciding which
7725 ** cells need freeing */
7726 sz = pCArray->szCell[i]; assert( sz>0 );
7727 iOfst = (u16)(pCell - aData);
7728 iAfter = iOfst+sz;
7729 for(j=0; j<nFree; j++){
7730 if( aOfst[j]==iAfter ){
7731 aOfst[j] = iOfst;
7732 break;
7733 }else if( aAfter[j]==iOfst ){
7734 aAfter[j] = iAfter;
7735 break;
7738 if( j>=nFree ){
7739 if( nFree>=(int)(sizeof(aOfst)/sizeof(aOfst[0])) ){
7740 for(j=0; j<nFree; j++){
7741 freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]);
7743 nFree = 0;
7745 aOfst[nFree] = iOfst;
7746 aAfter[nFree] = iAfter;
7747 if( &aData[iAfter]>pEnd ) return 0;
7748 nFree++;
7750 nRet++;
7753 for(j=0; j<nFree; j++){
7754 freeSpace(pPg, aOfst[j], aAfter[j]-aOfst[j]);
7756 return nRet;
7760 ** pCArray contains pointers to and sizes of all cells in the page being
7761 ** balanced. The current page, pPg, has pPg->nCell cells starting with
7762 ** pCArray->apCell[iOld]. After balancing, this page should hold nNew cells
7763 ** starting at apCell[iNew].
7765 ** This routine makes the necessary adjustments to pPg so that it contains
7766 ** the correct cells after being balanced.
7768 ** The pPg->nFree field is invalid when this function returns. It is the
7769 ** responsibility of the caller to set it correctly.
7771 static int editPage(
7772 MemPage *pPg, /* Edit this page */
7773 int iOld, /* Index of first cell currently on page */
7774 int iNew, /* Index of new first cell on page */
7775 int nNew, /* Final number of cells on page */
7776 CellArray *pCArray /* Array of cells and sizes */
7778 u8 * const aData = pPg->aData;
7779 const int hdr = pPg->hdrOffset;
7780 u8 *pBegin = &pPg->aCellIdx[nNew * 2];
7781 int nCell = pPg->nCell; /* Cells stored on pPg */
7782 u8 *pData;
7783 u8 *pCellptr;
7784 int i;
7785 int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
7786 int iNewEnd = iNew + nNew;
7788 #ifdef SQLITE_DEBUG
7789 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
7790 memcpy(pTmp, aData, pPg->pBt->usableSize);
7791 #endif
7793 /* Remove cells from the start and end of the page */
7794 assert( nCell>=0 );
7795 if( iOld<iNew ){
7796 int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
7797 if( NEVER(nShift>nCell) ) return SQLITE_CORRUPT_BKPT;
7798 memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
7799 nCell -= nShift;
7801 if( iNewEnd < iOldEnd ){
7802 int nTail = pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
7803 assert( nCell>=nTail );
7804 nCell -= nTail;
7807 pData = &aData[get2byte(&aData[hdr+5])];
7808 if( pData<pBegin ) goto editpage_fail;
7809 if( NEVER(pData>pPg->aDataEnd) ) goto editpage_fail;
7811 /* Add cells to the start of the page */
7812 if( iNew<iOld ){
7813 int nAdd = MIN(nNew,iOld-iNew);
7814 assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
7815 assert( nAdd>=0 );
7816 pCellptr = pPg->aCellIdx;
7817 memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
7818 if( pageInsertArray(
7819 pPg, pBegin, &pData, pCellptr,
7820 iNew, nAdd, pCArray
7821 ) ) goto editpage_fail;
7822 nCell += nAdd;
7825 /* Add any overflow cells */
7826 for(i=0; i<pPg->nOverflow; i++){
7827 int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
7828 if( iCell>=0 && iCell<nNew ){
7829 pCellptr = &pPg->aCellIdx[iCell * 2];
7830 if( nCell>iCell ){
7831 memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
7833 nCell++;
7834 cachedCellSize(pCArray, iCell+iNew);
7835 if( pageInsertArray(
7836 pPg, pBegin, &pData, pCellptr,
7837 iCell+iNew, 1, pCArray
7838 ) ) goto editpage_fail;
7842 /* Append cells to the end of the page */
7843 assert( nCell>=0 );
7844 pCellptr = &pPg->aCellIdx[nCell*2];
7845 if( pageInsertArray(
7846 pPg, pBegin, &pData, pCellptr,
7847 iNew+nCell, nNew-nCell, pCArray
7848 ) ) goto editpage_fail;
7850 pPg->nCell = nNew;
7851 pPg->nOverflow = 0;
7853 put2byte(&aData[hdr+3], pPg->nCell);
7854 put2byte(&aData[hdr+5], pData - aData);
7856 #ifdef SQLITE_DEBUG
7857 for(i=0; i<nNew && !CORRUPT_DB; i++){
7858 u8 *pCell = pCArray->apCell[i+iNew];
7859 int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
7860 if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
7861 pCell = &pTmp[pCell - aData];
7863 assert( 0==memcmp(pCell, &aData[iOff],
7864 pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
7866 #endif
7868 return SQLITE_OK;
7869 editpage_fail:
7870 /* Unable to edit this page. Rebuild it from scratch instead. */
7871 if( nNew<1 ) return SQLITE_CORRUPT_BKPT;
7872 populateCellCache(pCArray, iNew, nNew);
7873 return rebuildPage(pCArray, iNew, nNew, pPg);
7877 #ifndef SQLITE_OMIT_QUICKBALANCE
7879 ** This version of balance() handles the common special case where
7880 ** a new entry is being inserted on the extreme right-end of the
7881 ** tree, in other words, when the new entry will become the largest
7882 ** entry in the tree.
7884 ** Instead of trying to balance the 3 right-most leaf pages, just add
7885 ** a new page to the right-hand side and put the one new entry in
7886 ** that page. This leaves the right side of the tree somewhat
7887 ** unbalanced. But odds are that we will be inserting new entries
7888 ** at the end soon afterwards so the nearly empty page will quickly
7889 ** fill up. On average.
7891 ** pPage is the leaf page which is the right-most page in the tree.
7892 ** pParent is its parent. pPage must have a single overflow entry
7893 ** which is also the right-most entry on the page.
7895 ** The pSpace buffer is used to store a temporary copy of the divider
7896 ** cell that will be inserted into pParent. Such a cell consists of a 4
7897 ** byte page number followed by a variable length integer. In other
7898 ** words, at most 13 bytes. Hence the pSpace buffer must be at
7899 ** least 13 bytes in size.
7901 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
7902 BtShared *const pBt = pPage->pBt; /* B-Tree Database */
7903 MemPage *pNew; /* Newly allocated page */
7904 int rc; /* Return Code */
7905 Pgno pgnoNew; /* Page number of pNew */
7907 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
7908 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7909 assert( pPage->nOverflow==1 );
7911 if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT; /* dbfuzz001.test */
7912 assert( pPage->nFree>=0 );
7913 assert( pParent->nFree>=0 );
7915 /* Allocate a new page. This page will become the right-sibling of
7916 ** pPage. Make the parent page writable, so that the new divider cell
7917 ** may be inserted. If both these operations are successful, proceed.
7919 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
7921 if( rc==SQLITE_OK ){
7923 u8 *pOut = &pSpace[4];
7924 u8 *pCell = pPage->apOvfl[0];
7925 u16 szCell = pPage->xCellSize(pPage, pCell);
7926 u8 *pStop;
7927 CellArray b;
7929 assert( sqlite3PagerIswriteable(pNew->pDbPage) );
7930 assert( CORRUPT_DB || pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
7931 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
7932 b.nCell = 1;
7933 b.pRef = pPage;
7934 b.apCell = &pCell;
7935 b.szCell = &szCell;
7936 b.apEnd[0] = pPage->aDataEnd;
7937 b.ixNx[0] = 2;
7938 rc = rebuildPage(&b, 0, 1, pNew);
7939 if( NEVER(rc) ){
7940 releasePage(pNew);
7941 return rc;
7943 pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
7945 /* If this is an auto-vacuum database, update the pointer map
7946 ** with entries for the new page, and any pointer from the
7947 ** cell on the page to an overflow page. If either of these
7948 ** operations fails, the return code is set, but the contents
7949 ** of the parent page are still manipulated by the code below.
7950 ** That is Ok, at this point the parent page is guaranteed to
7951 ** be marked as dirty. Returning an error code will cause a
7952 ** rollback, undoing any changes made to the parent page.
7954 if( ISAUTOVACUUM(pBt) ){
7955 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
7956 if( szCell>pNew->minLocal ){
7957 ptrmapPutOvflPtr(pNew, pNew, pCell, &rc);
7961 /* Create a divider cell to insert into pParent. The divider cell
7962 ** consists of a 4-byte page number (the page number of pPage) and
7963 ** a variable length key value (which must be the same value as the
7964 ** largest key on pPage).
7966 ** To find the largest key value on pPage, first find the right-most
7967 ** cell on pPage. The first two fields of this cell are the
7968 ** record-length (a variable length integer at most 32-bits in size)
7969 ** and the key value (a variable length integer, may have any value).
7970 ** The first of the while(...) loops below skips over the record-length
7971 ** field. The second while(...) loop copies the key value from the
7972 ** cell on pPage into the pSpace buffer.
7974 pCell = findCell(pPage, pPage->nCell-1);
7975 pStop = &pCell[9];
7976 while( (*(pCell++)&0x80) && pCell<pStop );
7977 pStop = &pCell[9];
7978 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
7980 /* Insert the new divider cell into pParent. */
7981 if( rc==SQLITE_OK ){
7982 rc = insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
7983 0, pPage->pgno);
7986 /* Set the right-child pointer of pParent to point to the new page. */
7987 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
7989 /* Release the reference to the new page. */
7990 releasePage(pNew);
7993 return rc;
7995 #endif /* SQLITE_OMIT_QUICKBALANCE */
7997 #if 0
7999 ** This function does not contribute anything to the operation of SQLite.
8000 ** it is sometimes activated temporarily while debugging code responsible
8001 ** for setting pointer-map entries.
8003 static int ptrmapCheckPages(MemPage **apPage, int nPage){
8004 int i, j;
8005 for(i=0; i<nPage; i++){
8006 Pgno n;
8007 u8 e;
8008 MemPage *pPage = apPage[i];
8009 BtShared *pBt = pPage->pBt;
8010 assert( pPage->isInit );
8012 for(j=0; j<pPage->nCell; j++){
8013 CellInfo info;
8014 u8 *z;
8016 z = findCell(pPage, j);
8017 pPage->xParseCell(pPage, z, &info);
8018 if( info.nLocal<info.nPayload ){
8019 Pgno ovfl = get4byte(&z[info.nSize-4]);
8020 ptrmapGet(pBt, ovfl, &e, &n);
8021 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
8023 if( !pPage->leaf ){
8024 Pgno child = get4byte(z);
8025 ptrmapGet(pBt, child, &e, &n);
8026 assert( n==pPage->pgno && e==PTRMAP_BTREE );
8029 if( !pPage->leaf ){
8030 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
8031 ptrmapGet(pBt, child, &e, &n);
8032 assert( n==pPage->pgno && e==PTRMAP_BTREE );
8035 return 1;
8037 #endif
8040 ** This function is used to copy the contents of the b-tree node stored
8041 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
8042 ** the pointer-map entries for each child page are updated so that the
8043 ** parent page stored in the pointer map is page pTo. If pFrom contained
8044 ** any cells with overflow page pointers, then the corresponding pointer
8045 ** map entries are also updated so that the parent page is page pTo.
8047 ** If pFrom is currently carrying any overflow cells (entries in the
8048 ** MemPage.apOvfl[] array), they are not copied to pTo.
8050 ** Before returning, page pTo is reinitialized using btreeInitPage().
8052 ** The performance of this function is not critical. It is only used by
8053 ** the balance_shallower() and balance_deeper() procedures, neither of
8054 ** which are called often under normal circumstances.
8056 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
8057 if( (*pRC)==SQLITE_OK ){
8058 BtShared * const pBt = pFrom->pBt;
8059 u8 * const aFrom = pFrom->aData;
8060 u8 * const aTo = pTo->aData;
8061 int const iFromHdr = pFrom->hdrOffset;
8062 int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
8063 int rc;
8064 int iData;
8067 assert( pFrom->isInit );
8068 assert( pFrom->nFree>=iToHdr );
8069 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
8071 /* Copy the b-tree node content from page pFrom to page pTo. */
8072 iData = get2byte(&aFrom[iFromHdr+5]);
8073 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
8074 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
8076 /* Reinitialize page pTo so that the contents of the MemPage structure
8077 ** match the new data. The initialization of pTo can actually fail under
8078 ** fairly obscure circumstances, even though it is a copy of initialized
8079 ** page pFrom.
8081 pTo->isInit = 0;
8082 rc = btreeInitPage(pTo);
8083 if( rc==SQLITE_OK ) rc = btreeComputeFreeSpace(pTo);
8084 if( rc!=SQLITE_OK ){
8085 *pRC = rc;
8086 return;
8089 /* If this is an auto-vacuum database, update the pointer-map entries
8090 ** for any b-tree or overflow pages that pTo now contains the pointers to.
8092 if( ISAUTOVACUUM(pBt) ){
8093 *pRC = setChildPtrmaps(pTo);
8099 ** This routine redistributes cells on the iParentIdx'th child of pParent
8100 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
8101 ** same amount of free space. Usually a single sibling on either side of the
8102 ** page are used in the balancing, though both siblings might come from one
8103 ** side if the page is the first or last child of its parent. If the page
8104 ** has fewer than 2 siblings (something which can only happen if the page
8105 ** is a root page or a child of a root page) then all available siblings
8106 ** participate in the balancing.
8108 ** The number of siblings of the page might be increased or decreased by
8109 ** one or two in an effort to keep pages nearly full but not over full.
8111 ** Note that when this routine is called, some of the cells on the page
8112 ** might not actually be stored in MemPage.aData[]. This can happen
8113 ** if the page is overfull. This routine ensures that all cells allocated
8114 ** to the page and its siblings fit into MemPage.aData[] before returning.
8116 ** In the course of balancing the page and its siblings, cells may be
8117 ** inserted into or removed from the parent page (pParent). Doing so
8118 ** may cause the parent page to become overfull or underfull. If this
8119 ** happens, it is the responsibility of the caller to invoke the correct
8120 ** balancing routine to fix this problem (see the balance() routine).
8122 ** If this routine fails for any reason, it might leave the database
8123 ** in a corrupted state. So if this routine fails, the database should
8124 ** be rolled back.
8126 ** The third argument to this function, aOvflSpace, is a pointer to a
8127 ** buffer big enough to hold one page. If while inserting cells into the parent
8128 ** page (pParent) the parent page becomes overfull, this buffer is
8129 ** used to store the parent's overflow cells. Because this function inserts
8130 ** a maximum of four divider cells into the parent page, and the maximum
8131 ** size of a cell stored within an internal node is always less than 1/4
8132 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
8133 ** enough for all overflow cells.
8135 ** If aOvflSpace is set to a null pointer, this function returns
8136 ** SQLITE_NOMEM.
8138 static int balance_nonroot(
8139 MemPage *pParent, /* Parent page of siblings being balanced */
8140 int iParentIdx, /* Index of "the page" in pParent */
8141 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */
8142 int isRoot, /* True if pParent is a root-page */
8143 int bBulk /* True if this call is part of a bulk load */
8145 BtShared *pBt; /* The whole database */
8146 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
8147 int nNew = 0; /* Number of pages in apNew[] */
8148 int nOld; /* Number of pages in apOld[] */
8149 int i, j, k; /* Loop counters */
8150 int nxDiv; /* Next divider slot in pParent->aCell[] */
8151 int rc = SQLITE_OK; /* The return code */
8152 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */
8153 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
8154 int usableSpace; /* Bytes in pPage beyond the header */
8155 int pageFlags; /* Value of pPage->aData[0] */
8156 int iSpace1 = 0; /* First unused byte of aSpace1[] */
8157 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */
8158 int szScratch; /* Size of scratch memory requested */
8159 MemPage *apOld[NB]; /* pPage and up to two siblings */
8160 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
8161 u8 *pRight; /* Location in parent of right-sibling pointer */
8162 u8 *apDiv[NB-1]; /* Divider cells in pParent */
8163 int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */
8164 int cntOld[NB+2]; /* Old index in b.apCell[] */
8165 int szNew[NB+2]; /* Combined size of cells placed on i-th page */
8166 u8 *aSpace1; /* Space for copies of dividers cells */
8167 Pgno pgno; /* Temp var to store a page number in */
8168 u8 abDone[NB+2]; /* True after i'th new page is populated */
8169 Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */
8170 CellArray b; /* Parsed information on cells being balanced */
8172 memset(abDone, 0, sizeof(abDone));
8173 memset(&b, 0, sizeof(b));
8174 pBt = pParent->pBt;
8175 assert( sqlite3_mutex_held(pBt->mutex) );
8176 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
8178 /* At this point pParent may have at most one overflow cell. And if
8179 ** this overflow cell is present, it must be the cell with
8180 ** index iParentIdx. This scenario comes about when this function
8181 ** is called (indirectly) from sqlite3BtreeDelete().
8183 assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
8184 assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
8186 if( !aOvflSpace ){
8187 return SQLITE_NOMEM_BKPT;
8189 assert( pParent->nFree>=0 );
8191 /* Find the sibling pages to balance. Also locate the cells in pParent
8192 ** that divide the siblings. An attempt is made to find NN siblings on
8193 ** either side of pPage. More siblings are taken from one side, however,
8194 ** if there are fewer than NN siblings on the other side. If pParent
8195 ** has NB or fewer children then all children of pParent are taken.
8197 ** This loop also drops the divider cells from the parent page. This
8198 ** way, the remainder of the function does not have to deal with any
8199 ** overflow cells in the parent page, since if any existed they will
8200 ** have already been removed.
8202 i = pParent->nOverflow + pParent->nCell;
8203 if( i<2 ){
8204 nxDiv = 0;
8205 }else{
8206 assert( bBulk==0 || bBulk==1 );
8207 if( iParentIdx==0 ){
8208 nxDiv = 0;
8209 }else if( iParentIdx==i ){
8210 nxDiv = i-2+bBulk;
8211 }else{
8212 nxDiv = iParentIdx-1;
8214 i = 2-bBulk;
8216 nOld = i+1;
8217 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
8218 pRight = &pParent->aData[pParent->hdrOffset+8];
8219 }else{
8220 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
8222 pgno = get4byte(pRight);
8223 while( 1 ){
8224 if( rc==SQLITE_OK ){
8225 rc = getAndInitPage(pBt, pgno, &apOld[i], 0);
8227 if( rc ){
8228 memset(apOld, 0, (i+1)*sizeof(MemPage*));
8229 goto balance_cleanup;
8231 if( apOld[i]->nFree<0 ){
8232 rc = btreeComputeFreeSpace(apOld[i]);
8233 if( rc ){
8234 memset(apOld, 0, (i)*sizeof(MemPage*));
8235 goto balance_cleanup;
8238 nMaxCells += apOld[i]->nCell + ArraySize(pParent->apOvfl);
8239 if( (i--)==0 ) break;
8241 if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
8242 apDiv[i] = pParent->apOvfl[0];
8243 pgno = get4byte(apDiv[i]);
8244 szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
8245 pParent->nOverflow = 0;
8246 }else{
8247 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
8248 pgno = get4byte(apDiv[i]);
8249 szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
8251 /* Drop the cell from the parent page. apDiv[i] still points to
8252 ** the cell within the parent, even though it has been dropped.
8253 ** This is safe because dropping a cell only overwrites the first
8254 ** four bytes of it, and this function does not need the first
8255 ** four bytes of the divider cell. So the pointer is safe to use
8256 ** later on.
8258 ** But not if we are in secure-delete mode. In secure-delete mode,
8259 ** the dropCell() routine will overwrite the entire cell with zeroes.
8260 ** In this case, temporarily copy the cell into the aOvflSpace[]
8261 ** buffer. It will be copied out again as soon as the aSpace[] buffer
8262 ** is allocated. */
8263 if( pBt->btsFlags & BTS_FAST_SECURE ){
8264 int iOff;
8266 /* If the following if() condition is not true, the db is corrupted.
8267 ** The call to dropCell() below will detect this. */
8268 iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
8269 if( (iOff+szNew[i])<=(int)pBt->usableSize ){
8270 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
8271 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
8274 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
8278 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
8279 ** alignment */
8280 nMaxCells = (nMaxCells + 3)&~3;
8283 ** Allocate space for memory structures
8285 szScratch =
8286 nMaxCells*sizeof(u8*) /* b.apCell */
8287 + nMaxCells*sizeof(u16) /* b.szCell */
8288 + pBt->pageSize; /* aSpace1 */
8290 assert( szScratch<=7*(int)pBt->pageSize );
8291 b.apCell = sqlite3StackAllocRaw(0, szScratch );
8292 if( b.apCell==0 ){
8293 rc = SQLITE_NOMEM_BKPT;
8294 goto balance_cleanup;
8296 b.szCell = (u16*)&b.apCell[nMaxCells];
8297 aSpace1 = (u8*)&b.szCell[nMaxCells];
8298 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
8301 ** Load pointers to all cells on sibling pages and the divider cells
8302 ** into the local b.apCell[] array. Make copies of the divider cells
8303 ** into space obtained from aSpace1[]. The divider cells have already
8304 ** been removed from pParent.
8306 ** If the siblings are on leaf pages, then the child pointers of the
8307 ** divider cells are stripped from the cells before they are copied
8308 ** into aSpace1[]. In this way, all cells in b.apCell[] are without
8309 ** child pointers. If siblings are not leaves, then all cell in
8310 ** b.apCell[] include child pointers. Either way, all cells in b.apCell[]
8311 ** are alike.
8313 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
8314 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
8316 b.pRef = apOld[0];
8317 leafCorrection = b.pRef->leaf*4;
8318 leafData = b.pRef->intKeyLeaf;
8319 for(i=0; i<nOld; i++){
8320 MemPage *pOld = apOld[i];
8321 int limit = pOld->nCell;
8322 u8 *aData = pOld->aData;
8323 u16 maskPage = pOld->maskPage;
8324 u8 *piCell = aData + pOld->cellOffset;
8325 u8 *piEnd;
8326 VVA_ONLY( int nCellAtStart = b.nCell; )
8328 /* Verify that all sibling pages are of the same "type" (table-leaf,
8329 ** table-interior, index-leaf, or index-interior).
8331 if( pOld->aData[0]!=apOld[0]->aData[0] ){
8332 rc = SQLITE_CORRUPT_PAGE(pOld);
8333 goto balance_cleanup;
8336 /* Load b.apCell[] with pointers to all cells in pOld. If pOld
8337 ** contains overflow cells, include them in the b.apCell[] array
8338 ** in the correct spot.
8340 ** Note that when there are multiple overflow cells, it is always the
8341 ** case that they are sequential and adjacent. This invariant arises
8342 ** because multiple overflows can only occurs when inserting divider
8343 ** cells into a parent on a prior balance, and divider cells are always
8344 ** adjacent and are inserted in order. There is an assert() tagged
8345 ** with "NOTE 1" in the overflow cell insertion loop to prove this
8346 ** invariant.
8348 ** This must be done in advance. Once the balance starts, the cell
8349 ** offset section of the btree page will be overwritten and we will no
8350 ** long be able to find the cells if a pointer to each cell is not saved
8351 ** first.
8353 memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
8354 if( pOld->nOverflow>0 ){
8355 if( NEVER(limit<pOld->aiOvfl[0]) ){
8356 rc = SQLITE_CORRUPT_PAGE(pOld);
8357 goto balance_cleanup;
8359 limit = pOld->aiOvfl[0];
8360 for(j=0; j<limit; j++){
8361 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
8362 piCell += 2;
8363 b.nCell++;
8365 for(k=0; k<pOld->nOverflow; k++){
8366 assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
8367 b.apCell[b.nCell] = pOld->apOvfl[k];
8368 b.nCell++;
8371 piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
8372 while( piCell<piEnd ){
8373 assert( b.nCell<nMaxCells );
8374 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
8375 piCell += 2;
8376 b.nCell++;
8378 assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) );
8380 cntOld[i] = b.nCell;
8381 if( i<nOld-1 && !leafData){
8382 u16 sz = (u16)szNew[i];
8383 u8 *pTemp;
8384 assert( b.nCell<nMaxCells );
8385 b.szCell[b.nCell] = sz;
8386 pTemp = &aSpace1[iSpace1];
8387 iSpace1 += sz;
8388 assert( sz<=pBt->maxLocal+23 );
8389 assert( iSpace1 <= (int)pBt->pageSize );
8390 memcpy(pTemp, apDiv[i], sz);
8391 b.apCell[b.nCell] = pTemp+leafCorrection;
8392 assert( leafCorrection==0 || leafCorrection==4 );
8393 b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
8394 if( !pOld->leaf ){
8395 assert( leafCorrection==0 );
8396 assert( pOld->hdrOffset==0 || CORRUPT_DB );
8397 /* The right pointer of the child page pOld becomes the left
8398 ** pointer of the divider cell */
8399 memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
8400 }else{
8401 assert( leafCorrection==4 );
8402 while( b.szCell[b.nCell]<4 ){
8403 /* Do not allow any cells smaller than 4 bytes. If a smaller cell
8404 ** does exist, pad it with 0x00 bytes. */
8405 assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
8406 assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
8407 aSpace1[iSpace1++] = 0x00;
8408 b.szCell[b.nCell]++;
8411 b.nCell++;
8416 ** Figure out the number of pages needed to hold all b.nCell cells.
8417 ** Store this number in "k". Also compute szNew[] which is the total
8418 ** size of all cells on the i-th page and cntNew[] which is the index
8419 ** in b.apCell[] of the cell that divides page i from page i+1.
8420 ** cntNew[k] should equal b.nCell.
8422 ** Values computed by this block:
8424 ** k: The total number of sibling pages
8425 ** szNew[i]: Spaced used on the i-th sibling page.
8426 ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
8427 ** the right of the i-th sibling page.
8428 ** usableSpace: Number of bytes of space available on each sibling.
8431 usableSpace = pBt->usableSize - 12 + leafCorrection;
8432 for(i=k=0; i<nOld; i++, k++){
8433 MemPage *p = apOld[i];
8434 b.apEnd[k] = p->aDataEnd;
8435 b.ixNx[k] = cntOld[i];
8436 if( k && b.ixNx[k]==b.ixNx[k-1] ){
8437 k--; /* Omit b.ixNx[] entry for child pages with no cells */
8439 if( !leafData ){
8440 k++;
8441 b.apEnd[k] = pParent->aDataEnd;
8442 b.ixNx[k] = cntOld[i]+1;
8444 assert( p->nFree>=0 );
8445 szNew[i] = usableSpace - p->nFree;
8446 for(j=0; j<p->nOverflow; j++){
8447 szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
8449 cntNew[i] = cntOld[i];
8451 k = nOld;
8452 for(i=0; i<k; i++){
8453 int sz;
8454 while( szNew[i]>usableSpace ){
8455 if( i+1>=k ){
8456 k = i+2;
8457 if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
8458 szNew[k-1] = 0;
8459 cntNew[k-1] = b.nCell;
8461 sz = 2 + cachedCellSize(&b, cntNew[i]-1);
8462 szNew[i] -= sz;
8463 if( !leafData ){
8464 if( cntNew[i]<b.nCell ){
8465 sz = 2 + cachedCellSize(&b, cntNew[i]);
8466 }else{
8467 sz = 0;
8470 szNew[i+1] += sz;
8471 cntNew[i]--;
8473 while( cntNew[i]<b.nCell ){
8474 sz = 2 + cachedCellSize(&b, cntNew[i]);
8475 if( szNew[i]+sz>usableSpace ) break;
8476 szNew[i] += sz;
8477 cntNew[i]++;
8478 if( !leafData ){
8479 if( cntNew[i]<b.nCell ){
8480 sz = 2 + cachedCellSize(&b, cntNew[i]);
8481 }else{
8482 sz = 0;
8485 szNew[i+1] -= sz;
8487 if( cntNew[i]>=b.nCell ){
8488 k = i+1;
8489 }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
8490 rc = SQLITE_CORRUPT_BKPT;
8491 goto balance_cleanup;
8496 ** The packing computed by the previous block is biased toward the siblings
8497 ** on the left side (siblings with smaller keys). The left siblings are
8498 ** always nearly full, while the right-most sibling might be nearly empty.
8499 ** The next block of code attempts to adjust the packing of siblings to
8500 ** get a better balance.
8502 ** This adjustment is more than an optimization. The packing above might
8503 ** be so out of balance as to be illegal. For example, the right-most
8504 ** sibling might be completely empty. This adjustment is not optional.
8506 for(i=k-1; i>0; i--){
8507 int szRight = szNew[i]; /* Size of sibling on the right */
8508 int szLeft = szNew[i-1]; /* Size of sibling on the left */
8509 int r; /* Index of right-most cell in left sibling */
8510 int d; /* Index of first cell to the left of right sibling */
8512 r = cntNew[i-1] - 1;
8513 d = r + 1 - leafData;
8514 (void)cachedCellSize(&b, d);
8516 int szR, szD;
8517 assert( d<nMaxCells );
8518 assert( r<nMaxCells );
8519 szR = cachedCellSize(&b, r);
8520 szD = b.szCell[d];
8521 if( szRight!=0
8522 && (bBulk || szRight+szD+2 > szLeft-(szR+(i==k-1?0:2)))){
8523 break;
8525 szRight += szD + 2;
8526 szLeft -= szR + 2;
8527 cntNew[i-1] = r;
8528 r--;
8529 d--;
8530 }while( r>=0 );
8531 szNew[i] = szRight;
8532 szNew[i-1] = szLeft;
8533 if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
8534 rc = SQLITE_CORRUPT_BKPT;
8535 goto balance_cleanup;
8539 /* Sanity check: For a non-corrupt database file one of the following
8540 ** must be true:
8541 ** (1) We found one or more cells (cntNew[0])>0), or
8542 ** (2) pPage is a virtual root page. A virtual root page is when
8543 ** the real root page is page 1 and we are the only child of
8544 ** that page.
8546 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
8547 TRACE(("BALANCE: old: %u(nc=%u) %u(nc=%u) %u(nc=%u)\n",
8548 apOld[0]->pgno, apOld[0]->nCell,
8549 nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
8550 nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
8554 ** Allocate k new pages. Reuse old pages where possible.
8556 pageFlags = apOld[0]->aData[0];
8557 for(i=0; i<k; i++){
8558 MemPage *pNew;
8559 if( i<nOld ){
8560 pNew = apNew[i] = apOld[i];
8561 apOld[i] = 0;
8562 rc = sqlite3PagerWrite(pNew->pDbPage);
8563 nNew++;
8564 if( sqlite3PagerPageRefcount(pNew->pDbPage)!=1+(i==(iParentIdx-nxDiv))
8565 && rc==SQLITE_OK
8567 rc = SQLITE_CORRUPT_BKPT;
8569 if( rc ) goto balance_cleanup;
8570 }else{
8571 assert( i>0 );
8572 rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
8573 if( rc ) goto balance_cleanup;
8574 zeroPage(pNew, pageFlags);
8575 apNew[i] = pNew;
8576 nNew++;
8577 cntOld[i] = b.nCell;
8579 /* Set the pointer-map entry for the new sibling page. */
8580 if( ISAUTOVACUUM(pBt) ){
8581 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
8582 if( rc!=SQLITE_OK ){
8583 goto balance_cleanup;
8590 ** Reassign page numbers so that the new pages are in ascending order.
8591 ** This helps to keep entries in the disk file in order so that a scan
8592 ** of the table is closer to a linear scan through the file. That in turn
8593 ** helps the operating system to deliver pages from the disk more rapidly.
8595 ** An O(N*N) sort algorithm is used, but since N is never more than NB+2
8596 ** (5), that is not a performance concern.
8598 ** When NB==3, this one optimization makes the database about 25% faster
8599 ** for large insertions and deletions.
8601 for(i=0; i<nNew; i++){
8602 aPgno[i] = apNew[i]->pgno;
8603 assert( apNew[i]->pDbPage->flags & PGHDR_WRITEABLE );
8604 assert( apNew[i]->pDbPage->flags & PGHDR_DIRTY );
8606 for(i=0; i<nNew-1; i++){
8607 int iB = i;
8608 for(j=i+1; j<nNew; j++){
8609 if( apNew[j]->pgno < apNew[iB]->pgno ) iB = j;
8612 /* If apNew[i] has a page number that is bigger than any of the
8613 ** subsequence apNew[i] entries, then swap apNew[i] with the subsequent
8614 ** entry that has the smallest page number (which we know to be
8615 ** entry apNew[iB]).
8617 if( iB!=i ){
8618 Pgno pgnoA = apNew[i]->pgno;
8619 Pgno pgnoB = apNew[iB]->pgno;
8620 Pgno pgnoTemp = (PENDING_BYTE/pBt->pageSize)+1;
8621 u16 fgA = apNew[i]->pDbPage->flags;
8622 u16 fgB = apNew[iB]->pDbPage->flags;
8623 sqlite3PagerRekey(apNew[i]->pDbPage, pgnoTemp, fgB);
8624 sqlite3PagerRekey(apNew[iB]->pDbPage, pgnoA, fgA);
8625 sqlite3PagerRekey(apNew[i]->pDbPage, pgnoB, fgB);
8626 apNew[i]->pgno = pgnoB;
8627 apNew[iB]->pgno = pgnoA;
8631 TRACE(("BALANCE: new: %u(%u nc=%u) %u(%u nc=%u) %u(%u nc=%u) "
8632 "%u(%u nc=%u) %u(%u nc=%u)\n",
8633 apNew[0]->pgno, szNew[0], cntNew[0],
8634 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
8635 nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
8636 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
8637 nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
8638 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
8639 nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
8640 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
8641 nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
8644 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
8645 assert( nNew>=1 && nNew<=ArraySize(apNew) );
8646 assert( apNew[nNew-1]!=0 );
8647 put4byte(pRight, apNew[nNew-1]->pgno);
8649 /* If the sibling pages are not leaves, ensure that the right-child pointer
8650 ** of the right-most new sibling page is set to the value that was
8651 ** originally in the same field of the right-most old sibling page. */
8652 if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
8653 MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
8654 memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
8657 /* Make any required updates to pointer map entries associated with
8658 ** cells stored on sibling pages following the balance operation. Pointer
8659 ** map entries associated with divider cells are set by the insertCell()
8660 ** routine. The associated pointer map entries are:
8662 ** a) if the cell contains a reference to an overflow chain, the
8663 ** entry associated with the first page in the overflow chain, and
8665 ** b) if the sibling pages are not leaves, the child page associated
8666 ** with the cell.
8668 ** If the sibling pages are not leaves, then the pointer map entry
8669 ** associated with the right-child of each sibling may also need to be
8670 ** updated. This happens below, after the sibling pages have been
8671 ** populated, not here.
8673 if( ISAUTOVACUUM(pBt) ){
8674 MemPage *pOld;
8675 MemPage *pNew = pOld = apNew[0];
8676 int cntOldNext = pNew->nCell + pNew->nOverflow;
8677 int iNew = 0;
8678 int iOld = 0;
8680 for(i=0; i<b.nCell; i++){
8681 u8 *pCell = b.apCell[i];
8682 while( i==cntOldNext ){
8683 iOld++;
8684 assert( iOld<nNew || iOld<nOld );
8685 assert( iOld>=0 && iOld<NB );
8686 pOld = iOld<nNew ? apNew[iOld] : apOld[iOld];
8687 cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
8689 if( i==cntNew[iNew] ){
8690 pNew = apNew[++iNew];
8691 if( !leafData ) continue;
8694 /* Cell pCell is destined for new sibling page pNew. Originally, it
8695 ** was either part of sibling page iOld (possibly an overflow cell),
8696 ** or else the divider cell to the left of sibling page iOld. So,
8697 ** if sibling page iOld had the same page number as pNew, and if
8698 ** pCell really was a part of sibling page iOld (not a divider or
8699 ** overflow cell), we can skip updating the pointer map entries. */
8700 if( iOld>=nNew
8701 || pNew->pgno!=aPgno[iOld]
8702 || !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd)
8704 if( !leafCorrection ){
8705 ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
8707 if( cachedCellSize(&b,i)>pNew->minLocal ){
8708 ptrmapPutOvflPtr(pNew, pOld, pCell, &rc);
8710 if( rc ) goto balance_cleanup;
8715 /* Insert new divider cells into pParent. */
8716 for(i=0; i<nNew-1; i++){
8717 u8 *pCell;
8718 u8 *pTemp;
8719 int sz;
8720 u8 *pSrcEnd;
8721 MemPage *pNew = apNew[i];
8722 j = cntNew[i];
8724 assert( j<nMaxCells );
8725 assert( b.apCell[j]!=0 );
8726 pCell = b.apCell[j];
8727 sz = b.szCell[j] + leafCorrection;
8728 pTemp = &aOvflSpace[iOvflSpace];
8729 if( !pNew->leaf ){
8730 memcpy(&pNew->aData[8], pCell, 4);
8731 }else if( leafData ){
8732 /* If the tree is a leaf-data tree, and the siblings are leaves,
8733 ** then there is no divider cell in b.apCell[]. Instead, the divider
8734 ** cell consists of the integer key for the right-most cell of
8735 ** the sibling-page assembled above only.
8737 CellInfo info;
8738 j--;
8739 pNew->xParseCell(pNew, b.apCell[j], &info);
8740 pCell = pTemp;
8741 sz = 4 + putVarint(&pCell[4], info.nKey);
8742 pTemp = 0;
8743 }else{
8744 pCell -= 4;
8745 /* Obscure case for non-leaf-data trees: If the cell at pCell was
8746 ** previously stored on a leaf node, and its reported size was 4
8747 ** bytes, then it may actually be smaller than this
8748 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
8749 ** any cell). But it is important to pass the correct size to
8750 ** insertCell(), so reparse the cell now.
8752 ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
8753 ** and WITHOUT ROWID tables with exactly one column which is the
8754 ** primary key.
8756 if( b.szCell[j]==4 ){
8757 assert(leafCorrection==4);
8758 sz = pParent->xCellSize(pParent, pCell);
8761 iOvflSpace += sz;
8762 assert( sz<=pBt->maxLocal+23 );
8763 assert( iOvflSpace <= (int)pBt->pageSize );
8764 for(k=0; ALWAYS(k<NB*2) && b.ixNx[k]<=j; k++){}
8765 pSrcEnd = b.apEnd[k];
8766 if( SQLITE_OVERFLOW(pSrcEnd, pCell, pCell+sz) ){
8767 rc = SQLITE_CORRUPT_BKPT;
8768 goto balance_cleanup;
8770 rc = insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno);
8771 if( rc!=SQLITE_OK ) goto balance_cleanup;
8772 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
8775 /* Now update the actual sibling pages. The order in which they are updated
8776 ** is important, as this code needs to avoid disrupting any page from which
8777 ** cells may still to be read. In practice, this means:
8779 ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
8780 ** then it is not safe to update page apNew[iPg] until after
8781 ** the left-hand sibling apNew[iPg-1] has been updated.
8783 ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
8784 ** then it is not safe to update page apNew[iPg] until after
8785 ** the right-hand sibling apNew[iPg+1] has been updated.
8787 ** If neither of the above apply, the page is safe to update.
8789 ** The iPg value in the following loop starts at nNew-1 goes down
8790 ** to 0, then back up to nNew-1 again, thus making two passes over
8791 ** the pages. On the initial downward pass, only condition (1) above
8792 ** needs to be tested because (2) will always be true from the previous
8793 ** step. On the upward pass, both conditions are always true, so the
8794 ** upwards pass simply processes pages that were missed on the downward
8795 ** pass.
8797 for(i=1-nNew; i<nNew; i++){
8798 int iPg = i<0 ? -i : i;
8799 assert( iPg>=0 && iPg<nNew );
8800 assert( iPg>=1 || i>=0 );
8801 assert( iPg<ArraySize(cntOld) );
8802 if( abDone[iPg] ) continue; /* Skip pages already processed */
8803 if( i>=0 /* On the upwards pass, or... */
8804 || cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */
8806 int iNew;
8807 int iOld;
8808 int nNewCell;
8810 /* Verify condition (1): If cells are moving left, update iPg
8811 ** only after iPg-1 has already been updated. */
8812 assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
8814 /* Verify condition (2): If cells are moving right, update iPg
8815 ** only after iPg+1 has already been updated. */
8816 assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
8818 if( iPg==0 ){
8819 iNew = iOld = 0;
8820 nNewCell = cntNew[0];
8821 }else{
8822 iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
8823 iNew = cntNew[iPg-1] + !leafData;
8824 nNewCell = cntNew[iPg] - iNew;
8827 rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
8828 if( rc ) goto balance_cleanup;
8829 abDone[iPg]++;
8830 apNew[iPg]->nFree = usableSpace-szNew[iPg];
8831 assert( apNew[iPg]->nOverflow==0 );
8832 assert( apNew[iPg]->nCell==nNewCell );
8836 /* All pages have been processed exactly once */
8837 assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
8839 assert( nOld>0 );
8840 assert( nNew>0 );
8842 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
8843 /* The root page of the b-tree now contains no cells. The only sibling
8844 ** page is the right-child of the parent. Copy the contents of the
8845 ** child page into the parent, decreasing the overall height of the
8846 ** b-tree structure by one. This is described as the "balance-shallower"
8847 ** sub-algorithm in some documentation.
8849 ** If this is an auto-vacuum database, the call to copyNodeContent()
8850 ** sets all pointer-map entries corresponding to database image pages
8851 ** for which the pointer is stored within the content being copied.
8853 ** It is critical that the child page be defragmented before being
8854 ** copied into the parent, because if the parent is page 1 then it will
8855 ** by smaller than the child due to the database header, and so all the
8856 ** free space needs to be up front.
8858 assert( nNew==1 || CORRUPT_DB );
8859 rc = defragmentPage(apNew[0], -1);
8860 testcase( rc!=SQLITE_OK );
8861 assert( apNew[0]->nFree ==
8862 (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset
8863 - apNew[0]->nCell*2)
8864 || rc!=SQLITE_OK
8866 copyNodeContent(apNew[0], pParent, &rc);
8867 freePage(apNew[0], &rc);
8868 }else if( ISAUTOVACUUM(pBt) && !leafCorrection ){
8869 /* Fix the pointer map entries associated with the right-child of each
8870 ** sibling page. All other pointer map entries have already been taken
8871 ** care of. */
8872 for(i=0; i<nNew; i++){
8873 u32 key = get4byte(&apNew[i]->aData[8]);
8874 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
8878 assert( pParent->isInit );
8879 TRACE(("BALANCE: finished: old=%u new=%u cells=%u\n",
8880 nOld, nNew, b.nCell));
8882 /* Free any old pages that were not reused as new pages.
8884 for(i=nNew; i<nOld; i++){
8885 freePage(apOld[i], &rc);
8888 #if 0
8889 if( ISAUTOVACUUM(pBt) && rc==SQLITE_OK && apNew[0]->isInit ){
8890 /* The ptrmapCheckPages() contains assert() statements that verify that
8891 ** all pointer map pages are set correctly. This is helpful while
8892 ** debugging. This is usually disabled because a corrupt database may
8893 ** cause an assert() statement to fail. */
8894 ptrmapCheckPages(apNew, nNew);
8895 ptrmapCheckPages(&pParent, 1);
8897 #endif
8900 ** Cleanup before returning.
8902 balance_cleanup:
8903 sqlite3StackFree(0, b.apCell);
8904 for(i=0; i<nOld; i++){
8905 releasePage(apOld[i]);
8907 for(i=0; i<nNew; i++){
8908 releasePage(apNew[i]);
8911 return rc;
8916 ** This function is called when the root page of a b-tree structure is
8917 ** overfull (has one or more overflow pages).
8919 ** A new child page is allocated and the contents of the current root
8920 ** page, including overflow cells, are copied into the child. The root
8921 ** page is then overwritten to make it an empty page with the right-child
8922 ** pointer pointing to the new page.
8924 ** Before returning, all pointer-map entries corresponding to pages
8925 ** that the new child-page now contains pointers to are updated. The
8926 ** entry corresponding to the new right-child pointer of the root
8927 ** page is also updated.
8929 ** If successful, *ppChild is set to contain a reference to the child
8930 ** page and SQLITE_OK is returned. In this case the caller is required
8931 ** to call releasePage() on *ppChild exactly once. If an error occurs,
8932 ** an error code is returned and *ppChild is set to 0.
8934 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
8935 int rc; /* Return value from subprocedures */
8936 MemPage *pChild = 0; /* Pointer to a new child page */
8937 Pgno pgnoChild = 0; /* Page number of the new child page */
8938 BtShared *pBt = pRoot->pBt; /* The BTree */
8940 assert( pRoot->nOverflow>0 );
8941 assert( sqlite3_mutex_held(pBt->mutex) );
8943 /* Make pRoot, the root page of the b-tree, writable. Allocate a new
8944 ** page that will become the new right-child of pPage. Copy the contents
8945 ** of the node stored on pRoot into the new child page.
8947 rc = sqlite3PagerWrite(pRoot->pDbPage);
8948 if( rc==SQLITE_OK ){
8949 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
8950 copyNodeContent(pRoot, pChild, &rc);
8951 if( ISAUTOVACUUM(pBt) ){
8952 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
8955 if( rc ){
8956 *ppChild = 0;
8957 releasePage(pChild);
8958 return rc;
8960 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
8961 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
8962 assert( pChild->nCell==pRoot->nCell || CORRUPT_DB );
8964 TRACE(("BALANCE: copy root %u into %u\n", pRoot->pgno, pChild->pgno));
8966 /* Copy the overflow cells from pRoot to pChild */
8967 memcpy(pChild->aiOvfl, pRoot->aiOvfl,
8968 pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
8969 memcpy(pChild->apOvfl, pRoot->apOvfl,
8970 pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
8971 pChild->nOverflow = pRoot->nOverflow;
8973 /* Zero the contents of pRoot. Then install pChild as the right-child. */
8974 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
8975 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
8977 *ppChild = pChild;
8978 return SQLITE_OK;
8982 ** Return SQLITE_CORRUPT if any cursor other than pCur is currently valid
8983 ** on the same B-tree as pCur.
8985 ** This can occur if a database is corrupt with two or more SQL tables
8986 ** pointing to the same b-tree. If an insert occurs on one SQL table
8987 ** and causes a BEFORE TRIGGER to do a secondary insert on the other SQL
8988 ** table linked to the same b-tree. If the secondary insert causes a
8989 ** rebalance, that can change content out from under the cursor on the
8990 ** first SQL table, violating invariants on the first insert.
8992 static int anotherValidCursor(BtCursor *pCur){
8993 BtCursor *pOther;
8994 for(pOther=pCur->pBt->pCursor; pOther; pOther=pOther->pNext){
8995 if( pOther!=pCur
8996 && pOther->eState==CURSOR_VALID
8997 && pOther->pPage==pCur->pPage
8999 return SQLITE_CORRUPT_PAGE(pCur->pPage);
9002 return SQLITE_OK;
9006 ** The page that pCur currently points to has just been modified in
9007 ** some way. This function figures out if this modification means the
9008 ** tree needs to be balanced, and if so calls the appropriate balancing
9009 ** routine. Balancing routines are:
9011 ** balance_quick()
9012 ** balance_deeper()
9013 ** balance_nonroot()
9015 static int balance(BtCursor *pCur){
9016 int rc = SQLITE_OK;
9017 u8 aBalanceQuickSpace[13];
9018 u8 *pFree = 0;
9020 VVA_ONLY( int balance_quick_called = 0 );
9021 VVA_ONLY( int balance_deeper_called = 0 );
9023 do {
9024 int iPage;
9025 MemPage *pPage = pCur->pPage;
9027 if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break;
9028 if( pPage->nOverflow==0 && pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){
9029 /* No rebalance required as long as:
9030 ** (1) There are no overflow cells
9031 ** (2) The amount of free space on the page is less than 2/3rds of
9032 ** the total usable space on the page. */
9033 break;
9034 }else if( (iPage = pCur->iPage)==0 ){
9035 if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){
9036 /* The root page of the b-tree is overfull. In this case call the
9037 ** balance_deeper() function to create a new child for the root-page
9038 ** and copy the current contents of the root-page to it. The
9039 ** next iteration of the do-loop will balance the child page.
9041 assert( balance_deeper_called==0 );
9042 VVA_ONLY( balance_deeper_called++ );
9043 rc = balance_deeper(pPage, &pCur->apPage[1]);
9044 if( rc==SQLITE_OK ){
9045 pCur->iPage = 1;
9046 pCur->ix = 0;
9047 pCur->aiIdx[0] = 0;
9048 pCur->apPage[0] = pPage;
9049 pCur->pPage = pCur->apPage[1];
9050 assert( pCur->pPage->nOverflow );
9052 }else{
9053 break;
9055 }else if( sqlite3PagerPageRefcount(pPage->pDbPage)>1 ){
9056 /* The page being written is not a root page, and there is currently
9057 ** more than one reference to it. This only happens if the page is one
9058 ** of its own ancestor pages. Corruption. */
9059 rc = SQLITE_CORRUPT_PAGE(pPage);
9060 }else{
9061 MemPage * const pParent = pCur->apPage[iPage-1];
9062 int const iIdx = pCur->aiIdx[iPage-1];
9064 rc = sqlite3PagerWrite(pParent->pDbPage);
9065 if( rc==SQLITE_OK && pParent->nFree<0 ){
9066 rc = btreeComputeFreeSpace(pParent);
9068 if( rc==SQLITE_OK ){
9069 #ifndef SQLITE_OMIT_QUICKBALANCE
9070 if( pPage->intKeyLeaf
9071 && pPage->nOverflow==1
9072 && pPage->aiOvfl[0]==pPage->nCell
9073 && pParent->pgno!=1
9074 && pParent->nCell==iIdx
9076 /* Call balance_quick() to create a new sibling of pPage on which
9077 ** to store the overflow cell. balance_quick() inserts a new cell
9078 ** into pParent, which may cause pParent overflow. If this
9079 ** happens, the next iteration of the do-loop will balance pParent
9080 ** use either balance_nonroot() or balance_deeper(). Until this
9081 ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
9082 ** buffer.
9084 ** The purpose of the following assert() is to check that only a
9085 ** single call to balance_quick() is made for each call to this
9086 ** function. If this were not verified, a subtle bug involving reuse
9087 ** of the aBalanceQuickSpace[] might sneak in.
9089 assert( balance_quick_called==0 );
9090 VVA_ONLY( balance_quick_called++ );
9091 rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
9092 }else
9093 #endif
9095 /* In this case, call balance_nonroot() to redistribute cells
9096 ** between pPage and up to 2 of its sibling pages. This involves
9097 ** modifying the contents of pParent, which may cause pParent to
9098 ** become overfull or underfull. The next iteration of the do-loop
9099 ** will balance the parent page to correct this.
9101 ** If the parent page becomes overfull, the overflow cell or cells
9102 ** are stored in the pSpace buffer allocated immediately below.
9103 ** A subsequent iteration of the do-loop will deal with this by
9104 ** calling balance_nonroot() (balance_deeper() may be called first,
9105 ** but it doesn't deal with overflow cells - just moves them to a
9106 ** different page). Once this subsequent call to balance_nonroot()
9107 ** has completed, it is safe to release the pSpace buffer used by
9108 ** the previous call, as the overflow cell data will have been
9109 ** copied either into the body of a database page or into the new
9110 ** pSpace buffer passed to the latter call to balance_nonroot().
9112 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
9113 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
9114 pCur->hints&BTREE_BULKLOAD);
9115 if( pFree ){
9116 /* If pFree is not NULL, it points to the pSpace buffer used
9117 ** by a previous call to balance_nonroot(). Its contents are
9118 ** now stored either on real database pages or within the
9119 ** new pSpace buffer, so it may be safely freed here. */
9120 sqlite3PageFree(pFree);
9123 /* The pSpace buffer will be freed after the next call to
9124 ** balance_nonroot(), or just before this function returns, whichever
9125 ** comes first. */
9126 pFree = pSpace;
9130 pPage->nOverflow = 0;
9132 /* The next iteration of the do-loop balances the parent page. */
9133 releasePage(pPage);
9134 pCur->iPage--;
9135 assert( pCur->iPage>=0 );
9136 pCur->pPage = pCur->apPage[pCur->iPage];
9138 }while( rc==SQLITE_OK );
9140 if( pFree ){
9141 sqlite3PageFree(pFree);
9143 return rc;
9146 /* Overwrite content from pX into pDest. Only do the write if the
9147 ** content is different from what is already there.
9149 static int btreeOverwriteContent(
9150 MemPage *pPage, /* MemPage on which writing will occur */
9151 u8 *pDest, /* Pointer to the place to start writing */
9152 const BtreePayload *pX, /* Source of data to write */
9153 int iOffset, /* Offset of first byte to write */
9154 int iAmt /* Number of bytes to be written */
9156 int nData = pX->nData - iOffset;
9157 if( nData<=0 ){
9158 /* Overwriting with zeros */
9159 int i;
9160 for(i=0; i<iAmt && pDest[i]==0; i++){}
9161 if( i<iAmt ){
9162 int rc = sqlite3PagerWrite(pPage->pDbPage);
9163 if( rc ) return rc;
9164 memset(pDest + i, 0, iAmt - i);
9166 }else{
9167 if( nData<iAmt ){
9168 /* Mixed read data and zeros at the end. Make a recursive call
9169 ** to write the zeros then fall through to write the real data */
9170 int rc = btreeOverwriteContent(pPage, pDest+nData, pX, iOffset+nData,
9171 iAmt-nData);
9172 if( rc ) return rc;
9173 iAmt = nData;
9175 if( memcmp(pDest, ((u8*)pX->pData) + iOffset, iAmt)!=0 ){
9176 int rc = sqlite3PagerWrite(pPage->pDbPage);
9177 if( rc ) return rc;
9178 /* In a corrupt database, it is possible for the source and destination
9179 ** buffers to overlap. This is harmless since the database is already
9180 ** corrupt but it does cause valgrind and ASAN warnings. So use
9181 ** memmove(). */
9182 memmove(pDest, ((u8*)pX->pData) + iOffset, iAmt);
9185 return SQLITE_OK;
9189 ** Overwrite the cell that cursor pCur is pointing to with fresh content
9190 ** contained in pX. In this variant, pCur is pointing to an overflow
9191 ** cell.
9193 static SQLITE_NOINLINE int btreeOverwriteOverflowCell(
9194 BtCursor *pCur, /* Cursor pointing to cell to overwrite */
9195 const BtreePayload *pX /* Content to write into the cell */
9197 int iOffset; /* Next byte of pX->pData to write */
9198 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */
9199 int rc; /* Return code */
9200 MemPage *pPage = pCur->pPage; /* Page being written */
9201 BtShared *pBt; /* Btree */
9202 Pgno ovflPgno; /* Next overflow page to write */
9203 u32 ovflPageSize; /* Size to write on overflow page */
9205 assert( pCur->info.nLocal<nTotal ); /* pCur is an overflow cell */
9207 /* Overwrite the local portion first */
9208 rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX,
9209 0, pCur->info.nLocal);
9210 if( rc ) return rc;
9212 /* Now overwrite the overflow pages */
9213 iOffset = pCur->info.nLocal;
9214 assert( nTotal>=0 );
9215 assert( iOffset>=0 );
9216 ovflPgno = get4byte(pCur->info.pPayload + iOffset);
9217 pBt = pPage->pBt;
9218 ovflPageSize = pBt->usableSize - 4;
9220 rc = btreeGetPage(pBt, ovflPgno, &pPage, 0);
9221 if( rc ) return rc;
9222 if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 || pPage->isInit ){
9223 rc = SQLITE_CORRUPT_PAGE(pPage);
9224 }else{
9225 if( iOffset+ovflPageSize<(u32)nTotal ){
9226 ovflPgno = get4byte(pPage->aData);
9227 }else{
9228 ovflPageSize = nTotal - iOffset;
9230 rc = btreeOverwriteContent(pPage, pPage->aData+4, pX,
9231 iOffset, ovflPageSize);
9233 sqlite3PagerUnref(pPage->pDbPage);
9234 if( rc ) return rc;
9235 iOffset += ovflPageSize;
9236 }while( iOffset<nTotal );
9237 return SQLITE_OK;
9241 ** Overwrite the cell that cursor pCur is pointing to with fresh content
9242 ** contained in pX.
9244 static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){
9245 int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */
9246 MemPage *pPage = pCur->pPage; /* Page being written */
9248 if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd
9249 || pCur->info.pPayload < pPage->aData + pPage->cellOffset
9251 return SQLITE_CORRUPT_PAGE(pPage);
9253 if( pCur->info.nLocal==nTotal ){
9254 /* The entire cell is local */
9255 return btreeOverwriteContent(pPage, pCur->info.pPayload, pX,
9256 0, pCur->info.nLocal);
9257 }else{
9258 /* The cell contains overflow content */
9259 return btreeOverwriteOverflowCell(pCur, pX);
9265 ** Insert a new record into the BTree. The content of the new record
9266 ** is described by the pX object. The pCur cursor is used only to
9267 ** define what table the record should be inserted into, and is left
9268 ** pointing at a random location.
9270 ** For a table btree (used for rowid tables), only the pX.nKey value of
9271 ** the key is used. The pX.pKey value must be NULL. The pX.nKey is the
9272 ** rowid or INTEGER PRIMARY KEY of the row. The pX.nData,pData,nZero fields
9273 ** hold the content of the row.
9275 ** For an index btree (used for indexes and WITHOUT ROWID tables), the
9276 ** key is an arbitrary byte sequence stored in pX.pKey,nKey. The
9277 ** pX.pData,nData,nZero fields must be zero.
9279 ** If the seekResult parameter is non-zero, then a successful call to
9280 ** sqlite3BtreeIndexMoveto() to seek cursor pCur to (pKey,nKey) has already
9281 ** been performed. In other words, if seekResult!=0 then the cursor
9282 ** is currently pointing to a cell that will be adjacent to the cell
9283 ** to be inserted. If seekResult<0 then pCur points to a cell that is
9284 ** smaller then (pKey,nKey). If seekResult>0 then pCur points to a cell
9285 ** that is larger than (pKey,nKey).
9287 ** If seekResult==0, that means pCur is pointing at some unknown location.
9288 ** In that case, this routine must seek the cursor to the correct insertion
9289 ** point for (pKey,nKey) before doing the insertion. For index btrees,
9290 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
9291 ** key values and pX->aMem can be used instead of pX->pKey to avoid having
9292 ** to decode the key.
9294 int sqlite3BtreeInsert(
9295 BtCursor *pCur, /* Insert data into the table of this cursor */
9296 const BtreePayload *pX, /* Content of the row to be inserted */
9297 int flags, /* True if this is likely an append */
9298 int seekResult /* Result of prior IndexMoveto() call */
9300 int rc;
9301 int loc = seekResult; /* -1: before desired location +1: after */
9302 int szNew = 0;
9303 int idx;
9304 MemPage *pPage;
9305 Btree *p = pCur->pBtree;
9306 unsigned char *oldCell;
9307 unsigned char *newCell = 0;
9309 assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND|BTREE_PREFORMAT))==flags );
9310 assert( (flags & BTREE_PREFORMAT)==0 || seekResult || pCur->pKeyInfo==0 );
9312 /* Save the positions of any other cursors open on this table.
9314 ** In some cases, the call to btreeMoveto() below is a no-op. For
9315 ** example, when inserting data into a table with auto-generated integer
9316 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
9317 ** integer key to use. It then calls this function to actually insert the
9318 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
9319 ** that the cursor is already where it needs to be and returns without
9320 ** doing any work. To avoid thwarting these optimizations, it is important
9321 ** not to clear the cursor here.
9323 if( pCur->curFlags & BTCF_Multiple ){
9324 rc = saveAllCursors(p->pBt, pCur->pgnoRoot, pCur);
9325 if( rc ) return rc;
9326 if( loc && pCur->iPage<0 ){
9327 /* This can only happen if the schema is corrupt such that there is more
9328 ** than one table or index with the same root page as used by the cursor.
9329 ** Which can only happen if the SQLITE_NoSchemaError flag was set when
9330 ** the schema was loaded. This cannot be asserted though, as a user might
9331 ** set the flag, load the schema, and then unset the flag. */
9332 return SQLITE_CORRUPT_PGNO(pCur->pgnoRoot);
9336 /* Ensure that the cursor is not in the CURSOR_FAULT state and that it
9337 ** points to a valid cell.
9339 if( pCur->eState>=CURSOR_REQUIRESEEK ){
9340 testcase( pCur->eState==CURSOR_REQUIRESEEK );
9341 testcase( pCur->eState==CURSOR_FAULT );
9342 rc = moveToRoot(pCur);
9343 if( rc && rc!=SQLITE_EMPTY ) return rc;
9346 assert( cursorOwnsBtShared(pCur) );
9347 assert( (pCur->curFlags & BTCF_WriteFlag)!=0
9348 && p->pBt->inTransaction==TRANS_WRITE
9349 && (p->pBt->btsFlags & BTS_READ_ONLY)==0 );
9350 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
9352 /* Assert that the caller has been consistent. If this cursor was opened
9353 ** expecting an index b-tree, then the caller should be inserting blob
9354 ** keys with no associated data. If the cursor was opened expecting an
9355 ** intkey table, the caller should be inserting integer keys with a
9356 ** blob of associated data. */
9357 assert( (flags & BTREE_PREFORMAT) || (pX->pKey==0)==(pCur->pKeyInfo==0) );
9359 if( pCur->pKeyInfo==0 ){
9360 assert( pX->pKey==0 );
9361 /* If this is an insert into a table b-tree, invalidate any incrblob
9362 ** cursors open on the row being replaced */
9363 if( p->hasIncrblobCur ){
9364 invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0);
9367 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
9368 ** to a row with the same key as the new entry being inserted.
9370 #ifdef SQLITE_DEBUG
9371 if( flags & BTREE_SAVEPOSITION ){
9372 assert( pCur->curFlags & BTCF_ValidNKey );
9373 assert( pX->nKey==pCur->info.nKey );
9374 assert( loc==0 );
9376 #endif
9378 /* On the other hand, BTREE_SAVEPOSITION==0 does not imply
9379 ** that the cursor is not pointing to a row to be overwritten.
9380 ** So do a complete check.
9382 if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
9383 /* The cursor is pointing to the entry that is to be
9384 ** overwritten */
9385 assert( pX->nData>=0 && pX->nZero>=0 );
9386 if( pCur->info.nSize!=0
9387 && pCur->info.nPayload==(u32)pX->nData+pX->nZero
9389 /* New entry is the same size as the old. Do an overwrite */
9390 return btreeOverwriteCell(pCur, pX);
9392 assert( loc==0 );
9393 }else if( loc==0 ){
9394 /* The cursor is *not* pointing to the cell to be overwritten, nor
9395 ** to an adjacent cell. Move the cursor so that it is pointing either
9396 ** to the cell to be overwritten or an adjacent cell.
9398 rc = sqlite3BtreeTableMoveto(pCur, pX->nKey,
9399 (flags & BTREE_APPEND)!=0, &loc);
9400 if( rc ) return rc;
9402 }else{
9403 /* This is an index or a WITHOUT ROWID table */
9405 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
9406 ** to a row with the same key as the new entry being inserted.
9408 assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 );
9410 /* If the cursor is not already pointing either to the cell to be
9411 ** overwritten, or if a new cell is being inserted, if the cursor is
9412 ** not pointing to an immediately adjacent cell, then move the cursor
9413 ** so that it does.
9415 if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){
9416 if( pX->nMem ){
9417 UnpackedRecord r;
9418 r.pKeyInfo = pCur->pKeyInfo;
9419 r.aMem = pX->aMem;
9420 r.nField = pX->nMem;
9421 r.default_rc = 0;
9422 r.eqSeen = 0;
9423 rc = sqlite3BtreeIndexMoveto(pCur, &r, &loc);
9424 }else{
9425 rc = btreeMoveto(pCur, pX->pKey, pX->nKey,
9426 (flags & BTREE_APPEND)!=0, &loc);
9428 if( rc ) return rc;
9431 /* If the cursor is currently pointing to an entry to be overwritten
9432 ** and the new content is the same as as the old, then use the
9433 ** overwrite optimization.
9435 if( loc==0 ){
9436 getCellInfo(pCur);
9437 if( pCur->info.nKey==pX->nKey ){
9438 BtreePayload x2;
9439 x2.pData = pX->pKey;
9440 x2.nData = pX->nKey;
9441 x2.nZero = 0;
9442 return btreeOverwriteCell(pCur, &x2);
9446 assert( pCur->eState==CURSOR_VALID
9447 || (pCur->eState==CURSOR_INVALID && loc) || CORRUPT_DB );
9449 pPage = pCur->pPage;
9450 assert( pPage->intKey || pX->nKey>=0 || (flags & BTREE_PREFORMAT) );
9451 assert( pPage->leaf || !pPage->intKey );
9452 if( pPage->nFree<0 ){
9453 if( NEVER(pCur->eState>CURSOR_INVALID) ){
9454 /* ^^^^^--- due to the moveToRoot() call above */
9455 rc = SQLITE_CORRUPT_PAGE(pPage);
9456 }else{
9457 rc = btreeComputeFreeSpace(pPage);
9459 if( rc ) return rc;
9462 TRACE(("INSERT: table=%u nkey=%lld ndata=%u page=%u %s\n",
9463 pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
9464 loc==0 ? "overwrite" : "new entry"));
9465 assert( pPage->isInit || CORRUPT_DB );
9466 newCell = p->pBt->pTmpSpace;
9467 assert( newCell!=0 );
9468 assert( BTREE_PREFORMAT==OPFLAG_PREFORMAT );
9469 if( flags & BTREE_PREFORMAT ){
9470 rc = SQLITE_OK;
9471 szNew = p->pBt->nPreformatSize;
9472 if( szNew<4 ){
9473 szNew = 4;
9474 newCell[3] = 0;
9476 if( ISAUTOVACUUM(p->pBt) && szNew>pPage->maxLocal ){
9477 CellInfo info;
9478 pPage->xParseCell(pPage, newCell, &info);
9479 if( info.nPayload!=info.nLocal ){
9480 Pgno ovfl = get4byte(&newCell[szNew-4]);
9481 ptrmapPut(p->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, &rc);
9482 if( NEVER(rc) ) goto end_insert;
9485 }else{
9486 rc = fillInCell(pPage, newCell, pX, &szNew);
9487 if( rc ) goto end_insert;
9489 assert( szNew==pPage->xCellSize(pPage, newCell) );
9490 assert( szNew <= MX_CELL_SIZE(p->pBt) );
9491 idx = pCur->ix;
9492 pCur->info.nSize = 0;
9493 if( loc==0 ){
9494 CellInfo info;
9495 assert( idx>=0 );
9496 if( idx>=pPage->nCell ){
9497 return SQLITE_CORRUPT_PAGE(pPage);
9499 rc = sqlite3PagerWrite(pPage->pDbPage);
9500 if( rc ){
9501 goto end_insert;
9503 oldCell = findCell(pPage, idx);
9504 if( !pPage->leaf ){
9505 memcpy(newCell, oldCell, 4);
9507 BTREE_CLEAR_CELL(rc, pPage, oldCell, info);
9508 testcase( pCur->curFlags & BTCF_ValidOvfl );
9509 invalidateOverflowCache(pCur);
9510 if( info.nSize==szNew && info.nLocal==info.nPayload
9511 && (!ISAUTOVACUUM(p->pBt) || szNew<pPage->minLocal)
9513 /* Overwrite the old cell with the new if they are the same size.
9514 ** We could also try to do this if the old cell is smaller, then add
9515 ** the leftover space to the free list. But experiments show that
9516 ** doing that is no faster then skipping this optimization and just
9517 ** calling dropCell() and insertCell().
9519 ** This optimization cannot be used on an autovacuum database if the
9520 ** new entry uses overflow pages, as the insertCell() call below is
9521 ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry. */
9522 assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
9523 if( oldCell < pPage->aData+pPage->hdrOffset+10 ){
9524 return SQLITE_CORRUPT_PAGE(pPage);
9526 if( oldCell+szNew > pPage->aDataEnd ){
9527 return SQLITE_CORRUPT_PAGE(pPage);
9529 memcpy(oldCell, newCell, szNew);
9530 return SQLITE_OK;
9532 dropCell(pPage, idx, info.nSize, &rc);
9533 if( rc ) goto end_insert;
9534 }else if( loc<0 && pPage->nCell>0 ){
9535 assert( pPage->leaf );
9536 idx = ++pCur->ix;
9537 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
9538 }else{
9539 assert( pPage->leaf );
9541 rc = insertCellFast(pPage, idx, newCell, szNew);
9542 assert( pPage->nOverflow==0 || rc==SQLITE_OK );
9543 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
9545 /* If no error has occurred and pPage has an overflow cell, call balance()
9546 ** to redistribute the cells within the tree. Since balance() may move
9547 ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
9548 ** variables.
9550 ** Previous versions of SQLite called moveToRoot() to move the cursor
9551 ** back to the root page as balance() used to invalidate the contents
9552 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
9553 ** set the cursor state to "invalid". This makes common insert operations
9554 ** slightly faster.
9556 ** There is a subtle but important optimization here too. When inserting
9557 ** multiple records into an intkey b-tree using a single cursor (as can
9558 ** happen while processing an "INSERT INTO ... SELECT" statement), it
9559 ** is advantageous to leave the cursor pointing to the last entry in
9560 ** the b-tree if possible. If the cursor is left pointing to the last
9561 ** entry in the table, and the next row inserted has an integer key
9562 ** larger than the largest existing key, it is possible to insert the
9563 ** row without seeking the cursor. This can be a big performance boost.
9565 if( pPage->nOverflow ){
9566 assert( rc==SQLITE_OK );
9567 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
9568 rc = balance(pCur);
9570 /* Must make sure nOverflow is reset to zero even if the balance()
9571 ** fails. Internal data structure corruption will result otherwise.
9572 ** Also, set the cursor state to invalid. This stops saveCursorPosition()
9573 ** from trying to save the current position of the cursor. */
9574 pCur->pPage->nOverflow = 0;
9575 pCur->eState = CURSOR_INVALID;
9576 if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){
9577 btreeReleaseAllCursorPages(pCur);
9578 if( pCur->pKeyInfo ){
9579 assert( pCur->pKey==0 );
9580 pCur->pKey = sqlite3Malloc( pX->nKey );
9581 if( pCur->pKey==0 ){
9582 rc = SQLITE_NOMEM;
9583 }else{
9584 memcpy(pCur->pKey, pX->pKey, pX->nKey);
9587 pCur->eState = CURSOR_REQUIRESEEK;
9588 pCur->nKey = pX->nKey;
9591 assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 );
9593 end_insert:
9594 return rc;
9598 ** This function is used as part of copying the current row from cursor
9599 ** pSrc into cursor pDest. If the cursors are open on intkey tables, then
9600 ** parameter iKey is used as the rowid value when the record is copied
9601 ** into pDest. Otherwise, the record is copied verbatim.
9603 ** This function does not actually write the new value to cursor pDest.
9604 ** Instead, it creates and populates any required overflow pages and
9605 ** writes the data for the new cell into the BtShared.pTmpSpace buffer
9606 ** for the destination database. The size of the cell, in bytes, is left
9607 ** in BtShared.nPreformatSize. The caller completes the insertion by
9608 ** calling sqlite3BtreeInsert() with the BTREE_PREFORMAT flag specified.
9610 ** SQLITE_OK is returned if successful, or an SQLite error code otherwise.
9612 int sqlite3BtreeTransferRow(BtCursor *pDest, BtCursor *pSrc, i64 iKey){
9613 BtShared *pBt = pDest->pBt;
9614 u8 *aOut = pBt->pTmpSpace; /* Pointer to next output buffer */
9615 const u8 *aIn; /* Pointer to next input buffer */
9616 u32 nIn; /* Size of input buffer aIn[] */
9617 u32 nRem; /* Bytes of data still to copy */
9619 getCellInfo(pSrc);
9620 if( pSrc->info.nPayload<0x80 ){
9621 *(aOut++) = pSrc->info.nPayload;
9622 }else{
9623 aOut += sqlite3PutVarint(aOut, pSrc->info.nPayload);
9625 if( pDest->pKeyInfo==0 ) aOut += putVarint(aOut, iKey);
9626 nIn = pSrc->info.nLocal;
9627 aIn = pSrc->info.pPayload;
9628 if( aIn+nIn>pSrc->pPage->aDataEnd ){
9629 return SQLITE_CORRUPT_PAGE(pSrc->pPage);
9631 nRem = pSrc->info.nPayload;
9632 if( nIn==nRem && nIn<pDest->pPage->maxLocal ){
9633 memcpy(aOut, aIn, nIn);
9634 pBt->nPreformatSize = nIn + (aOut - pBt->pTmpSpace);
9635 return SQLITE_OK;
9636 }else{
9637 int rc = SQLITE_OK;
9638 Pager *pSrcPager = pSrc->pBt->pPager;
9639 u8 *pPgnoOut = 0;
9640 Pgno ovflIn = 0;
9641 DbPage *pPageIn = 0;
9642 MemPage *pPageOut = 0;
9643 u32 nOut; /* Size of output buffer aOut[] */
9645 nOut = btreePayloadToLocal(pDest->pPage, pSrc->info.nPayload);
9646 pBt->nPreformatSize = nOut + (aOut - pBt->pTmpSpace);
9647 if( nOut<pSrc->info.nPayload ){
9648 pPgnoOut = &aOut[nOut];
9649 pBt->nPreformatSize += 4;
9652 if( nRem>nIn ){
9653 if( aIn+nIn+4>pSrc->pPage->aDataEnd ){
9654 return SQLITE_CORRUPT_PAGE(pSrc->pPage);
9656 ovflIn = get4byte(&pSrc->info.pPayload[nIn]);
9659 do {
9660 nRem -= nOut;
9662 assert( nOut>0 );
9663 if( nIn>0 ){
9664 int nCopy = MIN(nOut, nIn);
9665 memcpy(aOut, aIn, nCopy);
9666 nOut -= nCopy;
9667 nIn -= nCopy;
9668 aOut += nCopy;
9669 aIn += nCopy;
9671 if( nOut>0 ){
9672 sqlite3PagerUnref(pPageIn);
9673 pPageIn = 0;
9674 rc = sqlite3PagerGet(pSrcPager, ovflIn, &pPageIn, PAGER_GET_READONLY);
9675 if( rc==SQLITE_OK ){
9676 aIn = (const u8*)sqlite3PagerGetData(pPageIn);
9677 ovflIn = get4byte(aIn);
9678 aIn += 4;
9679 nIn = pSrc->pBt->usableSize - 4;
9682 }while( rc==SQLITE_OK && nOut>0 );
9684 if( rc==SQLITE_OK && nRem>0 && ALWAYS(pPgnoOut) ){
9685 Pgno pgnoNew;
9686 MemPage *pNew = 0;
9687 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
9688 put4byte(pPgnoOut, pgnoNew);
9689 if( ISAUTOVACUUM(pBt) && pPageOut ){
9690 ptrmapPut(pBt, pgnoNew, PTRMAP_OVERFLOW2, pPageOut->pgno, &rc);
9692 releasePage(pPageOut);
9693 pPageOut = pNew;
9694 if( pPageOut ){
9695 pPgnoOut = pPageOut->aData;
9696 put4byte(pPgnoOut, 0);
9697 aOut = &pPgnoOut[4];
9698 nOut = MIN(pBt->usableSize - 4, nRem);
9701 }while( nRem>0 && rc==SQLITE_OK );
9703 releasePage(pPageOut);
9704 sqlite3PagerUnref(pPageIn);
9705 return rc;
9710 ** Delete the entry that the cursor is pointing to.
9712 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
9713 ** the cursor is left pointing at an arbitrary location after the delete.
9714 ** But if that bit is set, then the cursor is left in a state such that
9715 ** the next call to BtreeNext() or BtreePrev() moves it to the same row
9716 ** as it would have been on if the call to BtreeDelete() had been omitted.
9718 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
9719 ** associated with a single table entry and its indexes. Only one of those
9720 ** deletes is considered the "primary" delete. The primary delete occurs
9721 ** on a cursor that is not a BTREE_FORDELETE cursor. All but one delete
9722 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
9723 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
9724 ** but which might be used by alternative storage engines.
9726 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
9727 Btree *p = pCur->pBtree;
9728 BtShared *pBt = p->pBt;
9729 int rc; /* Return code */
9730 MemPage *pPage; /* Page to delete cell from */
9731 unsigned char *pCell; /* Pointer to cell to delete */
9732 int iCellIdx; /* Index of cell to delete */
9733 int iCellDepth; /* Depth of node containing pCell */
9734 CellInfo info; /* Size of the cell being deleted */
9735 u8 bPreserve; /* Keep cursor valid. 2 for CURSOR_SKIPNEXT */
9737 assert( cursorOwnsBtShared(pCur) );
9738 assert( pBt->inTransaction==TRANS_WRITE );
9739 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
9740 assert( pCur->curFlags & BTCF_WriteFlag );
9741 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
9742 assert( !hasReadConflicts(p, pCur->pgnoRoot) );
9743 assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
9744 if( pCur->eState!=CURSOR_VALID ){
9745 if( pCur->eState>=CURSOR_REQUIRESEEK ){
9746 rc = btreeRestoreCursorPosition(pCur);
9747 assert( rc!=SQLITE_OK || CORRUPT_DB || pCur->eState==CURSOR_VALID );
9748 if( rc || pCur->eState!=CURSOR_VALID ) return rc;
9749 }else{
9750 return SQLITE_CORRUPT_PGNO(pCur->pgnoRoot);
9753 assert( pCur->eState==CURSOR_VALID );
9755 iCellDepth = pCur->iPage;
9756 iCellIdx = pCur->ix;
9757 pPage = pCur->pPage;
9758 if( pPage->nCell<=iCellIdx ){
9759 return SQLITE_CORRUPT_PAGE(pPage);
9761 pCell = findCell(pPage, iCellIdx);
9762 if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ){
9763 return SQLITE_CORRUPT_PAGE(pPage);
9765 if( pCell<&pPage->aCellIdx[pPage->nCell] ){
9766 return SQLITE_CORRUPT_PAGE(pPage);
9769 /* If the BTREE_SAVEPOSITION bit is on, then the cursor position must
9770 ** be preserved following this delete operation. If the current delete
9771 ** will cause a b-tree rebalance, then this is done by saving the cursor
9772 ** key and leaving the cursor in CURSOR_REQUIRESEEK state before
9773 ** returning.
9775 ** If the current delete will not cause a rebalance, then the cursor
9776 ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
9777 ** before or after the deleted entry.
9779 ** The bPreserve value records which path is required:
9781 ** bPreserve==0 Not necessary to save the cursor position
9782 ** bPreserve==1 Use CURSOR_REQUIRESEEK to save the cursor position
9783 ** bPreserve==2 Cursor won't move. Set CURSOR_SKIPNEXT.
9785 bPreserve = (flags & BTREE_SAVEPOSITION)!=0;
9786 if( bPreserve ){
9787 if( !pPage->leaf
9788 || (pPage->nFree+pPage->xCellSize(pPage,pCell)+2) >
9789 (int)(pBt->usableSize*2/3)
9790 || pPage->nCell==1 /* See dbfuzz001.test for a test case */
9792 /* A b-tree rebalance will be required after deleting this entry.
9793 ** Save the cursor key. */
9794 rc = saveCursorKey(pCur);
9795 if( rc ) return rc;
9796 }else{
9797 bPreserve = 2;
9801 /* If the page containing the entry to delete is not a leaf page, move
9802 ** the cursor to the largest entry in the tree that is smaller than
9803 ** the entry being deleted. This cell will replace the cell being deleted
9804 ** from the internal node. The 'previous' entry is used for this instead
9805 ** of the 'next' entry, as the previous entry is always a part of the
9806 ** sub-tree headed by the child page of the cell being deleted. This makes
9807 ** balancing the tree following the delete operation easier. */
9808 if( !pPage->leaf ){
9809 rc = sqlite3BtreePrevious(pCur, 0);
9810 assert( rc!=SQLITE_DONE );
9811 if( rc ) return rc;
9814 /* Save the positions of any other cursors open on this table before
9815 ** making any modifications. */
9816 if( pCur->curFlags & BTCF_Multiple ){
9817 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
9818 if( rc ) return rc;
9821 /* If this is a delete operation to remove a row from a table b-tree,
9822 ** invalidate any incrblob cursors open on the row being deleted. */
9823 if( pCur->pKeyInfo==0 && p->hasIncrblobCur ){
9824 invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0);
9827 /* Make the page containing the entry to be deleted writable. Then free any
9828 ** overflow pages associated with the entry and finally remove the cell
9829 ** itself from within the page. */
9830 rc = sqlite3PagerWrite(pPage->pDbPage);
9831 if( rc ) return rc;
9832 BTREE_CLEAR_CELL(rc, pPage, pCell, info);
9833 dropCell(pPage, iCellIdx, info.nSize, &rc);
9834 if( rc ) return rc;
9836 /* If the cell deleted was not located on a leaf page, then the cursor
9837 ** is currently pointing to the largest entry in the sub-tree headed
9838 ** by the child-page of the cell that was just deleted from an internal
9839 ** node. The cell from the leaf node needs to be moved to the internal
9840 ** node to replace the deleted cell. */
9841 if( !pPage->leaf ){
9842 MemPage *pLeaf = pCur->pPage;
9843 int nCell;
9844 Pgno n;
9845 unsigned char *pTmp;
9847 if( pLeaf->nFree<0 ){
9848 rc = btreeComputeFreeSpace(pLeaf);
9849 if( rc ) return rc;
9851 if( iCellDepth<pCur->iPage-1 ){
9852 n = pCur->apPage[iCellDepth+1]->pgno;
9853 }else{
9854 n = pCur->pPage->pgno;
9856 pCell = findCell(pLeaf, pLeaf->nCell-1);
9857 if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_PAGE(pLeaf);
9858 nCell = pLeaf->xCellSize(pLeaf, pCell);
9859 assert( MX_CELL_SIZE(pBt) >= nCell );
9860 pTmp = pBt->pTmpSpace;
9861 assert( pTmp!=0 );
9862 rc = sqlite3PagerWrite(pLeaf->pDbPage);
9863 if( rc==SQLITE_OK ){
9864 rc = insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n);
9866 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
9867 if( rc ) return rc;
9870 /* Balance the tree. If the entry deleted was located on a leaf page,
9871 ** then the cursor still points to that page. In this case the first
9872 ** call to balance() repairs the tree, and the if(...) condition is
9873 ** never true.
9875 ** Otherwise, if the entry deleted was on an internal node page, then
9876 ** pCur is pointing to the leaf page from which a cell was removed to
9877 ** replace the cell deleted from the internal node. This is slightly
9878 ** tricky as the leaf node may be underfull, and the internal node may
9879 ** be either under or overfull. In this case run the balancing algorithm
9880 ** on the leaf node first. If the balance proceeds far enough up the
9881 ** tree that we can be sure that any problem in the internal node has
9882 ** been corrected, so be it. Otherwise, after balancing the leaf node,
9883 ** walk the cursor up the tree to the internal node and balance it as
9884 ** well. */
9885 assert( pCur->pPage->nOverflow==0 );
9886 assert( pCur->pPage->nFree>=0 );
9887 if( pCur->pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){
9888 /* Optimization: If the free space is less than 2/3rds of the page,
9889 ** then balance() will always be a no-op. No need to invoke it. */
9890 rc = SQLITE_OK;
9891 }else{
9892 rc = balance(pCur);
9894 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
9895 releasePageNotNull(pCur->pPage);
9896 pCur->iPage--;
9897 while( pCur->iPage>iCellDepth ){
9898 releasePage(pCur->apPage[pCur->iPage--]);
9900 pCur->pPage = pCur->apPage[pCur->iPage];
9901 rc = balance(pCur);
9904 if( rc==SQLITE_OK ){
9905 if( bPreserve>1 ){
9906 assert( (pCur->iPage==iCellDepth || CORRUPT_DB) );
9907 assert( pPage==pCur->pPage || CORRUPT_DB );
9908 assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
9909 pCur->eState = CURSOR_SKIPNEXT;
9910 if( iCellIdx>=pPage->nCell ){
9911 pCur->skipNext = -1;
9912 pCur->ix = pPage->nCell-1;
9913 }else{
9914 pCur->skipNext = 1;
9916 }else{
9917 rc = moveToRoot(pCur);
9918 if( bPreserve ){
9919 btreeReleaseAllCursorPages(pCur);
9920 pCur->eState = CURSOR_REQUIRESEEK;
9922 if( rc==SQLITE_EMPTY ) rc = SQLITE_OK;
9925 return rc;
9929 ** Create a new BTree table. Write into *piTable the page
9930 ** number for the root page of the new table.
9932 ** The type of type is determined by the flags parameter. Only the
9933 ** following values of flags are currently in use. Other values for
9934 ** flags might not work:
9936 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
9937 ** BTREE_ZERODATA Used for SQL indices
9939 static int btreeCreateTable(Btree *p, Pgno *piTable, int createTabFlags){
9940 BtShared *pBt = p->pBt;
9941 MemPage *pRoot;
9942 Pgno pgnoRoot;
9943 int rc;
9944 int ptfFlags; /* Page-type flags for the root page of new table */
9946 assert( sqlite3BtreeHoldsMutex(p) );
9947 assert( pBt->inTransaction==TRANS_WRITE );
9948 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
9950 #ifdef SQLITE_OMIT_AUTOVACUUM
9951 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
9952 if( rc ){
9953 return rc;
9955 #else
9956 if( pBt->autoVacuum ){
9957 Pgno pgnoMove; /* Move a page here to make room for the root-page */
9958 MemPage *pPageMove; /* The page to move to. */
9960 /* Creating a new table may probably require moving an existing database
9961 ** to make room for the new tables root page. In case this page turns
9962 ** out to be an overflow page, delete all overflow page-map caches
9963 ** held by open cursors.
9965 invalidateAllOverflowCache(pBt);
9967 /* Read the value of meta[3] from the database to determine where the
9968 ** root page of the new table should go. meta[3] is the largest root-page
9969 ** created so far, so the new root-page is (meta[3]+1).
9971 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
9972 if( pgnoRoot>btreePagecount(pBt) ){
9973 return SQLITE_CORRUPT_PGNO(pgnoRoot);
9975 pgnoRoot++;
9977 /* The new root-page may not be allocated on a pointer-map page, or the
9978 ** PENDING_BYTE page.
9980 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
9981 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
9982 pgnoRoot++;
9984 assert( pgnoRoot>=3 );
9986 /* Allocate a page. The page that currently resides at pgnoRoot will
9987 ** be moved to the allocated page (unless the allocated page happens
9988 ** to reside at pgnoRoot).
9990 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
9991 if( rc!=SQLITE_OK ){
9992 return rc;
9995 if( pgnoMove!=pgnoRoot ){
9996 /* pgnoRoot is the page that will be used for the root-page of
9997 ** the new table (assuming an error did not occur). But we were
9998 ** allocated pgnoMove. If required (i.e. if it was not allocated
9999 ** by extending the file), the current page at position pgnoMove
10000 ** is already journaled.
10002 u8 eType = 0;
10003 Pgno iPtrPage = 0;
10005 /* Save the positions of any open cursors. This is required in
10006 ** case they are holding a reference to an xFetch reference
10007 ** corresponding to page pgnoRoot. */
10008 rc = saveAllCursors(pBt, 0, 0);
10009 releasePage(pPageMove);
10010 if( rc!=SQLITE_OK ){
10011 return rc;
10014 /* Move the page currently at pgnoRoot to pgnoMove. */
10015 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
10016 if( rc!=SQLITE_OK ){
10017 return rc;
10019 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
10020 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
10021 rc = SQLITE_CORRUPT_PGNO(pgnoRoot);
10023 if( rc!=SQLITE_OK ){
10024 releasePage(pRoot);
10025 return rc;
10027 assert( eType!=PTRMAP_ROOTPAGE );
10028 assert( eType!=PTRMAP_FREEPAGE );
10029 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
10030 releasePage(pRoot);
10032 /* Obtain the page at pgnoRoot */
10033 if( rc!=SQLITE_OK ){
10034 return rc;
10036 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
10037 if( rc!=SQLITE_OK ){
10038 return rc;
10040 rc = sqlite3PagerWrite(pRoot->pDbPage);
10041 if( rc!=SQLITE_OK ){
10042 releasePage(pRoot);
10043 return rc;
10045 }else{
10046 pRoot = pPageMove;
10049 /* Update the pointer-map and meta-data with the new root-page number. */
10050 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
10051 if( rc ){
10052 releasePage(pRoot);
10053 return rc;
10056 /* When the new root page was allocated, page 1 was made writable in
10057 ** order either to increase the database filesize, or to decrement the
10058 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
10060 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
10061 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
10062 if( NEVER(rc) ){
10063 releasePage(pRoot);
10064 return rc;
10067 }else{
10068 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
10069 if( rc ) return rc;
10071 #endif
10072 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
10073 if( createTabFlags & BTREE_INTKEY ){
10074 ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
10075 }else{
10076 ptfFlags = PTF_ZERODATA | PTF_LEAF;
10078 zeroPage(pRoot, ptfFlags);
10079 sqlite3PagerUnref(pRoot->pDbPage);
10080 assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
10081 *piTable = pgnoRoot;
10082 return SQLITE_OK;
10084 int sqlite3BtreeCreateTable(Btree *p, Pgno *piTable, int flags){
10085 int rc;
10086 sqlite3BtreeEnter(p);
10087 rc = btreeCreateTable(p, piTable, flags);
10088 sqlite3BtreeLeave(p);
10089 return rc;
10093 ** Erase the given database page and all its children. Return
10094 ** the page to the freelist.
10096 static int clearDatabasePage(
10097 BtShared *pBt, /* The BTree that contains the table */
10098 Pgno pgno, /* Page number to clear */
10099 int freePageFlag, /* Deallocate page if true */
10100 i64 *pnChange /* Add number of Cells freed to this counter */
10102 MemPage *pPage;
10103 int rc;
10104 unsigned char *pCell;
10105 int i;
10106 int hdr;
10107 CellInfo info;
10109 assert( sqlite3_mutex_held(pBt->mutex) );
10110 if( pgno>btreePagecount(pBt) ){
10111 return SQLITE_CORRUPT_PGNO(pgno);
10113 rc = getAndInitPage(pBt, pgno, &pPage, 0);
10114 if( rc ) return rc;
10115 if( (pBt->openFlags & BTREE_SINGLE)==0
10116 && sqlite3PagerPageRefcount(pPage->pDbPage) != (1 + (pgno==1))
10118 rc = SQLITE_CORRUPT_PAGE(pPage);
10119 goto cleardatabasepage_out;
10121 hdr = pPage->hdrOffset;
10122 for(i=0; i<pPage->nCell; i++){
10123 pCell = findCell(pPage, i);
10124 if( !pPage->leaf ){
10125 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
10126 if( rc ) goto cleardatabasepage_out;
10128 BTREE_CLEAR_CELL(rc, pPage, pCell, info);
10129 if( rc ) goto cleardatabasepage_out;
10131 if( !pPage->leaf ){
10132 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
10133 if( rc ) goto cleardatabasepage_out;
10134 if( pPage->intKey ) pnChange = 0;
10136 if( pnChange ){
10137 testcase( !pPage->intKey );
10138 *pnChange += pPage->nCell;
10140 if( freePageFlag ){
10141 freePage(pPage, &rc);
10142 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
10143 zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
10146 cleardatabasepage_out:
10147 releasePage(pPage);
10148 return rc;
10152 ** Delete all information from a single table in the database. iTable is
10153 ** the page number of the root of the table. After this routine returns,
10154 ** the root page is empty, but still exists.
10156 ** This routine will fail with SQLITE_LOCKED if there are any open
10157 ** read cursors on the table. Open write cursors are moved to the
10158 ** root of the table.
10160 ** If pnChange is not NULL, then the integer value pointed to by pnChange
10161 ** is incremented by the number of entries in the table.
10163 int sqlite3BtreeClearTable(Btree *p, int iTable, i64 *pnChange){
10164 int rc;
10165 BtShared *pBt = p->pBt;
10166 sqlite3BtreeEnter(p);
10167 assert( p->inTrans==TRANS_WRITE );
10169 rc = saveAllCursors(pBt, (Pgno)iTable, 0);
10171 if( SQLITE_OK==rc ){
10172 /* Invalidate all incrblob cursors open on table iTable (assuming iTable
10173 ** is the root of a table b-tree - if it is not, the following call is
10174 ** a no-op). */
10175 if( p->hasIncrblobCur ){
10176 invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1);
10178 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
10180 sqlite3BtreeLeave(p);
10181 return rc;
10185 ** Delete all information from the single table that pCur is open on.
10187 ** This routine only work for pCur on an ephemeral table.
10189 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
10190 return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
10194 ** Erase all information in a table and add the root of the table to
10195 ** the freelist. Except, the root of the principle table (the one on
10196 ** page 1) is never added to the freelist.
10198 ** This routine will fail with SQLITE_LOCKED if there are any open
10199 ** cursors on the table.
10201 ** If AUTOVACUUM is enabled and the page at iTable is not the last
10202 ** root page in the database file, then the last root page
10203 ** in the database file is moved into the slot formerly occupied by
10204 ** iTable and that last slot formerly occupied by the last root page
10205 ** is added to the freelist instead of iTable. In this say, all
10206 ** root pages are kept at the beginning of the database file, which
10207 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the
10208 ** page number that used to be the last root page in the file before
10209 ** the move. If no page gets moved, *piMoved is set to 0.
10210 ** The last root page is recorded in meta[3] and the value of
10211 ** meta[3] is updated by this procedure.
10213 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
10214 int rc;
10215 MemPage *pPage = 0;
10216 BtShared *pBt = p->pBt;
10218 assert( sqlite3BtreeHoldsMutex(p) );
10219 assert( p->inTrans==TRANS_WRITE );
10220 assert( iTable>=2 );
10221 if( iTable>btreePagecount(pBt) ){
10222 return SQLITE_CORRUPT_PGNO(iTable);
10225 rc = sqlite3BtreeClearTable(p, iTable, 0);
10226 if( rc ) return rc;
10227 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
10228 if( NEVER(rc) ){
10229 releasePage(pPage);
10230 return rc;
10233 *piMoved = 0;
10235 #ifdef SQLITE_OMIT_AUTOVACUUM
10236 freePage(pPage, &rc);
10237 releasePage(pPage);
10238 #else
10239 if( pBt->autoVacuum ){
10240 Pgno maxRootPgno;
10241 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
10243 if( iTable==maxRootPgno ){
10244 /* If the table being dropped is the table with the largest root-page
10245 ** number in the database, put the root page on the free list.
10247 freePage(pPage, &rc);
10248 releasePage(pPage);
10249 if( rc!=SQLITE_OK ){
10250 return rc;
10252 }else{
10253 /* The table being dropped does not have the largest root-page
10254 ** number in the database. So move the page that does into the
10255 ** gap left by the deleted root-page.
10257 MemPage *pMove;
10258 releasePage(pPage);
10259 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
10260 if( rc!=SQLITE_OK ){
10261 return rc;
10263 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
10264 releasePage(pMove);
10265 if( rc!=SQLITE_OK ){
10266 return rc;
10268 pMove = 0;
10269 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
10270 freePage(pMove, &rc);
10271 releasePage(pMove);
10272 if( rc!=SQLITE_OK ){
10273 return rc;
10275 *piMoved = maxRootPgno;
10278 /* Set the new 'max-root-page' value in the database header. This
10279 ** is the old value less one, less one more if that happens to
10280 ** be a root-page number, less one again if that is the
10281 ** PENDING_BYTE_PAGE.
10283 maxRootPgno--;
10284 while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
10285 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
10286 maxRootPgno--;
10288 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
10290 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
10291 }else{
10292 freePage(pPage, &rc);
10293 releasePage(pPage);
10295 #endif
10296 return rc;
10298 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
10299 int rc;
10300 sqlite3BtreeEnter(p);
10301 rc = btreeDropTable(p, iTable, piMoved);
10302 sqlite3BtreeLeave(p);
10303 return rc;
10308 ** This function may only be called if the b-tree connection already
10309 ** has a read or write transaction open on the database.
10311 ** Read the meta-information out of a database file. Meta[0]
10312 ** is the number of free pages currently in the database. Meta[1]
10313 ** through meta[15] are available for use by higher layers. Meta[0]
10314 ** is read-only, the others are read/write.
10316 ** The schema layer numbers meta values differently. At the schema
10317 ** layer (and the SetCookie and ReadCookie opcodes) the number of
10318 ** free pages is not visible. So Cookie[0] is the same as Meta[1].
10320 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead
10321 ** of reading the value out of the header, it instead loads the "DataVersion"
10322 ** from the pager. The BTREE_DATA_VERSION value is not actually stored in the
10323 ** database file. It is a number computed by the pager. But its access
10324 ** pattern is the same as header meta values, and so it is convenient to
10325 ** read it from this routine.
10327 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
10328 BtShared *pBt = p->pBt;
10330 sqlite3BtreeEnter(p);
10331 assert( p->inTrans>TRANS_NONE );
10332 assert( SQLITE_OK==querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK) );
10333 assert( pBt->pPage1 );
10334 assert( idx>=0 && idx<=15 );
10336 if( idx==BTREE_DATA_VERSION ){
10337 *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iBDataVersion;
10338 }else{
10339 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
10342 /* If auto-vacuum is disabled in this build and this is an auto-vacuum
10343 ** database, mark the database as read-only. */
10344 #ifdef SQLITE_OMIT_AUTOVACUUM
10345 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
10346 pBt->btsFlags |= BTS_READ_ONLY;
10348 #endif
10350 sqlite3BtreeLeave(p);
10354 ** Write meta-information back into the database. Meta[0] is
10355 ** read-only and may not be written.
10357 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
10358 BtShared *pBt = p->pBt;
10359 unsigned char *pP1;
10360 int rc;
10361 assert( idx>=1 && idx<=15 );
10362 sqlite3BtreeEnter(p);
10363 assert( p->inTrans==TRANS_WRITE );
10364 assert( pBt->pPage1!=0 );
10365 pP1 = pBt->pPage1->aData;
10366 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
10367 if( rc==SQLITE_OK ){
10368 put4byte(&pP1[36 + idx*4], iMeta);
10369 #ifndef SQLITE_OMIT_AUTOVACUUM
10370 if( idx==BTREE_INCR_VACUUM ){
10371 assert( pBt->autoVacuum || iMeta==0 );
10372 assert( iMeta==0 || iMeta==1 );
10373 pBt->incrVacuum = (u8)iMeta;
10375 #endif
10377 sqlite3BtreeLeave(p);
10378 return rc;
10382 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
10383 ** number of entries in the b-tree and write the result to *pnEntry.
10385 ** SQLITE_OK is returned if the operation is successfully executed.
10386 ** Otherwise, if an error is encountered (i.e. an IO error or database
10387 ** corruption) an SQLite error code is returned.
10389 int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){
10390 i64 nEntry = 0; /* Value to return in *pnEntry */
10391 int rc; /* Return code */
10393 rc = moveToRoot(pCur);
10394 if( rc==SQLITE_EMPTY ){
10395 *pnEntry = 0;
10396 return SQLITE_OK;
10399 /* Unless an error occurs, the following loop runs one iteration for each
10400 ** page in the B-Tree structure (not including overflow pages).
10402 while( rc==SQLITE_OK && !AtomicLoad(&db->u1.isInterrupted) ){
10403 int iIdx; /* Index of child node in parent */
10404 MemPage *pPage; /* Current page of the b-tree */
10406 /* If this is a leaf page or the tree is not an int-key tree, then
10407 ** this page contains countable entries. Increment the entry counter
10408 ** accordingly.
10410 pPage = pCur->pPage;
10411 if( pPage->leaf || !pPage->intKey ){
10412 nEntry += pPage->nCell;
10415 /* pPage is a leaf node. This loop navigates the cursor so that it
10416 ** points to the first interior cell that it points to the parent of
10417 ** the next page in the tree that has not yet been visited. The
10418 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
10419 ** of the page, or to the number of cells in the page if the next page
10420 ** to visit is the right-child of its parent.
10422 ** If all pages in the tree have been visited, return SQLITE_OK to the
10423 ** caller.
10425 if( pPage->leaf ){
10426 do {
10427 if( pCur->iPage==0 ){
10428 /* All pages of the b-tree have been visited. Return successfully. */
10429 *pnEntry = nEntry;
10430 return moveToRoot(pCur);
10432 moveToParent(pCur);
10433 }while ( pCur->ix>=pCur->pPage->nCell );
10435 pCur->ix++;
10436 pPage = pCur->pPage;
10439 /* Descend to the child node of the cell that the cursor currently
10440 ** points at. This is the right-child if (iIdx==pPage->nCell).
10442 iIdx = pCur->ix;
10443 if( iIdx==pPage->nCell ){
10444 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
10445 }else{
10446 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
10450 /* An error has occurred. Return an error code. */
10451 return rc;
10455 ** Return the pager associated with a BTree. This routine is used for
10456 ** testing and debugging only.
10458 Pager *sqlite3BtreePager(Btree *p){
10459 return p->pBt->pPager;
10462 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
10464 ** Record an OOM error during integrity_check
10466 static void checkOom(IntegrityCk *pCheck){
10467 pCheck->rc = SQLITE_NOMEM;
10468 pCheck->mxErr = 0; /* Causes integrity_check processing to stop */
10469 if( pCheck->nErr==0 ) pCheck->nErr++;
10473 ** Invoke the progress handler, if appropriate. Also check for an
10474 ** interrupt.
10476 static void checkProgress(IntegrityCk *pCheck){
10477 sqlite3 *db = pCheck->db;
10478 if( AtomicLoad(&db->u1.isInterrupted) ){
10479 pCheck->rc = SQLITE_INTERRUPT;
10480 pCheck->nErr++;
10481 pCheck->mxErr = 0;
10483 #ifndef SQLITE_OMIT_PROGRESS_CALLBACK
10484 if( db->xProgress ){
10485 assert( db->nProgressOps>0 );
10486 pCheck->nStep++;
10487 if( (pCheck->nStep % db->nProgressOps)==0
10488 && db->xProgress(db->pProgressArg)
10490 pCheck->rc = SQLITE_INTERRUPT;
10491 pCheck->nErr++;
10492 pCheck->mxErr = 0;
10495 #endif
10499 ** Append a message to the error message string.
10501 static void checkAppendMsg(
10502 IntegrityCk *pCheck,
10503 const char *zFormat,
10506 va_list ap;
10507 checkProgress(pCheck);
10508 if( !pCheck->mxErr ) return;
10509 pCheck->mxErr--;
10510 pCheck->nErr++;
10511 va_start(ap, zFormat);
10512 if( pCheck->errMsg.nChar ){
10513 sqlite3_str_append(&pCheck->errMsg, "\n", 1);
10515 if( pCheck->zPfx ){
10516 sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx,
10517 pCheck->v0, pCheck->v1, pCheck->v2);
10519 sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap);
10520 va_end(ap);
10521 if( pCheck->errMsg.accError==SQLITE_NOMEM ){
10522 checkOom(pCheck);
10525 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
10527 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
10530 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
10531 ** corresponds to page iPg is already set.
10533 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
10534 assert( pCheck->aPgRef!=0 );
10535 assert( iPg<=pCheck->nCkPage && sizeof(pCheck->aPgRef[0])==1 );
10536 return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
10540 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
10542 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
10543 assert( pCheck->aPgRef!=0 );
10544 assert( iPg<=pCheck->nCkPage && sizeof(pCheck->aPgRef[0])==1 );
10545 pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
10550 ** Add 1 to the reference count for page iPage. If this is the second
10551 ** reference to the page, add an error message to pCheck->zErrMsg.
10552 ** Return 1 if there are 2 or more references to the page and 0 if
10553 ** if this is the first reference to the page.
10555 ** Also check that the page number is in bounds.
10557 static int checkRef(IntegrityCk *pCheck, Pgno iPage){
10558 if( iPage>pCheck->nCkPage || iPage==0 ){
10559 checkAppendMsg(pCheck, "invalid page number %u", iPage);
10560 return 1;
10562 if( getPageReferenced(pCheck, iPage) ){
10563 checkAppendMsg(pCheck, "2nd reference to page %u", iPage);
10564 return 1;
10566 setPageReferenced(pCheck, iPage);
10567 return 0;
10570 #ifndef SQLITE_OMIT_AUTOVACUUM
10572 ** Check that the entry in the pointer-map for page iChild maps to
10573 ** page iParent, pointer type ptrType. If not, append an error message
10574 ** to pCheck.
10576 static void checkPtrmap(
10577 IntegrityCk *pCheck, /* Integrity check context */
10578 Pgno iChild, /* Child page number */
10579 u8 eType, /* Expected pointer map type */
10580 Pgno iParent /* Expected pointer map parent page number */
10582 int rc;
10583 u8 ePtrmapType;
10584 Pgno iPtrmapParent;
10586 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
10587 if( rc!=SQLITE_OK ){
10588 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) checkOom(pCheck);
10589 checkAppendMsg(pCheck, "Failed to read ptrmap key=%u", iChild);
10590 return;
10593 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
10594 checkAppendMsg(pCheck,
10595 "Bad ptr map entry key=%u expected=(%u,%u) got=(%u,%u)",
10596 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
10599 #endif
10602 ** Check the integrity of the freelist or of an overflow page list.
10603 ** Verify that the number of pages on the list is N.
10605 static void checkList(
10606 IntegrityCk *pCheck, /* Integrity checking context */
10607 int isFreeList, /* True for a freelist. False for overflow page list */
10608 Pgno iPage, /* Page number for first page in the list */
10609 u32 N /* Expected number of pages in the list */
10611 int i;
10612 u32 expected = N;
10613 int nErrAtStart = pCheck->nErr;
10614 while( iPage!=0 && pCheck->mxErr ){
10615 DbPage *pOvflPage;
10616 unsigned char *pOvflData;
10617 if( checkRef(pCheck, iPage) ) break;
10618 N--;
10619 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
10620 checkAppendMsg(pCheck, "failed to get page %u", iPage);
10621 break;
10623 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
10624 if( isFreeList ){
10625 u32 n = (u32)get4byte(&pOvflData[4]);
10626 #ifndef SQLITE_OMIT_AUTOVACUUM
10627 if( pCheck->pBt->autoVacuum ){
10628 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
10630 #endif
10631 if( n>pCheck->pBt->usableSize/4-2 ){
10632 checkAppendMsg(pCheck,
10633 "freelist leaf count too big on page %u", iPage);
10634 N--;
10635 }else{
10636 for(i=0; i<(int)n; i++){
10637 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
10638 #ifndef SQLITE_OMIT_AUTOVACUUM
10639 if( pCheck->pBt->autoVacuum ){
10640 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
10642 #endif
10643 checkRef(pCheck, iFreePage);
10645 N -= n;
10648 #ifndef SQLITE_OMIT_AUTOVACUUM
10649 else{
10650 /* If this database supports auto-vacuum and iPage is not the last
10651 ** page in this overflow list, check that the pointer-map entry for
10652 ** the following page matches iPage.
10654 if( pCheck->pBt->autoVacuum && N>0 ){
10655 i = get4byte(pOvflData);
10656 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
10659 #endif
10660 iPage = get4byte(pOvflData);
10661 sqlite3PagerUnref(pOvflPage);
10663 if( N && nErrAtStart==pCheck->nErr ){
10664 checkAppendMsg(pCheck,
10665 "%s is %u but should be %u",
10666 isFreeList ? "size" : "overflow list length",
10667 expected-N, expected);
10670 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
10673 ** An implementation of a min-heap.
10675 ** aHeap[0] is the number of elements on the heap. aHeap[1] is the
10676 ** root element. The daughter nodes of aHeap[N] are aHeap[N*2]
10677 ** and aHeap[N*2+1].
10679 ** The heap property is this: Every node is less than or equal to both
10680 ** of its daughter nodes. A consequence of the heap property is that the
10681 ** root node aHeap[1] is always the minimum value currently in the heap.
10683 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
10684 ** the heap, preserving the heap property. The btreeHeapPull() routine
10685 ** removes the root element from the heap (the minimum value in the heap)
10686 ** and then moves other nodes around as necessary to preserve the heap
10687 ** property.
10689 ** This heap is used for cell overlap and coverage testing. Each u32
10690 ** entry represents the span of a cell or freeblock on a btree page.
10691 ** The upper 16 bits are the index of the first byte of a range and the
10692 ** lower 16 bits are the index of the last byte of that range.
10694 static void btreeHeapInsert(u32 *aHeap, u32 x){
10695 u32 j, i;
10696 assert( aHeap!=0 );
10697 i = ++aHeap[0];
10698 aHeap[i] = x;
10699 while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
10700 x = aHeap[j];
10701 aHeap[j] = aHeap[i];
10702 aHeap[i] = x;
10703 i = j;
10706 static int btreeHeapPull(u32 *aHeap, u32 *pOut){
10707 u32 j, i, x;
10708 if( (x = aHeap[0])==0 ) return 0;
10709 *pOut = aHeap[1];
10710 aHeap[1] = aHeap[x];
10711 aHeap[x] = 0xffffffff;
10712 aHeap[0]--;
10713 i = 1;
10714 while( (j = i*2)<=aHeap[0] ){
10715 if( aHeap[j]>aHeap[j+1] ) j++;
10716 if( aHeap[i]<aHeap[j] ) break;
10717 x = aHeap[i];
10718 aHeap[i] = aHeap[j];
10719 aHeap[j] = x;
10720 i = j;
10722 return 1;
10725 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
10727 ** Do various sanity checks on a single page of a tree. Return
10728 ** the tree depth. Root pages return 0. Parents of root pages
10729 ** return 1, and so forth.
10731 ** These checks are done:
10733 ** 1. Make sure that cells and freeblocks do not overlap
10734 ** but combine to completely cover the page.
10735 ** 2. Make sure integer cell keys are in order.
10736 ** 3. Check the integrity of overflow pages.
10737 ** 4. Recursively call checkTreePage on all children.
10738 ** 5. Verify that the depth of all children is the same.
10740 static int checkTreePage(
10741 IntegrityCk *pCheck, /* Context for the sanity check */
10742 Pgno iPage, /* Page number of the page to check */
10743 i64 *piMinKey, /* Write minimum integer primary key here */
10744 i64 maxKey /* Error if integer primary key greater than this */
10746 MemPage *pPage = 0; /* The page being analyzed */
10747 int i; /* Loop counter */
10748 int rc; /* Result code from subroutine call */
10749 int depth = -1, d2; /* Depth of a subtree */
10750 int pgno; /* Page number */
10751 int nFrag; /* Number of fragmented bytes on the page */
10752 int hdr; /* Offset to the page header */
10753 int cellStart; /* Offset to the start of the cell pointer array */
10754 int nCell; /* Number of cells */
10755 int doCoverageCheck = 1; /* True if cell coverage checking should be done */
10756 int keyCanBeEqual = 1; /* True if IPK can be equal to maxKey
10757 ** False if IPK must be strictly less than maxKey */
10758 u8 *data; /* Page content */
10759 u8 *pCell; /* Cell content */
10760 u8 *pCellIdx; /* Next element of the cell pointer array */
10761 BtShared *pBt; /* The BtShared object that owns pPage */
10762 u32 pc; /* Address of a cell */
10763 u32 usableSize; /* Usable size of the page */
10764 u32 contentOffset; /* Offset to the start of the cell content area */
10765 u32 *heap = 0; /* Min-heap used for checking cell coverage */
10766 u32 x, prev = 0; /* Next and previous entry on the min-heap */
10767 const char *saved_zPfx = pCheck->zPfx;
10768 int saved_v1 = pCheck->v1;
10769 int saved_v2 = pCheck->v2;
10770 u8 savedIsInit = 0;
10772 /* Check that the page exists
10774 checkProgress(pCheck);
10775 if( pCheck->mxErr==0 ) goto end_of_check;
10776 pBt = pCheck->pBt;
10777 usableSize = pBt->usableSize;
10778 if( iPage==0 ) return 0;
10779 if( checkRef(pCheck, iPage) ) return 0;
10780 pCheck->zPfx = "Tree %u page %u: ";
10781 pCheck->v1 = iPage;
10782 if( (rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0 ){
10783 checkAppendMsg(pCheck,
10784 "unable to get the page. error code=%d", rc);
10785 if( rc==SQLITE_IOERR_NOMEM ) pCheck->rc = SQLITE_NOMEM;
10786 goto end_of_check;
10789 /* Clear MemPage.isInit to make sure the corruption detection code in
10790 ** btreeInitPage() is executed. */
10791 savedIsInit = pPage->isInit;
10792 pPage->isInit = 0;
10793 if( (rc = btreeInitPage(pPage))!=0 ){
10794 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */
10795 checkAppendMsg(pCheck,
10796 "btreeInitPage() returns error code %d", rc);
10797 goto end_of_check;
10799 if( (rc = btreeComputeFreeSpace(pPage))!=0 ){
10800 assert( rc==SQLITE_CORRUPT );
10801 checkAppendMsg(pCheck, "free space corruption", rc);
10802 goto end_of_check;
10804 data = pPage->aData;
10805 hdr = pPage->hdrOffset;
10807 /* Set up for cell analysis */
10808 pCheck->zPfx = "Tree %u page %u cell %u: ";
10809 contentOffset = get2byteNotZero(&data[hdr+5]);
10810 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */
10812 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
10813 ** number of cells on the page. */
10814 nCell = get2byte(&data[hdr+3]);
10815 assert( pPage->nCell==nCell );
10816 if( pPage->leaf || pPage->intKey==0 ){
10817 pCheck->nRow += nCell;
10820 /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
10821 ** immediately follows the b-tree page header. */
10822 cellStart = hdr + 12 - 4*pPage->leaf;
10823 assert( pPage->aCellIdx==&data[cellStart] );
10824 pCellIdx = &data[cellStart + 2*(nCell-1)];
10826 if( !pPage->leaf ){
10827 /* Analyze the right-child page of internal pages */
10828 pgno = get4byte(&data[hdr+8]);
10829 #ifndef SQLITE_OMIT_AUTOVACUUM
10830 if( pBt->autoVacuum ){
10831 pCheck->zPfx = "Tree %u page %u right child: ";
10832 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
10834 #endif
10835 depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
10836 keyCanBeEqual = 0;
10837 }else{
10838 /* For leaf pages, the coverage check will occur in the same loop
10839 ** as the other cell checks, so initialize the heap. */
10840 heap = pCheck->heap;
10841 heap[0] = 0;
10844 /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
10845 ** integer offsets to the cell contents. */
10846 for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
10847 CellInfo info;
10849 /* Check cell size */
10850 pCheck->v2 = i;
10851 assert( pCellIdx==&data[cellStart + i*2] );
10852 pc = get2byteAligned(pCellIdx);
10853 pCellIdx -= 2;
10854 if( pc<contentOffset || pc>usableSize-4 ){
10855 checkAppendMsg(pCheck, "Offset %u out of range %u..%u",
10856 pc, contentOffset, usableSize-4);
10857 doCoverageCheck = 0;
10858 continue;
10860 pCell = &data[pc];
10861 pPage->xParseCell(pPage, pCell, &info);
10862 if( pc+info.nSize>usableSize ){
10863 checkAppendMsg(pCheck, "Extends off end of page");
10864 doCoverageCheck = 0;
10865 continue;
10868 /* Check for integer primary key out of range */
10869 if( pPage->intKey ){
10870 if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
10871 checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
10873 maxKey = info.nKey;
10874 keyCanBeEqual = 0; /* Only the first key on the page may ==maxKey */
10877 /* Check the content overflow list */
10878 if( info.nPayload>info.nLocal ){
10879 u32 nPage; /* Number of pages on the overflow chain */
10880 Pgno pgnoOvfl; /* First page of the overflow chain */
10881 assert( pc + info.nSize - 4 <= usableSize );
10882 nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
10883 pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
10884 #ifndef SQLITE_OMIT_AUTOVACUUM
10885 if( pBt->autoVacuum ){
10886 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
10888 #endif
10889 checkList(pCheck, 0, pgnoOvfl, nPage);
10892 if( !pPage->leaf ){
10893 /* Check sanity of left child page for internal pages */
10894 pgno = get4byte(pCell);
10895 #ifndef SQLITE_OMIT_AUTOVACUUM
10896 if( pBt->autoVacuum ){
10897 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
10899 #endif
10900 d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
10901 keyCanBeEqual = 0;
10902 if( d2!=depth ){
10903 checkAppendMsg(pCheck, "Child page depth differs");
10904 depth = d2;
10906 }else{
10907 /* Populate the coverage-checking heap for leaf pages */
10908 btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
10911 *piMinKey = maxKey;
10913 /* Check for complete coverage of the page
10915 pCheck->zPfx = 0;
10916 if( doCoverageCheck && pCheck->mxErr>0 ){
10917 /* For leaf pages, the min-heap has already been initialized and the
10918 ** cells have already been inserted. But for internal pages, that has
10919 ** not yet been done, so do it now */
10920 if( !pPage->leaf ){
10921 heap = pCheck->heap;
10922 heap[0] = 0;
10923 for(i=nCell-1; i>=0; i--){
10924 u32 size;
10925 pc = get2byteAligned(&data[cellStart+i*2]);
10926 size = pPage->xCellSize(pPage, &data[pc]);
10927 btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
10930 assert( heap!=0 );
10931 /* Add the freeblocks to the min-heap
10933 ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
10934 ** is the offset of the first freeblock, or zero if there are no
10935 ** freeblocks on the page.
10937 i = get2byte(&data[hdr+1]);
10938 while( i>0 ){
10939 int size, j;
10940 assert( (u32)i<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
10941 size = get2byte(&data[i+2]);
10942 assert( (u32)(i+size)<=usableSize ); /* due to btreeComputeFreeSpace() */
10943 btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
10944 /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
10945 ** big-endian integer which is the offset in the b-tree page of the next
10946 ** freeblock in the chain, or zero if the freeblock is the last on the
10947 ** chain. */
10948 j = get2byte(&data[i]);
10949 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
10950 ** increasing offset. */
10951 assert( j==0 || j>i+size ); /* Enforced by btreeComputeFreeSpace() */
10952 assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
10953 i = j;
10955 /* Analyze the min-heap looking for overlap between cells and/or
10956 ** freeblocks, and counting the number of untracked bytes in nFrag.
10958 ** Each min-heap entry is of the form: (start_address<<16)|end_address.
10959 ** There is an implied first entry the covers the page header, the cell
10960 ** pointer index, and the gap between the cell pointer index and the start
10961 ** of cell content.
10963 ** The loop below pulls entries from the min-heap in order and compares
10964 ** the start_address against the previous end_address. If there is an
10965 ** overlap, that means bytes are used multiple times. If there is a gap,
10966 ** that gap is added to the fragmentation count.
10968 nFrag = 0;
10969 prev = contentOffset - 1; /* Implied first min-heap entry */
10970 while( btreeHeapPull(heap,&x) ){
10971 if( (prev&0xffff)>=(x>>16) ){
10972 checkAppendMsg(pCheck,
10973 "Multiple uses for byte %u of page %u", x>>16, iPage);
10974 break;
10975 }else{
10976 nFrag += (x>>16) - (prev&0xffff) - 1;
10977 prev = x;
10980 nFrag += usableSize - (prev&0xffff) - 1;
10981 /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
10982 ** is stored in the fifth field of the b-tree page header.
10983 ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
10984 ** number of fragmented free bytes within the cell content area.
10986 if( heap[0]==0 && nFrag!=data[hdr+7] ){
10987 checkAppendMsg(pCheck,
10988 "Fragmentation of %u bytes reported as %u on page %u",
10989 nFrag, data[hdr+7], iPage);
10993 end_of_check:
10994 if( !doCoverageCheck ) pPage->isInit = savedIsInit;
10995 releasePage(pPage);
10996 pCheck->zPfx = saved_zPfx;
10997 pCheck->v1 = saved_v1;
10998 pCheck->v2 = saved_v2;
10999 return depth+1;
11001 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
11003 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
11005 ** This routine does a complete check of the given BTree file. aRoot[] is
11006 ** an array of pages numbers were each page number is the root page of
11007 ** a table. nRoot is the number of entries in aRoot.
11009 ** A read-only or read-write transaction must be opened before calling
11010 ** this function.
11012 ** Write the number of error seen in *pnErr. Except for some memory
11013 ** allocation errors, an error message held in memory obtained from
11014 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
11015 ** returned. If a memory allocation error occurs, NULL is returned.
11017 ** If the first entry in aRoot[] is 0, that indicates that the list of
11018 ** root pages is incomplete. This is a "partial integrity-check". This
11019 ** happens when performing an integrity check on a single table. The
11020 ** zero is skipped, of course. But in addition, the freelist checks
11021 ** and the checks to make sure every page is referenced are also skipped,
11022 ** since obviously it is not possible to know which pages are covered by
11023 ** the unverified btrees. Except, if aRoot[1] is 1, then the freelist
11024 ** checks are still performed.
11026 int sqlite3BtreeIntegrityCheck(
11027 sqlite3 *db, /* Database connection that is running the check */
11028 Btree *p, /* The btree to be checked */
11029 Pgno *aRoot, /* An array of root pages numbers for individual trees */
11030 Mem *aCnt, /* Memory cells to write counts for each tree to */
11031 int nRoot, /* Number of entries in aRoot[] */
11032 int mxErr, /* Stop reporting errors after this many */
11033 int *pnErr, /* OUT: Write number of errors seen to this variable */
11034 char **pzOut /* OUT: Write the error message string here */
11036 Pgno i;
11037 IntegrityCk sCheck;
11038 BtShared *pBt = p->pBt;
11039 u64 savedDbFlags = pBt->db->flags;
11040 char zErr[100];
11041 int bPartial = 0; /* True if not checking all btrees */
11042 int bCkFreelist = 1; /* True to scan the freelist */
11043 VVA_ONLY( int nRef );
11045 assert( nRoot>0 );
11046 assert( aCnt!=0 );
11048 /* aRoot[0]==0 means this is a partial check */
11049 if( aRoot[0]==0 ){
11050 assert( nRoot>1 );
11051 bPartial = 1;
11052 if( aRoot[1]!=1 ) bCkFreelist = 0;
11055 sqlite3BtreeEnter(p);
11056 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
11057 VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
11058 assert( nRef>=0 );
11059 memset(&sCheck, 0, sizeof(sCheck));
11060 sCheck.db = db;
11061 sCheck.pBt = pBt;
11062 sCheck.pPager = pBt->pPager;
11063 sCheck.nCkPage = btreePagecount(sCheck.pBt);
11064 sCheck.mxErr = mxErr;
11065 sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
11066 sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
11067 if( sCheck.nCkPage==0 ){
11068 goto integrity_ck_cleanup;
11071 sCheck.aPgRef = sqlite3MallocZero((sCheck.nCkPage / 8)+ 1);
11072 if( !sCheck.aPgRef ){
11073 checkOom(&sCheck);
11074 goto integrity_ck_cleanup;
11076 sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
11077 if( sCheck.heap==0 ){
11078 checkOom(&sCheck);
11079 goto integrity_ck_cleanup;
11082 i = PENDING_BYTE_PAGE(pBt);
11083 if( i<=sCheck.nCkPage ) setPageReferenced(&sCheck, i);
11085 /* Check the integrity of the freelist
11087 if( bCkFreelist ){
11088 sCheck.zPfx = "Freelist: ";
11089 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
11090 get4byte(&pBt->pPage1->aData[36]));
11091 sCheck.zPfx = 0;
11094 /* Check all the tables.
11096 #ifndef SQLITE_OMIT_AUTOVACUUM
11097 if( !bPartial ){
11098 if( pBt->autoVacuum ){
11099 Pgno mx = 0;
11100 Pgno mxInHdr;
11101 for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i];
11102 mxInHdr = get4byte(&pBt->pPage1->aData[52]);
11103 if( mx!=mxInHdr ){
11104 checkAppendMsg(&sCheck,
11105 "max rootpage (%u) disagrees with header (%u)",
11106 mx, mxInHdr
11109 }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){
11110 checkAppendMsg(&sCheck,
11111 "incremental_vacuum enabled with a max rootpage of zero"
11115 #endif
11116 testcase( pBt->db->flags & SQLITE_CellSizeCk );
11117 pBt->db->flags &= ~(u64)SQLITE_CellSizeCk;
11118 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
11119 sCheck.nRow = 0;
11120 if( aRoot[i] ){
11121 i64 notUsed;
11122 #ifndef SQLITE_OMIT_AUTOVACUUM
11123 if( pBt->autoVacuum && aRoot[i]>1 && !bPartial ){
11124 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
11126 #endif
11127 sCheck.v0 = aRoot[i];
11128 checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
11130 sqlite3MemSetArrayInt64(aCnt, i, sCheck.nRow);
11132 pBt->db->flags = savedDbFlags;
11134 /* Make sure every page in the file is referenced
11136 if( !bPartial ){
11137 for(i=1; i<=sCheck.nCkPage && sCheck.mxErr; i++){
11138 #ifdef SQLITE_OMIT_AUTOVACUUM
11139 if( getPageReferenced(&sCheck, i)==0 ){
11140 checkAppendMsg(&sCheck, "Page %u: never used", i);
11142 #else
11143 /* If the database supports auto-vacuum, make sure no tables contain
11144 ** references to pointer-map pages.
11146 if( getPageReferenced(&sCheck, i)==0 &&
11147 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
11148 checkAppendMsg(&sCheck, "Page %u: never used", i);
11150 if( getPageReferenced(&sCheck, i)!=0 &&
11151 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
11152 checkAppendMsg(&sCheck, "Page %u: pointer map referenced", i);
11154 #endif
11158 /* Clean up and report errors.
11160 integrity_ck_cleanup:
11161 sqlite3PageFree(sCheck.heap);
11162 sqlite3_free(sCheck.aPgRef);
11163 *pnErr = sCheck.nErr;
11164 if( sCheck.nErr==0 ){
11165 sqlite3_str_reset(&sCheck.errMsg);
11166 *pzOut = 0;
11167 }else{
11168 *pzOut = sqlite3StrAccumFinish(&sCheck.errMsg);
11170 /* Make sure this analysis did not leave any unref() pages. */
11171 assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
11172 sqlite3BtreeLeave(p);
11173 return sCheck.rc;
11175 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
11178 ** Return the full pathname of the underlying database file. Return
11179 ** an empty string if the database is in-memory or a TEMP database.
11181 ** The pager filename is invariant as long as the pager is
11182 ** open so it is safe to access without the BtShared mutex.
11184 const char *sqlite3BtreeGetFilename(Btree *p){
11185 assert( p->pBt->pPager!=0 );
11186 return sqlite3PagerFilename(p->pBt->pPager, 1);
11190 ** Return the pathname of the journal file for this database. The return
11191 ** value of this routine is the same regardless of whether the journal file
11192 ** has been created or not.
11194 ** The pager journal filename is invariant as long as the pager is
11195 ** open so it is safe to access without the BtShared mutex.
11197 const char *sqlite3BtreeGetJournalname(Btree *p){
11198 assert( p->pBt->pPager!=0 );
11199 return sqlite3PagerJournalname(p->pBt->pPager);
11203 ** Return one of SQLITE_TXN_NONE, SQLITE_TXN_READ, or SQLITE_TXN_WRITE
11204 ** to describe the current transaction state of Btree p.
11206 int sqlite3BtreeTxnState(Btree *p){
11207 assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
11208 return p ? p->inTrans : 0;
11211 #ifndef SQLITE_OMIT_WAL
11213 ** Run a checkpoint on the Btree passed as the first argument.
11215 ** Return SQLITE_LOCKED if this or any other connection has an open
11216 ** transaction on the shared-cache the argument Btree is connected to.
11218 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
11220 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
11221 int rc = SQLITE_OK;
11222 if( p ){
11223 BtShared *pBt = p->pBt;
11224 sqlite3BtreeEnter(p);
11225 if( pBt->inTransaction!=TRANS_NONE ){
11226 rc = SQLITE_LOCKED;
11227 }else{
11228 rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
11230 sqlite3BtreeLeave(p);
11232 return rc;
11234 #endif
11237 ** Return true if there is currently a backup running on Btree p.
11239 int sqlite3BtreeIsInBackup(Btree *p){
11240 assert( p );
11241 assert( sqlite3_mutex_held(p->db->mutex) );
11242 return p->nBackup!=0;
11246 ** This function returns a pointer to a blob of memory associated with
11247 ** a single shared-btree. The memory is used by client code for its own
11248 ** purposes (for example, to store a high-level schema associated with
11249 ** the shared-btree). The btree layer manages reference counting issues.
11251 ** The first time this is called on a shared-btree, nBytes bytes of memory
11252 ** are allocated, zeroed, and returned to the caller. For each subsequent
11253 ** call the nBytes parameter is ignored and a pointer to the same blob
11254 ** of memory returned.
11256 ** If the nBytes parameter is 0 and the blob of memory has not yet been
11257 ** allocated, a null pointer is returned. If the blob has already been
11258 ** allocated, it is returned as normal.
11260 ** Just before the shared-btree is closed, the function passed as the
11261 ** xFree argument when the memory allocation was made is invoked on the
11262 ** blob of allocated memory. The xFree function should not call sqlite3_free()
11263 ** on the memory, the btree layer does that.
11265 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
11266 BtShared *pBt = p->pBt;
11267 sqlite3BtreeEnter(p);
11268 if( !pBt->pSchema && nBytes ){
11269 pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
11270 pBt->xFreeSchema = xFree;
11272 sqlite3BtreeLeave(p);
11273 return pBt->pSchema;
11277 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
11278 ** btree as the argument handle holds an exclusive lock on the
11279 ** sqlite_schema table. Otherwise SQLITE_OK.
11281 int sqlite3BtreeSchemaLocked(Btree *p){
11282 int rc;
11283 assert( sqlite3_mutex_held(p->db->mutex) );
11284 sqlite3BtreeEnter(p);
11285 rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK);
11286 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
11287 sqlite3BtreeLeave(p);
11288 return rc;
11292 #ifndef SQLITE_OMIT_SHARED_CACHE
11294 ** Obtain a lock on the table whose root page is iTab. The
11295 ** lock is a write lock if isWritelock is true or a read lock
11296 ** if it is false.
11298 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
11299 int rc = SQLITE_OK;
11300 assert( p->inTrans!=TRANS_NONE );
11301 if( p->sharable ){
11302 u8 lockType = READ_LOCK + isWriteLock;
11303 assert( READ_LOCK+1==WRITE_LOCK );
11304 assert( isWriteLock==0 || isWriteLock==1 );
11306 sqlite3BtreeEnter(p);
11307 rc = querySharedCacheTableLock(p, iTab, lockType);
11308 if( rc==SQLITE_OK ){
11309 rc = setSharedCacheTableLock(p, iTab, lockType);
11311 sqlite3BtreeLeave(p);
11313 return rc;
11315 #endif
11317 #ifndef SQLITE_OMIT_INCRBLOB
11319 ** Argument pCsr must be a cursor opened for writing on an
11320 ** INTKEY table currently pointing at a valid table entry.
11321 ** This function modifies the data stored as part of that entry.
11323 ** Only the data content may only be modified, it is not possible to
11324 ** change the length of the data stored. If this function is called with
11325 ** parameters that attempt to write past the end of the existing data,
11326 ** no modifications are made and SQLITE_CORRUPT is returned.
11328 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
11329 int rc;
11330 assert( cursorOwnsBtShared(pCsr) );
11331 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
11332 assert( pCsr->curFlags & BTCF_Incrblob );
11334 rc = restoreCursorPosition(pCsr);
11335 if( rc!=SQLITE_OK ){
11336 return rc;
11338 assert( pCsr->eState!=CURSOR_REQUIRESEEK );
11339 if( pCsr->eState!=CURSOR_VALID ){
11340 return SQLITE_ABORT;
11343 /* Save the positions of all other cursors open on this table. This is
11344 ** required in case any of them are holding references to an xFetch
11345 ** version of the b-tree page modified by the accessPayload call below.
11347 ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
11348 ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
11349 ** saveAllCursors can only return SQLITE_OK.
11351 VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
11352 assert( rc==SQLITE_OK );
11354 /* Check some assumptions:
11355 ** (a) the cursor is open for writing,
11356 ** (b) there is a read/write transaction open,
11357 ** (c) the connection holds a write-lock on the table (if required),
11358 ** (d) there are no conflicting read-locks, and
11359 ** (e) the cursor points at a valid row of an intKey table.
11361 if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
11362 return SQLITE_READONLY;
11364 assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
11365 && pCsr->pBt->inTransaction==TRANS_WRITE );
11366 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
11367 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
11368 assert( pCsr->pPage->intKey );
11370 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
11374 ** Mark this cursor as an incremental blob cursor.
11376 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
11377 pCur->curFlags |= BTCF_Incrblob;
11378 pCur->pBtree->hasIncrblobCur = 1;
11380 #endif
11383 ** Set both the "read version" (single byte at byte offset 18) and
11384 ** "write version" (single byte at byte offset 19) fields in the database
11385 ** header to iVersion.
11387 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
11388 BtShared *pBt = pBtree->pBt;
11389 int rc; /* Return code */
11391 assert( iVersion==1 || iVersion==2 );
11393 /* If setting the version fields to 1, do not automatically open the
11394 ** WAL connection, even if the version fields are currently set to 2.
11396 pBt->btsFlags &= ~BTS_NO_WAL;
11397 if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
11399 rc = sqlite3BtreeBeginTrans(pBtree, 0, 0);
11400 if( rc==SQLITE_OK ){
11401 u8 *aData = pBt->pPage1->aData;
11402 if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
11403 rc = sqlite3BtreeBeginTrans(pBtree, 2, 0);
11404 if( rc==SQLITE_OK ){
11405 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
11406 if( rc==SQLITE_OK ){
11407 aData[18] = (u8)iVersion;
11408 aData[19] = (u8)iVersion;
11414 pBt->btsFlags &= ~BTS_NO_WAL;
11415 return rc;
11419 ** Return true if the cursor has a hint specified. This routine is
11420 ** only used from within assert() statements
11422 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
11423 return (pCsr->hints & mask)!=0;
11427 ** Return true if the given Btree is read-only.
11429 int sqlite3BtreeIsReadonly(Btree *p){
11430 return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
11434 ** Return the size of the header added to each page by this module.
11436 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
11439 ** If no transaction is active and the database is not a temp-db, clear
11440 ** the in-memory pager cache.
11442 void sqlite3BtreeClearCache(Btree *p){
11443 BtShared *pBt = p->pBt;
11444 if( pBt->inTransaction==TRANS_NONE ){
11445 sqlite3PagerClearCache(pBt->pPager);
11449 #if !defined(SQLITE_OMIT_SHARED_CACHE)
11451 ** Return true if the Btree passed as the only argument is sharable.
11453 int sqlite3BtreeSharable(Btree *p){
11454 return p->sharable;
11458 ** Return the number of connections to the BtShared object accessed by
11459 ** the Btree handle passed as the only argument. For private caches
11460 ** this is always 1. For shared caches it may be 1 or greater.
11462 int sqlite3BtreeConnectionCount(Btree *p){
11463 testcase( p->sharable );
11464 return p->pBt->nRef;
11466 #endif