Snapshot of upstream SQLite 3.40.1
[sqlcipher.git] / ext / fts5 / fts5_hash.c
blobbc9244fc0163aa4527ed6d59b4617cbc05728294
1 /*
2 ** 2014 August 11
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
17 #include "fts5Int.h"
19 typedef struct Fts5HashEntry Fts5HashEntry;
22 ** This file contains the implementation of an in-memory hash table used
23 ** to accumuluate "term -> doclist" content before it is flused to a level-0
24 ** segment.
28 struct Fts5Hash {
29 int eDetail; /* Copy of Fts5Config.eDetail */
30 int *pnByte; /* Pointer to bytes counter */
31 int nEntry; /* Number of entries currently in hash */
32 int nSlot; /* Size of aSlot[] array */
33 Fts5HashEntry *pScan; /* Current ordered scan item */
34 Fts5HashEntry **aSlot; /* Array of hash slots */
38 ** Each entry in the hash table is represented by an object of the
39 ** following type. Each object, its key (a nul-terminated string) and
40 ** its current data are stored in a single memory allocation. The
41 ** key immediately follows the object in memory. The position list
42 ** data immediately follows the key data in memory.
44 ** The data that follows the key is in a similar, but not identical format
45 ** to the doclist data stored in the database. It is:
47 ** * Rowid, as a varint
48 ** * Position list, without 0x00 terminator.
49 ** * Size of previous position list and rowid, as a 4 byte
50 ** big-endian integer.
52 ** iRowidOff:
53 ** Offset of last rowid written to data area. Relative to first byte of
54 ** structure.
56 ** nData:
57 ** Bytes of data written since iRowidOff.
59 struct Fts5HashEntry {
60 Fts5HashEntry *pHashNext; /* Next hash entry with same hash-key */
61 Fts5HashEntry *pScanNext; /* Next entry in sorted order */
63 int nAlloc; /* Total size of allocation */
64 int iSzPoslist; /* Offset of space for 4-byte poslist size */
65 int nData; /* Total bytes of data (incl. structure) */
66 int nKey; /* Length of key in bytes */
67 u8 bDel; /* Set delete-flag @ iSzPoslist */
68 u8 bContent; /* Set content-flag (detail=none mode) */
69 i16 iCol; /* Column of last value written */
70 int iPos; /* Position of last value written */
71 i64 iRowid; /* Rowid of last value written */
75 ** Eqivalent to:
77 ** char *fts5EntryKey(Fts5HashEntry *pEntry){ return zKey; }
79 #define fts5EntryKey(p) ( ((char *)(&(p)[1])) )
83 ** Allocate a new hash table.
85 int sqlite3Fts5HashNew(Fts5Config *pConfig, Fts5Hash **ppNew, int *pnByte){
86 int rc = SQLITE_OK;
87 Fts5Hash *pNew;
89 *ppNew = pNew = (Fts5Hash*)sqlite3_malloc(sizeof(Fts5Hash));
90 if( pNew==0 ){
91 rc = SQLITE_NOMEM;
92 }else{
93 sqlite3_int64 nByte;
94 memset(pNew, 0, sizeof(Fts5Hash));
95 pNew->pnByte = pnByte;
96 pNew->eDetail = pConfig->eDetail;
98 pNew->nSlot = 1024;
99 nByte = sizeof(Fts5HashEntry*) * pNew->nSlot;
100 pNew->aSlot = (Fts5HashEntry**)sqlite3_malloc64(nByte);
101 if( pNew->aSlot==0 ){
102 sqlite3_free(pNew);
103 *ppNew = 0;
104 rc = SQLITE_NOMEM;
105 }else{
106 memset(pNew->aSlot, 0, (size_t)nByte);
109 return rc;
113 ** Free a hash table object.
115 void sqlite3Fts5HashFree(Fts5Hash *pHash){
116 if( pHash ){
117 sqlite3Fts5HashClear(pHash);
118 sqlite3_free(pHash->aSlot);
119 sqlite3_free(pHash);
124 ** Empty (but do not delete) a hash table.
126 void sqlite3Fts5HashClear(Fts5Hash *pHash){
127 int i;
128 for(i=0; i<pHash->nSlot; i++){
129 Fts5HashEntry *pNext;
130 Fts5HashEntry *pSlot;
131 for(pSlot=pHash->aSlot[i]; pSlot; pSlot=pNext){
132 pNext = pSlot->pHashNext;
133 sqlite3_free(pSlot);
136 memset(pHash->aSlot, 0, pHash->nSlot * sizeof(Fts5HashEntry*));
137 pHash->nEntry = 0;
140 static unsigned int fts5HashKey(int nSlot, const u8 *p, int n){
141 int i;
142 unsigned int h = 13;
143 for(i=n-1; i>=0; i--){
144 h = (h << 3) ^ h ^ p[i];
146 return (h % nSlot);
149 static unsigned int fts5HashKey2(int nSlot, u8 b, const u8 *p, int n){
150 int i;
151 unsigned int h = 13;
152 for(i=n-1; i>=0; i--){
153 h = (h << 3) ^ h ^ p[i];
155 h = (h << 3) ^ h ^ b;
156 return (h % nSlot);
160 ** Resize the hash table by doubling the number of slots.
162 static int fts5HashResize(Fts5Hash *pHash){
163 int nNew = pHash->nSlot*2;
164 int i;
165 Fts5HashEntry **apNew;
166 Fts5HashEntry **apOld = pHash->aSlot;
168 apNew = (Fts5HashEntry**)sqlite3_malloc64(nNew*sizeof(Fts5HashEntry*));
169 if( !apNew ) return SQLITE_NOMEM;
170 memset(apNew, 0, nNew*sizeof(Fts5HashEntry*));
172 for(i=0; i<pHash->nSlot; i++){
173 while( apOld[i] ){
174 unsigned int iHash;
175 Fts5HashEntry *p = apOld[i];
176 apOld[i] = p->pHashNext;
177 iHash = fts5HashKey(nNew, (u8*)fts5EntryKey(p),
178 (int)strlen(fts5EntryKey(p)));
179 p->pHashNext = apNew[iHash];
180 apNew[iHash] = p;
184 sqlite3_free(apOld);
185 pHash->nSlot = nNew;
186 pHash->aSlot = apNew;
187 return SQLITE_OK;
190 static int fts5HashAddPoslistSize(
191 Fts5Hash *pHash,
192 Fts5HashEntry *p,
193 Fts5HashEntry *p2
195 int nRet = 0;
196 if( p->iSzPoslist ){
197 u8 *pPtr = p2 ? (u8*)p2 : (u8*)p;
198 int nData = p->nData;
199 if( pHash->eDetail==FTS5_DETAIL_NONE ){
200 assert( nData==p->iSzPoslist );
201 if( p->bDel ){
202 pPtr[nData++] = 0x00;
203 if( p->bContent ){
204 pPtr[nData++] = 0x00;
207 }else{
208 int nSz = (nData - p->iSzPoslist - 1); /* Size in bytes */
209 int nPos = nSz*2 + p->bDel; /* Value of nPos field */
211 assert( p->bDel==0 || p->bDel==1 );
212 if( nPos<=127 ){
213 pPtr[p->iSzPoslist] = (u8)nPos;
214 }else{
215 int nByte = sqlite3Fts5GetVarintLen((u32)nPos);
216 memmove(&pPtr[p->iSzPoslist + nByte], &pPtr[p->iSzPoslist + 1], nSz);
217 sqlite3Fts5PutVarint(&pPtr[p->iSzPoslist], nPos);
218 nData += (nByte-1);
222 nRet = nData - p->nData;
223 if( p2==0 ){
224 p->iSzPoslist = 0;
225 p->bDel = 0;
226 p->bContent = 0;
227 p->nData = nData;
230 return nRet;
234 ** Add an entry to the in-memory hash table. The key is the concatenation
235 ** of bByte and (pToken/nToken). The value is (iRowid/iCol/iPos).
237 ** (bByte || pToken) -> (iRowid,iCol,iPos)
239 ** Or, if iCol is negative, then the value is a delete marker.
241 int sqlite3Fts5HashWrite(
242 Fts5Hash *pHash,
243 i64 iRowid, /* Rowid for this entry */
244 int iCol, /* Column token appears in (-ve -> delete) */
245 int iPos, /* Position of token within column */
246 char bByte, /* First byte of token */
247 const char *pToken, int nToken /* Token to add or remove to or from index */
249 unsigned int iHash;
250 Fts5HashEntry *p;
251 u8 *pPtr;
252 int nIncr = 0; /* Amount to increment (*pHash->pnByte) by */
253 int bNew; /* If non-delete entry should be written */
255 bNew = (pHash->eDetail==FTS5_DETAIL_FULL);
257 /* Attempt to locate an existing hash entry */
258 iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
259 for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
260 char *zKey = fts5EntryKey(p);
261 if( zKey[0]==bByte
262 && p->nKey==nToken
263 && memcmp(&zKey[1], pToken, nToken)==0
265 break;
269 /* If an existing hash entry cannot be found, create a new one. */
270 if( p==0 ){
271 /* Figure out how much space to allocate */
272 char *zKey;
273 sqlite3_int64 nByte = sizeof(Fts5HashEntry) + (nToken+1) + 1 + 64;
274 if( nByte<128 ) nByte = 128;
276 /* Grow the Fts5Hash.aSlot[] array if necessary. */
277 if( (pHash->nEntry*2)>=pHash->nSlot ){
278 int rc = fts5HashResize(pHash);
279 if( rc!=SQLITE_OK ) return rc;
280 iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
283 /* Allocate new Fts5HashEntry and add it to the hash table. */
284 p = (Fts5HashEntry*)sqlite3_malloc64(nByte);
285 if( !p ) return SQLITE_NOMEM;
286 memset(p, 0, sizeof(Fts5HashEntry));
287 p->nAlloc = (int)nByte;
288 zKey = fts5EntryKey(p);
289 zKey[0] = bByte;
290 memcpy(&zKey[1], pToken, nToken);
291 assert( iHash==fts5HashKey(pHash->nSlot, (u8*)zKey, nToken+1) );
292 p->nKey = nToken;
293 zKey[nToken+1] = '\0';
294 p->nData = nToken+1 + 1 + sizeof(Fts5HashEntry);
295 p->pHashNext = pHash->aSlot[iHash];
296 pHash->aSlot[iHash] = p;
297 pHash->nEntry++;
299 /* Add the first rowid field to the hash-entry */
300 p->nData += sqlite3Fts5PutVarint(&((u8*)p)[p->nData], iRowid);
301 p->iRowid = iRowid;
303 p->iSzPoslist = p->nData;
304 if( pHash->eDetail!=FTS5_DETAIL_NONE ){
305 p->nData += 1;
306 p->iCol = (pHash->eDetail==FTS5_DETAIL_FULL ? 0 : -1);
309 }else{
311 /* Appending to an existing hash-entry. Check that there is enough
312 ** space to append the largest possible new entry. Worst case scenario
313 ** is:
315 ** + 9 bytes for a new rowid,
316 ** + 4 byte reserved for the "poslist size" varint.
317 ** + 1 byte for a "new column" byte,
318 ** + 3 bytes for a new column number (16-bit max) as a varint,
319 ** + 5 bytes for the new position offset (32-bit max).
321 if( (p->nAlloc - p->nData) < (9 + 4 + 1 + 3 + 5) ){
322 sqlite3_int64 nNew = p->nAlloc * 2;
323 Fts5HashEntry *pNew;
324 Fts5HashEntry **pp;
325 pNew = (Fts5HashEntry*)sqlite3_realloc64(p, nNew);
326 if( pNew==0 ) return SQLITE_NOMEM;
327 pNew->nAlloc = (int)nNew;
328 for(pp=&pHash->aSlot[iHash]; *pp!=p; pp=&(*pp)->pHashNext);
329 *pp = pNew;
330 p = pNew;
332 nIncr -= p->nData;
334 assert( (p->nAlloc - p->nData) >= (9 + 4 + 1 + 3 + 5) );
336 pPtr = (u8*)p;
338 /* If this is a new rowid, append the 4-byte size field for the previous
339 ** entry, and the new rowid for this entry. */
340 if( iRowid!=p->iRowid ){
341 u64 iDiff = (u64)iRowid - (u64)p->iRowid;
342 fts5HashAddPoslistSize(pHash, p, 0);
343 p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iDiff);
344 p->iRowid = iRowid;
345 bNew = 1;
346 p->iSzPoslist = p->nData;
347 if( pHash->eDetail!=FTS5_DETAIL_NONE ){
348 p->nData += 1;
349 p->iCol = (pHash->eDetail==FTS5_DETAIL_FULL ? 0 : -1);
350 p->iPos = 0;
354 if( iCol>=0 ){
355 if( pHash->eDetail==FTS5_DETAIL_NONE ){
356 p->bContent = 1;
357 }else{
358 /* Append a new column value, if necessary */
359 assert_nc( iCol>=p->iCol );
360 if( iCol!=p->iCol ){
361 if( pHash->eDetail==FTS5_DETAIL_FULL ){
362 pPtr[p->nData++] = 0x01;
363 p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iCol);
364 p->iCol = (i16)iCol;
365 p->iPos = 0;
366 }else{
367 bNew = 1;
368 p->iCol = (i16)(iPos = iCol);
372 /* Append the new position offset, if necessary */
373 if( bNew ){
374 p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iPos - p->iPos + 2);
375 p->iPos = iPos;
378 }else{
379 /* This is a delete. Set the delete flag. */
380 p->bDel = 1;
383 nIncr += p->nData;
384 *pHash->pnByte += nIncr;
385 return SQLITE_OK;
390 ** Arguments pLeft and pRight point to linked-lists of hash-entry objects,
391 ** each sorted in key order. This function merges the two lists into a
392 ** single list and returns a pointer to its first element.
394 static Fts5HashEntry *fts5HashEntryMerge(
395 Fts5HashEntry *pLeft,
396 Fts5HashEntry *pRight
398 Fts5HashEntry *p1 = pLeft;
399 Fts5HashEntry *p2 = pRight;
400 Fts5HashEntry *pRet = 0;
401 Fts5HashEntry **ppOut = &pRet;
403 while( p1 || p2 ){
404 if( p1==0 ){
405 *ppOut = p2;
406 p2 = 0;
407 }else if( p2==0 ){
408 *ppOut = p1;
409 p1 = 0;
410 }else{
411 int i = 0;
412 char *zKey1 = fts5EntryKey(p1);
413 char *zKey2 = fts5EntryKey(p2);
414 while( zKey1[i]==zKey2[i] ) i++;
416 if( ((u8)zKey1[i])>((u8)zKey2[i]) ){
417 /* p2 is smaller */
418 *ppOut = p2;
419 ppOut = &p2->pScanNext;
420 p2 = p2->pScanNext;
421 }else{
422 /* p1 is smaller */
423 *ppOut = p1;
424 ppOut = &p1->pScanNext;
425 p1 = p1->pScanNext;
427 *ppOut = 0;
431 return pRet;
435 ** Extract all tokens from hash table iHash and link them into a list
436 ** in sorted order. The hash table is cleared before returning. It is
437 ** the responsibility of the caller to free the elements of the returned
438 ** list.
440 static int fts5HashEntrySort(
441 Fts5Hash *pHash,
442 const char *pTerm, int nTerm, /* Query prefix, if any */
443 Fts5HashEntry **ppSorted
445 const int nMergeSlot = 32;
446 Fts5HashEntry **ap;
447 Fts5HashEntry *pList;
448 int iSlot;
449 int i;
451 *ppSorted = 0;
452 ap = sqlite3_malloc64(sizeof(Fts5HashEntry*) * nMergeSlot);
453 if( !ap ) return SQLITE_NOMEM;
454 memset(ap, 0, sizeof(Fts5HashEntry*) * nMergeSlot);
456 for(iSlot=0; iSlot<pHash->nSlot; iSlot++){
457 Fts5HashEntry *pIter;
458 for(pIter=pHash->aSlot[iSlot]; pIter; pIter=pIter->pHashNext){
459 if( pTerm==0
460 || (pIter->nKey+1>=nTerm && 0==memcmp(fts5EntryKey(pIter), pTerm, nTerm))
462 Fts5HashEntry *pEntry = pIter;
463 pEntry->pScanNext = 0;
464 for(i=0; ap[i]; i++){
465 pEntry = fts5HashEntryMerge(pEntry, ap[i]);
466 ap[i] = 0;
468 ap[i] = pEntry;
473 pList = 0;
474 for(i=0; i<nMergeSlot; i++){
475 pList = fts5HashEntryMerge(pList, ap[i]);
478 pHash->nEntry = 0;
479 sqlite3_free(ap);
480 *ppSorted = pList;
481 return SQLITE_OK;
485 ** Query the hash table for a doclist associated with term pTerm/nTerm.
487 int sqlite3Fts5HashQuery(
488 Fts5Hash *pHash, /* Hash table to query */
489 int nPre,
490 const char *pTerm, int nTerm, /* Query term */
491 void **ppOut, /* OUT: Pointer to new object */
492 int *pnDoclist /* OUT: Size of doclist in bytes */
494 unsigned int iHash = fts5HashKey(pHash->nSlot, (const u8*)pTerm, nTerm);
495 char *zKey = 0;
496 Fts5HashEntry *p;
498 for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
499 zKey = fts5EntryKey(p);
500 assert( p->nKey+1==(int)strlen(zKey) );
501 if( nTerm==p->nKey+1 && memcmp(zKey, pTerm, nTerm)==0 ) break;
504 if( p ){
505 int nHashPre = sizeof(Fts5HashEntry) + nTerm + 1;
506 int nList = p->nData - nHashPre;
507 u8 *pRet = (u8*)(*ppOut = sqlite3_malloc64(nPre + nList + 10));
508 if( pRet ){
509 Fts5HashEntry *pFaux = (Fts5HashEntry*)&pRet[nPre-nHashPre];
510 memcpy(&pRet[nPre], &((u8*)p)[nHashPre], nList);
511 nList += fts5HashAddPoslistSize(pHash, p, pFaux);
512 *pnDoclist = nList;
513 }else{
514 *pnDoclist = 0;
515 return SQLITE_NOMEM;
517 }else{
518 *ppOut = 0;
519 *pnDoclist = 0;
522 return SQLITE_OK;
525 int sqlite3Fts5HashScanInit(
526 Fts5Hash *p, /* Hash table to query */
527 const char *pTerm, int nTerm /* Query prefix */
529 return fts5HashEntrySort(p, pTerm, nTerm, &p->pScan);
532 void sqlite3Fts5HashScanNext(Fts5Hash *p){
533 assert( !sqlite3Fts5HashScanEof(p) );
534 p->pScan = p->pScan->pScanNext;
537 int sqlite3Fts5HashScanEof(Fts5Hash *p){
538 return (p->pScan==0);
541 void sqlite3Fts5HashScanEntry(
542 Fts5Hash *pHash,
543 const char **pzTerm, /* OUT: term (nul-terminated) */
544 const u8 **ppDoclist, /* OUT: pointer to doclist */
545 int *pnDoclist /* OUT: size of doclist in bytes */
547 Fts5HashEntry *p;
548 if( (p = pHash->pScan) ){
549 char *zKey = fts5EntryKey(p);
550 int nTerm = (int)strlen(zKey);
551 fts5HashAddPoslistSize(pHash, p, 0);
552 *pzTerm = zKey;
553 *ppDoclist = (const u8*)&zKey[nTerm+1];
554 *pnDoclist = p->nData - (sizeof(Fts5HashEntry) + nTerm + 1);
555 }else{
556 *pzTerm = 0;
557 *ppDoclist = 0;
558 *pnDoclist = 0;