2 ** This C program extracts all "words" from an input document and adds them
3 ** to an SQLite database. A "word" is any contiguous sequence of alphabetic
4 ** characters. All digits, punctuation, and whitespace characters are
5 ** word separators. The database stores a single entry for each distinct
6 ** word together with a count of the number of occurrences of that word.
7 ** A fresh database is created automatically on each run.
9 ** wordcount DATABASE INPUTFILE
11 ** The INPUTFILE name can be omitted, in which case input it taken from
16 ** --without-rowid Use a WITHOUT ROWID table to store the words.
17 ** --insert Use INSERT mode (the default)
18 ** --replace Use REPLACE mode
19 ** --select Use SELECT mode
20 ** --update Use UPDATE mode
21 ** --delete Use DELETE mode
22 ** --query Use QUERY mode
23 ** --nocase Add the NOCASE collating sequence to the words.
24 ** --trace Enable sqlite3_trace() output.
25 ** --summary Show summary information on the collected data.
26 ** --stats Show sqlite3_status() results at the end.
27 ** --pagesize NNN Use a page size of NNN
28 ** --cachesize NNN Use a cache size of NNN
29 ** --commit NNN Commit after every NNN operations
30 ** --nosync Use PRAGMA synchronous=OFF
31 ** --journal MMMM Use PRAGMA journal_mode=MMMM
32 ** --timer Time the operation of this program
37 ** (1) INSERT OR IGNORE INTO wordcount VALUES($new,1)
38 ** (2) UPDATE wordcount SET cnt=cnt+1 WHERE word=$new -- if (1) is a noop
41 ** (1) INSERT OR IGNORE INTO wordcount VALUES($new,0)
42 ** (2) UPDATE wordcount SET cnt=cnt+1 WHERE word=$new
44 ** Replace mode means:
45 ** (1) REPLACE INTO wordcount
46 ** VALUES($new,ifnull((SELECT cnt FROM wordcount WHERE word=$new),0)+1);
49 ** (1) SELECT 1 FROM wordcount WHERE word=$new
50 ** (2) INSERT INTO wordcount VALUES($new,1) -- if (1) returns nothing
51 ** (3) UPDATE wordcount SET cnt=cnt+1 WHERE word=$new --if (1) return TRUE
54 ** (1) DELETE FROM wordcount WHERE word=$new
57 ** (1) SELECT cnt FROM wordcount WHERE word=$new
59 ** Note that delete mode and query mode are only useful for preexisting
60 ** databases. The wordcount table is created using IF NOT EXISTS so this
61 ** utility can be run multiple times on the same database file. The
62 ** --without-rowid, --nocase, and --pagesize parameters are only effective
63 ** when creating a new database and are harmless no-ops on preexisting
66 ******************************************************************************
68 ** Compile as follows:
70 ** gcc -I. wordcount.c sqlite3.c -ldl -lpthreads
74 ** gcc -I. -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION \
75 ** wordcount.c sqlite3.c
84 /* Return the current wall-clock time */
85 static sqlite3_int64
realTime(void){
86 static sqlite3_vfs
*clockVfs
= 0;
88 if( clockVfs
==0 ) clockVfs
= sqlite3_vfs_find(0);
89 if( clockVfs
->iVersion
>=1 && clockVfs
->xCurrentTimeInt64
!=0 ){
90 clockVfs
->xCurrentTimeInt64(clockVfs
, &t
);
93 clockVfs
->xCurrentTime(clockVfs
, &r
);
94 t
= (sqlite3_int64
)(r
*86400000.0);
99 /* Print an error message and exit */
100 static void fatal_error(const char *zMsg
, ...){
103 vfprintf(stderr
, zMsg
, ap
);
108 /* The sqlite3_trace() callback function */
109 static void traceCallback(void *NotUsed
, const char *zSql
){
110 printf("%s;\n", zSql
);
113 /* An sqlite3_exec() callback that prints results on standard output,
114 ** each column separated by a single space. */
115 static int printResult(void *NotUsed
, int nArg
, char **azArg
, char **azNm
){
118 for(i
=0; i
<nArg
; i
++){
119 printf(" %s", azArg
[i
] ? azArg
[i
] : "(null)");
127 ** Add one character to a hash
129 static void addCharToHash(unsigned int *a
, unsigned char x
){
131 a
[1] = (a
[1]<<8) | x
;
134 a
[2] = (a
[2]<<8) | x
;
139 a
[0] = a
[1] = a
[2] = 0;
145 ** Compute the final hash value.
147 static void finalHash(unsigned int *a
, char *z
){
148 a
[3] += a
[1] + a
[4] + a
[0];
150 sqlite3_snprintf(17, z
, "%08x%08x", a
[3], a
[4]);
155 ** Implementation of a checksum() aggregate SQL function
157 static void checksumStep(
158 sqlite3_context
*context
,
162 const unsigned char *zVal
;
165 a
= (unsigned*)sqlite3_aggregate_context(context
, sizeof(unsigned int)*5);
168 for(i
=0; i
<argc
; i
++){
169 nVal
= sqlite3_value_bytes(argv
[i
]);
170 zVal
= (const unsigned char*)sqlite3_value_text(argv
[i
]);
171 if( zVal
) for(j
=0; j
<nVal
; j
++) addCharToHash(a
, zVal
[j
]);
172 addCharToHash(a
, '|');
174 addCharToHash(a
, '\n');
177 static void checksumFinalize(sqlite3_context
*context
){
180 a
= sqlite3_aggregate_context(context
, 0);
182 finalHash(a
, zResult
);
183 sqlite3_result_text(context
, zResult
, -1, SQLITE_TRANSIENT
);
188 /* Define operating modes */
189 #define MODE_INSERT 0
190 #define MODE_REPLACE 1
191 #define MODE_SELECT 2
192 #define MODE_UPDATE 3
193 #define MODE_DELETE 4
196 int main(int argc
, char **argv
){
197 const char *zFileToRead
= 0; /* Input file. NULL for stdin */
198 const char *zDbName
= 0; /* Name of the database file to create */
199 int useWithoutRowid
= 0; /* True for --without-rowid */
200 int iMode
= MODE_INSERT
; /* One of MODE_xxxxx */
201 int useNocase
= 0; /* True for --nocase */
202 int doTrace
= 0; /* True for --trace */
203 int showStats
= 0; /* True for --stats */
204 int showSummary
= 0; /* True for --summary */
205 int showTimer
= 0; /* True for --timer */
206 int cacheSize
= 0; /* Desired cache size. 0 means default */
207 int pageSize
= 0; /* Desired page size. 0 means default */
208 int commitInterval
= 0; /* How often to commit. 0 means never */
209 int noSync
= 0; /* True for --nosync */
210 const char *zJMode
= 0; /* Journal mode */
211 int nOp
= 0; /* Operation counter */
212 int i
, j
; /* Loop counters */
213 sqlite3
*db
; /* The SQLite database connection */
214 char *zSql
; /* Constructed SQL statement */
215 sqlite3_stmt
*pInsert
= 0; /* The INSERT statement */
216 sqlite3_stmt
*pUpdate
= 0; /* The UPDATE statement */
217 sqlite3_stmt
*pSelect
= 0; /* The SELECT statement */
218 sqlite3_stmt
*pDelete
= 0; /* The DELETE statement */
219 FILE *in
; /* The open input file */
220 int rc
; /* Return code from an SQLite interface */
221 int iCur
, iHiwtr
; /* Statistics values, current and "highwater" */
222 sqlite3_int64 sumCnt
= 0; /* Sum in QUERY mode */
223 sqlite3_int64 startTime
;
224 char zInput
[2000]; /* A single line of input */
226 /* Process command-line arguments */
227 for(i
=1; i
<argc
; i
++){
228 const char *z
= argv
[i
];
230 do{ z
++; }while( z
[0]=='-' );
231 if( strcmp(z
,"without-rowid")==0 ){
233 }else if( strcmp(z
,"replace")==0 ){
234 iMode
= MODE_REPLACE
;
235 }else if( strcmp(z
,"select")==0 ){
237 }else if( strcmp(z
,"insert")==0 ){
239 }else if( strcmp(z
,"update")==0 ){
241 }else if( strcmp(z
,"delete")==0 ){
243 }else if( strcmp(z
,"query")==0 ){
245 }else if( strcmp(z
,"nocase")==0 ){
247 }else if( strcmp(z
,"trace")==0 ){
249 }else if( strcmp(z
,"nosync")==0 ){
251 }else if( strcmp(z
,"stats")==0 ){
253 }else if( strcmp(z
,"summary")==0 ){
255 }else if( strcmp(z
,"timer")==0 ){
257 }else if( strcmp(z
,"cachesize")==0 && i
<argc
-1 ){
259 cacheSize
= atoi(argv
[i
]);
260 }else if( strcmp(z
,"pagesize")==0 && i
<argc
-1 ){
262 pageSize
= atoi(argv
[i
]);
263 }else if( strcmp(z
,"commit")==0 && i
<argc
-1 ){
265 commitInterval
= atoi(argv
[i
]);
266 }else if( strcmp(z
,"journal")==0 && i
<argc
-1 ){
269 fatal_error("unknown option: %s\n", argv
[i
]);
271 }else if( zDbName
==0 ){
273 }else if( zFileToRead
==0 ){
274 zFileToRead
= argv
[i
];
276 fatal_error("surplus argument: %s\n", argv
[i
]);
280 fatal_error("Usage: %s [--options] DATABASE [INPUTFILE]\n", argv
[0]);
282 startTime
= realTime();
284 /* Open the database and the input file */
285 if( sqlite3_open(zDbName
, &db
) ){
286 fatal_error("Cannot open database file: %s\n", zDbName
);
289 in
= fopen(zFileToRead
, "rb");
291 fatal_error("Could not open input file \"%s\"\n", zFileToRead
);
297 /* Set database connection options */
298 if( doTrace
) sqlite3_trace(db
, traceCallback
, 0);
300 zSql
= sqlite3_mprintf("PRAGMA page_size=%d", pageSize
);
301 sqlite3_exec(db
, zSql
, 0, 0, 0);
305 zSql
= sqlite3_mprintf("PRAGMA cache_size=%d", cacheSize
);
306 sqlite3_exec(db
, zSql
, 0, 0, 0);
309 if( noSync
) sqlite3_exec(db
, "PRAGMA synchronous=OFF", 0, 0, 0);
311 zSql
= sqlite3_mprintf("PRAGMA journal_mode=%s", zJMode
);
312 sqlite3_exec(db
, zSql
, 0, 0, 0);
317 /* Construct the "wordcount" table into which to put the words */
318 if( sqlite3_exec(db
, "BEGIN IMMEDIATE", 0, 0, 0) ){
319 fatal_error("Could not start a transaction\n");
321 zSql
= sqlite3_mprintf(
322 "CREATE TABLE IF NOT EXISTS wordcount(\n"
323 " word TEXT PRIMARY KEY COLLATE %s,\n"
326 useNocase
? "nocase" : "binary",
327 useWithoutRowid
? " WITHOUT ROWID" : ""
329 if( zSql
==0 ) fatal_error("out of memory\n");
330 rc
= sqlite3_exec(db
, zSql
, 0, 0, 0);
331 if( rc
) fatal_error("Could not create the wordcount table: %s.\n",
335 /* Prepare SQL statements that will be needed */
336 if( iMode
==MODE_QUERY
){
337 rc
= sqlite3_prepare_v2(db
,
338 "SELECT cnt FROM wordcount WHERE word=?1",
340 if( rc
) fatal_error("Could not prepare the SELECT statement: %s\n",
343 if( iMode
==MODE_SELECT
){
344 rc
= sqlite3_prepare_v2(db
,
345 "SELECT 1 FROM wordcount WHERE word=?1",
347 if( rc
) fatal_error("Could not prepare the SELECT statement: %s\n",
349 rc
= sqlite3_prepare_v2(db
,
350 "INSERT INTO wordcount(word,cnt) VALUES(?1,1)",
352 if( rc
) fatal_error("Could not prepare the INSERT statement: %s\n",
355 if( iMode
==MODE_SELECT
|| iMode
==MODE_UPDATE
|| iMode
==MODE_INSERT
){
356 rc
= sqlite3_prepare_v2(db
,
357 "UPDATE wordcount SET cnt=cnt+1 WHERE word=?1",
359 if( rc
) fatal_error("Could not prepare the UPDATE statement: %s\n",
362 if( iMode
==MODE_INSERT
){
363 rc
= sqlite3_prepare_v2(db
,
364 "INSERT OR IGNORE INTO wordcount(word,cnt) VALUES(?1,1)",
366 if( rc
) fatal_error("Could not prepare the INSERT statement: %s\n",
369 if( iMode
==MODE_UPDATE
){
370 rc
= sqlite3_prepare_v2(db
,
371 "INSERT OR IGNORE INTO wordcount(word,cnt) VALUES(?1,0)",
373 if( rc
) fatal_error("Could not prepare the INSERT statement: %s\n",
376 if( iMode
==MODE_REPLACE
){
377 rc
= sqlite3_prepare_v2(db
,
378 "REPLACE INTO wordcount(word,cnt)"
379 "VALUES(?1,coalesce((SELECT cnt FROM wordcount WHERE word=?1),0)+1)",
381 if( rc
) fatal_error("Could not prepare the REPLACE statement: %s\n",
384 if( iMode
==MODE_DELETE
){
385 rc
= sqlite3_prepare_v2(db
,
386 "DELETE FROM wordcount WHERE word=?1",
388 if( rc
) fatal_error("Could not prepare the DELETE statement: %s\n",
392 /* Process the input file */
393 while( fgets(zInput
, sizeof(zInput
), in
) ){
394 for(i
=0; zInput
[i
]; i
++){
395 if( !isalpha(zInput
[i
]) ) continue;
396 for(j
=i
+1; isalpha(zInput
[j
]); j
++){}
398 /* Found a new word at zInput[i] that is j-i bytes long.
399 ** Process it into the wordcount table. */
400 if( iMode
==MODE_DELETE
){
401 sqlite3_bind_text(pDelete
, 1, zInput
+i
, j
-i
, SQLITE_STATIC
);
402 if( sqlite3_step(pDelete
)!=SQLITE_DONE
){
403 fatal_error("DELETE failed: %s\n", sqlite3_errmsg(db
));
405 sqlite3_reset(pDelete
);
406 }else if( iMode
==MODE_SELECT
){
407 sqlite3_bind_text(pSelect
, 1, zInput
+i
, j
-i
, SQLITE_STATIC
);
408 rc
= sqlite3_step(pSelect
);
409 sqlite3_reset(pSelect
);
410 if( rc
==SQLITE_ROW
){
411 sqlite3_bind_text(pUpdate
, 1, zInput
+i
, j
-i
, SQLITE_STATIC
);
412 if( sqlite3_step(pUpdate
)!=SQLITE_DONE
){
413 fatal_error("UPDATE failed: %s\n", sqlite3_errmsg(db
));
415 sqlite3_reset(pUpdate
);
416 }else if( rc
==SQLITE_DONE
){
417 sqlite3_bind_text(pInsert
, 1, zInput
+i
, j
-i
, SQLITE_STATIC
);
418 if( sqlite3_step(pInsert
)!=SQLITE_DONE
){
419 fatal_error("Insert failed: %s\n", sqlite3_errmsg(db
));
421 sqlite3_reset(pInsert
);
423 fatal_error("SELECT failed: %s\n", sqlite3_errmsg(db
));
425 }else if( iMode
==MODE_QUERY
){
426 sqlite3_bind_text(pSelect
, 1, zInput
+i
, j
-i
, SQLITE_STATIC
);
427 if( sqlite3_step(pSelect
)==SQLITE_ROW
){
428 sumCnt
+= sqlite3_column_int64(pSelect
, 0);
430 sqlite3_reset(pSelect
);
432 sqlite3_bind_text(pInsert
, 1, zInput
+i
, j
-i
, SQLITE_STATIC
);
433 if( sqlite3_step(pInsert
)!=SQLITE_DONE
){
434 fatal_error("INSERT failed: %s\n", sqlite3_errmsg(db
));
436 sqlite3_reset(pInsert
);
437 if( iMode
==MODE_UPDATE
438 || (iMode
==MODE_INSERT
&& sqlite3_changes(db
)==0)
440 sqlite3_bind_text(pUpdate
, 1, zInput
+i
, j
-i
, SQLITE_STATIC
);
441 if( sqlite3_step(pUpdate
)!=SQLITE_DONE
){
442 fatal_error("UPDATE failed: %s\n", sqlite3_errmsg(db
));
444 sqlite3_reset(pUpdate
);
449 /* Increment the operation counter. Do a COMMIT if it is time. */
451 if( commitInterval
>0 && (nOp
%commitInterval
)==0 ){
452 sqlite3_exec(db
, "COMMIT; BEGIN IMMEDIATE", 0, 0, 0);
456 sqlite3_exec(db
, "COMMIT", 0, 0, 0);
457 if( zFileToRead
) fclose(in
);
458 sqlite3_finalize(pInsert
);
459 sqlite3_finalize(pUpdate
);
460 sqlite3_finalize(pSelect
);
461 sqlite3_finalize(pDelete
);
463 if( iMode
==MODE_QUERY
){
464 printf("sum of cnt: %lld\n", sumCnt
);
465 rc
= sqlite3_prepare_v2(db
,"SELECT sum(cnt*cnt) FROM wordcount", -1,
467 if( rc
==SQLITE_OK
&& sqlite3_step(pSelect
)==SQLITE_ROW
){
468 printf("double-check: %lld\n", sqlite3_column_int64(pSelect
, 0));
470 sqlite3_finalize(pSelect
);
475 sqlite3_int64 elapseTime
= realTime() - startTime
;
476 fprintf(stderr
, "%3d.%03d wordcount", (int)(elapseTime
/1000),
477 (int)(elapseTime
%1000));
478 for(i
=1; i
<argc
; i
++) if( i
!=showTimer
) fprintf(stderr
, " %s", argv
[i
]);
479 fprintf(stderr
, "\n");
483 sqlite3_create_function(db
, "checksum", -1, SQLITE_UTF8
, 0,
484 0, checksumStep
, checksumFinalize
);
486 "SELECT 'count(*): ', count(*) FROM wordcount;\n"
487 "SELECT 'sum(cnt): ', sum(cnt) FROM wordcount;\n"
488 "SELECT 'max(cnt): ', max(cnt) FROM wordcount;\n"
489 "SELECT 'avg(cnt): ', avg(cnt) FROM wordcount;\n"
490 "SELECT 'sum(cnt=1):', sum(cnt=1) FROM wordcount;\n"
491 "SELECT 'top 10: ', group_concat(word, ', ') FROM "
492 "(SELECT word FROM wordcount ORDER BY cnt DESC, word LIMIT 10);\n"
493 "SELECT 'checksum: ', checksum(word, cnt) FROM "
494 "(SELECT word, cnt FROM wordcount ORDER BY word);\n"
495 "PRAGMA integrity_check;\n",
499 /* Database connection statistics printed after both prepared statements
500 ** have been finalized */
502 sqlite3_db_status(db
, SQLITE_DBSTATUS_LOOKASIDE_USED
, &iCur
, &iHiwtr
, 0);
503 printf("-- Lookaside Slots Used: %d (max %d)\n", iCur
,iHiwtr
);
504 sqlite3_db_status(db
, SQLITE_DBSTATUS_LOOKASIDE_HIT
, &iCur
, &iHiwtr
, 0);
505 printf("-- Successful lookasides: %d\n", iHiwtr
);
506 sqlite3_db_status(db
, SQLITE_DBSTATUS_LOOKASIDE_MISS_SIZE
, &iCur
,&iHiwtr
,0);
507 printf("-- Lookaside size faults: %d\n", iHiwtr
);
508 sqlite3_db_status(db
, SQLITE_DBSTATUS_LOOKASIDE_MISS_FULL
, &iCur
,&iHiwtr
,0);
509 printf("-- Lookaside OOM faults: %d\n", iHiwtr
);
510 sqlite3_db_status(db
, SQLITE_DBSTATUS_CACHE_USED
, &iCur
, &iHiwtr
, 0);
511 printf("-- Pager Heap Usage: %d bytes\n", iCur
);
512 sqlite3_db_status(db
, SQLITE_DBSTATUS_CACHE_HIT
, &iCur
, &iHiwtr
, 1);
513 printf("-- Page cache hits: %d\n", iCur
);
514 sqlite3_db_status(db
, SQLITE_DBSTATUS_CACHE_MISS
, &iCur
, &iHiwtr
, 1);
515 printf("-- Page cache misses: %d\n", iCur
);
516 sqlite3_db_status(db
, SQLITE_DBSTATUS_CACHE_WRITE
, &iCur
, &iHiwtr
, 1);
517 printf("-- Page cache writes: %d\n", iCur
);
518 sqlite3_db_status(db
, SQLITE_DBSTATUS_SCHEMA_USED
, &iCur
, &iHiwtr
, 0);
519 printf("-- Schema Heap Usage: %d bytes\n", iCur
);
520 sqlite3_db_status(db
, SQLITE_DBSTATUS_STMT_USED
, &iCur
, &iHiwtr
, 0);
521 printf("-- Statement Heap Usage: %d bytes\n", iCur
);
526 /* Global memory usage statistics printed after the database connection
527 ** has closed. Memory usage should be zero at this point. */
529 sqlite3_status(SQLITE_STATUS_MEMORY_USED
, &iCur
, &iHiwtr
, 0);
530 printf("-- Memory Used (bytes): %d (max %d)\n", iCur
,iHiwtr
);
531 sqlite3_status(SQLITE_STATUS_MALLOC_COUNT
, &iCur
, &iHiwtr
, 0);
532 printf("-- Outstanding Allocations: %d (max %d)\n", iCur
,iHiwtr
);
533 sqlite3_status(SQLITE_STATUS_PAGECACHE_OVERFLOW
, &iCur
, &iHiwtr
, 0);
534 printf("-- Pcache Overflow Bytes: %d (max %d)\n", iCur
,iHiwtr
);
535 sqlite3_status(SQLITE_STATUS_SCRATCH_OVERFLOW
, &iCur
, &iHiwtr
, 0);
536 printf("-- Scratch Overflow Bytes: %d (max %d)\n", iCur
,iHiwtr
);
537 sqlite3_status(SQLITE_STATUS_MALLOC_SIZE
, &iCur
, &iHiwtr
, 0);
538 printf("-- Largest Allocation: %d bytes\n",iHiwtr
);
539 sqlite3_status(SQLITE_STATUS_PAGECACHE_SIZE
, &iCur
, &iHiwtr
, 0);
540 printf("-- Largest Pcache Allocation: %d bytes\n",iHiwtr
);
541 sqlite3_status(SQLITE_STATUS_SCRATCH_SIZE
, &iCur
, &iHiwtr
, 0);
542 printf("-- Largest Scratch Allocation: %d bytes\n", iHiwtr
);