Fixes default log output to console for macOS
[sqlcipher.git] / ext / misc / normalize.c
blob08d7733b96090aab0c3d871cf36d7312a6bce2e9
1 /*
2 ** 2018-01-08
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
13 ** This file contains code to implement the sqlite3_normalize() function.
15 ** char *sqlite3_normalize(const char *zSql);
17 ** This function takes an SQL string as input and returns a "normalized"
18 ** version of that string in memory obtained from sqlite3_malloc64(). The
19 ** caller is responsible for ensuring that the returned memory is freed.
21 ** If a memory allocation error occurs, this routine returns NULL.
23 ** The normalization consists of the following transformations:
25 ** (1) Convert every literal (string, blob literal, numeric constant,
26 ** or "NULL" constant) into a ?
28 ** (2) Remove all superfluous whitespace, including comments. Change
29 ** all required whitespace to a single space character.
31 ** (3) Lowercase all ASCII characters.
33 ** (4) If an IN or NOT IN operator is followed by a list of 1 or more
34 ** values, convert that list into "(?,?,?)".
36 ** The purpose of normalization is two-fold:
38 ** (1) Sanitize queries by removing potentially private or sensitive
39 ** information contained in literals.
41 ** (2) Identify structurally identical queries by comparing their
42 ** normalized forms.
44 ** Command-Line Utility
45 ** --------------------
47 ** This file also contains code for a command-line utility that converts
48 ** SQL queries in text files into their normalized forms. To build the
49 ** command-line program, compile this file with -DSQLITE_NORMALIZE_CLI
50 ** and link it against the SQLite library.
52 #include <sqlite3.h>
53 #include <string.h>
56 ** Implementation note:
58 ** Much of the tokenizer logic is copied out of the tokenize.c source file
59 ** of SQLite. That logic could be simplified for this particular application,
60 ** but that would impose a risk of introducing subtle errors. It is best to
61 ** keep the code as close to the original as possible.
63 ** The tokenize code is in sync with the SQLite core as of 2018-01-08.
64 ** Any future changes to the core tokenizer might require corresponding
65 ** adjustments to the tokenizer logic in this module.
69 /* Character classes for tokenizing
71 ** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
72 ** using a lookup table, whereas a switch() directly on c uses a binary search.
73 ** The lookup table is much faster. To maximize speed, and to ensure that
74 ** a lookup table is used, all of the classes need to be small integers and
75 ** all of them need to be used within the switch.
77 #define CC_X 0 /* The letter 'x', or start of BLOB literal */
78 #define CC_KYWD 1 /* Alphabetics or '_'. Usable in a keyword */
79 #define CC_ID 2 /* unicode characters usable in IDs */
80 #define CC_DIGIT 3 /* Digits */
81 #define CC_DOLLAR 4 /* '$' */
82 #define CC_VARALPHA 5 /* '@', '#', ':'. Alphabetic SQL variables */
83 #define CC_VARNUM 6 /* '?'. Numeric SQL variables */
84 #define CC_SPACE 7 /* Space characters */
85 #define CC_QUOTE 8 /* '"', '\'', or '`'. String literals, quoted ids */
86 #define CC_QUOTE2 9 /* '['. [...] style quoted ids */
87 #define CC_PIPE 10 /* '|'. Bitwise OR or concatenate */
88 #define CC_MINUS 11 /* '-'. Minus or SQL-style comment */
89 #define CC_LT 12 /* '<'. Part of < or <= or <> */
90 #define CC_GT 13 /* '>'. Part of > or >= */
91 #define CC_EQ 14 /* '='. Part of = or == */
92 #define CC_BANG 15 /* '!'. Part of != */
93 #define CC_SLASH 16 /* '/'. / or c-style comment */
94 #define CC_LP 17 /* '(' */
95 #define CC_RP 18 /* ')' */
96 #define CC_SEMI 19 /* ';' */
97 #define CC_PLUS 20 /* '+' */
98 #define CC_STAR 21 /* '*' */
99 #define CC_PERCENT 22 /* '%' */
100 #define CC_COMMA 23 /* ',' */
101 #define CC_AND 24 /* '&' */
102 #define CC_TILDA 25 /* '~' */
103 #define CC_DOT 26 /* '.' */
104 #define CC_ILLEGAL 27 /* Illegal character */
106 static const unsigned char aiClass[] = {
107 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */
108 /* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 7, 27, 7, 7, 27, 27,
109 /* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
110 /* 2x */ 7, 15, 8, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16,
111 /* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6,
112 /* 4x */ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113 /* 5x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 9, 27, 27, 27, 1,
114 /* 6x */ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
115 /* 7x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 27, 10, 27, 25, 27,
116 /* 8x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
117 /* 9x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
118 /* Ax */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
119 /* Bx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
120 /* Cx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
121 /* Dx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
122 /* Ex */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
123 /* Fx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
126 /* An array to map all upper-case characters into their corresponding
127 ** lower-case character.
129 ** SQLite only considers US-ASCII (or EBCDIC) characters. We do not
130 ** handle case conversions for the UTF character set since the tables
131 ** involved are nearly as big or bigger than SQLite itself.
133 static const unsigned char sqlite3UpperToLower[] = {
134 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
135 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
136 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
137 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
138 104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
139 122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,
140 108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
141 126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
142 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
143 162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
144 180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
145 198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
146 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
147 234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
148 252,253,254,255
152 ** The following 256 byte lookup table is used to support SQLites built-in
153 ** equivalents to the following standard library functions:
155 ** isspace() 0x01
156 ** isalpha() 0x02
157 ** isdigit() 0x04
158 ** isalnum() 0x06
159 ** isxdigit() 0x08
160 ** toupper() 0x20
161 ** SQLite identifier character 0x40
162 ** Quote character 0x80
164 ** Bit 0x20 is set if the mapped character requires translation to upper
165 ** case. i.e. if the character is a lower-case ASCII character.
166 ** If x is a lower-case ASCII character, then its upper-case equivalent
167 ** is (x - 0x20). Therefore toupper() can be implemented as:
169 ** (x & ~(map[x]&0x20))
171 ** The equivalent of tolower() is implemented using the sqlite3UpperToLower[]
172 ** array. tolower() is used more often than toupper() by SQLite.
174 ** Bit 0x40 is set if the character is non-alphanumeric and can be used in an
175 ** SQLite identifier. Identifiers are alphanumerics, "_", "$", and any
176 ** non-ASCII UTF character. Hence the test for whether or not a character is
177 ** part of an identifier is 0x46.
179 static const unsigned char sqlite3CtypeMap[256] = {
180 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00..07 ........ */
181 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, /* 08..0f ........ */
182 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10..17 ........ */
183 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 18..1f ........ */
184 0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80, /* 20..27 !"#$%&' */
185 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 28..2f ()*+,-./ */
186 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 30..37 01234567 */
187 0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 38..3f 89:;<=>? */
189 0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02, /* 40..47 @ABCDEFG */
190 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 48..4f HIJKLMNO */
191 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 50..57 PQRSTUVW */
192 0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40, /* 58..5f XYZ[\]^_ */
193 0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22, /* 60..67 `abcdefg */
194 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 68..6f hijklmno */
195 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 70..77 pqrstuvw */
196 0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, /* 78..7f xyz{|}~. */
198 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 80..87 ........ */
199 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 88..8f ........ */
200 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 90..97 ........ */
201 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 98..9f ........ */
202 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a0..a7 ........ */
203 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a8..af ........ */
204 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b0..b7 ........ */
205 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b8..bf ........ */
207 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c0..c7 ........ */
208 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c8..cf ........ */
209 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d0..d7 ........ */
210 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d8..df ........ */
211 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e0..e7 ........ */
212 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e8..ef ........ */
213 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* f0..f7 ........ */
214 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40 /* f8..ff ........ */
216 #define sqlite3Toupper(x) ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20))
217 #define sqlite3Isspace(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x01)
218 #define sqlite3Isalnum(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x06)
219 #define sqlite3Isalpha(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x02)
220 #define sqlite3Isdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x04)
221 #define sqlite3Isxdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x08)
222 #define sqlite3Tolower(x) (sqlite3UpperToLower[(unsigned char)(x)])
223 #define sqlite3Isquote(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x80)
227 ** If X is a character that can be used in an identifier then
228 ** IdChar(X) will be true. Otherwise it is false.
230 ** For ASCII, any character with the high-order bit set is
231 ** allowed in an identifier. For 7-bit characters,
232 ** sqlite3IsIdChar[X] must be 1.
234 ** For EBCDIC, the rules are more complex but have the same
235 ** end result.
237 ** Ticket #1066. the SQL standard does not allow '$' in the
238 ** middle of identifiers. But many SQL implementations do.
239 ** SQLite will allow '$' in identifiers for compatibility.
240 ** But the feature is undocumented.
242 #define IdChar(C) ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
245 ** Ignore testcase() macros
247 #define testcase(X)
250 ** Token values
252 #define TK_SPACE 0
253 #define TK_NAME 1
254 #define TK_LITERAL 2
255 #define TK_PUNCT 3
256 #define TK_ERROR 4
258 #define TK_MINUS TK_PUNCT
259 #define TK_LP TK_PUNCT
260 #define TK_RP TK_PUNCT
261 #define TK_SEMI TK_PUNCT
262 #define TK_PLUS TK_PUNCT
263 #define TK_STAR TK_PUNCT
264 #define TK_SLASH TK_PUNCT
265 #define TK_REM TK_PUNCT
266 #define TK_EQ TK_PUNCT
267 #define TK_LE TK_PUNCT
268 #define TK_NE TK_PUNCT
269 #define TK_LSHIFT TK_PUNCT
270 #define TK_LT TK_PUNCT
271 #define TK_GE TK_PUNCT
272 #define TK_RSHIFT TK_PUNCT
273 #define TK_GT TK_PUNCT
274 #define TK_GE TK_PUNCT
275 #define TK_BITOR TK_PUNCT
276 #define TK_CONCAT TK_PUNCT
277 #define TK_COMMA TK_PUNCT
278 #define TK_BITAND TK_PUNCT
279 #define TK_BITNOT TK_PUNCT
280 #define TK_STRING TK_LITERAL
281 #define TK_ID TK_NAME
282 #define TK_ILLEGAL TK_ERROR
283 #define TK_DOT TK_PUNCT
284 #define TK_INTEGER TK_LITERAL
285 #define TK_FLOAT TK_LITERAL
286 #define TK_VARIABLE TK_LITERAL
287 #define TK_BLOB TK_LITERAL
289 /* Disable nuisence warnings about case fall-through */
290 #if !defined(deliberate_fall_through) && defined(__GCC__) && __GCC__>=7
291 # define deliberate_fall_through __attribute__((fallthrough));
292 #else
293 # define deliberate_fall_through
294 #endif
297 ** Return the length (in bytes) of the token that begins at z[0].
298 ** Store the token type in *tokenType before returning.
300 static int sqlite3GetToken(const unsigned char *z, int *tokenType){
301 int i, c;
302 switch( aiClass[*z] ){ /* Switch on the character-class of the first byte
303 ** of the token. See the comment on the CC_ defines
304 ** above. */
305 case CC_SPACE: {
306 for(i=1; sqlite3Isspace(z[i]); i++){}
307 *tokenType = TK_SPACE;
308 return i;
310 case CC_MINUS: {
311 if( z[1]=='-' ){
312 for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
313 *tokenType = TK_SPACE;
314 return i;
316 *tokenType = TK_MINUS;
317 return 1;
319 case CC_LP: {
320 *tokenType = TK_LP;
321 return 1;
323 case CC_RP: {
324 *tokenType = TK_RP;
325 return 1;
327 case CC_SEMI: {
328 *tokenType = TK_SEMI;
329 return 1;
331 case CC_PLUS: {
332 *tokenType = TK_PLUS;
333 return 1;
335 case CC_STAR: {
336 *tokenType = TK_STAR;
337 return 1;
339 case CC_SLASH: {
340 if( z[1]!='*' || z[2]==0 ){
341 *tokenType = TK_SLASH;
342 return 1;
344 for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
345 if( c ) i++;
346 *tokenType = TK_SPACE;
347 return i;
349 case CC_PERCENT: {
350 *tokenType = TK_REM;
351 return 1;
353 case CC_EQ: {
354 *tokenType = TK_EQ;
355 return 1 + (z[1]=='=');
357 case CC_LT: {
358 if( (c=z[1])=='=' ){
359 *tokenType = TK_LE;
360 return 2;
361 }else if( c=='>' ){
362 *tokenType = TK_NE;
363 return 2;
364 }else if( c=='<' ){
365 *tokenType = TK_LSHIFT;
366 return 2;
367 }else{
368 *tokenType = TK_LT;
369 return 1;
372 case CC_GT: {
373 if( (c=z[1])=='=' ){
374 *tokenType = TK_GE;
375 return 2;
376 }else if( c=='>' ){
377 *tokenType = TK_RSHIFT;
378 return 2;
379 }else{
380 *tokenType = TK_GT;
381 return 1;
384 case CC_BANG: {
385 if( z[1]!='=' ){
386 *tokenType = TK_ILLEGAL;
387 return 1;
388 }else{
389 *tokenType = TK_NE;
390 return 2;
393 case CC_PIPE: {
394 if( z[1]!='|' ){
395 *tokenType = TK_BITOR;
396 return 1;
397 }else{
398 *tokenType = TK_CONCAT;
399 return 2;
402 case CC_COMMA: {
403 *tokenType = TK_COMMA;
404 return 1;
406 case CC_AND: {
407 *tokenType = TK_BITAND;
408 return 1;
410 case CC_TILDA: {
411 *tokenType = TK_BITNOT;
412 return 1;
414 case CC_QUOTE: {
415 int delim = z[0];
416 testcase( delim=='`' );
417 testcase( delim=='\'' );
418 testcase( delim=='"' );
419 for(i=1; (c=z[i])!=0; i++){
420 if( c==delim ){
421 if( z[i+1]==delim ){
422 i++;
423 }else{
424 break;
428 if( c=='\'' ){
429 *tokenType = TK_STRING;
430 return i+1;
431 }else if( c!=0 ){
432 *tokenType = TK_ID;
433 return i+1;
434 }else{
435 *tokenType = TK_ILLEGAL;
436 return i;
439 case CC_DOT: {
440 if( !sqlite3Isdigit(z[1]) ){
441 *tokenType = TK_DOT;
442 return 1;
444 /* If the next character is a digit, this is a floating point
445 ** number that begins with ".". Fall thru into the next case */
446 /* no break */ deliberate_fall_through
448 case CC_DIGIT: {
449 *tokenType = TK_INTEGER;
450 if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){
451 for(i=3; sqlite3Isxdigit(z[i]); i++){}
452 return i;
454 for(i=0; sqlite3Isdigit(z[i]); i++){}
455 if( z[i]=='.' ){
456 i++;
457 while( sqlite3Isdigit(z[i]) ){ i++; }
458 *tokenType = TK_FLOAT;
460 if( (z[i]=='e' || z[i]=='E') &&
461 ( sqlite3Isdigit(z[i+1])
462 || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2]))
465 i += 2;
466 while( sqlite3Isdigit(z[i]) ){ i++; }
467 *tokenType = TK_FLOAT;
469 while( IdChar(z[i]) ){
470 *tokenType = TK_ILLEGAL;
471 i++;
473 return i;
475 case CC_QUOTE2: {
476 for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
477 *tokenType = c==']' ? TK_ID : TK_ILLEGAL;
478 return i;
480 case CC_VARNUM: {
481 *tokenType = TK_VARIABLE;
482 for(i=1; sqlite3Isdigit(z[i]); i++){}
483 return i;
485 case CC_DOLLAR:
486 case CC_VARALPHA: {
487 int n = 0;
488 testcase( z[0]=='$' ); testcase( z[0]=='@' );
489 testcase( z[0]==':' ); testcase( z[0]=='#' );
490 *tokenType = TK_VARIABLE;
491 for(i=1; (c=z[i])!=0; i++){
492 if( IdChar(c) ){
493 n++;
494 }else if( c=='(' && n>0 ){
496 i++;
497 }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' );
498 if( c==')' ){
499 i++;
500 }else{
501 *tokenType = TK_ILLEGAL;
503 break;
504 }else if( c==':' && z[i+1]==':' ){
505 i++;
506 }else{
507 break;
510 if( n==0 ) *tokenType = TK_ILLEGAL;
511 return i;
513 case CC_KYWD: {
514 for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
515 if( IdChar(z[i]) ){
516 /* This token started out using characters that can appear in keywords,
517 ** but z[i] is a character not allowed within keywords, so this must
518 ** be an identifier instead */
519 i++;
520 break;
522 *tokenType = TK_ID;
523 return i;
525 case CC_X: {
526 testcase( z[0]=='x' ); testcase( z[0]=='X' );
527 if( z[1]=='\'' ){
528 *tokenType = TK_BLOB;
529 for(i=2; sqlite3Isxdigit(z[i]); i++){}
530 if( z[i]!='\'' || i%2 ){
531 *tokenType = TK_ILLEGAL;
532 while( z[i] && z[i]!='\'' ){ i++; }
534 if( z[i] ) i++;
535 return i;
537 /* If it is not a BLOB literal, then it must be an ID, since no
538 ** SQL keywords start with the letter 'x'. Fall through */
539 /* no break */ deliberate_fall_through
541 case CC_ID: {
542 i = 1;
543 break;
545 default: {
546 *tokenType = TK_ILLEGAL;
547 return 1;
550 while( IdChar(z[i]) ){ i++; }
551 *tokenType = TK_ID;
552 return i;
555 char *sqlite3_normalize(const char *zSql){
556 char *z; /* The output string */
557 sqlite3_int64 nZ; /* Size of the output string in bytes */
558 sqlite3_int64 nSql; /* Size of the input string in bytes */
559 int i; /* Next character to read from zSql[] */
560 int j; /* Next slot to fill in on z[] */
561 int tokenType; /* Type of the next token */
562 int n; /* Size of the next token */
563 int k; /* Loop counter */
565 nSql = strlen(zSql);
566 nZ = nSql;
567 z = sqlite3_malloc64( nZ+2 );
568 if( z==0 ) return 0;
569 for(i=j=0; zSql[i]; i += n){
570 n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType);
571 switch( tokenType ){
572 case TK_SPACE: {
573 break;
575 case TK_ERROR: {
576 sqlite3_free(z);
577 return 0;
579 case TK_LITERAL: {
580 z[j++] = '?';
581 break;
583 case TK_PUNCT:
584 case TK_NAME: {
585 if( n==4 && sqlite3_strnicmp(zSql+i,"NULL",4)==0 ){
586 if( (j>=3 && strncmp(z+j-2,"is",2)==0 && !IdChar(z[j-3]))
587 || (j>=4 && strncmp(z+j-3,"not",3)==0 && !IdChar(z[j-4]))
589 /* NULL is a keyword in this case, not a literal value */
590 }else{
591 /* Here the NULL is a literal value */
592 z[j++] = '?';
593 break;
596 if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' ';
597 for(k=0; k<n; k++){
598 z[j++] = sqlite3Tolower(zSql[i+k]);
600 break;
604 while( j>0 && z[j-1]==' ' ){ j--; }
605 if( j>0 && z[j-1]!=';' ){ z[j++] = ';'; }
606 z[j] = 0;
608 /* Make a second pass converting "in(...)" where the "..." is not a
609 ** SELECT statement into "in(?,?,?)" */
610 for(i=0; i<j; i=n){
611 char *zIn = strstr(z+i, "in(");
612 int nParen;
613 if( zIn==0 ) break;
614 n = (int)(zIn-z)+3; /* Index of first char past "in(" */
615 if( n && IdChar(zIn[-1]) ) continue;
616 if( strncmp(zIn, "in(select",9)==0 && !IdChar(zIn[9]) ) continue;
617 if( strncmp(zIn, "in(with",7)==0 && !IdChar(zIn[7]) ) continue;
618 for(nParen=1, k=0; z[n+k]; k++){
619 if( z[n+k]=='(' ) nParen++;
620 if( z[n+k]==')' ){
621 nParen--;
622 if( nParen==0 ) break;
625 /* k is the number of bytes in the "..." within "in(...)" */
626 if( k<5 ){
627 z = sqlite3_realloc64(z, j+(5-k)+1);
628 if( z==0 ) return 0;
629 memmove(z+n+5, z+n+k, j-(n+k));
630 }else if( k>5 ){
631 memmove(z+n+5, z+n+k, j-(n+k));
633 j = j-k+5;
634 z[j] = 0;
635 memcpy(z+n, "?,?,?", 5);
637 return z;
641 ** For testing purposes, or to build a stand-alone SQL normalizer program,
642 ** compile this one source file with the -DSQLITE_NORMALIZE_CLI and link
643 ** it against any SQLite library. The resulting command-line program will
644 ** run sqlite3_normalize() over the text of all files named on the command-
645 ** line and show the result on standard output.
647 #ifdef SQLITE_NORMALIZE_CLI
648 #include <stdio.h>
649 #include <stdlib.h>
652 ** Break zIn up into separate SQL statements and run sqlite3_normalize()
653 ** on each one. Print the result of each run.
655 static void normalizeFile(char *zIn){
656 int i;
657 if( zIn==0 ) return;
658 for(i=0; zIn[i]; i++){
659 char cSaved;
660 if( zIn[i]!=';' ) continue;
661 cSaved = zIn[i+1];
662 zIn[i+1] = 0;
663 if( sqlite3_complete(zIn) ){
664 char *zOut = sqlite3_normalize(zIn);
665 if( zOut ){
666 printf("%s\n", zOut);
667 sqlite3_free(zOut);
668 }else{
669 fprintf(stderr, "ERROR: %s\n", zIn);
671 zIn[i+1] = cSaved;
672 zIn += i+1;
673 i = -1;
674 }else{
675 zIn[i+1] = cSaved;
681 ** The main routine for "sql_normalize". Read files named on the
682 ** command-line and run the text of each through sqlite3_normalize().
684 int main(int argc, char **argv){
685 int i;
686 FILE *in;
687 char *zBuf = 0;
688 sqlite3_int64 sz, got;
690 for(i=1; i<argc; i++){
691 in = fopen(argv[i], "rb");
692 if( in==0 ){
693 fprintf(stderr, "cannot open \"%s\"\n", argv[i]);
694 continue;
696 fseek(in, 0, SEEK_END);
697 sz = ftell(in);
698 rewind(in);
699 zBuf = sqlite3_realloc64(zBuf, sz+1);
700 if( zBuf==0 ){
701 fprintf(stderr, "failed to malloc for %lld bytes\n", sz);
702 exit(1);
704 got = fread(zBuf, 1, sz, in);
705 fclose(in);
706 if( got!=sz ){
707 fprintf(stderr, "only able to read %lld of %lld bytes from \"%s\"\n",
708 got, sz, argv[i]);
709 }else{
710 zBuf[got] = 0;
711 normalizeFile(zBuf);
714 sqlite3_free(zBuf);
716 #endif /* SQLITE_NORMALIZE_CLI */