ext/misc/normalize.c

   1 /*
   2 ** 2018-01-08
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 ******************************************************************************
  12 **
  13 ** This file contains code to implement the sqlite3_normalize() function.
  14 **
  15 **    char *sqlite3_normalize(const char *zSql);
  16 **
  17 ** This function takes an SQL string as input and returns a "normalized"
  18 ** version of that string in memory obtained from sqlite3_malloc64().  The
  19 ** caller is responsible for ensuring that the returned memory is freed.
  20 **
  21 ** If a memory allocation error occurs, this routine returns NULL.
  22 **
  23 ** The normalization consists of the following transformations:
  24 **
  25 **   (1)  Convert every literal (string, blob literal, numeric constant,
  26 **        or "NULL" constant) into a ?
  27 **
  28 **   (2)  Remove all superfluous whitespace, including comments.  Change
  29 **        all required whitespace to a single space character.
  30 **
  31 **   (3)  Lowercase all ASCII characters.
  32 **
  33 **   (4)  If an IN or NOT IN operator is followed by a list of 1 or more
  34 **        values, convert that list into "(?,?,?)".
  35 **
  36 ** The purpose of normalization is two-fold:
  37 **
  38 **   (1)  Sanitize queries by removing potentially private or sensitive
  39 **        information contained in literals.
  40 **
  41 **   (2)  Identify structurally identical queries by comparing their
  42 **        normalized forms.
  43 **
  44 ** Command-Line Utility
  45 ** --------------------
  46 **
  47 ** This file also contains code for a command-line utility that converts
  48 ** SQL queries in text files into their normalized forms.  To build the
  49 ** command-line program, compile this file with -DSQLITE_NORMALIZE_CLI
  50 ** and link it against the SQLite library.
  51 */
  52 #include <sqlite3.h>
  53 #include <string.h>
  54
  55 /*
  56 ** Implementation note:
  57 **
  58 ** Much of the tokenizer logic is copied out of the tokenize.c source file
  59 ** of SQLite.  That logic could be simplified for this particular application,
  60 ** but that would impose a risk of introducing subtle errors.  It is best to
  61 ** keep the code as close to the original as possible.
  62 **
  63 ** The tokenize code is in sync with the SQLite core as of 2018-01-08.
  64 ** Any future changes to the core tokenizer might require corresponding
  65 ** adjustments to the tokenizer logic in this module.
  66 */
  67
  68
  69 /* Character classes for tokenizing
  70 **
  71 ** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
  72 ** using a lookup table, whereas a switch() directly on c uses a binary search.
  73 ** The lookup table is much faster.  To maximize speed, and to ensure that
  74 ** a lookup table is used, all of the classes need to be small integers and
  75 ** all of them need to be used within the switch.
  76 */
  77 #define CC_X          0    /* The letter 'x', or start of BLOB literal */
  78 #define CC_KYWD       1    /* Alphabetics or '_'.  Usable in a keyword */
  79 #define CC_ID         2    /* unicode characters usable in IDs */
  80 #define CC_DIGIT      3    /* Digits */
  81 #define CC_DOLLAR     4    /* '$' */
  82 #define CC_VARALPHA   5    /* '@', '#', ':'.  Alphabetic SQL variables */
  83 #define CC_VARNUM     6    /* '?'.  Numeric SQL variables */
  84 #define CC_SPACE      7    /* Space characters */
  85 #define CC_QUOTE      8    /* '"', '\'', or '`'.  String literals, quoted ids */
  86 #define CC_QUOTE2     9    /* '['.   [...] style quoted ids */
  87 #define CC_PIPE      10    /* '|'.   Bitwise OR or concatenate */
  88 #define CC_MINUS     11    /* '-'.  Minus or SQL-style comment */
  89 #define CC_LT        12    /* '<'.  Part of < or <= or <> */
  90 #define CC_GT        13    /* '>'.  Part of > or >= */
  91 #define CC_EQ        14    /* '='.  Part of = or == */
  92 #define CC_BANG      15    /* '!'.  Part of != */
  93 #define CC_SLASH     16    /* '/'.  / or c-style comment */
  94 #define CC_LP        17    /* '(' */
  95 #define CC_RP        18    /* ')' */
  96 #define CC_SEMI      19    /* ';' */
  97 #define CC_PLUS      20    /* '+' */
  98 #define CC_STAR      21    /* '*' */
  99 #define CC_PERCENT   22    /* '%' */
 100 #define CC_COMMA     23    /* ',' */
 101 #define CC_AND       24    /* '&' */
 102 #define CC_TILDA     25    /* '~' */
 103 #define CC_DOT       26    /* '.' */
 104 #define CC_ILLEGAL   27    /* Illegal character */
 105
 106 static const unsigned char aiClass[] = {
 107 /*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
 108 /* 0x */   27, 27, 27, 27, 27, 27, 27, 27, 27,  7,  7, 27,  7,  7, 27, 27,
 109 /* 1x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
 110 /* 2x */    7, 15,  8,  5,  4, 22, 24,  8, 17, 18, 21, 20, 23, 11, 26, 16,
 111 /* 3x */    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  5, 19, 12, 14, 13,  6,
 112 /* 4x */    5,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 113 /* 5x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  9, 27, 27, 27,  1,
 114 /* 6x */    8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 115 /* 7x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1, 27, 10, 27, 25, 27,
 116 /* 8x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 117 /* 9x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 118 /* Ax */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 119 /* Bx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 120 /* Cx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 121 /* Dx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 122 /* Ex */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 123 /* Fx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
 124 };
 125
 126 /* An array to map all upper-case characters into their corresponding
 127 ** lower-case character.
 128 **
 129 ** SQLite only considers US-ASCII (or EBCDIC) characters.  We do not
 130 ** handle case conversions for the UTF character set since the tables
 131 ** involved are nearly as big or bigger than SQLite itself.
 132 */
 133 static const unsigned char sqlite3UpperToLower[] = {
 134       0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
 135      18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
 136      36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
 137      54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
 138     104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
 139     122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,
 140     108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
 141     126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
 142     144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
 143     162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
 144     180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
 145     198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
 146     216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
 147     234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
 148     252,253,254,255
 149 };
 150
 151 /*
 152 ** The following 256 byte lookup table is used to support SQLites built-in
 153 ** equivalents to the following standard library functions:
 154 **
 155 **   isspace()                        0x01
 156 **   isalpha()                        0x02
 157 **   isdigit()                        0x04
 158 **   isalnum()                        0x06
 159 **   isxdigit()                       0x08
 160 **   toupper()                        0x20
 161 **   SQLite identifier character      0x40
 162 **   Quote character                  0x80
 163 **
 164 ** Bit 0x20 is set if the mapped character requires translation to upper
 165 ** case. i.e. if the character is a lower-case ASCII character.
 166 ** If x is a lower-case ASCII character, then its upper-case equivalent
 167 ** is (x - 0x20). Therefore toupper() can be implemented as:
 168 **
 169 **   (x & ~(map[x]&0x20))
 170 **
 171 ** The equivalent of tolower() is implemented using the sqlite3UpperToLower[]
 172 ** array. tolower() is used more often than toupper() by SQLite.
 173 **
 174 ** Bit 0x40 is set if the character is non-alphanumeric and can be used in an
 175 ** SQLite identifier.  Identifiers are alphanumerics, "_", "$", and any
 176 ** non-ASCII UTF character. Hence the test for whether or not a character is
 177 ** part of an identifier is 0x46.
 178 */
 179 static const unsigned char sqlite3CtypeMap[256] = {
 180   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 00..07    ........ */
 181   0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,  /* 08..0f    ........ */
 182   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 10..17    ........ */
 183   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 18..1f    ........ */
 184   0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80,  /* 20..27     !"#$%&' */
 185   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 28..2f    ()*+,-./ */
 186   0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,  /* 30..37    01234567 */
 187   0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 38..3f    89:;<=>? */
 188
 189   0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02,  /* 40..47    @ABCDEFG */
 190   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 48..4f    HIJKLMNO */
 191   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 50..57    PQRSTUVW */
 192   0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40,  /* 58..5f    XYZ[\]^_ */
 193   0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22,  /* 60..67    `abcdefg */
 194   0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 68..6f    hijklmno */
 195   0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 70..77    pqrstuvw */
 196   0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 78..7f    xyz{|}~. */
 197
 198   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 80..87    ........ */
 199   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 88..8f    ........ */
 200   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 90..97    ........ */
 201   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 98..9f    ........ */
 202   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a0..a7    ........ */
 203   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a8..af    ........ */
 204   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b0..b7    ........ */
 205   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b8..bf    ........ */
 206
 207   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c0..c7    ........ */
 208   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c8..cf    ........ */
 209   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d0..d7    ........ */
 210   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d8..df    ........ */
 211   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e0..e7    ........ */
 212   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e8..ef    ........ */
 213   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* f0..f7    ........ */
 214   0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40   /* f8..ff    ........ */
 215 };
 216 #define sqlite3Toupper(x)   ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20))
 217 #define sqlite3Isspace(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x01)
 218 #define sqlite3Isalnum(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x06)
 219 #define sqlite3Isalpha(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x02)
 220 #define sqlite3Isdigit(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x04)
 221 #define sqlite3Isxdigit(x)  (sqlite3CtypeMap[(unsigned char)(x)]&0x08)
 222 #define sqlite3Tolower(x)   (sqlite3UpperToLower[(unsigned char)(x)])
 223 #define sqlite3Isquote(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x80)
 224
 225
 226 /*
 227 ** If X is a character that can be used in an identifier then
 228 ** IdChar(X) will be true.  Otherwise it is false.
 229 **
 230 ** For ASCII, any character with the high-order bit set is
 231 ** allowed in an identifier.  For 7-bit characters,
 232 ** sqlite3IsIdChar[X] must be 1.
 233 **
 234 ** For EBCDIC, the rules are more complex but have the same
 235 ** end result.
 236 **
 237 ** Ticket #1066.  the SQL standard does not allow '$' in the
 238 ** middle of identifiers.  But many SQL implementations do.
 239 ** SQLite will allow '$' in identifiers for compatibility.
 240 ** But the feature is undocumented.
 241 */
 242 #define IdChar(C)  ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
 243
 244 /*
 245 ** Ignore testcase() macros
 246 */
 247 #define testcase(X)
 248
 249 /*
 250 ** Token values
 251 */
 252 #define TK_SPACE    0
 253 #define TK_NAME     1
 254 #define TK_LITERAL  2
 255 #define TK_PUNCT    3
 256 #define TK_ERROR    4
 257
 258 #define TK_MINUS    TK_PUNCT
 259 #define TK_LP       TK_PUNCT
 260 #define TK_RP       TK_PUNCT
 261 #define TK_SEMI     TK_PUNCT
 262 #define TK_PLUS     TK_PUNCT
 263 #define TK_STAR     TK_PUNCT
 264 #define TK_SLASH    TK_PUNCT
 265 #define TK_REM      TK_PUNCT
 266 #define TK_EQ       TK_PUNCT
 267 #define TK_LE       TK_PUNCT
 268 #define TK_NE       TK_PUNCT
 269 #define TK_LSHIFT   TK_PUNCT
 270 #define TK_LT       TK_PUNCT
 271 #define TK_GE       TK_PUNCT
 272 #define TK_RSHIFT   TK_PUNCT
 273 #define TK_GT       TK_PUNCT
 274 #define TK_GE       TK_PUNCT
 275 #define TK_BITOR    TK_PUNCT
 276 #define TK_CONCAT   TK_PUNCT
 277 #define TK_COMMA    TK_PUNCT
 278 #define TK_BITAND   TK_PUNCT
 279 #define TK_BITNOT   TK_PUNCT
 280 #define TK_STRING   TK_LITERAL
 281 #define TK_ID       TK_NAME
 282 #define TK_ILLEGAL  TK_ERROR
 283 #define TK_DOT      TK_PUNCT
 284 #define TK_INTEGER  TK_LITERAL
 285 #define TK_FLOAT    TK_LITERAL
 286 #define TK_VARIABLE TK_LITERAL
 287 #define TK_BLOB     TK_LITERAL
 288
 289 /* Disable nuisence warnings about case fall-through */
 290 #if !defined(deliberate_fall_through) && defined(__GCC__) && __GCC__>=7
 291 # define deliberate_fall_through __attribute__((fallthrough));
 292 #else
 293 # define deliberate_fall_through
 294 #endif
 295
 296 /*
 297 ** Return the length (in bytes) of the token that begins at z[0].
 298 ** Store the token type in *tokenType before returning.
 299 */
 300 static int sqlite3GetToken(const unsigned char *z, int *tokenType){
 301   int i, c;
 302   switch( aiClass[*z] ){  /* Switch on the character-class of the first byte
 303                           ** of the token. See the comment on the CC_ defines
 304                           ** above. */
 305     case CC_SPACE: {
 306       for(i=1; sqlite3Isspace(z[i]); i++){}
 307       *tokenType = TK_SPACE;
 308       return i;
 309     }
 310     case CC_MINUS: {
 311       if( z[1]=='-' ){
 312         for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
 313         *tokenType = TK_SPACE;
 314         return i;
 315       }
 316       *tokenType = TK_MINUS;
 317       return 1;
 318     }
 319     case CC_LP: {
 320       *tokenType = TK_LP;
 321       return 1;
 322     }
 323     case CC_RP: {
 324       *tokenType = TK_RP;
 325       return 1;
 326     }
 327     case CC_SEMI: {
 328       *tokenType = TK_SEMI;
 329       return 1;
 330     }
 331     case CC_PLUS: {
 332       *tokenType = TK_PLUS;
 333       return 1;
 334     }
 335     case CC_STAR: {
 336       *tokenType = TK_STAR;
 337       return 1;
 338     }
 339     case CC_SLASH: {
 340       if( z[1]!='*' || z[2]==0 ){
 341         *tokenType = TK_SLASH;
 342         return 1;
 343       }
 344       for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
 345       if( c ) i++;
 346       *tokenType = TK_SPACE;
 347       return i;
 348     }
 349     case CC_PERCENT: {
 350       *tokenType = TK_REM;
 351       return 1;
 352     }
 353     case CC_EQ: {
 354       *tokenType = TK_EQ;
 355       return 1 + (z[1]=='=');
 356     }
 357     case CC_LT: {
 358       if( (c=z[1])=='=' ){
 359         *tokenType = TK_LE;
 360         return 2;
 361       }else if( c=='>' ){
 362         *tokenType = TK_NE;
 363         return 2;
 364       }else if( c=='<' ){
 365         *tokenType = TK_LSHIFT;
 366         return 2;
 367       }else{
 368         *tokenType = TK_LT;
 369         return 1;
 370       }
 371     }
 372     case CC_GT: {
 373       if( (c=z[1])=='=' ){
 374         *tokenType = TK_GE;
 375         return 2;
 376       }else if( c=='>' ){
 377         *tokenType = TK_RSHIFT;
 378         return 2;
 379       }else{
 380         *tokenType = TK_GT;
 381         return 1;
 382       }
 383     }
 384     case CC_BANG: {
 385       if( z[1]!='=' ){
 386         *tokenType = TK_ILLEGAL;
 387         return 1;
 388       }else{
 389         *tokenType = TK_NE;
 390         return 2;
 391       }
 392     }
 393     case CC_PIPE: {
 394       if( z[1]!='|' ){
 395         *tokenType = TK_BITOR;
 396         return 1;
 397       }else{
 398         *tokenType = TK_CONCAT;
 399         return 2;
 400       }
 401     }
 402     case CC_COMMA: {
 403       *tokenType = TK_COMMA;
 404       return 1;
 405     }
 406     case CC_AND: {
 407       *tokenType = TK_BITAND;
 408       return 1;
 409     }
 410     case CC_TILDA: {
 411       *tokenType = TK_BITNOT;
 412       return 1;
 413     }
 414     case CC_QUOTE: {
 415       int delim = z[0];
 416       testcase( delim=='`' );
 417       testcase( delim=='\'' );
 418       testcase( delim=='"' );
 419       for(i=1; (c=z[i])!=0; i++){
 420         if( c==delim ){
 421           if( z[i+1]==delim ){
 422             i++;
 423           }else{
 424             break;
 425           }
 426         }
 427       }
 428       if( c=='\'' ){
 429         *tokenType = TK_STRING;
 430         return i+1;
 431       }else if( c!=0 ){
 432         *tokenType = TK_ID;
 433         return i+1;
 434       }else{
 435         *tokenType = TK_ILLEGAL;
 436         return i;
 437       }
 438     }
 439     case CC_DOT: {
 440       if( !sqlite3Isdigit(z[1]) ){
 441         *tokenType = TK_DOT;
 442         return 1;
 443       }
 444       /* If the next character is a digit, this is a floating point
 445       ** number that begins with ".".  Fall thru into the next case */
 446       /* no break */ deliberate_fall_through
 447     }
 448     case CC_DIGIT: {
 449       *tokenType = TK_INTEGER;
 450       if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){
 451         for(i=3; sqlite3Isxdigit(z[i]); i++){}
 452         return i;
 453       }
 454       for(i=0; sqlite3Isdigit(z[i]); i++){}
 455       if( z[i]=='.' ){
 456         i++;
 457         while( sqlite3Isdigit(z[i]) ){ i++; }
 458         *tokenType = TK_FLOAT;
 459       }
 460       if( (z[i]=='e' || z[i]=='E') &&
 461            ( sqlite3Isdigit(z[i+1])
 462             || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2]))
 463            )
 464       ){
 465         i += 2;
 466         while( sqlite3Isdigit(z[i]) ){ i++; }
 467         *tokenType = TK_FLOAT;
 468       }
 469       while( IdChar(z[i]) ){
 470         *tokenType = TK_ILLEGAL;
 471         i++;
 472       }
 473       return i;
 474     }
 475     case CC_QUOTE2: {
 476       for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
 477       *tokenType = c==']' ? TK_ID : TK_ILLEGAL;
 478       return i;
 479     }
 480     case CC_VARNUM: {
 481       *tokenType = TK_VARIABLE;
 482       for(i=1; sqlite3Isdigit(z[i]); i++){}
 483       return i;
 484     }
 485     case CC_DOLLAR:
 486     case CC_VARALPHA: {
 487       int n = 0;
 488       testcase( z[0]=='$' );  testcase( z[0]=='@' );
 489       testcase( z[0]==':' );  testcase( z[0]=='#' );
 490       *tokenType = TK_VARIABLE;
 491       for(i=1; (c=z[i])!=0; i++){
 492         if( IdChar(c) ){
 493           n++;
 494         }else if( c=='(' && n>0 ){
 495           do{
 496             i++;
 497           }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' );
 498           if( c==')' ){
 499             i++;
 500           }else{
 501             *tokenType = TK_ILLEGAL;
 502           }
 503           break;
 504         }else if( c==':' && z[i+1]==':' ){
 505           i++;
 506         }else{
 507           break;
 508         }
 509       }
 510       if( n==0 ) *tokenType = TK_ILLEGAL;
 511       return i;
 512     }
 513     case CC_KYWD: {
 514       for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
 515       if( IdChar(z[i]) ){
 516         /* This token started out using characters that can appear in keywords,
 517         ** but z[i] is a character not allowed within keywords, so this must
 518         ** be an identifier instead */
 519         i++;
 520         break;
 521       }
 522       *tokenType = TK_ID;
 523       return i;
 524     }
 525     case CC_X: {
 526       testcase( z[0]=='x' ); testcase( z[0]=='X' );
 527       if( z[1]=='\'' ){
 528         *tokenType = TK_BLOB;
 529         for(i=2; sqlite3Isxdigit(z[i]); i++){}
 530         if( z[i]!='\'' || i%2 ){
 531           *tokenType = TK_ILLEGAL;
 532           while( z[i] && z[i]!='\'' ){ i++; }
 533         }
 534         if( z[i] ) i++;
 535         return i;
 536       }
 537       /* If it is not a BLOB literal, then it must be an ID, since no
 538       ** SQL keywords start with the letter 'x'.  Fall through */
 539       /* no break */ deliberate_fall_through
 540     }
 541     case CC_ID: {
 542       i = 1;
 543       break;
 544     }
 545     default: {
 546       *tokenType = TK_ILLEGAL;
 547       return 1;
 548     }
 549   }
 550   while( IdChar(z[i]) ){ i++; }
 551   *tokenType = TK_ID;
 552   return i;
 553 }
 554
 555 char *sqlite3_normalize(const char *zSql){
 556   char *z;              /* The output string */
 557   sqlite3_int64 nZ;     /* Size of the output string in bytes */
 558   sqlite3_int64 nSql;   /* Size of the input string in bytes */
 559   int i;                /* Next character to read from zSql[] */
 560   int j;                /* Next slot to fill in on z[] */
 561   int tokenType;        /* Type of the next token */
 562   int n;                /* Size of the next token */
 563   int k;                /* Loop counter */
 564
 565   nSql = strlen(zSql);
 566   nZ = nSql;
 567   z = sqlite3_malloc64( nZ+2 );
 568   if( z==0 ) return 0;
 569   for(i=j=0; zSql[i]; i += n){
 570     n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType);
 571     switch( tokenType ){
 572       case TK_SPACE: {
 573         break;
 574       }
 575       case TK_ERROR: {
 576         sqlite3_free(z);
 577         return 0;
 578       }
 579       case TK_LITERAL: {
 580         z[j++] = '?';
 581         break;
 582       }
 583       case TK_PUNCT:
 584       case TK_NAME: {
 585         if( n==4 && sqlite3_strnicmp(zSql+i,"NULL",4)==0 ){
 586           if( (j>=3 && strncmp(z+j-2,"is",2)==0 && !IdChar(z[j-3]))
 587            || (j>=4 && strncmp(z+j-3,"not",3)==0 && !IdChar(z[j-4]))
 588           ){
 589             /* NULL is a keyword in this case, not a literal value */
 590           }else{
 591             /* Here the NULL is a literal value */
 592             z[j++] = '?';
 593             break;
 594           }
 595         }
 596         if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' ';
 597         for(k=0; k<n; k++){
 598           z[j++] = sqlite3Tolower(zSql[i+k]);
 599         }
 600         break;
 601       }
 602     }
 603   }
 604   while( j>0 && z[j-1]==' ' ){ j--; }
 605   if( j>0 && z[j-1]!=';' ){ z[j++] = ';'; }
 606   z[j] = 0;
 607
 608   /* Make a second pass converting "in(...)" where the "..." is not a
 609   ** SELECT statement into "in(?,?,?)" */
 610   for(i=0; i<j; i=n){
 611     char *zIn = strstr(z+i, "in(");
 612     int nParen;
 613     if( zIn==0 ) break;
 614     n = (int)(zIn-z)+3;  /* Index of first char past "in(" */
 615     if( n && IdChar(zIn[-1]) ) continue;
 616     if( strncmp(zIn, "in(select",9)==0 && !IdChar(zIn[9]) ) continue;
 617     if( strncmp(zIn, "in(with",7)==0 && !IdChar(zIn[7]) ) continue;
 618     for(nParen=1, k=0; z[n+k]; k++){
 619       if( z[n+k]=='(' ) nParen++;
 620       if( z[n+k]==')' ){
 621         nParen--;
 622         if( nParen==0 ) break;
 623       }
 624     }
 625     /* k is the number of bytes in the "..." within "in(...)" */
 626     if( k<5 ){
 627       z = sqlite3_realloc64(z, j+(5-k)+1);
 628       if( z==0 ) return 0;
 629       memmove(z+n+5, z+n+k, j-(n+k));
 630     }else if( k>5 ){
 631       memmove(z+n+5, z+n+k, j-(n+k));
 632     }
 633     j = j-k+5;
 634     z[j] = 0;
 635     memcpy(z+n, "?,?,?", 5);
 636   }
 637   return z;
 638 }
 639
 640 /*
 641 ** For testing purposes, or to build a stand-alone SQL normalizer program,
 642 ** compile this one source file with the -DSQLITE_NORMALIZE_CLI and link
 643 ** it against any SQLite library.  The resulting command-line program will
 644 ** run sqlite3_normalize() over the text of all files named on the command-
 645 ** line and show the result on standard output.
 646 */
 647 #ifdef SQLITE_NORMALIZE_CLI
 648 #include <stdio.h>
 649 #include <stdlib.h>
 650
 651 /*
 652 ** Break zIn up into separate SQL statements and run sqlite3_normalize()
 653 ** on each one.  Print the result of each run.
 654 */
 655 static void normalizeFile(char *zIn){
 656   int i;
 657   if( zIn==0 ) return;
 658   for(i=0; zIn[i]; i++){
 659     char cSaved;
 660     if( zIn[i]!=';' ) continue;
 661     cSaved = zIn[i+1];
 662     zIn[i+1] = 0;
 663     if( sqlite3_complete(zIn) ){
 664       char *zOut = sqlite3_normalize(zIn);
 665       if( zOut ){
 666         printf("%s\n", zOut);
 667         sqlite3_free(zOut);
 668       }else{
 669         fprintf(stderr, "ERROR: %s\n", zIn);
 670       }
 671       zIn[i+1] = cSaved;
 672       zIn += i+1;
 673       i = -1;
 674     }else{
 675       zIn[i+1] = cSaved;
 676     }
 677   }
 678 }
 679
 680 /*
 681 ** The main routine for "sql_normalize".  Read files named on the
 682 ** command-line and run the text of each through sqlite3_normalize().
 683 */
 684 int main(int argc, char **argv){
 685   int i;
 686   FILE *in;
 687   char *zBuf = 0;
 688   sqlite3_int64 sz, got;
 689
 690   for(i=1; i<argc; i++){
 691     in = fopen(argv[i], "rb");
 692     if( in==0 ){
 693       fprintf(stderr, "cannot open \"%s\"\n", argv[i]);
 694       continue;
 695     }
 696     fseek(in, 0, SEEK_END);
 697     sz = ftell(in);
 698     rewind(in);
 699     zBuf = sqlite3_realloc64(zBuf, sz+1);
 700     if( zBuf==0 ){
 701       fprintf(stderr, "failed to malloc for %lld bytes\n", sz);
 702       exit(1);
 703     }
 704     got = fread(zBuf, 1, sz, in);
 705     fclose(in);
 706     if( got!=sz ){
 707       fprintf(stderr, "only able to read %lld of %lld bytes from \"%s\"\n",
 708               got, sz, argv[i]);
 709     }else{
 710       zBuf[got] = 0;
 711       normalizeFile(zBuf);
 712     }
 713   }
 714   sqlite3_free(zBuf);
 715 }
 716 #endif /* SQLITE_NORMALIZE_CLI */