pgworksheet_1.9/pgw/Lexical.py

   1 #!/usr/bin/env python
   2 # -*- coding: latin-1; -*-
   3 #
   4 # PgWorksheet - PostgreSQL Front End
   5 # http://pgworksheet.projects.postgresql.org/
   6 #
   7 # Copyright © 2004-2008 Henri Michelon & CML http://www.e-cml.org/
   8 #
   9 # This program is free software; you can redistribute it and/or
  10 # modify it under the terms of the GNU General Public License
  11 # as published by the Free Software Foundation; either version 2
  12 # of the License, or (at your option) any later version.
  13 #
  14 # This program is distributed in the hope that it will be useful,
  15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 # GNU General Public License for more details (read LICENSE.txt).
  18 #
  19 # You should have received a copy of the GNU General Public License
  20 # along with this program; if not, write to the Free Software
  21 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  22 #
  23 # $Id: Lexical.py,v 1.9 2008/03/12 20:26:23 hmichelon Exp $
  24 #
  25
  26 # http://www.postgresql.org/docs/8.0/static/sql-syntax.html
  27
  28 # basic characters sets
  29 SPACES = [ ' ', '\t', '\n' ]
  30 DIGITS = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ]
  31 NUMERIC = DIGITS + [ 'e', '.', '+', '-' ]
  32 OPERATOR_CHARS = [ '+', '-', '*', '/', '<', '>', '=', '~', '!', \
  33                    '@', '#', '%', '^', '&', '|', '`', '?' ]
  34 SPECIAL_CHARS = [ '(', ')', '[', ']', ',', ';', ':', '*', '.' ]
  35 OPERATORS = OPERATOR_CHARS + SPECIAL_CHARS
  36
  37 # not the first character of an identifier
  38 NOT_IDENT_START = SPECIAL_CHARS + OPERATOR_CHARS + DIGITS + [ '$' ]
  39 # not a character of an identifier
  40 NOT_IDENT_CHAR = SPECIAL_CHARS + OPERATOR_CHARS + SPACES + [ "'" ]
  41 # not a character of a dollar quoted string
  42 NOT_DOLLAR_QUOTED = [ '$' ] + SPACES
  43
  44
  45 class Token:
  46
  47   def __init__(self, token, start_iter, end_iter, value=None):
  48     self.token = token
  49     self.start_iter = start_iter
  50     self.end_iter = end_iter
  51     self.value = value
  52
  53
  54 class Eob:
  55   """End of Buffer Exception"""
  56
  57
  58 class Lexical:
  59   """Simplified lexical analyser"""
  60
  61   def analyse(self, buffer, start, end):
  62     """Run the lexical analyser"""
  63     self.buffer = buffer
  64     self.current = start.copy()
  65     self.tokens = [];
  66     try:
  67       self.lexical_analyser(end.copy())
  68     except Eob:
  69       pass
  70     return self.tokens
  71
  72
  73   def next_char(self):
  74     """Returns the next character to analyse"""
  75     if (self.current.is_end()):
  76       raise Eob()
  77     c = self.current.get_char()
  78     self.current.forward_char()
  79     return c
  80
  81
  82   def skip_spaces(self, c):
  83     """Skips everything that looks like a space/tab/etc..."""
  84     while (c in SPACES):
  85       c = self.next_char()
  86     return c
  87
  88
  89   def string(self):
  90     """Single quoted strings"""
  91     start = self.current.copy()
  92     start.backward_char()
  93     prev = None
  94     c = self.next_char()
  95     try:
  96       while (True):
  97         if ((c == "'") and (prev != '\\')): # a single quote in the string...
  98           c = self.next_char()
  99           if (c != "'"):
 100             break
 101         prev = c
 102         c = self.next_char()
 103     except Eob:
 104       end = self.current.copy()
 105       self.tokens.append(Token('string', start, end))
 106       raise
 107     end = self.current.copy()
 108     end.backward_char()
 109     self.tokens.append(Token('string', start, end))
 110     return c
 111
 112
 113   def dollar_string(self):
 114     """Dollar-quoted strings"""
 115     # first bound
 116     start = self.current.copy()
 117     start.backward_char()
 118     c = self.next_char()
 119     string_tag = ''
 120     try:
 121       while (c not in NOT_DOLLAR_QUOTED):
 122         string_tag = string_tag + c
 123         c = self.next_char()
 124     except Eob:
 125       end = self.current.copy()
 126       self.tokens.append(Token('identifier', start, end, string_tag.upper()))
 127       raise
 128     end = self.current.copy()
 129     end.backward_char()
 130     if (c != '$'):
 131       self.tokens.append(Token('identifier', start, end, string_tag.upper()))
 132       return c
 133     self.tokens.append(Token('dollarquote', start, end, string_tag.upper()))
 134
 135     # string content
 136     start = self.current.copy()
 137     try:
 138       c = self.next_char()
 139     except Eob:
 140       end = self.current.copy()
 141       self.tokens.append(Token('identifier', start, end, string_tag.upper()))
 142       raise
 143     try:
 144       while (True):
 145         if (c == '$'):
 146           string_end = self.current.copy()
 147           c = self.next_char()
 148           s = ''
 149           while (c not in NOT_DOLLAR_QUOTED):
 150             s = s + c
 151             c = self.next_char()
 152           if (s == string_tag):
 153             string_end.backward_char()
 154             self.tokens.append(Token('string', start, string_end))
 155             end = self.current.copy()
 156             end.backward_char()
 157             self.tokens.append(Token('dollarquote', start, end, s.upper()))
 158             return c
 159         else:
 160           c = self.next_char()
 161     except Eob:
 162       end = self.current.copy()
 163       self.tokens.append(Token('string', start, end))
 164       raise
 165     end = self.current.copy()
 166     end.backward_char()
 167     self.tokens.append(Token('string', start, end))
 168     return c
 169
 170
 171   def bit_string_constant(self, start):
 172     """Binary and Hexadecimal numeric constants using strings"""
 173     c = self.next_char()
 174     if (c == "'"):
 175       c = self.next_char()
 176       start = self.current.copy()
 177       start.backward_char()
 178       start.backward_char()
 179       start.backward_char()
 180       while (c != "'"):
 181         c = self.next_char()
 182       end = self.current.copy()
 183       self.tokens.append(Token('numeric_constant', start, end))
 184       return self.next_char()
 185     else:
 186       return self.identifier(c, start)
 187
 188
 189   def identifier(self, c, ident = ''):
 190     """An identifier, keyword, type name, etc..."""
 191     start = self.current.copy()
 192     for i in range(0, len(ident) + 1):
 193       start.backward_char()
 194     try:
 195       while (c not in NOT_IDENT_CHAR):
 196         ident = ident + c
 197         c = self.next_char()
 198     except Eob:
 199       end = self.current.copy()
 200       self.tokens.append(Token('identifier', start, end, ident.upper()))
 201       raise
 202     end = self.current.copy()
 203     end.backward_char()
 204     self.tokens.append(Token('identifier', start, end, ident.upper()))
 205     return c
 206
 207
 208   def numeric(self, c):
 209     """A numeric constant"""
 210     start = self.current.copy()
 211     start.backward_char()
 212     try:
 213       while (c in NUMERIC):
 214         c = self.next_char()
 215     except Eob:
 216       end = self.current.copy()
 217       self.tokens.append(Token('numeric_constant', start, end))
 218       raise
 219     end = self.current.copy()
 220     end.backward_char()
 221     self.tokens.append(Token('numeric_constant', start, end))
 222     return c
 223
 224
 225   def simple_comment(self):
 226     """One line comment using --"""
 227     start = self.current.copy()
 228     start.backward_char()
 229     start.backward_char()
 230     c = self.next_char()
 231     try:
 232       while (c != '\n'):
 233         c = self.next_char()
 234     except Eob:
 235       end = self.current.copy()
 236       self.tokens.append(Token('comment', start, end))
 237       raise
 238     end = self.current.copy()
 239     self.tokens.append(Token('comment', start, end))
 240
 241
 242   def comment(self):
 243     """Multi lines comments using /* */"""
 244     start = self.current.copy()
 245     start.backward_char()
 246     start.backward_char()
 247     c = self.next_char()
 248     prev = None
 249     nested = 0
 250     try:
 251       while (True):
 252         if (c == '*'):
 253           c = self.next_char()
 254           if (prev == '/'):
 255             nested = nested + 1
 256             continue
 257           if (c == '/'):
 258             if (nested == 0):
 259               c = self.next_char()
 260               break
 261             else:
 262               nested = nested - 1
 263           else:
 264             prev = c
 265             continue
 266         prev = c
 267         c = self.next_char()
 268     except Eob:
 269       end = self.current.copy()
 270       self.tokens.append(Token('comment', start, end))
 271       raise
 272     end = self.current.copy()
 273     end.backward_char()
 274     self.tokens.append(Token('comment', start, end))
 275     return c
 276
 277
 278   def psql(self):
 279     """A PgSQL Command"""
 280     start = self.current.copy()
 281     start.backward_char()
 282     c = self.next_char()
 283     cmd = '\\'
 284     try:
 285       while (c != '\n') and (c != ';'):
 286         cmd = cmd + c
 287         c = self.next_char()
 288     except:
 289       end = self.current.copy()
 290       self.tokens.append(Token('psql', start, end, cmd))
 291       raise
 292     end = self.current.copy()
 293     self.tokens.append(Token('psql', start, end, cmd))
 294
 295
 296   def lexical_analyser(self, fin):
 297     """A simplified lexical analyser"""
 298     c = self.next_char()
 299     while (self.current.compare(fin) <= 0):
 300       c = self.skip_spaces(c)
 301       # Multi lines comments
 302       if (c == '/'):
 303         c = self.next_char()
 304         if (c == '*'):
 305           c = self.comment()
 306           continue
 307         else:
 308           self.current.backward_char()
 309       # One line comments
 310       elif (c == '-'):
 311         c = self.next_char()
 312         if (c == '-'):
 313           self.simple_comment()
 314         else:
 315           self.current.backward_char()
 316       # psql commands
 317       elif (c == '\\'):
 318         self.psql()
 319       # numeric
 320       elif (c in DIGITS):
 321         c = self.numeric(c)
 322         continue
 323       # bit strings
 324       elif (c == 'B') or (c == 'b') or (c == 'H') or (c == 'h'):
 325         c = self.bit_string_constant(c)
 326         continue
 327       # strings
 328       elif (c == "'"):
 329         c = self.string()
 330         continue
 331       # dollar-quoted strings
 332       elif (c == '$'):
 333         c = self.dollar_string()
 334         continue
 335       # numeric
 336       elif (c == '.'):
 337         c = self.next_char()
 338         if (c in DIGITS):
 339           self.current.backward_char()
 340           c = self.numeric(self.current.get_char())
 341           continue
 342       # quoted identifiers
 343       elif (c == '"'):
 344         c = self.next_char()
 345         while (c != '"'):
 346           c = self.next_char()
 347       # operators
 348       elif (c in OPERATORS):
 349         start = self.current.copy()
 350         start.backward_char()
 351         end = self.current.copy()
 352         self.tokens.append(Token('operator', start, end, c))
 353       # everything else
 354       elif (c not in NOT_IDENT_START):
 355         c = self.identifier(c)
 356         continue
 357       c = self.next_char()