misc/sparse.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 """simple parser / string tokenizer
   5 rather than returning a list of token types etc, we simple return a list of tokens...
   6 each tokenizing function takes a string as input and returns a list of tokens
   7 """
   8
   9 # Copyright 2002, 2003 St James Software
  10 #
  11 # This file is part of translate.
  12 #
  13 # translate is free software; you can redistribute it and/or modify
  14 # it under the terms of the GNU General Public License as published by
  15 # the Free Software Foundation; either version 2 of the License, or
  16 # (at your option) any later version.
  17 #
  18 # translate is distributed in the hope that it will be useful,
  19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 # GNU General Public License for more details.
  22 #
  23 # You should have received a copy of the GNU General Public License
  24 # along with translate; if not, write to the Free Software
  25 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  26
  27 def stringeval(text):
  28   """takes away repeated quotes (escapes) and returns the string represented by the text"""
  29   stringchar = text[0]
  30   if text[-1] != stringchar or stringchar not in ("'",'"'):
  31     # scratch your head
  32     raise ValueError, "error parsing escaped string: %r" % text
  33   return text[1:-1].replace(stringchar+stringchar,stringchar)
  34
  35 def stringquote(text):
  36   """escapes quotes as neccessary and returns a string representing the text"""
  37   if "'" in text:
  38     if '"' in text:
  39       return '"' + text.replace('"', '""') + '"'
  40     else:
  41       return '"' + text + '"'
  42   else:
  43     return "'" + text + "'"
  44
  45 class ParserError(ValueError):
  46   """Intelligent parser error"""
  47   def __init__(self, parser, message, tokennum):
  48     """takes a message and the number of the token that caused the error"""
  49     tokenpos = parser.findtokenpos(tokennum)
  50     line, charpos = parser.getlinepos(tokenpos)
  51     ValueError.__init__(self, "%s at line %d, char %d (token %r)" % \
  52         (message, line, charpos, parser.tokens[tokennum]))
  53     self.parser = parser
  54     self.tokennum = tokennum
  55
  56 class SimpleParser:
  57   """this is a simple parser"""
  58   def __init__(self, defaulttokenlist=None, whitespacechars=" \t\r\n", includewhitespacetokens=0):
  59     if defaulttokenlist is None:
  60       self.defaulttokenlist = ['<=', '>=', '==', '!=', '+=', '-=', '*=', '/=', '<>']
  61       self.defaulttokenlist.extend('(),[]:=+-')
  62     else:
  63       self.defaulttokenlist = defaulttokenlist
  64     self.whitespacechars = whitespacechars
  65     self.includewhitespacetokens = includewhitespacetokens
  66     self.standardtokenizers = [self.stringtokenize, self.removewhitespace, self.separatetokens]
  67     self.quotechars = ('"', "'")
  68     self.endquotechars = {'"':'"',"'":"'"}
  69     self.stringescaping = 1
  70
  71   def stringtokenize(self, text):
  72     """makes strings in text into tokens..."""
  73     tokens = []
  74     laststart = 0
  75     instring = 0
  76     endstringchar, escapechar = '', '\\'
  77     gotclose, gotescape = 0, 0
  78     for pos in range(len(text)):
  79       char = text[pos]
  80       if instring:
  81         if self.stringescaping and (gotescape or char == escapechar) and not gotclose:
  82           gotescape = not gotescape
  83         elif char == endstringchar:
  84           gotclose = not gotclose
  85         elif gotclose:
  86           tokens.append(text[laststart:pos])
  87           instring, laststart, endstringchar = 0, pos, ''
  88       if not instring:
  89         if char in self.quotechars:
  90           if pos > laststart: tokens.append(text[laststart:pos])
  91           instring, laststart, endstringchar, gotclose = 1, pos, self.endquotechars[char], 0
  92     if laststart < len(text): tokens.append(text[laststart:])
  93     return tokens
  94
  95   def keeptogether(self, text):
  96     """checks whether a token should be kept together"""
  97     return self.isstringtoken(text)
  98
  99   def isstringtoken(self, text):
 100     """checks whether a token is a string token"""
 101     return text[:1] in self.quotechars
 102
 103   def separatetokens(self, text, tokenlist = None):
 104     """this separates out tokens in tokenlist from whitespace etc"""
 105     if self.keeptogether(text): return [text]
 106     if tokenlist is None:
 107       tokenlist = self.defaulttokenlist
 108     # loop through and put tokens into a list
 109     tokens = []
 110     pos = 0
 111     laststart = 0
 112     lentext = len(text)
 113     while pos < lentext:
 114       foundtoken = 0
 115       for token in tokenlist:
 116         lentoken = len(token)
 117         if text[pos:pos+lentoken] == token:
 118           if laststart < pos: tokens.append(text[laststart:pos])
 119           tokens.append(token)
 120           pos += lentoken
 121           foundtoken, laststart = 1, pos
 122           break
 123       if not foundtoken: pos += 1
 124     if laststart < lentext: tokens.append(text[laststart:])
 125     return tokens
 126
 127   def removewhitespace(self, text):
 128     """this removes whitespace but lets it separate things out into separate tokens"""
 129     if self.keeptogether(text): return [text]
 130     # loop through and put tokens into a list
 131     tokens = []
 132     pos = 0
 133     inwhitespace = 0
 134     laststart = 0
 135     for pos in range(len(text)):
 136       char = text[pos]
 137       if inwhitespace:
 138         if char not in self.whitespacechars:
 139           if laststart < pos and self.includewhitespacetokens: tokens.append(text[laststart:pos])
 140           inwhitespace, laststart = 0, pos
 141       else:
 142         if char in self.whitespacechars:
 143           if laststart < pos: tokens.append(text[laststart:pos])
 144           inwhitespace, laststart = 1, pos
 145     if laststart < len(text) and (not inwhitespace or self.includewhitespacetokens):
 146       tokens.append(text[laststart:])
 147     return tokens
 148
 149   def applytokenizer(self, inputlist, tokenizer):
 150     """apply a tokenizer to a set of text, flattening the result"""
 151     tokenizedlists = [tokenizer(text) for text in inputlist]
 152     joined = []
 153     map(joined.extend, tokenizedlists)
 154     return joined
 155
 156   def applytokenizers(self, inputlist, tokenizers):
 157     """apply a set of tokenizers to a set of text, flattening each time"""
 158     for tokenizer in tokenizers:
 159       inputlist = self.applytokenizer(inputlist, tokenizer)
 160     return inputlist
 161
 162   def tokenize(self, source, tokenizers=None):
 163     """tokenize the text string with the standard tokenizers"""
 164     self.source = source
 165     if tokenizers is None:
 166       tokenizers = self.standardtokenizers
 167     self.tokens = self.applytokenizers([self.source], tokenizers)
 168     return self.tokens
 169
 170   def findtokenpos(self, tokennum):
 171     """finds the position of the given token in the text"""
 172     currenttokenpos = 0
 173     for currenttokennum in range(tokennum+1):
 174       currenttokenpos = self.source.find(self.tokens[currenttokennum], currenttokenpos)
 175     return currenttokenpos
 176
 177   def getlinepos(self, tokenpos):
 178     """finds the line and character position of the given character"""
 179     sourcecut = self.source[:tokenpos]
 180     line = sourcecut.count("\n")+1
 181     charpos = tokenpos - sourcecut.rfind("\n")
 182     return line, charpos
 183
 184   def raiseerror(self, message, tokennum):
 185     """raises a ParserError"""
 186     raise ParserError(self, message, tokennum)
 187
 188