for git v1.5.2 (and below): chdir to the directory of the target file before executin...
[translate_toolkit.git] / misc / sparse.py
blobbf2ba04ba3f2b147810e330d1a32eb8ea422989c
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 """simple parser / string tokenizer
5 rather than returning a list of token types etc, we simple return a list of tokens...
6 each tokenizing function takes a string as input and returns a list of tokens
7 """
9 # Copyright 2002, 2003 St James Software
11 # This file is part of translate.
13 # translate is free software; you can redistribute it and/or modify
14 # it under the terms of the GNU General Public License as published by
15 # the Free Software Foundation; either version 2 of the License, or
16 # (at your option) any later version.
18 # translate is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 # GNU General Public License for more details.
23 # You should have received a copy of the GNU General Public License
24 # along with translate; if not, write to the Free Software
25 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 def stringeval(text):
28 """takes away repeated quotes (escapes) and returns the string represented by the text"""
29 stringchar = text[0]
30 if text[-1] != stringchar or stringchar not in ("'",'"'):
31 # scratch your head
32 raise ValueError, "error parsing escaped string: %r" % text
33 return text[1:-1].replace(stringchar+stringchar,stringchar)
35 def stringquote(text):
36 """escapes quotes as neccessary and returns a string representing the text"""
37 if "'" in text:
38 if '"' in text:
39 return '"' + text.replace('"', '""') + '"'
40 else:
41 return '"' + text + '"'
42 else:
43 return "'" + text + "'"
45 class ParserError(ValueError):
46 """Intelligent parser error"""
47 def __init__(self, parser, message, tokennum):
48 """takes a message and the number of the token that caused the error"""
49 tokenpos = parser.findtokenpos(tokennum)
50 line, charpos = parser.getlinepos(tokenpos)
51 ValueError.__init__(self, "%s at line %d, char %d (token %r)" % \
52 (message, line, charpos, parser.tokens[tokennum]))
53 self.parser = parser
54 self.tokennum = tokennum
56 class SimpleParser:
57 """this is a simple parser"""
58 def __init__(self, defaulttokenlist=None, whitespacechars=" \t\r\n", includewhitespacetokens=0):
59 if defaulttokenlist is None:
60 self.defaulttokenlist = ['<=', '>=', '==', '!=', '+=', '-=', '*=', '/=', '<>']
61 self.defaulttokenlist.extend('(),[]:=+-')
62 else:
63 self.defaulttokenlist = defaulttokenlist
64 self.whitespacechars = whitespacechars
65 self.includewhitespacetokens = includewhitespacetokens
66 self.standardtokenizers = [self.stringtokenize, self.removewhitespace, self.separatetokens]
67 self.quotechars = ('"', "'")
68 self.endquotechars = {'"':'"',"'":"'"}
69 self.stringescaping = 1
71 def stringtokenize(self, text):
72 """makes strings in text into tokens..."""
73 tokens = []
74 laststart = 0
75 instring = 0
76 endstringchar, escapechar = '', '\\'
77 gotclose, gotescape = 0, 0
78 for pos in range(len(text)):
79 char = text[pos]
80 if instring:
81 if self.stringescaping and (gotescape or char == escapechar) and not gotclose:
82 gotescape = not gotescape
83 elif char == endstringchar:
84 gotclose = not gotclose
85 elif gotclose:
86 tokens.append(text[laststart:pos])
87 instring, laststart, endstringchar = 0, pos, ''
88 if not instring:
89 if char in self.quotechars:
90 if pos > laststart: tokens.append(text[laststart:pos])
91 instring, laststart, endstringchar, gotclose = 1, pos, self.endquotechars[char], 0
92 if laststart < len(text): tokens.append(text[laststart:])
93 return tokens
95 def keeptogether(self, text):
96 """checks whether a token should be kept together"""
97 return self.isstringtoken(text)
99 def isstringtoken(self, text):
100 """checks whether a token is a string token"""
101 return text[:1] in self.quotechars
103 def separatetokens(self, text, tokenlist = None):
104 """this separates out tokens in tokenlist from whitespace etc"""
105 if self.keeptogether(text): return [text]
106 if tokenlist is None:
107 tokenlist = self.defaulttokenlist
108 # loop through and put tokens into a list
109 tokens = []
110 pos = 0
111 laststart = 0
112 lentext = len(text)
113 while pos < lentext:
114 foundtoken = 0
115 for token in tokenlist:
116 lentoken = len(token)
117 if text[pos:pos+lentoken] == token:
118 if laststart < pos: tokens.append(text[laststart:pos])
119 tokens.append(token)
120 pos += lentoken
121 foundtoken, laststart = 1, pos
122 break
123 if not foundtoken: pos += 1
124 if laststart < lentext: tokens.append(text[laststart:])
125 return tokens
127 def removewhitespace(self, text):
128 """this removes whitespace but lets it separate things out into separate tokens"""
129 if self.keeptogether(text): return [text]
130 # loop through and put tokens into a list
131 tokens = []
132 pos = 0
133 inwhitespace = 0
134 laststart = 0
135 for pos in range(len(text)):
136 char = text[pos]
137 if inwhitespace:
138 if char not in self.whitespacechars:
139 if laststart < pos and self.includewhitespacetokens: tokens.append(text[laststart:pos])
140 inwhitespace, laststart = 0, pos
141 else:
142 if char in self.whitespacechars:
143 if laststart < pos: tokens.append(text[laststart:pos])
144 inwhitespace, laststart = 1, pos
145 if laststart < len(text) and (not inwhitespace or self.includewhitespacetokens):
146 tokens.append(text[laststart:])
147 return tokens
149 def applytokenizer(self, inputlist, tokenizer):
150 """apply a tokenizer to a set of text, flattening the result"""
151 tokenizedlists = [tokenizer(text) for text in inputlist]
152 joined = []
153 map(joined.extend, tokenizedlists)
154 return joined
156 def applytokenizers(self, inputlist, tokenizers):
157 """apply a set of tokenizers to a set of text, flattening each time"""
158 for tokenizer in tokenizers:
159 inputlist = self.applytokenizer(inputlist, tokenizer)
160 return inputlist
162 def tokenize(self, source, tokenizers=None):
163 """tokenize the text string with the standard tokenizers"""
164 self.source = source
165 if tokenizers is None:
166 tokenizers = self.standardtokenizers
167 self.tokens = self.applytokenizers([self.source], tokenizers)
168 return self.tokens
170 def findtokenpos(self, tokennum):
171 """finds the position of the given token in the text"""
172 currenttokenpos = 0
173 for currenttokennum in range(tokennum+1):
174 currenttokenpos = self.source.find(self.tokens[currenttokennum], currenttokenpos)
175 return currenttokenpos
177 def getlinepos(self, tokenpos):
178 """finds the line and character position of the given character"""
179 sourcecut = self.source[:tokenpos]
180 line = sourcecut.count("\n")+1
181 charpos = tokenpos - sourcecut.rfind("\n")
182 return line, charpos
184 def raiseerror(self, message, tokennum):
185 """raises a ParserError"""
186 raise ParserError(self, message, tokennum)