2 # -*- coding: utf-8 -*-
4 """simple parser / string tokenizer
5 rather than returning a list of token types etc, we simple return a list of tokens...
6 each tokenizing function takes a string as input and returns a list of tokens
9 # Copyright 2002, 2003 St James Software
11 # This file is part of translate.
13 # translate is free software; you can redistribute it and/or modify
14 # it under the terms of the GNU General Public License as published by
15 # the Free Software Foundation; either version 2 of the License, or
16 # (at your option) any later version.
18 # translate is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 # GNU General Public License for more details.
23 # You should have received a copy of the GNU General Public License
24 # along with translate; if not, write to the Free Software
25 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28 """takes away repeated quotes (escapes) and returns the string represented by the text"""
30 if text
[-1] != stringchar
or stringchar
not in ("'",'"'):
32 raise ValueError, "error parsing escaped string: %r" % text
33 return text
[1:-1].replace(stringchar
+stringchar
,stringchar
)
35 def stringquote(text
):
36 """escapes quotes as neccessary and returns a string representing the text"""
39 return '"' + text
.replace('"', '""') + '"'
41 return '"' + text
+ '"'
43 return "'" + text
+ "'"
45 class ParserError(ValueError):
46 """Intelligent parser error"""
47 def __init__(self
, parser
, message
, tokennum
):
48 """takes a message and the number of the token that caused the error"""
49 tokenpos
= parser
.findtokenpos(tokennum
)
50 line
, charpos
= parser
.getlinepos(tokenpos
)
51 ValueError.__init
__(self
, "%s at line %d, char %d (token %r)" % \
52 (message
, line
, charpos
, parser
.tokens
[tokennum
]))
54 self
.tokennum
= tokennum
57 """this is a simple parser"""
58 def __init__(self
, defaulttokenlist
=None, whitespacechars
=" \t\r\n", includewhitespacetokens
=0):
59 if defaulttokenlist
is None:
60 self
.defaulttokenlist
= ['<=', '>=', '==', '!=', '+=', '-=', '*=', '/=', '<>']
61 self
.defaulttokenlist
.extend('(),[]:=+-')
63 self
.defaulttokenlist
= defaulttokenlist
64 self
.whitespacechars
= whitespacechars
65 self
.includewhitespacetokens
= includewhitespacetokens
66 self
.standardtokenizers
= [self
.stringtokenize
, self
.removewhitespace
, self
.separatetokens
]
67 self
.quotechars
= ('"', "'")
68 self
.endquotechars
= {'"':'"',"'":"'"}
69 self
.stringescaping
= 1
71 def stringtokenize(self
, text
):
72 """makes strings in text into tokens..."""
76 endstringchar
, escapechar
= '', '\\'
77 gotclose
, gotescape
= 0, 0
78 for pos
in range(len(text
)):
81 if self
.stringescaping
and (gotescape
or char
== escapechar
) and not gotclose
:
82 gotescape
= not gotescape
83 elif char
== endstringchar
:
84 gotclose
= not gotclose
86 tokens
.append(text
[laststart
:pos
])
87 instring
, laststart
, endstringchar
= 0, pos
, ''
89 if char
in self
.quotechars
:
90 if pos
> laststart
: tokens
.append(text
[laststart
:pos
])
91 instring
, laststart
, endstringchar
, gotclose
= 1, pos
, self
.endquotechars
[char
], 0
92 if laststart
< len(text
): tokens
.append(text
[laststart
:])
95 def keeptogether(self
, text
):
96 """checks whether a token should be kept together"""
97 return self
.isstringtoken(text
)
99 def isstringtoken(self
, text
):
100 """checks whether a token is a string token"""
101 return text
[:1] in self
.quotechars
103 def separatetokens(self
, text
, tokenlist
= None):
104 """this separates out tokens in tokenlist from whitespace etc"""
105 if self
.keeptogether(text
): return [text
]
106 if tokenlist
is None:
107 tokenlist
= self
.defaulttokenlist
108 # loop through and put tokens into a list
115 for token
in tokenlist
:
116 lentoken
= len(token
)
117 if text
[pos
:pos
+lentoken
] == token
:
118 if laststart
< pos
: tokens
.append(text
[laststart
:pos
])
121 foundtoken
, laststart
= 1, pos
123 if not foundtoken
: pos
+= 1
124 if laststart
< lentext
: tokens
.append(text
[laststart
:])
127 def removewhitespace(self
, text
):
128 """this removes whitespace but lets it separate things out into separate tokens"""
129 if self
.keeptogether(text
): return [text
]
130 # loop through and put tokens into a list
135 for pos
in range(len(text
)):
138 if char
not in self
.whitespacechars
:
139 if laststart
< pos
and self
.includewhitespacetokens
: tokens
.append(text
[laststart
:pos
])
140 inwhitespace
, laststart
= 0, pos
142 if char
in self
.whitespacechars
:
143 if laststart
< pos
: tokens
.append(text
[laststart
:pos
])
144 inwhitespace
, laststart
= 1, pos
145 if laststart
< len(text
) and (not inwhitespace
or self
.includewhitespacetokens
):
146 tokens
.append(text
[laststart
:])
149 def applytokenizer(self
, inputlist
, tokenizer
):
150 """apply a tokenizer to a set of text, flattening the result"""
151 tokenizedlists
= [tokenizer(text
) for text
in inputlist
]
153 map(joined
.extend
, tokenizedlists
)
156 def applytokenizers(self
, inputlist
, tokenizers
):
157 """apply a set of tokenizers to a set of text, flattening each time"""
158 for tokenizer
in tokenizers
:
159 inputlist
= self
.applytokenizer(inputlist
, tokenizer
)
162 def tokenize(self
, source
, tokenizers
=None):
163 """tokenize the text string with the standard tokenizers"""
165 if tokenizers
is None:
166 tokenizers
= self
.standardtokenizers
167 self
.tokens
= self
.applytokenizers([self
.source
], tokenizers
)
170 def findtokenpos(self
, tokennum
):
171 """finds the position of the given token in the text"""
173 for currenttokennum
in range(tokennum
+1):
174 currenttokenpos
= self
.source
.find(self
.tokens
[currenttokennum
], currenttokenpos
)
175 return currenttokenpos
177 def getlinepos(self
, tokenpos
):
178 """finds the line and character position of the given character"""
179 sourcecut
= self
.source
[:tokenpos
]
180 line
= sourcecut
.count("\n")+1
181 charpos
= tokenpos
- sourcecut
.rfind("\n")
184 def raiseerror(self
, message
, tokennum
):
185 """raises a ParserError"""
186 raise ParserError(self
, message
, tokennum
)